summaryrefslogtreecommitdiffstats
path: root/contrib/python/pandas/py2
diff options
context:
space:
mode:
authorshumkovnd <[email protected]>2023-11-10 14:39:34 +0300
committershumkovnd <[email protected]>2023-11-10 16:42:24 +0300
commit77eb2d3fdcec5c978c64e025ced2764c57c00285 (patch)
treec51edb0748ca8d4a08d7c7323312c27ba1a8b79a /contrib/python/pandas/py2
parentdd6d20cadb65582270ac23f4b3b14ae189704b9d (diff)
KIKIMR-19287: add task_stats_drawing script
Diffstat (limited to 'contrib/python/pandas/py2')
-rw-r--r--contrib/python/pandas/py2/.dist-info/METADATA90
-rw-r--r--contrib/python/pandas/py2/.dist-info/top_level.txt1
-rw-r--r--contrib/python/pandas/py2/LICENSE29
-rw-r--r--contrib/python/pandas/py2/README.md234
-rw-r--r--contrib/python/pandas/py2/pandas/__init__.py101
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/__init__.py5
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/algos.pxd21
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/algos.pyx792
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/algos_common_helper.pxi403
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/algos_rank_helper.pxi870
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/algos_take_helper.pxi4986
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/groupby.pyx386
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/groupby_helper.pxi1638
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/hashing.pyx181
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/hashtable.pxd54
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/hashtable.pyx173
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/hashtable_class_helper.pxi1802
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/hashtable_func_helper.pxi764
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/index.pyx699
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/index_class_helper.pxi409
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/indexing.pyx23
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/internals.pyx465
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/interval.pyx488
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/intervaltree.pxi3618
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/join.pyx1006
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/khash.pxd141
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/lib.pyx2349
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/missing.pxd11
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/missing.pyx284
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/ops.pyx295
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/parsers.pyx2324
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/properties.pyx69
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/reduction.pyx641
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/reshape.pyx95
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/skiplist.pxd45
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/skiplist.pyx145
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/sparse.pyx807
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/sparse_op_helper.pxi5846
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/compat_helper.h50
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/headers/cmath36
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/headers/ms_stdint.h247
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/headers/portable.h15
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/headers/stdint.h10
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/inline_helper.h25
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/klib/khash.h569
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/klib/khash_python.h86
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/msgpack/pack.h103
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/msgpack/pack_template.h785
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/msgpack/sysdep.h194
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/msgpack/unpack.h278
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/msgpack/unpack_define.h95
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/msgpack/unpack_template.h475
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/parse_helper.h274
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/parser/io.c280
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/parser/io.h70
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/parser/tokenizer.c2033
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/parser/tokenizer.h270
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/skiplist.h279
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/ujson/lib/ultrajson.h317
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/ujson/lib/ultrajsondec.c1151
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/ujson/lib/ultrajsonenc.c1143
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/ujson/python/JSONtoObj.c638
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/ujson/python/objToJSON.c2539
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/ujson/python/py_defines.h58
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/ujson/python/ujson.c122
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/src/ujson/python/version.h43
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/testing.pyx220
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslib.pyx828
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/__init__.py9
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/ccalendar.pxd12
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/ccalendar.pyx226
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/conversion.pxd34
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/conversion.pyx1335
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/fields.pyx669
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/frequencies.pxd9
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/frequencies.pyx512
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/nattype.pxd20
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/nattype.pyx717
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/np_datetime.pxd76
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/np_datetime.pyx203
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/offsets.pxd3
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/offsets.pyx1127
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/parsing.pyx749
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/period.pyx2553
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/resolution.pyx354
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/src/datetime/np_datetime.c814
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/src/datetime/np_datetime.h80
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c886
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h83
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/strptime.pyx668
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/timedeltas.pxd8
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/timedeltas.pyx1534
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/timestamps.pxd8
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/timestamps.pyx1349
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/timezones.pxd16
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/timezones.pyx359
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/tslibs/util.pxd229
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/util.pxd114
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/window.pyx1910
-rw-r--r--contrib/python/pandas/py2/pandas/_libs/writers.pyx167
-rw-r--r--contrib/python/pandas/py2/pandas/_version.py23
-rw-r--r--contrib/python/pandas/py2/pandas/api/__init__.py2
-rw-r--r--contrib/python/pandas/py2/pandas/api/extensions/__init__.py10
-rw-r--r--contrib/python/pandas/py2/pandas/api/types/__init__.py9
-rw-r--r--contrib/python/pandas/py2/pandas/arrays/__init__.py23
-rw-r--r--contrib/python/pandas/py2/pandas/compat/__init__.py470
-rw-r--r--contrib/python/pandas/py2/pandas/compat/chainmap.py27
-rw-r--r--contrib/python/pandas/py2/pandas/compat/chainmap_impl.py157
-rw-r--r--contrib/python/pandas/py2/pandas/compat/numpy/__init__.py72
-rw-r--r--contrib/python/pandas/py2/pandas/compat/numpy/function.py402
-rw-r--r--contrib/python/pandas/py2/pandas/compat/pickle_compat.py229
-rw-r--r--contrib/python/pandas/py2/pandas/core/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/core/accessor.py281
-rw-r--r--contrib/python/pandas/py2/pandas/core/algorithms.py1826
-rw-r--r--contrib/python/pandas/py2/pandas/core/api.py64
-rw-r--r--contrib/python/pandas/py2/pandas/core/apply.py411
-rw-r--r--contrib/python/pandas/py2/pandas/core/arrays/__init__.py13
-rw-r--r--contrib/python/pandas/py2/pandas/core/arrays/_ranges.py188
-rw-r--r--contrib/python/pandas/py2/pandas/core/arrays/array_.py274
-rw-r--r--contrib/python/pandas/py2/pandas/core/arrays/base.py1120
-rw-r--r--contrib/python/pandas/py2/pandas/core/arrays/categorical.py2708
-rw-r--r--contrib/python/pandas/py2/pandas/core/arrays/datetimelike.py1598
-rw-r--r--contrib/python/pandas/py2/pandas/core/arrays/datetimes.py2152
-rw-r--r--contrib/python/pandas/py2/pandas/core/arrays/integer.py706
-rw-r--r--contrib/python/pandas/py2/pandas/core/arrays/interval.py1104
-rw-r--r--contrib/python/pandas/py2/pandas/core/arrays/numpy_.py458
-rw-r--r--contrib/python/pandas/py2/pandas/core/arrays/period.py956
-rw-r--r--contrib/python/pandas/py2/pandas/core/arrays/sparse.py2028
-rw-r--r--contrib/python/pandas/py2/pandas/core/arrays/timedeltas.py1069
-rw-r--r--contrib/python/pandas/py2/pandas/core/base.py1530
-rw-r--r--contrib/python/pandas/py2/pandas/core/categorical.py9
-rw-r--r--contrib/python/pandas/py2/pandas/core/common.py472
-rw-r--r--contrib/python/pandas/py2/pandas/core/computation/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/core/computation/align.py179
-rw-r--r--contrib/python/pandas/py2/pandas/core/computation/api.py3
-rw-r--r--contrib/python/pandas/py2/pandas/core/computation/check.py24
-rw-r--r--contrib/python/pandas/py2/pandas/core/computation/common.py26
-rw-r--r--contrib/python/pandas/py2/pandas/core/computation/engines.py151
-rw-r--r--contrib/python/pandas/py2/pandas/core/computation/eval.py351
-rw-r--r--contrib/python/pandas/py2/pandas/core/computation/expr.py776
-rw-r--r--contrib/python/pandas/py2/pandas/core/computation/expressions.py251
-rw-r--r--contrib/python/pandas/py2/pandas/core/computation/ops.py561
-rw-r--r--contrib/python/pandas/py2/pandas/core/computation/pytables.py604
-rw-r--r--contrib/python/pandas/py2/pandas/core/computation/scope.py302
-rw-r--r--contrib/python/pandas/py2/pandas/core/config.py837
-rw-r--r--contrib/python/pandas/py2/pandas/core/config_init.py507
-rw-r--r--contrib/python/pandas/py2/pandas/core/dtypes/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/core/dtypes/api.py14
-rw-r--r--contrib/python/pandas/py2/pandas/core/dtypes/base.py294
-rw-r--r--contrib/python/pandas/py2/pandas/core/dtypes/cast.py1328
-rw-r--r--contrib/python/pandas/py2/pandas/core/dtypes/common.py2031
-rw-r--r--contrib/python/pandas/py2/pandas/core/dtypes/concat.py583
-rw-r--r--contrib/python/pandas/py2/pandas/core/dtypes/dtypes.py991
-rw-r--r--contrib/python/pandas/py2/pandas/core/dtypes/generic.py84
-rw-r--r--contrib/python/pandas/py2/pandas/core/dtypes/inference.py499
-rw-r--r--contrib/python/pandas/py2/pandas/core/dtypes/missing.py529
-rw-r--r--contrib/python/pandas/py2/pandas/core/frame.py7976
-rw-r--r--contrib/python/pandas/py2/pandas/core/generic.py11039
-rw-r--r--contrib/python/pandas/py2/pandas/core/groupby/__init__.py4
-rw-r--r--contrib/python/pandas/py2/pandas/core/groupby/base.py158
-rw-r--r--contrib/python/pandas/py2/pandas/core/groupby/categorical.py100
-rw-r--r--contrib/python/pandas/py2/pandas/core/groupby/generic.py1673
-rw-r--r--contrib/python/pandas/py2/pandas/core/groupby/groupby.py2110
-rw-r--r--contrib/python/pandas/py2/pandas/core/groupby/grouper.py632
-rw-r--r--contrib/python/pandas/py2/pandas/core/groupby/ops.py898
-rw-r--r--contrib/python/pandas/py2/pandas/core/index.py3
-rw-r--r--contrib/python/pandas/py2/pandas/core/indexes/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/core/indexes/accessors.py325
-rw-r--r--contrib/python/pandas/py2/pandas/core/indexes/api.py286
-rw-r--r--contrib/python/pandas/py2/pandas/core/indexes/base.py5410
-rw-r--r--contrib/python/pandas/py2/pandas/core/indexes/category.py852
-rw-r--r--contrib/python/pandas/py2/pandas/core/indexes/datetimelike.py724
-rw-r--r--contrib/python/pandas/py2/pandas/core/indexes/datetimes.py1679
-rw-r--r--contrib/python/pandas/py2/pandas/core/indexes/frozen.py196
-rw-r--r--contrib/python/pandas/py2/pandas/core/indexes/interval.py1315
-rw-r--r--contrib/python/pandas/py2/pandas/core/indexes/multi.py3166
-rw-r--r--contrib/python/pandas/py2/pandas/core/indexes/numeric.py450
-rw-r--r--contrib/python/pandas/py2/pandas/core/indexes/period.py966
-rw-r--r--contrib/python/pandas/py2/pandas/core/indexes/range.py702
-rw-r--r--contrib/python/pandas/py2/pandas/core/indexes/timedeltas.py804
-rwxr-xr-xcontrib/python/pandas/py2/pandas/core/indexing.py2766
-rw-r--r--contrib/python/pandas/py2/pandas/core/internals/__init__.py13
-rw-r--r--contrib/python/pandas/py2/pandas/core/internals/arrays.py55
-rw-r--r--contrib/python/pandas/py2/pandas/core/internals/blocks.py3299
-rw-r--r--contrib/python/pandas/py2/pandas/core/internals/concat.py485
-rw-r--r--contrib/python/pandas/py2/pandas/core/internals/construction.py721
-rw-r--r--contrib/python/pandas/py2/pandas/core/internals/managers.py2065
-rw-r--r--contrib/python/pandas/py2/pandas/core/missing.py748
-rw-r--r--contrib/python/pandas/py2/pandas/core/nanops.py1272
-rw-r--r--contrib/python/pandas/py2/pandas/core/ops.py2309
-rw-r--r--contrib/python/pandas/py2/pandas/core/panel.py1588
-rw-r--r--contrib/python/pandas/py2/pandas/core/resample.py1766
-rw-r--r--contrib/python/pandas/py2/pandas/core/reshape/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/core/reshape/api.py8
-rw-r--r--contrib/python/pandas/py2/pandas/core/reshape/concat.py635
-rw-r--r--contrib/python/pandas/py2/pandas/core/reshape/melt.py461
-rw-r--r--contrib/python/pandas/py2/pandas/core/reshape/merge.py1752
-rw-r--r--contrib/python/pandas/py2/pandas/core/reshape/pivot.py618
-rw-r--r--contrib/python/pandas/py2/pandas/core/reshape/reshape.py1044
-rw-r--r--contrib/python/pandas/py2/pandas/core/reshape/tile.py559
-rw-r--r--contrib/python/pandas/py2/pandas/core/reshape/util.py57
-rw-r--r--contrib/python/pandas/py2/pandas/core/series.py4394
-rw-r--r--contrib/python/pandas/py2/pandas/core/sorting.py508
-rw-r--r--contrib/python/pandas/py2/pandas/core/sparse/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/core/sparse/api.py5
-rw-r--r--contrib/python/pandas/py2/pandas/core/sparse/frame.py1043
-rw-r--r--contrib/python/pandas/py2/pandas/core/sparse/scipy_sparse.py131
-rw-r--r--contrib/python/pandas/py2/pandas/core/sparse/series.py592
-rw-r--r--contrib/python/pandas/py2/pandas/core/strings.py3184
-rw-r--r--contrib/python/pandas/py2/pandas/core/tools/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/core/tools/datetimes.py903
-rw-r--r--contrib/python/pandas/py2/pandas/core/tools/numeric.py179
-rw-r--r--contrib/python/pandas/py2/pandas/core/tools/timedeltas.py166
-rw-r--r--contrib/python/pandas/py2/pandas/core/util/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/core/util/hashing.py333
-rw-r--r--contrib/python/pandas/py2/pandas/core/window.py2649
-rw-r--r--contrib/python/pandas/py2/pandas/errors/__init__.py183
-rw-r--r--contrib/python/pandas/py2/pandas/io/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/io/api.py20
-rw-r--r--contrib/python/pandas/py2/pandas/io/clipboard/__init__.py125
-rw-r--r--contrib/python/pandas/py2/pandas/io/clipboard/clipboards.py145
-rw-r--r--contrib/python/pandas/py2/pandas/io/clipboard/exceptions.py12
-rw-r--r--contrib/python/pandas/py2/pandas/io/clipboard/windows.py154
-rw-r--r--contrib/python/pandas/py2/pandas/io/clipboards.py145
-rw-r--r--contrib/python/pandas/py2/pandas/io/common.py617
-rw-r--r--contrib/python/pandas/py2/pandas/io/date_converters.py64
-rw-r--r--contrib/python/pandas/py2/pandas/io/excel.py1996
-rw-r--r--contrib/python/pandas/py2/pandas/io/feather_format.py127
-rw-r--r--contrib/python/pandas/py2/pandas/io/formats/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/io/formats/console.py159
-rw-r--r--contrib/python/pandas/py2/pandas/io/formats/css.py250
-rw-r--r--contrib/python/pandas/py2/pandas/io/formats/csvs.py315
-rw-r--r--contrib/python/pandas/py2/pandas/io/formats/excel.py664
-rw-r--r--contrib/python/pandas/py2/pandas/io/formats/format.py1626
-rw-r--r--contrib/python/pandas/py2/pandas/io/formats/html.py531
-rw-r--r--contrib/python/pandas/py2/pandas/io/formats/latex.py246
-rw-r--r--contrib/python/pandas/py2/pandas/io/formats/printing.py435
-rw-r--r--contrib/python/pandas/py2/pandas/io/formats/style.py1367
-rw-r--r--contrib/python/pandas/py2/pandas/io/formats/templates/html.tpl70
-rw-r--r--contrib/python/pandas/py2/pandas/io/formats/terminal.py152
-rw-r--r--contrib/python/pandas/py2/pandas/io/gbq.py162
-rw-r--r--contrib/python/pandas/py2/pandas/io/gcs.py16
-rw-r--r--contrib/python/pandas/py2/pandas/io/html.py1094
-rw-r--r--contrib/python/pandas/py2/pandas/io/json/__init__.py5
-rw-r--r--contrib/python/pandas/py2/pandas/io/json/json.py951
-rw-r--r--contrib/python/pandas/py2/pandas/io/json/normalize.py286
-rw-r--r--contrib/python/pandas/py2/pandas/io/json/table_schema.py326
-rw-r--r--contrib/python/pandas/py2/pandas/io/msgpack/__init__.py50
-rw-r--r--contrib/python/pandas/py2/pandas/io/msgpack/_packer.pyx308
-rw-r--r--contrib/python/pandas/py2/pandas/io/msgpack/_unpacker.pyx486
-rw-r--r--contrib/python/pandas/py2/pandas/io/msgpack/_version.py1
-rw-r--r--contrib/python/pandas/py2/pandas/io/msgpack/exceptions.py32
-rw-r--r--contrib/python/pandas/py2/pandas/io/packers.py830
-rw-r--r--contrib/python/pandas/py2/pandas/io/parquet.py282
-rwxr-xr-xcontrib/python/pandas/py2/pandas/io/parsers.py3610
-rw-r--r--contrib/python/pandas/py2/pandas/io/pickle.py201
-rw-r--r--contrib/python/pandas/py2/pandas/io/pytables.py4890
-rw-r--r--contrib/python/pandas/py2/pandas/io/s3.py40
-rw-r--r--contrib/python/pandas/py2/pandas/io/sas/__init__.py1
-rw-r--r--contrib/python/pandas/py2/pandas/io/sas/sas.pyx445
-rw-r--r--contrib/python/pandas/py2/pandas/io/sas/sas7bdat.py703
-rw-r--r--contrib/python/pandas/py2/pandas/io/sas/sas_constants.py171
-rw-r--r--contrib/python/pandas/py2/pandas/io/sas/sas_xport.py464
-rw-r--r--contrib/python/pandas/py2/pandas/io/sas/sasreader.py68
-rw-r--r--contrib/python/pandas/py2/pandas/io/sql.py1596
-rw-r--r--contrib/python/pandas/py2/pandas/io/stata.py2988
-rw-r--r--contrib/python/pandas/py2/pandas/plotting/__init__.py20
-rw-r--r--contrib/python/pandas/py2/pandas/plotting/_compat.py25
-rw-r--r--contrib/python/pandas/py2/pandas/plotting/_converter.py1154
-rw-r--r--contrib/python/pandas/py2/pandas/plotting/_core.py3605
-rw-r--r--contrib/python/pandas/py2/pandas/plotting/_misc.py640
-rw-r--r--contrib/python/pandas/py2/pandas/plotting/_style.py168
-rw-r--r--contrib/python/pandas/py2/pandas/plotting/_timeseries.py353
-rw-r--r--contrib/python/pandas/py2/pandas/plotting/_tools.py382
-rw-r--r--contrib/python/pandas/py2/pandas/testing.py8
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/data/fixed_width_format.txt3
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/data/gbq_fake_job.txt1
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/msgpack/test_newspec.py92
-rw-r--r--contrib/python/pandas/py2/pandas/tests/io/parser/data/utf16_ex.txtbin0 -> 11406 bytes
-rw-r--r--contrib/python/pandas/py2/pandas/tseries/__init__.py0
-rw-r--r--contrib/python/pandas/py2/pandas/tseries/api.py8
-rw-r--r--contrib/python/pandas/py2/pandas/tseries/converter.py16
-rw-r--r--contrib/python/pandas/py2/pandas/tseries/frequencies.py497
-rw-r--r--contrib/python/pandas/py2/pandas/tseries/holiday.py513
-rw-r--r--contrib/python/pandas/py2/pandas/tseries/offsets.py2514
-rw-r--r--contrib/python/pandas/py2/pandas/tseries/plotting.py3
-rw-r--r--contrib/python/pandas/py2/pandas/util/__init__.py2
-rw-r--r--contrib/python/pandas/py2/pandas/util/_decorators.py352
-rw-r--r--contrib/python/pandas/py2/pandas/util/_depr_module.py103
-rw-r--r--contrib/python/pandas/py2/pandas/util/_doctools.py206
-rw-r--r--contrib/python/pandas/py2/pandas/util/_exceptions.py16
-rw-r--r--contrib/python/pandas/py2/pandas/util/_print_versions.py159
-rw-r--r--contrib/python/pandas/py2/pandas/util/_test_decorators.py210
-rw-r--r--contrib/python/pandas/py2/pandas/util/_tester.py29
-rw-r--r--contrib/python/pandas/py2/pandas/util/_validators.py358
-rw-r--r--contrib/python/pandas/py2/pandas/util/move.c268
-rw-r--r--contrib/python/pandas/py2/pandas/util/testing.py3067
-rw-r--r--contrib/python/pandas/py2/symbols.cmake175
-rw-r--r--contrib/python/pandas/py2/ya.make299
299 files changed, 216103 insertions, 0 deletions
diff --git a/contrib/python/pandas/py2/.dist-info/METADATA b/contrib/python/pandas/py2/.dist-info/METADATA
new file mode 100644
index 00000000000..a1e9ec057d7
--- /dev/null
+++ b/contrib/python/pandas/py2/.dist-info/METADATA
@@ -0,0 +1,90 @@
+Metadata-Version: 2.1
+Name: pandas
+Version: 0.24.2
+Summary: Powerful data structures for data analysis, time series, and statistics
+Home-page: http://pandas.pydata.org
+Maintainer: The PyData Development Team
+Maintainer-email: [email protected]
+License: BSD
+Platform: any
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Console
+Classifier: Operating System :: OS Independent
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3.5
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Cython
+Classifier: Topic :: Scientific/Engineering
+Requires-Python: >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*
+Requires-Dist: python-dateutil (>=2.5.0)
+Requires-Dist: pytz (>=2011k)
+Requires-Dist: numpy (>=1.12.0)
+
+**pandas** is a Python package providing fast, flexible, and expressive data
+structures designed to make working with structured (tabular, multidimensional,
+potentially heterogeneous) and time series data both easy and intuitive. It
+aims to be the fundamental high-level building block for doing practical,
+**real world** data analysis in Python. Additionally, it has the broader goal
+of becoming **the most powerful and flexible open source data analysis /
+manipulation tool available in any language**. It is already well on its way
+toward this goal.
+
+pandas is well suited for many different kinds of data:
+
+ - Tabular data with heterogeneously-typed columns, as in an SQL table or
+ Excel spreadsheet
+ - Ordered and unordered (not necessarily fixed-frequency) time series data.
+ - Arbitrary matrix data (homogeneously typed or heterogeneous) with row and
+ column labels
+ - Any other form of observational / statistical data sets. The data actually
+ need not be labeled at all to be placed into a pandas data structure
+
+The two primary data structures of pandas, Series (1-dimensional) and DataFrame
+(2-dimensional), handle the vast majority of typical use cases in finance,
+statistics, social science, and many areas of engineering. For R users,
+DataFrame provides everything that R's ``data.frame`` provides and much
+more. pandas is built on top of `NumPy <http://www.numpy.org>`__ and is
+intended to integrate well within a scientific computing environment with many
+other 3rd party libraries.
+
+Here are just a few of the things that pandas does well:
+
+ - Easy handling of **missing data** (represented as NaN) in floating point as
+ well as non-floating point data
+ - Size mutability: columns can be **inserted and deleted** from DataFrame and
+ higher dimensional objects
+ - Automatic and explicit **data alignment**: objects can be explicitly
+ aligned to a set of labels, or the user can simply ignore the labels and
+ let `Series`, `DataFrame`, etc. automatically align the data for you in
+ computations
+ - Powerful, flexible **group by** functionality to perform
+ split-apply-combine operations on data sets, for both aggregating and
+ transforming data
+ - Make it **easy to convert** ragged, differently-indexed data in other
+ Python and NumPy data structures into DataFrame objects
+ - Intelligent label-based **slicing**, **fancy indexing**, and **subsetting**
+ of large data sets
+ - Intuitive **merging** and **joining** data sets
+ - Flexible **reshaping** and pivoting of data sets
+ - **Hierarchical** labeling of axes (possible to have multiple labels per
+ tick)
+ - Robust IO tools for loading data from **flat files** (CSV and delimited),
+ Excel files, databases, and saving / loading data from the ultrafast **HDF5
+ format**
+ - **Time series**-specific functionality: date range generation and frequency
+ conversion, moving window statistics, moving window linear regressions,
+ date shifting and lagging, etc.
+
+Many of these principles are here to address the shortcomings frequently
+experienced using other languages / scientific research environments. For data
+scientists, working with data is typically divided into multiple stages:
+munging and cleaning data, analyzing / modeling it, then organizing the results
+of the analysis into a form suitable for plotting or tabular display. pandas is
+the ideal tool for all of these tasks.
+
+
diff --git a/contrib/python/pandas/py2/.dist-info/top_level.txt b/contrib/python/pandas/py2/.dist-info/top_level.txt
new file mode 100644
index 00000000000..fb6c7ed7ec6
--- /dev/null
+++ b/contrib/python/pandas/py2/.dist-info/top_level.txt
@@ -0,0 +1 @@
+pandas
diff --git a/contrib/python/pandas/py2/LICENSE b/contrib/python/pandas/py2/LICENSE
new file mode 100644
index 00000000000..924de26253b
--- /dev/null
+++ b/contrib/python/pandas/py2/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2008-2012, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/contrib/python/pandas/py2/README.md b/contrib/python/pandas/py2/README.md
new file mode 100644
index 00000000000..ce228187058
--- /dev/null
+++ b/contrib/python/pandas/py2/README.md
@@ -0,0 +1,234 @@
+<div align="center">
+ <img src="https://github.com/pandas-dev/pandas/blob/master/doc/logo/pandas_logo.png"><br>
+</div>
+
+-----------------
+
+# pandas: powerful Python data analysis toolkit
+
+<table>
+<tr>
+ <td>Latest Release</td>
+ <td>
+ <a href="https://pypi.org/project/pandas/">
+ <img src="https://img.shields.io/pypi/v/pandas.svg" alt="latest release" />
+ </a>
+ </td>
+</tr>
+ <td></td>
+ <td>
+ <a href="https://anaconda.org/anaconda/pandas/">
+ <img src="https://anaconda.org/conda-forge/pandas/badges/version.svg" alt="latest release" />
+ </a>
+</td>
+</tr>
+<tr>
+ <td>Package Status</td>
+ <td>
+ <a href="https://pypi.org/project/pandas/">
+ <img src="https://img.shields.io/pypi/status/pandas.svg" alt="status" /></td>
+ </a>
+</tr>
+<tr>
+ <td>License</td>
+ <td>
+ <a href="https://github.com/pandas-dev/pandas/blob/master/LICENSE">
+ <img src="https://img.shields.io/pypi/l/pandas.svg" alt="license" />
+ </a>
+</td>
+</tr>
+<tr>
+ <td>Build Status</td>
+ <td>
+ <a href="https://travis-ci.org/pandas-dev/pandas">
+ <img src="https://travis-ci.org/pandas-dev/pandas.svg?branch=master" alt="travis build status" />
+ </a>
+ </td>
+</tr>
+<tr>
+ <td></td>
+ <td>
+ <a href="https://dev.azure.com/pandas-dev/pandas/_build/latest?definitionId=1&branch=master">
+ <img src="https://dev.azure.com/pandas-dev/pandas/_apis/build/status/pandas-dev.pandas?branch=master" alt="Azure Pipelines build status" />
+ </a>
+ </td>
+</tr>
+<tr>
+ <td>Coverage</td>
+  <td>
+ <a href="https://codecov.io/gh/pandas-dev/pandas">
+ <img src="https://codecov.io/github/pandas-dev/pandas/coverage.svg?branch=master" alt="coverage" />
+ </a>
+ </td>
+</tr>
+<tr>
+ <td>Downloads</td>
+ <td>
+ <a href="https://pandas.pydata.org">
+ <img src="https://anaconda.org/conda-forge/pandas/badges/downloads.svg" alt="conda-forge downloads" />
+ </a>
+ </td>
+</tr>
+<tr>
+ <td>Gitter</td>
+ <td>
+ <a href="https://gitter.im/pydata/pandas">
+ <img src="https://badges.gitter.im/Join%20Chat.svg"
+ </a>
+ </td>
+</tr>
+</table>
+
+
+
+## What is it?
+
+**pandas** is a Python package providing fast, flexible, and expressive data
+structures designed to make working with "relational" or "labeled" data both
+easy and intuitive. It aims to be the fundamental high-level building block for
+doing practical, **real world** data analysis in Python. Additionally, it has
+the broader goal of becoming **the most powerful and flexible open source data
+analysis / manipulation tool available in any language**. It is already well on
+its way towards this goal.
+
+## Main Features
+Here are just a few of the things that pandas does well:
+
+ - Easy handling of [**missing data**][missing-data] (represented as
+ `NaN`) in floating point as well as non-floating point data
+ - Size mutability: columns can be [**inserted and
+ deleted**][insertion-deletion] from DataFrame and higher dimensional
+ objects
+ - Automatic and explicit [**data alignment**][alignment]: objects can
+ be explicitly aligned to a set of labels, or the user can simply
+ ignore the labels and let `Series`, `DataFrame`, etc. automatically
+ align the data for you in computations
+ - Powerful, flexible [**group by**][groupby] functionality to perform
+ split-apply-combine operations on data sets, for both aggregating
+ and transforming data
+ - Make it [**easy to convert**][conversion] ragged,
+ differently-indexed data in other Python and NumPy data structures
+ into DataFrame objects
+ - Intelligent label-based [**slicing**][slicing], [**fancy
+ indexing**][fancy-indexing], and [**subsetting**][subsetting] of
+ large data sets
+ - Intuitive [**merging**][merging] and [**joining**][joining] data
+ sets
+ - Flexible [**reshaping**][reshape] and [**pivoting**][pivot-table] of
+ data sets
+ - [**Hierarchical**][mi] labeling of axes (possible to have multiple
+ labels per tick)
+ - Robust IO tools for loading data from [**flat files**][flat-files]
+ (CSV and delimited), [**Excel files**][excel], [**databases**][db],
+ and saving/loading data from the ultrafast [**HDF5 format**][hdfstore]
+ - [**Time series**][timeseries]-specific functionality: date range
+ generation and frequency conversion, moving window statistics,
+ moving window linear regressions, date shifting and lagging, etc.
+
+
+ [missing-data]: https://pandas.pydata.org/pandas-docs/stable/missing_data.html#working-with-missing-data
+ [insertion-deletion]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html#column-selection-addition-deletion
+ [alignment]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html?highlight=alignment#intro-to-data-structures
+ [groupby]: https://pandas.pydata.org/pandas-docs/stable/groupby.html#group-by-split-apply-combine
+ [conversion]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe
+ [slicing]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#slicing-ranges
+ [fancy-indexing]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#advanced-indexing-with-ix
+ [subsetting]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-indexing
+ [merging]: https://pandas.pydata.org/pandas-docs/stable/merging.html#database-style-dataframe-joining-merging
+ [joining]: https://pandas.pydata.org/pandas-docs/stable/merging.html#joining-on-index
+ [reshape]: https://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-and-pivot-tables
+ [pivot-table]: https://pandas.pydata.org/pandas-docs/stable/reshaping.html#pivot-tables-and-cross-tabulations
+ [mi]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#hierarchical-indexing-multiindex
+ [flat-files]: https://pandas.pydata.org/pandas-docs/stable/io.html#csv-text-files
+ [excel]: https://pandas.pydata.org/pandas-docs/stable/io.html#excel-files
+ [db]: https://pandas.pydata.org/pandas-docs/stable/io.html#sql-queries
+ [hdfstore]: https://pandas.pydata.org/pandas-docs/stable/io.html#hdf5-pytables
+ [timeseries]: https://pandas.pydata.org/pandas-docs/stable/timeseries.html#time-series-date-functionality
+
+## Where to get it
+The source code is currently hosted on GitHub at:
+https://github.com/pandas-dev/pandas
+
+Binary installers for the latest released version are available at the [Python
+package index](https://pypi.org/project/pandas) and on conda.
+
+```sh
+# conda
+conda install pandas
+```
+
+```sh
+# or PyPI
+pip install pandas
+```
+
+## Dependencies
+- [NumPy](https://www.numpy.org): 1.12.0 or higher
+- [python-dateutil](https://labix.org/python-dateutil): 2.5.0 or higher
+- [pytz](https://pythonhosted.org/pytz): 2011k or higher
+
+See the [full installation instructions](https://pandas.pydata.org/pandas-docs/stable/install.html#dependencies)
+for recommended and optional dependencies.
+
+## Installation from sources
+To install pandas from source you need Cython in addition to the normal
+dependencies above. Cython can be installed from pypi:
+
+```sh
+pip install cython
+```
+
+In the `pandas` directory (same one where you found this file after
+cloning the git repo), execute:
+
+```sh
+python setup.py install
+```
+
+or for installing in [development mode](https://pip.pypa.io/en/latest/reference/pip_install.html#editable-installs):
+
+```sh
+python setup.py develop
+```
+
+Alternatively, you can use `pip` if you want all the dependencies pulled
+in automatically (the `-e` option is for installing it in [development
+mode](https://pip.pypa.io/en/latest/reference/pip_install.html#editable-installs)):
+
+```sh
+pip install -e .
+```
+
+See the full instructions for [installing from source](https://pandas.pydata.org/pandas-docs/stable/install.html#installing-from-source).
+
+## License
+[BSD 3](LICENSE)
+
+## Documentation
+The official documentation is hosted on PyData.org: https://pandas.pydata.org/pandas-docs/stable
+
+## Background
+Work on ``pandas`` started at AQR (a quantitative hedge fund) in 2008 and
+has been under active development since then.
+
+## Getting Help
+
+For usage questions, the best place to go to is [StackOverflow](https://stackoverflow.com/questions/tagged/pandas).
+Further, general questions and discussions can also take place on the [pydata mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata).
+
+## Discussion and Development
+Most development discussion is taking place on github in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions.
+
+## Contributing to pandas [![Open Source Helpers](https://www.codetriage.com/pandas-dev/pandas/badges/users.svg)](https://www.codetriage.com/pandas-dev/pandas)
+
+All contributions, bug reports, bug fixes, documentation improvements, enhancements and ideas are welcome.
+
+A detailed overview on how to contribute can be found in the **[contributing guide](https://pandas-docs.github.io/pandas-docs-travis/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub.
+
+If you are simply looking to start working with the pandas codebase, navigate to the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [good first issue](https://github.com/pandas-dev/pandas/issues?labels=good+first+issue&sort=updated&state=open) where you could start out.
+
+You can also triage issues which may include reproducing bug reports, or asking for vital information such as version numbers or reproduction instructions. If you would like to start triaging issues, one easy way to get started is to [subscribe to pandas on CodeTriage](https://www.codetriage.com/pandas-dev/pandas).
+
+Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’...you can do something about it!
+
+Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas).
diff --git a/contrib/python/pandas/py2/pandas/__init__.py b/contrib/python/pandas/py2/pandas/__init__.py
new file mode 100644
index 00000000000..427157acb43
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/__init__.py
@@ -0,0 +1,101 @@
+# pylint: disable-msg=W0614,W0401,W0611,W0622
+
+# flake8: noqa
+
+__docformat__ = 'restructuredtext'
+
+# Let users know if they're missing any of our hard dependencies
+hard_dependencies = ("numpy", "pytz", "dateutil")
+missing_dependencies = []
+
+for dependency in hard_dependencies:
+ try:
+ __import__(dependency)
+ except ImportError as e:
+ missing_dependencies.append(dependency)
+
+if missing_dependencies:
+ raise ImportError(
+ "Missing required dependencies {0}".format(missing_dependencies))
+del hard_dependencies, dependency, missing_dependencies
+
+# numpy compat
+from pandas.compat.numpy import *
+
+try:
+ from pandas._libs import (hashtable as _hashtable,
+ lib as _lib,
+ tslib as _tslib)
+except ImportError as e: # pragma: no cover
+ # hack but overkill to use re
+ module = str(e).replace('cannot import name ', '')
+ raise ImportError("C extension: {0} not built. If you want to import "
+ "pandas from the source directory, you may need to run "
+ "'python setup.py build_ext --inplace --force' to build "
+ "the C extensions first.".format(module))
+
+from datetime import datetime
+
+# let init-time option registration happen
+import pandas.core.config_init
+
+from pandas.core.api import *
+from pandas.core.sparse.api import *
+from pandas.tseries.api import *
+from pandas.core.computation.api import *
+from pandas.core.reshape.api import *
+
+from pandas.util._print_versions import show_versions
+from pandas.io.api import *
+from pandas.util._tester import test
+import pandas.testing
+import pandas.arrays
+
+# use the closest tagged version if possible
+from ._version import get_versions
+v = get_versions()
+__version__ = v.get('closest-tag', v['version'])
+__git_version__ = v.get('full-revisionid')
+del get_versions, v
+
+# module level doc-string
+__doc__ = """
+pandas - a powerful data analysis and manipulation library for Python
+=====================================================================
+
+**pandas** is a Python package providing fast, flexible, and expressive data
+structures designed to make working with "relational" or "labeled" data both
+easy and intuitive. It aims to be the fundamental high-level building block for
+doing practical, **real world** data analysis in Python. Additionally, it has
+the broader goal of becoming **the most powerful and flexible open source data
+analysis / manipulation tool available in any language**. It is already well on
+its way toward this goal.
+
+Main Features
+-------------
+Here are just a few of the things that pandas does well:
+
+ - Easy handling of missing data in floating point as well as non-floating
+ point data.
+ - Size mutability: columns can be inserted and deleted from DataFrame and
+ higher dimensional objects
+ - Automatic and explicit data alignment: objects can be explicitly aligned
+ to a set of labels, or the user can simply ignore the labels and let
+ `Series`, `DataFrame`, etc. automatically align the data for you in
+ computations.
+ - Powerful, flexible group by functionality to perform split-apply-combine
+ operations on data sets, for both aggregating and transforming data.
+ - Make it easy to convert ragged, differently-indexed data in other Python
+ and NumPy data structures into DataFrame objects.
+ - Intelligent label-based slicing, fancy indexing, and subsetting of large
+ data sets.
+ - Intuitive merging and joining data sets.
+ - Flexible reshaping and pivoting of data sets.
+ - Hierarchical labeling of axes (possible to have multiple labels per tick).
+ - Robust IO tools for loading data from flat files (CSV and delimited),
+ Excel files, databases, and saving/loading data from the ultrafast HDF5
+ format.
+ - Time series-specific functionality: date range generation and frequency
+ conversion, moving window statistics, moving window linear regressions,
+ date shifting and lagging, etc.
+"""
diff --git a/contrib/python/pandas/py2/pandas/_libs/__init__.py b/contrib/python/pandas/py2/pandas/_libs/__init__.py
new file mode 100644
index 00000000000..b02c423b79f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+# flake8: noqa
+
+from .tslibs import (
+ iNaT, NaT, Timestamp, Timedelta, OutOfBoundsDatetime, Period)
diff --git a/contrib/python/pandas/py2/pandas/_libs/algos.pxd b/contrib/python/pandas/py2/pandas/_libs/algos.pxd
new file mode 100644
index 00000000000..4bca5b33a3c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/algos.pxd
@@ -0,0 +1,21 @@
+from pandas._libs.util cimport numeric
+
+
+cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil:
+ cdef:
+ numeric t
+
+ # cython doesn't allow pointer dereference so use array syntax
+ t = a[0]
+ a[0] = b[0]
+ b[0] = t
+ return 0
+
+
+cdef enum TiebreakEnumType:
+ TIEBREAK_AVERAGE
+ TIEBREAK_MIN,
+ TIEBREAK_MAX
+ TIEBREAK_FIRST
+ TIEBREAK_FIRST_DESCENDING
+ TIEBREAK_DENSE
diff --git a/contrib/python/pandas/py2/pandas/_libs/algos.pyx b/contrib/python/pandas/py2/pandas/_libs/algos.pyx
new file mode 100644
index 00000000000..b3c519ab99b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/algos.pyx
@@ -0,0 +1,792 @@
+# -*- coding: utf-8 -*-
+
+import cython
+from cython import Py_ssize_t
+
+from libc.stdlib cimport malloc, free
+from libc.string cimport memmove
+from libc.math cimport fabs, sqrt
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport (ndarray,
+ NPY_INT64, NPY_INT32, NPY_INT16, NPY_INT8,
+ NPY_UINT64, NPY_UINT32, NPY_UINT16, NPY_UINT8,
+ NPY_FLOAT32, NPY_FLOAT64,
+ NPY_OBJECT,
+ int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
+ uint32_t, uint64_t, float32_t, float64_t)
+cnp.import_array()
+
+
+cimport pandas._libs.util as util
+from pandas._libs.util cimport numeric, get_nat
+
+from pandas._libs.khash cimport (
+ khiter_t, kh_destroy_int64, kh_put_int64, kh_init_int64, kh_int64_t,
+ kh_resize_int64, kh_get_int64)
+
+import pandas._libs.missing as missing
+
+cdef float64_t FP_ERR = 1e-13
+
+cdef float64_t NaN = <float64_t>np.NaN
+
+cdef int64_t NPY_NAT = get_nat()
+
+tiebreakers = {
+ 'average': TIEBREAK_AVERAGE,
+ 'min': TIEBREAK_MIN,
+ 'max': TIEBREAK_MAX,
+ 'first': TIEBREAK_FIRST,
+ 'dense': TIEBREAK_DENSE,
+}
+
+
+cdef inline bint are_diff(object left, object right):
+ try:
+ return fabs(left - right) > FP_ERR
+ except TypeError:
+ return left != right
+
+
+class Infinity(object):
+ """ provide a positive Infinity comparison method for ranking """
+
+ __lt__ = lambda self, other: False
+ __le__ = lambda self, other: isinstance(other, Infinity)
+ __eq__ = lambda self, other: isinstance(other, Infinity)
+ __ne__ = lambda self, other: not isinstance(other, Infinity)
+ __gt__ = lambda self, other: (not isinstance(other, Infinity) and
+ not missing.checknull(other))
+ __ge__ = lambda self, other: not missing.checknull(other)
+
+
+class NegInfinity(object):
+ """ provide a negative Infinity comparison method for ranking """
+
+ __lt__ = lambda self, other: (not isinstance(other, NegInfinity) and
+ not missing.checknull(other))
+ __le__ = lambda self, other: not missing.checknull(other)
+ __eq__ = lambda self, other: isinstance(other, NegInfinity)
+ __ne__ = lambda self, other: not isinstance(other, NegInfinity)
+ __gt__ = lambda self, other: False
+ __ge__ = lambda self, other: isinstance(other, NegInfinity)
+
+
+cpdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr):
+ """
+ Efficiently find the unique first-differences of the given array.
+
+ Parameters
+ ----------
+ arr : ndarray[in64_t]
+
+ Returns
+ -------
+ result : ndarray[int64_t]
+ result is sorted
+ """
+ cdef:
+ Py_ssize_t i, n = len(arr)
+ int64_t val
+ khiter_t k
+ kh_int64_t *table
+ int ret = 0
+ list uniques = []
+
+ table = kh_init_int64()
+ kh_resize_int64(table, 10)
+ for i in range(n - 1):
+ val = arr[i + 1] - arr[i]
+ k = kh_get_int64(table, val)
+ if k == table.n_buckets:
+ kh_put_int64(table, val, &ret)
+ uniques.append(val)
+ kh_destroy_int64(table)
+
+ result = np.array(uniques, dtype=np.int64)
+ result.sort()
+ return result
+
+
+def is_lexsorted(list_of_arrays: list) -> bint:
+ cdef:
+ Py_ssize_t i
+ Py_ssize_t n, nlevels
+ int64_t k, cur, pre
+ ndarray arr
+ bint result = True
+
+ nlevels = len(list_of_arrays)
+ n = len(list_of_arrays[0])
+
+ cdef int64_t **vecs = <int64_t**>malloc(nlevels * sizeof(int64_t*))
+ for i in range(nlevels):
+ arr = list_of_arrays[i]
+ assert arr.dtype.name == 'int64'
+ vecs[i] = <int64_t*>cnp.PyArray_DATA(arr)
+
+ # Assume uniqueness??
+ with nogil:
+ for i in range(1, n):
+ for k in range(nlevels):
+ cur = vecs[k][i]
+ pre = vecs[k][i -1]
+ if cur == pre:
+ continue
+ elif cur > pre:
+ break
+ else:
+ result = False
+ break
+ free(vecs)
+ return result
+
+
+def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups):
+ """
+ compute a 1-d indexer that is an ordering of the passed index,
+ ordered by the groups. This is a reverse of the label
+ factorization process.
+
+ Parameters
+ ----------
+ index: int64 ndarray
+ mappings from group -> position
+ ngroups: int64
+ number of groups
+
+ return a tuple of (1-d indexer ordered by groups, group counts)
+ """
+
+ cdef:
+ Py_ssize_t i, loc, label, n
+ ndarray[int64_t] counts, where, result
+
+ counts = np.zeros(ngroups + 1, dtype=np.int64)
+ n = len(index)
+ result = np.zeros(n, dtype=np.int64)
+ where = np.zeros(ngroups + 1, dtype=np.int64)
+
+ with nogil:
+
+ # count group sizes, location 0 for NA
+ for i in range(n):
+ counts[index[i] + 1] += 1
+
+ # mark the start of each contiguous group of like-indexed data
+ for i in range(1, ngroups + 1):
+ where[i] = where[i - 1] + counts[i - 1]
+
+ # this is our indexer
+ for i in range(n):
+ label = index[i] + 1
+ result[where[label]] = i
+ where[label] += 1
+
+ return result, counts
+
+
+def kth_smallest(numeric[:] a, Py_ssize_t k) -> numeric:
+ cdef:
+ Py_ssize_t i, j, l, m, n = a.shape[0]
+ numeric x
+
+ with nogil:
+ l = 0
+ m = n - 1
+
+ while l < m:
+ x = a[k]
+ i = l
+ j = m
+
+ while 1:
+ while a[i] < x: i += 1
+ while x < a[j]: j -= 1
+ if i <= j:
+ swap(&a[i], &a[j])
+ i += 1; j -= 1
+
+ if i > j: break
+
+ if j < k: l = i
+ if k < i: m = j
+ return a[k]
+
+
+# ----------------------------------------------------------------------
+# Pairwise correlation/covariance
+
+
+def nancorr(ndarray[float64_t, ndim=2] mat, bint cov=0, minp=None):
+ cdef:
+ Py_ssize_t i, j, xi, yi, N, K
+ bint minpv
+ ndarray[float64_t, ndim=2] result
+ ndarray[uint8_t, ndim=2] mask
+ int64_t nobs = 0
+ float64_t vx, vy, sumx, sumy, sumxx, sumyy, meanx, meany, divisor
+
+ N, K = (<object>mat).shape
+
+ if minp is None:
+ minpv = 1
+ else:
+ minpv = <int>minp
+
+ result = np.empty((K, K), dtype=np.float64)
+ mask = np.isfinite(mat).view(np.uint8)
+
+ with nogil:
+ for xi in range(K):
+ for yi in range(xi + 1):
+ nobs = sumxx = sumyy = sumx = sumy = 0
+ for i in range(N):
+ if mask[i, xi] and mask[i, yi]:
+ vx = mat[i, xi]
+ vy = mat[i, yi]
+ nobs += 1
+ sumx += vx
+ sumy += vy
+
+ if nobs < minpv:
+ result[xi, yi] = result[yi, xi] = NaN
+ else:
+ meanx = sumx / nobs
+ meany = sumy / nobs
+
+ # now the cov numerator
+ sumx = 0
+
+ for i in range(N):
+ if mask[i, xi] and mask[i, yi]:
+ vx = mat[i, xi] - meanx
+ vy = mat[i, yi] - meany
+
+ sumx += vx * vy
+ sumxx += vx * vx
+ sumyy += vy * vy
+
+ divisor = (nobs - 1.0) if cov else sqrt(sumxx * sumyy)
+
+ if divisor != 0:
+ result[xi, yi] = result[yi, xi] = sumx / divisor
+ else:
+ result[xi, yi] = result[yi, xi] = NaN
+
+ return result
+
+# ----------------------------------------------------------------------
+# Pairwise Spearman correlation
+
+
+def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1):
+ cdef:
+ Py_ssize_t i, j, xi, yi, N, K
+ ndarray[float64_t, ndim=2] result
+ ndarray[float64_t, ndim=1] maskedx
+ ndarray[float64_t, ndim=1] maskedy
+ ndarray[uint8_t, ndim=2] mask
+ int64_t nobs = 0
+ float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor
+
+ N, K = (<object>mat).shape
+
+ result = np.empty((K, K), dtype=np.float64)
+ mask = np.isfinite(mat).view(np.uint8)
+
+ for xi in range(K):
+ for yi in range(xi + 1):
+ nobs = 0
+ for i in range(N):
+ if mask[i, xi] and mask[i, yi]:
+ nobs += 1
+
+ if nobs < minp:
+ result[xi, yi] = result[yi, xi] = NaN
+ else:
+ maskedx = np.empty(nobs, dtype=np.float64)
+ maskedy = np.empty(nobs, dtype=np.float64)
+ j = 0
+ for i in range(N):
+ if mask[i, xi] and mask[i, yi]:
+ maskedx[j] = mat[i, xi]
+ maskedy[j] = mat[i, yi]
+ j += 1
+ maskedx = rank_1d_float64(maskedx)
+ maskedy = rank_1d_float64(maskedy)
+
+ mean = (nobs + 1) / 2.
+
+ # now the cov numerator
+ sumx = sumxx = sumyy = 0
+
+ for i in range(nobs):
+ vx = maskedx[i] - mean
+ vy = maskedy[i] - mean
+
+ sumx += vx * vy
+ sumxx += vx * vx
+ sumyy += vy * vy
+
+ divisor = sqrt(sumxx * sumyy)
+
+ if divisor != 0:
+ result[xi, yi] = result[yi, xi] = sumx / divisor
+ else:
+ result[xi, yi] = result[yi, xi] = NaN
+
+ return result
+
+
+# ----------------------------------------------------------------------
+
+ctypedef fused algos_t:
+ float64_t
+ float32_t
+ object
+ int64_t
+ int32_t
+ int16_t
+ int8_t
+ uint64_t
+ uint32_t
+ uint16_t
+ uint8_t
+
+
+def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None):
+ cdef:
+ Py_ssize_t i, j, nleft, nright
+ ndarray[int64_t, ndim=1] indexer
+ algos_t cur, next
+ int lim, fill_count = 0
+
+ nleft = len(old)
+ nright = len(new)
+ indexer = np.empty(nright, dtype=np.int64)
+ indexer[:] = -1
+
+ if limit is None:
+ lim = nright
+ else:
+ if not util.is_integer_object(limit):
+ raise ValueError('Limit must be an integer')
+ if limit < 1:
+ raise ValueError('Limit must be greater than 0')
+ lim = limit
+
+ if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
+ return indexer
+
+ i = j = 0
+
+ cur = old[0]
+
+ while j <= nright - 1 and new[j] < cur:
+ j += 1
+
+ while True:
+ if j == nright:
+ break
+
+ if i == nleft - 1:
+ while j < nright:
+ if new[j] == cur:
+ indexer[j] = i
+ elif new[j] > cur and fill_count < lim:
+ indexer[j] = i
+ fill_count += 1
+ j += 1
+ break
+
+ next = old[i + 1]
+
+ while j < nright and cur <= new[j] < next:
+ if new[j] == cur:
+ indexer[j] = i
+ elif fill_count < lim:
+ indexer[j] = i
+ fill_count += 1
+ j += 1
+
+ fill_count = 0
+ i += 1
+ cur = next
+
+ return indexer
+
+
+def pad_inplace(ndarray[algos_t] values,
+ ndarray[uint8_t, cast=True] mask,
+ limit=None):
+ cdef:
+ Py_ssize_t i, N
+ algos_t val
+ int lim, fill_count = 0
+
+ N = len(values)
+
+ # GH#2778
+ if N == 0:
+ return
+
+ if limit is None:
+ lim = N
+ else:
+ if not util.is_integer_object(limit):
+ raise ValueError('Limit must be an integer')
+ if limit < 1:
+ raise ValueError('Limit must be greater than 0')
+ lim = limit
+
+ val = values[0]
+ for i in range(N):
+ if mask[i]:
+ if fill_count >= lim:
+ continue
+ fill_count += 1
+ values[i] = val
+ else:
+ fill_count = 0
+ val = values[i]
+
+
+def pad_2d_inplace(ndarray[algos_t, ndim=2] values,
+ ndarray[uint8_t, ndim=2] mask,
+ limit=None):
+ cdef:
+ Py_ssize_t i, j, N, K
+ algos_t val
+ int lim, fill_count = 0
+
+ K, N = (<object>values).shape
+
+ # GH#2778
+ if N == 0:
+ return
+
+ if limit is None:
+ lim = N
+ else:
+ if not util.is_integer_object(limit):
+ raise ValueError('Limit must be an integer')
+ if limit < 1:
+ raise ValueError('Limit must be greater than 0')
+ lim = limit
+
+ for j in range(K):
+ fill_count = 0
+ val = values[j, 0]
+ for i in range(N):
+ if mask[j, i]:
+ if fill_count >= lim:
+ continue
+ fill_count += 1
+ values[j, i] = val
+ else:
+ fill_count = 0
+ val = values[j, i]
+
+
+"""
+Backfilling logic for generating fill vector
+
+Diagram of what's going on
+
+Old New Fill vector Mask
+ . 0 1
+ . 0 1
+ . 0 1
+A A 0 1
+ . 1 1
+ . 1 1
+ . 1 1
+ . 1 1
+ . 1 1
+B B 1 1
+ . 2 1
+ . 2 1
+ . 2 1
+C C 2 1
+ . 0
+ . 0
+D
+"""
+
+
+def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None):
+ cdef:
+ Py_ssize_t i, j, nleft, nright
+ ndarray[int64_t, ndim=1] indexer
+ algos_t cur, prev
+ int lim, fill_count = 0
+
+ nleft = len(old)
+ nright = len(new)
+ indexer = np.empty(nright, dtype=np.int64)
+ indexer[:] = -1
+
+ if limit is None:
+ lim = nright
+ else:
+ if not util.is_integer_object(limit):
+ raise ValueError('Limit must be an integer')
+ if limit < 1:
+ raise ValueError('Limit must be greater than 0')
+ lim = limit
+
+ if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+ return indexer
+
+ i = nleft - 1
+ j = nright - 1
+
+ cur = old[nleft - 1]
+
+ while j >= 0 and new[j] > cur:
+ j -= 1
+
+ while True:
+ if j < 0:
+ break
+
+ if i == 0:
+ while j >= 0:
+ if new[j] == cur:
+ indexer[j] = i
+ elif new[j] < cur and fill_count < lim:
+ indexer[j] = i
+ fill_count += 1
+ j -= 1
+ break
+
+ prev = old[i - 1]
+
+ while j >= 0 and prev < new[j] <= cur:
+ if new[j] == cur:
+ indexer[j] = i
+ elif new[j] < cur and fill_count < lim:
+ indexer[j] = i
+ fill_count += 1
+ j -= 1
+
+ fill_count = 0
+ i -= 1
+ cur = prev
+
+ return indexer
+
+
+def backfill_inplace(ndarray[algos_t] values,
+ ndarray[uint8_t, cast=True] mask,
+ limit=None):
+ cdef:
+ Py_ssize_t i, N
+ algos_t val
+ int lim, fill_count = 0
+
+ N = len(values)
+
+ # GH#2778
+ if N == 0:
+ return
+
+ if limit is None:
+ lim = N
+ else:
+ if not util.is_integer_object(limit):
+ raise ValueError('Limit must be an integer')
+ if limit < 1:
+ raise ValueError('Limit must be greater than 0')
+ lim = limit
+
+ val = values[N - 1]
+ for i in range(N - 1, -1, -1):
+ if mask[i]:
+ if fill_count >= lim:
+ continue
+ fill_count += 1
+ values[i] = val
+ else:
+ fill_count = 0
+ val = values[i]
+
+
+def backfill_2d_inplace(ndarray[algos_t, ndim=2] values,
+ ndarray[uint8_t, ndim=2] mask,
+ limit=None):
+ cdef:
+ Py_ssize_t i, j, N, K
+ algos_t val
+ int lim, fill_count = 0
+
+ K, N = (<object>values).shape
+
+ # GH#2778
+ if N == 0:
+ return
+
+ if limit is None:
+ lim = N
+ else:
+ if not util.is_integer_object(limit):
+ raise ValueError('Limit must be an integer')
+ if limit < 1:
+ raise ValueError('Limit must be greater than 0')
+ lim = limit
+
+ for j in range(K):
+ fill_count = 0
+ val = values[j, N - 1]
+ for i in range(N - 1, -1, -1):
+ if mask[j, i]:
+ if fill_count >= lim:
+ continue
+ fill_count += 1
+ values[j, i] = val
+ else:
+ fill_count = 0
+ val = values[j, i]
+
+
+def arrmap(ndarray[algos_t] index, object func):
+ cdef:
+ Py_ssize_t length = index.shape[0]
+ Py_ssize_t i = 0
+ ndarray[object] result = np.empty(length, dtype=np.object_)
+
+ from pandas._libs.lib import maybe_convert_objects
+
+ for i in range(length):
+ result[i] = func(index[i])
+
+ return maybe_convert_objects(result)
+
+
+arrmap_float64 = arrmap["float64_t"]
+arrmap_float32 = arrmap["float32_t"]
+arrmap_object = arrmap["object"]
+arrmap_int64 = arrmap["int64_t"]
+arrmap_int32 = arrmap["int32_t"]
+arrmap_uint64 = arrmap["uint64_t"]
+arrmap_bool = arrmap["uint8_t"]
+
+
+def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike):
+ """
+ Returns
+ -------
+ is_monotonic_inc, is_monotonic_dec, is_unique
+ """
+ cdef:
+ Py_ssize_t i, n
+ algos_t prev, cur
+ bint is_monotonic_inc = 1
+ bint is_monotonic_dec = 1
+ bint is_unique = 1
+ bint is_strict_monotonic = 1
+
+ n = len(arr)
+
+ if n == 1:
+ if arr[0] != arr[0] or (timelike and <int64_t>arr[0] == NPY_NAT):
+ # single value is NaN
+ return False, False, True
+ else:
+ return True, True, True
+ elif n < 2:
+ return True, True, True
+
+ if timelike and <int64_t>arr[0] == NPY_NAT:
+ return False, False, True
+
+ if algos_t is not object:
+ with nogil:
+ prev = arr[0]
+ for i in range(1, n):
+ cur = arr[i]
+ if timelike and <int64_t>cur == NPY_NAT:
+ is_monotonic_inc = 0
+ is_monotonic_dec = 0
+ break
+ if cur < prev:
+ is_monotonic_inc = 0
+ elif cur > prev:
+ is_monotonic_dec = 0
+ elif cur == prev:
+ is_unique = 0
+ else:
+ # cur or prev is NaN
+ is_monotonic_inc = 0
+ is_monotonic_dec = 0
+ break
+ if not is_monotonic_inc and not is_monotonic_dec:
+ is_monotonic_inc = 0
+ is_monotonic_dec = 0
+ break
+ prev = cur
+ else:
+ # object-dtype, identical to above except we cannot use `with nogil`
+ prev = arr[0]
+ for i in range(1, n):
+ cur = arr[i]
+ if timelike and <int64_t>cur == NPY_NAT:
+ is_monotonic_inc = 0
+ is_monotonic_dec = 0
+ break
+ if cur < prev:
+ is_monotonic_inc = 0
+ elif cur > prev:
+ is_monotonic_dec = 0
+ elif cur == prev:
+ is_unique = 0
+ else:
+ # cur or prev is NaN
+ is_monotonic_inc = 0
+ is_monotonic_dec = 0
+ break
+ if not is_monotonic_inc and not is_monotonic_dec:
+ is_monotonic_inc = 0
+ is_monotonic_dec = 0
+ break
+ prev = cur
+
+ is_strict_monotonic = is_unique and (is_monotonic_inc or is_monotonic_dec)
+ return is_monotonic_inc, is_monotonic_dec, is_strict_monotonic
+
+
+# generated from template
+include "algos_common_helper.pxi"
+include "algos_rank_helper.pxi"
+include "algos_take_helper.pxi"
diff --git a/contrib/python/pandas/py2/pandas/_libs/algos_common_helper.pxi b/contrib/python/pandas/py2/pandas/_libs/algos_common_helper.pxi
new file mode 100644
index 00000000000..5ca987ba4ab
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/algos_common_helper.pxi
@@ -0,0 +1,403 @@
+"""
+Template for each `dtype` helper function using 1-d template
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+
+def diff_2d_float64(ndarray[float64_t, ndim=2] arr,
+ ndarray[float64_t, ndim=2] out,
+ Py_ssize_t periods, int axis):
+ cdef:
+ Py_ssize_t i, j, sx, sy
+
+ sx, sy = (<object>arr).shape
+ if arr.flags.f_contiguous:
+ if axis == 0:
+ if periods >= 0:
+ start, stop = periods, sx
+ else:
+ start, stop = 0, sx + periods
+ for j in range(sy):
+ for i in range(start, stop):
+ out[i, j] = arr[i, j] - arr[i - periods, j]
+ else:
+ if periods >= 0:
+ start, stop = periods, sy
+ else:
+ start, stop = 0, sy + periods
+ for j in range(start, stop):
+ for i in range(sx):
+ out[i, j] = arr[i, j] - arr[i, j - periods]
+ else:
+ if axis == 0:
+ if periods >= 0:
+ start, stop = periods, sx
+ else:
+ start, stop = 0, sx + periods
+ for i in range(start, stop):
+ for j in range(sy):
+ out[i, j] = arr[i, j] - arr[i - periods, j]
+ else:
+ if periods >= 0:
+ start, stop = periods, sy
+ else:
+ start, stop = 0, sy + periods
+ for i in range(sx):
+ for j in range(start, stop):
+ out[i, j] = arr[i, j] - arr[i, j - periods]
+
+
+def diff_2d_float32(ndarray[float32_t, ndim=2] arr,
+ ndarray[float32_t, ndim=2] out,
+ Py_ssize_t periods, int axis):
+ cdef:
+ Py_ssize_t i, j, sx, sy
+
+ sx, sy = (<object>arr).shape
+ if arr.flags.f_contiguous:
+ if axis == 0:
+ if periods >= 0:
+ start, stop = periods, sx
+ else:
+ start, stop = 0, sx + periods
+ for j in range(sy):
+ for i in range(start, stop):
+ out[i, j] = arr[i, j] - arr[i - periods, j]
+ else:
+ if periods >= 0:
+ start, stop = periods, sy
+ else:
+ start, stop = 0, sy + periods
+ for j in range(start, stop):
+ for i in range(sx):
+ out[i, j] = arr[i, j] - arr[i, j - periods]
+ else:
+ if axis == 0:
+ if periods >= 0:
+ start, stop = periods, sx
+ else:
+ start, stop = 0, sx + periods
+ for i in range(start, stop):
+ for j in range(sy):
+ out[i, j] = arr[i, j] - arr[i - periods, j]
+ else:
+ if periods >= 0:
+ start, stop = periods, sy
+ else:
+ start, stop = 0, sy + periods
+ for i in range(sx):
+ for j in range(start, stop):
+ out[i, j] = arr[i, j] - arr[i, j - periods]
+
+
+def diff_2d_int8(ndarray[int8_t, ndim=2] arr,
+ ndarray[float32_t, ndim=2] out,
+ Py_ssize_t periods, int axis):
+ cdef:
+ Py_ssize_t i, j, sx, sy
+
+ sx, sy = (<object>arr).shape
+ if arr.flags.f_contiguous:
+ if axis == 0:
+ if periods >= 0:
+ start, stop = periods, sx
+ else:
+ start, stop = 0, sx + periods
+ for j in range(sy):
+ for i in range(start, stop):
+ out[i, j] = arr[i, j] - arr[i - periods, j]
+ else:
+ if periods >= 0:
+ start, stop = periods, sy
+ else:
+ start, stop = 0, sy + periods
+ for j in range(start, stop):
+ for i in range(sx):
+ out[i, j] = arr[i, j] - arr[i, j - periods]
+ else:
+ if axis == 0:
+ if periods >= 0:
+ start, stop = periods, sx
+ else:
+ start, stop = 0, sx + periods
+ for i in range(start, stop):
+ for j in range(sy):
+ out[i, j] = arr[i, j] - arr[i - periods, j]
+ else:
+ if periods >= 0:
+ start, stop = periods, sy
+ else:
+ start, stop = 0, sy + periods
+ for i in range(sx):
+ for j in range(start, stop):
+ out[i, j] = arr[i, j] - arr[i, j - periods]
+
+
+def diff_2d_int16(ndarray[int16_t, ndim=2] arr,
+ ndarray[float32_t, ndim=2] out,
+ Py_ssize_t periods, int axis):
+ cdef:
+ Py_ssize_t i, j, sx, sy
+
+ sx, sy = (<object>arr).shape
+ if arr.flags.f_contiguous:
+ if axis == 0:
+ if periods >= 0:
+ start, stop = periods, sx
+ else:
+ start, stop = 0, sx + periods
+ for j in range(sy):
+ for i in range(start, stop):
+ out[i, j] = arr[i, j] - arr[i - periods, j]
+ else:
+ if periods >= 0:
+ start, stop = periods, sy
+ else:
+ start, stop = 0, sy + periods
+ for j in range(start, stop):
+ for i in range(sx):
+ out[i, j] = arr[i, j] - arr[i, j - periods]
+ else:
+ if axis == 0:
+ if periods >= 0:
+ start, stop = periods, sx
+ else:
+ start, stop = 0, sx + periods
+ for i in range(start, stop):
+ for j in range(sy):
+ out[i, j] = arr[i, j] - arr[i - periods, j]
+ else:
+ if periods >= 0:
+ start, stop = periods, sy
+ else:
+ start, stop = 0, sy + periods
+ for i in range(sx):
+ for j in range(start, stop):
+ out[i, j] = arr[i, j] - arr[i, j - periods]
+
+
+def diff_2d_int32(ndarray[int32_t, ndim=2] arr,
+ ndarray[float64_t, ndim=2] out,
+ Py_ssize_t periods, int axis):
+ cdef:
+ Py_ssize_t i, j, sx, sy
+
+ sx, sy = (<object>arr).shape
+ if arr.flags.f_contiguous:
+ if axis == 0:
+ if periods >= 0:
+ start, stop = periods, sx
+ else:
+ start, stop = 0, sx + periods
+ for j in range(sy):
+ for i in range(start, stop):
+ out[i, j] = arr[i, j] - arr[i - periods, j]
+ else:
+ if periods >= 0:
+ start, stop = periods, sy
+ else:
+ start, stop = 0, sy + periods
+ for j in range(start, stop):
+ for i in range(sx):
+ out[i, j] = arr[i, j] - arr[i, j - periods]
+ else:
+ if axis == 0:
+ if periods >= 0:
+ start, stop = periods, sx
+ else:
+ start, stop = 0, sx + periods
+ for i in range(start, stop):
+ for j in range(sy):
+ out[i, j] = arr[i, j] - arr[i - periods, j]
+ else:
+ if periods >= 0:
+ start, stop = periods, sy
+ else:
+ start, stop = 0, sy + periods
+ for i in range(sx):
+ for j in range(start, stop):
+ out[i, j] = arr[i, j] - arr[i, j - periods]
+
+
+def diff_2d_int64(ndarray[int64_t, ndim=2] arr,
+ ndarray[float64_t, ndim=2] out,
+ Py_ssize_t periods, int axis):
+ cdef:
+ Py_ssize_t i, j, sx, sy
+
+ sx, sy = (<object>arr).shape
+ if arr.flags.f_contiguous:
+ if axis == 0:
+ if periods >= 0:
+ start, stop = periods, sx
+ else:
+ start, stop = 0, sx + periods
+ for j in range(sy):
+ for i in range(start, stop):
+ out[i, j] = arr[i, j] - arr[i - periods, j]
+ else:
+ if periods >= 0:
+ start, stop = periods, sy
+ else:
+ start, stop = 0, sy + periods
+ for j in range(start, stop):
+ for i in range(sx):
+ out[i, j] = arr[i, j] - arr[i, j - periods]
+ else:
+ if axis == 0:
+ if periods >= 0:
+ start, stop = periods, sx
+ else:
+ start, stop = 0, sx + periods
+ for i in range(start, stop):
+ for j in range(sy):
+ out[i, j] = arr[i, j] - arr[i - periods, j]
+ else:
+ if periods >= 0:
+ start, stop = periods, sy
+ else:
+ start, stop = 0, sy + periods
+ for i in range(sx):
+ for j in range(start, stop):
+ out[i, j] = arr[i, j] - arr[i, j - periods]
+
+# ----------------------------------------------------------------------
+# ensure_dtype
+# ----------------------------------------------------------------------
+
+cdef int PLATFORM_INT = (<ndarray>np.arange(0, dtype=np.intp)).descr.type_num
+
+
+def ensure_platform_int(object arr):
+ # GH3033, GH1392
+ # platform int is the size of the int pointer, e.g. np.intp
+ if util.is_array(arr):
+ if (<ndarray>arr).descr.type_num == PLATFORM_INT:
+ return arr
+ else:
+ return arr.astype(np.intp)
+ else:
+ return np.array(arr, dtype=np.intp)
+
+
+def ensure_object(object arr):
+ if util.is_array(arr):
+ if (<ndarray>arr).descr.type_num == NPY_OBJECT:
+ return arr
+ else:
+ return arr.astype(np.object_)
+ else:
+ return np.array(arr, dtype=np.object_)
+
+
+def ensure_float64(object arr, copy=True):
+ if util.is_array(arr):
+ if (<ndarray>arr).descr.type_num == NPY_FLOAT64:
+ return arr
+ else:
+ return arr.astype(np.float64, copy=copy)
+ else:
+ return np.array(arr, dtype=np.float64)
+
+
+def ensure_float32(object arr, copy=True):
+ if util.is_array(arr):
+ if (<ndarray>arr).descr.type_num == NPY_FLOAT32:
+ return arr
+ else:
+ return arr.astype(np.float32, copy=copy)
+ else:
+ return np.array(arr, dtype=np.float32)
+
+
+def ensure_int8(object arr, copy=True):
+ if util.is_array(arr):
+ if (<ndarray>arr).descr.type_num == NPY_INT8:
+ return arr
+ else:
+ return arr.astype(np.int8, copy=copy)
+ else:
+ return np.array(arr, dtype=np.int8)
+
+
+def ensure_int16(object arr, copy=True):
+ if util.is_array(arr):
+ if (<ndarray>arr).descr.type_num == NPY_INT16:
+ return arr
+ else:
+ return arr.astype(np.int16, copy=copy)
+ else:
+ return np.array(arr, dtype=np.int16)
+
+
+def ensure_int32(object arr, copy=True):
+ if util.is_array(arr):
+ if (<ndarray>arr).descr.type_num == NPY_INT32:
+ return arr
+ else:
+ return arr.astype(np.int32, copy=copy)
+ else:
+ return np.array(arr, dtype=np.int32)
+
+
+def ensure_int64(object arr, copy=True):
+ if util.is_array(arr):
+ if (<ndarray>arr).descr.type_num == NPY_INT64:
+ return arr
+ else:
+ return arr.astype(np.int64, copy=copy)
+ else:
+ return np.array(arr, dtype=np.int64)
+
+
+def ensure_uint8(object arr, copy=True):
+ if util.is_array(arr):
+ if (<ndarray>arr).descr.type_num == NPY_UINT8:
+ return arr
+ else:
+ return arr.astype(np.uint8, copy=copy)
+ else:
+ return np.array(arr, dtype=np.uint8)
+
+
+def ensure_uint16(object arr, copy=True):
+ if util.is_array(arr):
+ if (<ndarray>arr).descr.type_num == NPY_UINT16:
+ return arr
+ else:
+ return arr.astype(np.uint16, copy=copy)
+ else:
+ return np.array(arr, dtype=np.uint16)
+
+
+def ensure_uint32(object arr, copy=True):
+ if util.is_array(arr):
+ if (<ndarray>arr).descr.type_num == NPY_UINT32:
+ return arr
+ else:
+ return arr.astype(np.uint32, copy=copy)
+ else:
+ return np.array(arr, dtype=np.uint32)
+
+
+def ensure_uint64(object arr, copy=True):
+ if util.is_array(arr):
+ if (<ndarray>arr).descr.type_num == NPY_UINT64:
+ return arr
+ else:
+ return arr.astype(np.uint64, copy=copy)
+ else:
+ return np.array(arr, dtype=np.uint64)
diff --git a/contrib/python/pandas/py2/pandas/_libs/algos_rank_helper.pxi b/contrib/python/pandas/py2/pandas/_libs/algos_rank_helper.pxi
new file mode 100644
index 00000000000..df0bf0e9e71
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/algos_rank_helper.pxi
@@ -0,0 +1,870 @@
+"""
+Template for each `dtype` helper function for rank
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+# ----------------------------------------------------------------------
+# rank_1d, rank_2d
+# ----------------------------------------------------------------------
+
+
+def rank_1d_object(object in_arr, ties_method='average',
+ ascending=True, na_option='keep', pct=False):
+ """
+ Fast NaN-friendly version of scipy.stats.rankdata
+ """
+
+ cdef:
+ Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0
+
+ ndarray sorted_data, values
+
+ ndarray[float64_t] ranks
+ ndarray[int64_t] argsorted
+ ndarray[uint8_t, cast=True] sorted_mask
+
+ object val, nan_value
+
+ float64_t sum_ranks = 0
+ int tiebreak = 0
+ bint keep_na = 0
+ bint isnan
+ float64_t count = 0.0
+ tiebreak = tiebreakers[ties_method]
+
+ values = np.array(in_arr, copy=True)
+
+ if values.dtype != np.object_:
+ values = values.astype('O')
+
+ keep_na = na_option == 'keep'
+
+ mask = missing.isnaobj(values)
+
+ # double sort first by mask and then by values to ensure nan values are
+ # either at the beginning or the end. mask/(~mask) controls padding at
+ # tail or the head
+ if ascending ^ (na_option == 'top'):
+ nan_value = Infinity()
+ order = (values, mask)
+ else:
+ nan_value = NegInfinity()
+ order = (values, ~mask)
+ np.putmask(values, mask, nan_value)
+
+ n = len(values)
+ ranks = np.empty(n, dtype='f8')
+
+ _as = np.lexsort(keys=order)
+
+ if not ascending:
+ _as = _as[::-1]
+
+ sorted_data = values.take(_as)
+ sorted_mask = mask.take(_as)
+ _indices = np.diff(sorted_mask.astype(int)).nonzero()[0]
+ non_na_idx = _indices[0] if len(_indices) > 0 else -1
+ argsorted = _as.astype('i8')
+
+ if True:
+ # TODO: why does the 2d version not have a nogil block?
+ for i in range(n):
+ sum_ranks += i + 1
+ dups += 1
+
+ val = util.get_value_at(sorted_data, i)
+ isnan = sorted_mask[i]
+ if isnan and keep_na:
+ ranks[argsorted[i]] = NaN
+ continue
+
+ count += 1.0
+
+ if (i == n - 1 or
+ are_diff(util.get_value_at(sorted_data, i + 1), val) or
+ i == non_na_idx):
+
+ if tiebreak == TIEBREAK_AVERAGE:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = sum_ranks / dups
+ elif tiebreak == TIEBREAK_MIN:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = i - dups + 2
+ elif tiebreak == TIEBREAK_MAX:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = i + 1
+ elif tiebreak == TIEBREAK_FIRST:
+ raise ValueError('first not supported for '
+ 'non-numeric data')
+ elif tiebreak == TIEBREAK_FIRST_DESCENDING:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = 2 * i - j - dups + 2
+ elif tiebreak == TIEBREAK_DENSE:
+ total_tie_count += 1
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = total_tie_count
+ sum_ranks = dups = 0
+ if pct:
+ if tiebreak == TIEBREAK_DENSE:
+ return ranks / total_tie_count
+ else:
+ return ranks / count
+ else:
+ return ranks
+
+
+def rank_2d_object(object in_arr, axis=0, ties_method='average',
+ ascending=True, na_option='keep', pct=False):
+ """
+ Fast NaN-friendly version of scipy.stats.rankdata
+ """
+
+ cdef:
+ Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
+
+ Py_ssize_t infs
+
+ ndarray[float64_t, ndim=2] ranks
+ ndarray[object, ndim=2] values
+
+ ndarray[int64_t, ndim=2] argsorted
+
+ object val, nan_value
+
+ float64_t sum_ranks = 0
+ int tiebreak = 0
+ bint keep_na = 0
+ float64_t count = 0.0
+
+ tiebreak = tiebreakers[ties_method]
+
+ keep_na = na_option == 'keep'
+
+ in_arr = np.asarray(in_arr)
+
+ if axis == 0:
+ values = in_arr.T.copy()
+ else:
+ values = in_arr.copy()
+
+ if values.dtype != np.object_:
+ values = values.astype('O')
+ if ascending ^ (na_option == 'top'):
+ nan_value = Infinity()
+ else:
+ nan_value = NegInfinity()
+
+ mask = missing.isnaobj2d(values)
+
+ np.putmask(values, mask, nan_value)
+
+ n, k = (<object>values).shape
+ ranks = np.empty((n, k), dtype='f8')
+
+ try:
+ _as = values.argsort(1)
+ except TypeError:
+ values = in_arr
+ for i in range(len(values)):
+ ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method,
+ ascending=ascending, pct=pct)
+ if axis == 0:
+ return ranks.T
+ else:
+ return ranks
+
+ if not ascending:
+ _as = _as[:, ::-1]
+
+ values = _take_2d_object(values, _as)
+ argsorted = _as.astype('i8')
+
+ for i in range(n):
+ dups = sum_ranks = infs = 0
+
+ total_tie_count = 0
+ count = 0.0
+ for j in range(k):
+
+ val = values[i, j]
+
+ if (val is nan_value) and keep_na:
+ ranks[i, argsorted[i, j]] = NaN
+
+ infs += 1
+
+ continue
+
+ count += 1.0
+
+ sum_ranks += (j - infs) + 1
+ dups += 1
+ if j == k - 1 or are_diff(values[i, j + 1], val):
+ if tiebreak == TIEBREAK_AVERAGE:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = sum_ranks / dups
+ elif tiebreak == TIEBREAK_MIN:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = j - dups + 2
+ elif tiebreak == TIEBREAK_MAX:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = j + 1
+ elif tiebreak == TIEBREAK_FIRST:
+ raise ValueError('first not supported '
+ 'for non-numeric data')
+ elif tiebreak == TIEBREAK_FIRST_DESCENDING:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2
+ elif tiebreak == TIEBREAK_DENSE:
+ total_tie_count += 1
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = total_tie_count
+ sum_ranks = dups = 0
+ if pct:
+ if tiebreak == TIEBREAK_DENSE:
+ ranks[i, :] /= total_tie_count
+ else:
+ ranks[i, :] /= count
+ if axis == 0:
+ return ranks.T
+ else:
+ return ranks
+
+
+def rank_1d_float64(object in_arr, ties_method='average',
+ ascending=True, na_option='keep', pct=False):
+ """
+ Fast NaN-friendly version of scipy.stats.rankdata
+ """
+
+ cdef:
+ Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0
+
+ ndarray[float64_t] sorted_data, values
+
+ ndarray[float64_t] ranks
+ ndarray[int64_t] argsorted
+ ndarray[uint8_t, cast=True] sorted_mask
+
+ float64_t val, nan_value
+
+ float64_t sum_ranks = 0
+ int tiebreak = 0
+ bint keep_na = 0
+ bint isnan
+ float64_t count = 0.0
+ tiebreak = tiebreakers[ties_method]
+
+ values = np.asarray(in_arr).copy()
+
+ keep_na = na_option == 'keep'
+
+ mask = np.isnan(values)
+
+ # double sort first by mask and then by values to ensure nan values are
+ # either at the beginning or the end. mask/(~mask) controls padding at
+ # tail or the head
+ if ascending ^ (na_option == 'top'):
+ nan_value = np.inf
+ order = (values, mask)
+ else:
+ nan_value = -np.inf
+ order = (values, ~mask)
+ np.putmask(values, mask, nan_value)
+
+ n = len(values)
+ ranks = np.empty(n, dtype='f8')
+
+ if tiebreak == TIEBREAK_FIRST:
+ # need to use a stable sort here
+ _as = np.lexsort(keys=order)
+ if not ascending:
+ tiebreak = TIEBREAK_FIRST_DESCENDING
+ else:
+ _as = np.lexsort(keys=order)
+
+ if not ascending:
+ _as = _as[::-1]
+
+ sorted_data = values.take(_as)
+ sorted_mask = mask.take(_as)
+ _indices = np.diff(sorted_mask.astype(int)).nonzero()[0]
+ non_na_idx = _indices[0] if len(_indices) > 0 else -1
+ argsorted = _as.astype('i8')
+
+ with nogil:
+ # TODO: why does the 2d version not have a nogil block?
+ for i in range(n):
+ sum_ranks += i + 1
+ dups += 1
+
+ val = sorted_data[i]
+ isnan = sorted_mask[i]
+ if isnan and keep_na:
+ ranks[argsorted[i]] = NaN
+ continue
+
+ count += 1.0
+
+ if (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx):
+
+ if tiebreak == TIEBREAK_AVERAGE:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = sum_ranks / dups
+ elif tiebreak == TIEBREAK_MIN:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = i - dups + 2
+ elif tiebreak == TIEBREAK_MAX:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = i + 1
+ elif tiebreak == TIEBREAK_FIRST:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = j + 1
+ elif tiebreak == TIEBREAK_FIRST_DESCENDING:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = 2 * i - j - dups + 2
+ elif tiebreak == TIEBREAK_DENSE:
+ total_tie_count += 1
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = total_tie_count
+ sum_ranks = dups = 0
+ if pct:
+ if tiebreak == TIEBREAK_DENSE:
+ return ranks / total_tie_count
+ else:
+ return ranks / count
+ else:
+ return ranks
+
+
+def rank_2d_float64(object in_arr, axis=0, ties_method='average',
+ ascending=True, na_option='keep', pct=False):
+ """
+ Fast NaN-friendly version of scipy.stats.rankdata
+ """
+
+ cdef:
+ Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
+
+
+ ndarray[float64_t, ndim=2] ranks
+ ndarray[float64_t, ndim=2] values
+
+ ndarray[int64_t, ndim=2] argsorted
+
+ float64_t val, nan_value
+
+ float64_t sum_ranks = 0
+ int tiebreak = 0
+ bint keep_na = 0
+ float64_t count = 0.0
+
+ tiebreak = tiebreakers[ties_method]
+
+ keep_na = na_option == 'keep'
+
+ in_arr = np.asarray(in_arr)
+
+ if axis == 0:
+ values = in_arr.T.copy()
+ else:
+ values = in_arr.copy()
+
+ if ascending ^ (na_option == 'top'):
+ nan_value = np.inf
+ else:
+ nan_value = -np.inf
+
+ mask = np.isnan(values)
+
+ np.putmask(values, mask, nan_value)
+
+ n, k = (<object>values).shape
+ ranks = np.empty((n, k), dtype='f8')
+
+ if tiebreak == TIEBREAK_FIRST:
+ # need to use a stable sort here
+ _as = values.argsort(axis=1, kind='mergesort')
+ if not ascending:
+ tiebreak = TIEBREAK_FIRST_DESCENDING
+ else:
+ _as = values.argsort(1)
+
+ if not ascending:
+ _as = _as[:, ::-1]
+
+ values = _take_2d_float64(values, _as)
+ argsorted = _as.astype('i8')
+
+ for i in range(n):
+ dups = sum_ranks = 0
+
+ total_tie_count = 0
+ count = 0.0
+ for j in range(k):
+ sum_ranks += j + 1
+ dups += 1
+
+ val = values[i, j]
+
+ if (val == nan_value) and keep_na:
+ ranks[i, argsorted[i, j]] = NaN
+
+
+ continue
+
+ count += 1.0
+
+ if j == k - 1 or values[i, j + 1] != val:
+ if tiebreak == TIEBREAK_AVERAGE:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = sum_ranks / dups
+ elif tiebreak == TIEBREAK_MIN:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = j - dups + 2
+ elif tiebreak == TIEBREAK_MAX:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = j + 1
+ elif tiebreak == TIEBREAK_FIRST:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = z + 1
+ elif tiebreak == TIEBREAK_FIRST_DESCENDING:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2
+ elif tiebreak == TIEBREAK_DENSE:
+ total_tie_count += 1
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = total_tie_count
+ sum_ranks = dups = 0
+ if pct:
+ if tiebreak == TIEBREAK_DENSE:
+ ranks[i, :] /= total_tie_count
+ else:
+ ranks[i, :] /= count
+ if axis == 0:
+ return ranks.T
+ else:
+ return ranks
+
+
+def rank_1d_uint64(object in_arr, ties_method='average',
+ ascending=True, na_option='keep', pct=False):
+ """
+ Fast NaN-friendly version of scipy.stats.rankdata
+ """
+
+ cdef:
+ Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0
+
+ ndarray[uint64_t] sorted_data, values
+
+ ndarray[float64_t] ranks
+ ndarray[int64_t] argsorted
+ ndarray[uint8_t, cast=True] sorted_mask
+
+ uint64_t val
+
+ float64_t sum_ranks = 0
+ int tiebreak = 0
+ bint keep_na = 0
+ bint isnan
+ float64_t count = 0.0
+ tiebreak = tiebreakers[ties_method]
+
+ values = np.asarray(in_arr)
+
+ keep_na = na_option == 'keep'
+
+
+ # double sort first by mask and then by values to ensure nan values are
+ # either at the beginning or the end. mask/(~mask) controls padding at
+ # tail or the head
+ mask = np.zeros(shape=len(values), dtype=bool)
+ order = (values, mask)
+
+ n = len(values)
+ ranks = np.empty(n, dtype='f8')
+
+ if tiebreak == TIEBREAK_FIRST:
+ # need to use a stable sort here
+ _as = np.lexsort(keys=order)
+ if not ascending:
+ tiebreak = TIEBREAK_FIRST_DESCENDING
+ else:
+ _as = np.lexsort(keys=order)
+
+ if not ascending:
+ _as = _as[::-1]
+
+ sorted_data = values.take(_as)
+ sorted_mask = mask.take(_as)
+ _indices = np.diff(sorted_mask.astype(int)).nonzero()[0]
+ non_na_idx = _indices[0] if len(_indices) > 0 else -1
+ argsorted = _as.astype('i8')
+
+ with nogil:
+ # TODO: why does the 2d version not have a nogil block?
+ for i in range(n):
+ sum_ranks += i + 1
+ dups += 1
+
+ val = sorted_data[i]
+
+ count += 1.0
+
+ if (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx):
+
+ if tiebreak == TIEBREAK_AVERAGE:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = sum_ranks / dups
+ elif tiebreak == TIEBREAK_MIN:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = i - dups + 2
+ elif tiebreak == TIEBREAK_MAX:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = i + 1
+ elif tiebreak == TIEBREAK_FIRST:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = j + 1
+ elif tiebreak == TIEBREAK_FIRST_DESCENDING:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = 2 * i - j - dups + 2
+ elif tiebreak == TIEBREAK_DENSE:
+ total_tie_count += 1
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = total_tie_count
+ sum_ranks = dups = 0
+ if pct:
+ if tiebreak == TIEBREAK_DENSE:
+ return ranks / total_tie_count
+ else:
+ return ranks / count
+ else:
+ return ranks
+
+
+def rank_2d_uint64(object in_arr, axis=0, ties_method='average',
+ ascending=True, na_option='keep', pct=False):
+ """
+ Fast NaN-friendly version of scipy.stats.rankdata
+ """
+
+ cdef:
+ Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
+
+
+ ndarray[float64_t, ndim=2] ranks
+ ndarray[uint64_t, ndim=2, cast=True] values
+
+ ndarray[int64_t, ndim=2] argsorted
+
+ uint64_t val
+
+ float64_t sum_ranks = 0
+ int tiebreak = 0
+ bint keep_na = 0
+ float64_t count = 0.0
+
+ tiebreak = tiebreakers[ties_method]
+
+ keep_na = na_option == 'keep'
+
+ in_arr = np.asarray(in_arr)
+
+ if axis == 0:
+ values = in_arr.T.copy()
+ else:
+ values = in_arr.copy()
+
+
+ n, k = (<object>values).shape
+ ranks = np.empty((n, k), dtype='f8')
+
+ if tiebreak == TIEBREAK_FIRST:
+ # need to use a stable sort here
+ _as = values.argsort(axis=1, kind='mergesort')
+ if not ascending:
+ tiebreak = TIEBREAK_FIRST_DESCENDING
+ else:
+ _as = values.argsort(1)
+
+ if not ascending:
+ _as = _as[:, ::-1]
+
+ values = _take_2d_uint64(values, _as)
+ argsorted = _as.astype('i8')
+
+ for i in range(n):
+ dups = sum_ranks = 0
+
+ total_tie_count = 0
+ count = 0.0
+ for j in range(k):
+ sum_ranks += j + 1
+ dups += 1
+
+ val = values[i, j]
+
+
+ count += 1.0
+
+ if j == k - 1 or values[i, j + 1] != val:
+ if tiebreak == TIEBREAK_AVERAGE:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = sum_ranks / dups
+ elif tiebreak == TIEBREAK_MIN:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = j - dups + 2
+ elif tiebreak == TIEBREAK_MAX:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = j + 1
+ elif tiebreak == TIEBREAK_FIRST:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = z + 1
+ elif tiebreak == TIEBREAK_FIRST_DESCENDING:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2
+ elif tiebreak == TIEBREAK_DENSE:
+ total_tie_count += 1
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = total_tie_count
+ sum_ranks = dups = 0
+ if pct:
+ if tiebreak == TIEBREAK_DENSE:
+ ranks[i, :] /= total_tie_count
+ else:
+ ranks[i, :] /= count
+ if axis == 0:
+ return ranks.T
+ else:
+ return ranks
+
+
+def rank_1d_int64(object in_arr, ties_method='average',
+ ascending=True, na_option='keep', pct=False):
+ """
+ Fast NaN-friendly version of scipy.stats.rankdata
+ """
+
+ cdef:
+ Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0
+
+ ndarray[int64_t] sorted_data, values
+
+ ndarray[float64_t] ranks
+ ndarray[int64_t] argsorted
+ ndarray[uint8_t, cast=True] sorted_mask
+
+ int64_t val, nan_value
+
+ float64_t sum_ranks = 0
+ int tiebreak = 0
+ bint keep_na = 0
+ bint isnan
+ float64_t count = 0.0
+ tiebreak = tiebreakers[ties_method]
+
+ values = np.asarray(in_arr)
+
+ keep_na = na_option == 'keep'
+
+ mask = values == NPY_NAT
+
+ # create copy in case of NPY_NAT
+ # values are mutated inplace
+ if mask.any():
+ values = values.copy()
+
+ # double sort first by mask and then by values to ensure nan values are
+ # either at the beginning or the end. mask/(~mask) controls padding at
+ # tail or the head
+ if ascending ^ (na_option == 'top'):
+ nan_value = np.iinfo(np.int64).max
+ order = (values, mask)
+ else:
+ nan_value = np.iinfo(np.int64).min
+ order = (values, ~mask)
+ np.putmask(values, mask, nan_value)
+
+ n = len(values)
+ ranks = np.empty(n, dtype='f8')
+
+ if tiebreak == TIEBREAK_FIRST:
+ # need to use a stable sort here
+ _as = np.lexsort(keys=order)
+ if not ascending:
+ tiebreak = TIEBREAK_FIRST_DESCENDING
+ else:
+ _as = np.lexsort(keys=order)
+
+ if not ascending:
+ _as = _as[::-1]
+
+ sorted_data = values.take(_as)
+ sorted_mask = mask.take(_as)
+ _indices = np.diff(sorted_mask.astype(int)).nonzero()[0]
+ non_na_idx = _indices[0] if len(_indices) > 0 else -1
+ argsorted = _as.astype('i8')
+
+ with nogil:
+ # TODO: why does the 2d version not have a nogil block?
+ for i in range(n):
+ sum_ranks += i + 1
+ dups += 1
+
+ val = sorted_data[i]
+ isnan = sorted_mask[i]
+ if isnan and keep_na:
+ ranks[argsorted[i]] = NaN
+ continue
+
+ count += 1.0
+
+ if (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx):
+
+ if tiebreak == TIEBREAK_AVERAGE:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = sum_ranks / dups
+ elif tiebreak == TIEBREAK_MIN:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = i - dups + 2
+ elif tiebreak == TIEBREAK_MAX:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = i + 1
+ elif tiebreak == TIEBREAK_FIRST:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = j + 1
+ elif tiebreak == TIEBREAK_FIRST_DESCENDING:
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = 2 * i - j - dups + 2
+ elif tiebreak == TIEBREAK_DENSE:
+ total_tie_count += 1
+ for j in range(i - dups + 1, i + 1):
+ ranks[argsorted[j]] = total_tie_count
+ sum_ranks = dups = 0
+ if pct:
+ if tiebreak == TIEBREAK_DENSE:
+ return ranks / total_tie_count
+ else:
+ return ranks / count
+ else:
+ return ranks
+
+
+def rank_2d_int64(object in_arr, axis=0, ties_method='average',
+ ascending=True, na_option='keep', pct=False):
+ """
+ Fast NaN-friendly version of scipy.stats.rankdata
+ """
+
+ cdef:
+ Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
+
+
+ ndarray[float64_t, ndim=2] ranks
+ ndarray[int64_t, ndim=2, cast=True] values
+
+ ndarray[int64_t, ndim=2] argsorted
+
+ int64_t val, nan_value
+
+ float64_t sum_ranks = 0
+ int tiebreak = 0
+ bint keep_na = 0
+ float64_t count = 0.0
+
+ tiebreak = tiebreakers[ties_method]
+
+ keep_na = na_option == 'keep'
+
+ in_arr = np.asarray(in_arr)
+
+ if axis == 0:
+ values = in_arr.T.copy()
+ else:
+ values = in_arr.copy()
+
+ if ascending ^ (na_option == 'top'):
+ nan_value = np.iinfo(np.int64).max
+ else:
+ nan_value = np.iinfo(np.int64).min
+
+ mask = values == NPY_NAT
+
+ np.putmask(values, mask, nan_value)
+
+ n, k = (<object>values).shape
+ ranks = np.empty((n, k), dtype='f8')
+
+ if tiebreak == TIEBREAK_FIRST:
+ # need to use a stable sort here
+ _as = values.argsort(axis=1, kind='mergesort')
+ if not ascending:
+ tiebreak = TIEBREAK_FIRST_DESCENDING
+ else:
+ _as = values.argsort(1)
+
+ if not ascending:
+ _as = _as[:, ::-1]
+
+ values = _take_2d_int64(values, _as)
+ argsorted = _as.astype('i8')
+
+ for i in range(n):
+ dups = sum_ranks = 0
+
+ total_tie_count = 0
+ count = 0.0
+ for j in range(k):
+ sum_ranks += j + 1
+ dups += 1
+
+ val = values[i, j]
+
+ if (val == nan_value) and keep_na:
+ ranks[i, argsorted[i, j]] = NaN
+
+
+ continue
+
+ count += 1.0
+
+ if j == k - 1 or values[i, j + 1] != val:
+ if tiebreak == TIEBREAK_AVERAGE:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = sum_ranks / dups
+ elif tiebreak == TIEBREAK_MIN:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = j - dups + 2
+ elif tiebreak == TIEBREAK_MAX:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = j + 1
+ elif tiebreak == TIEBREAK_FIRST:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = z + 1
+ elif tiebreak == TIEBREAK_FIRST_DESCENDING:
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2
+ elif tiebreak == TIEBREAK_DENSE:
+ total_tie_count += 1
+ for z in range(j - dups + 1, j + 1):
+ ranks[i, argsorted[i, z]] = total_tie_count
+ sum_ranks = dups = 0
+ if pct:
+ if tiebreak == TIEBREAK_DENSE:
+ ranks[i, :] /= total_tie_count
+ else:
+ ranks[i, :] /= count
+ if axis == 0:
+ return ranks.T
+ else:
+ return ranks
diff --git a/contrib/python/pandas/py2/pandas/_libs/algos_take_helper.pxi b/contrib/python/pandas/py2/pandas/_libs/algos_take_helper.pxi
new file mode 100644
index 00000000000..482b9e3ea6a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/algos_take_helper.pxi
@@ -0,0 +1,4986 @@
+"""
+Template for each `dtype` helper function for take
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+# ----------------------------------------------------------------------
+# take_1d, take_2d
+# ----------------------------------------------------------------------
+
+
+cdef inline take_1d_bool_bool_memview(uint8_t[:] values,
+ int64_t[:] indexer,
+ uint8_t[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ uint8_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+def take_1d_bool_bool(ndarray[uint8_t, ndim=1] values,
+ int64_t[:] indexer,
+ uint8_t[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_bool_bool_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ uint8_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+cdef inline take_2d_axis0_bool_bool_memview(uint8_t[:, :] values,
+ int64_t[:] indexer,
+ uint8_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ uint8_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF True:
+ cdef:
+ uint8_t *v
+ uint8_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(uint8_t) and
+ sizeof(uint8_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(uint8_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+def take_2d_axis0_bool_bool(ndarray[uint8_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ uint8_t[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_bool_bool_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ uint8_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF True:
+ cdef:
+ uint8_t *v
+ uint8_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(uint8_t) and
+ sizeof(uint8_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(uint8_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+cdef inline take_2d_axis1_bool_bool_memview(uint8_t[:, :] values,
+ int64_t[:] indexer,
+ uint8_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ uint8_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_axis1_bool_bool(ndarray[uint8_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ uint8_t[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_bool_bool_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ uint8_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_multi_bool_bool(ndarray[uint8_t, ndim=2] values,
+ indexer,
+ ndarray[uint8_t, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ uint8_t fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[idx, idx1[j]]
+
+
+cdef inline take_1d_bool_object_memview(uint8_t[:] values,
+ int64_t[:] indexer,
+ object[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ object fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = True if values[idx] > 0 else False
+
+
+
+def take_1d_bool_object(ndarray[uint8_t, ndim=1] values,
+ int64_t[:] indexer,
+ object[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_bool_object_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ object fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = True if values[idx] > 0 else False
+
+
+
+cdef inline take_2d_axis0_bool_object_memview(uint8_t[:, :] values,
+ int64_t[:] indexer,
+ object[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ object fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ object *v
+ object *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(object) and
+ sizeof(object) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(object) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = True if values[idx, j] > 0 else False
+
+
+
+def take_2d_axis0_bool_object(ndarray[uint8_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ object[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_bool_object_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ object fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ object *v
+ object *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(object) and
+ sizeof(object) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(object) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = True if values[idx, j] > 0 else False
+
+
+
+cdef inline take_2d_axis1_bool_object_memview(uint8_t[:, :] values,
+ int64_t[:] indexer,
+ object[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ object fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = True if values[i, idx] > 0 else False
+
+
+
+def take_2d_axis1_bool_object(ndarray[uint8_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ object[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_bool_object_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ object fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = True if values[i, idx] > 0 else False
+
+
+
+def take_2d_multi_bool_object(ndarray[uint8_t, ndim=2] values,
+ indexer,
+ ndarray[object, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ object fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = True if values[idx, idx1[j]] > 0 else False
+
+
+cdef inline take_1d_int8_int8_memview(int8_t[:] values,
+ int64_t[:] indexer,
+ int8_t[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ int8_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+def take_1d_int8_int8(ndarray[int8_t, ndim=1] values,
+ int64_t[:] indexer,
+ int8_t[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_int8_int8_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ int8_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+cdef inline take_2d_axis0_int8_int8_memview(int8_t[:, :] values,
+ int64_t[:] indexer,
+ int8_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int8_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF True:
+ cdef:
+ int8_t *v
+ int8_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(int8_t) and
+ sizeof(int8_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(int8_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+def take_2d_axis0_int8_int8(ndarray[int8_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ int8_t[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_int8_int8_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int8_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF True:
+ cdef:
+ int8_t *v
+ int8_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(int8_t) and
+ sizeof(int8_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(int8_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+cdef inline take_2d_axis1_int8_int8_memview(int8_t[:, :] values,
+ int64_t[:] indexer,
+ int8_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int8_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_axis1_int8_int8(ndarray[int8_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ int8_t[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_int8_int8_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int8_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_multi_int8_int8(ndarray[int8_t, ndim=2] values,
+ indexer,
+ ndarray[int8_t, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ int8_t fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[idx, idx1[j]]
+
+
+cdef inline take_1d_int8_int32_memview(int8_t[:] values,
+ int64_t[:] indexer,
+ int32_t[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ int32_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+def take_1d_int8_int32(ndarray[int8_t, ndim=1] values,
+ int64_t[:] indexer,
+ int32_t[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_int8_int32_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ int32_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+cdef inline take_2d_axis0_int8_int32_memview(int8_t[:, :] values,
+ int64_t[:] indexer,
+ int32_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int32_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ int32_t *v
+ int32_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(int32_t) and
+ sizeof(int32_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(int32_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+def take_2d_axis0_int8_int32(ndarray[int8_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ int32_t[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_int8_int32_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int32_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ int32_t *v
+ int32_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(int32_t) and
+ sizeof(int32_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(int32_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+cdef inline take_2d_axis1_int8_int32_memview(int8_t[:, :] values,
+ int64_t[:] indexer,
+ int32_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int32_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_axis1_int8_int32(ndarray[int8_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ int32_t[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_int8_int32_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int32_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_multi_int8_int32(ndarray[int8_t, ndim=2] values,
+ indexer,
+ ndarray[int32_t, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ int32_t fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[idx, idx1[j]]
+
+
+cdef inline take_1d_int8_int64_memview(int8_t[:] values,
+ int64_t[:] indexer,
+ int64_t[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ int64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+def take_1d_int8_int64(ndarray[int8_t, ndim=1] values,
+ int64_t[:] indexer,
+ int64_t[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_int8_int64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ int64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+cdef inline take_2d_axis0_int8_int64_memview(int8_t[:, :] values,
+ int64_t[:] indexer,
+ int64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ int64_t *v
+ int64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(int64_t) and
+ sizeof(int64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(int64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+def take_2d_axis0_int8_int64(ndarray[int8_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ int64_t[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_int8_int64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ int64_t *v
+ int64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(int64_t) and
+ sizeof(int64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(int64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+cdef inline take_2d_axis1_int8_int64_memview(int8_t[:, :] values,
+ int64_t[:] indexer,
+ int64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_axis1_int8_int64(ndarray[int8_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ int64_t[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_int8_int64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_multi_int8_int64(ndarray[int8_t, ndim=2] values,
+ indexer,
+ ndarray[int64_t, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ int64_t fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[idx, idx1[j]]
+
+
+cdef inline take_1d_int8_float64_memview(int8_t[:] values,
+ int64_t[:] indexer,
+ float64_t[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ float64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+def take_1d_int8_float64(ndarray[int8_t, ndim=1] values,
+ int64_t[:] indexer,
+ float64_t[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_int8_float64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ float64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+cdef inline take_2d_axis0_int8_float64_memview(int8_t[:, :] values,
+ int64_t[:] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ float64_t *v
+ float64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(float64_t) and
+ sizeof(float64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(float64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+def take_2d_axis0_int8_float64(ndarray[int8_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_int8_float64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ float64_t *v
+ float64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(float64_t) and
+ sizeof(float64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(float64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+cdef inline take_2d_axis1_int8_float64_memview(int8_t[:, :] values,
+ int64_t[:] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_axis1_int8_float64(ndarray[int8_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_int8_float64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_multi_int8_float64(ndarray[int8_t, ndim=2] values,
+ indexer,
+ ndarray[float64_t, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ float64_t fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[idx, idx1[j]]
+
+
+cdef inline take_1d_int16_int16_memview(int16_t[:] values,
+ int64_t[:] indexer,
+ int16_t[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ int16_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+def take_1d_int16_int16(ndarray[int16_t, ndim=1] values,
+ int64_t[:] indexer,
+ int16_t[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_int16_int16_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ int16_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+cdef inline take_2d_axis0_int16_int16_memview(int16_t[:, :] values,
+ int64_t[:] indexer,
+ int16_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int16_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF True:
+ cdef:
+ int16_t *v
+ int16_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(int16_t) and
+ sizeof(int16_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(int16_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+def take_2d_axis0_int16_int16(ndarray[int16_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ int16_t[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_int16_int16_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int16_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF True:
+ cdef:
+ int16_t *v
+ int16_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(int16_t) and
+ sizeof(int16_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(int16_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+cdef inline take_2d_axis1_int16_int16_memview(int16_t[:, :] values,
+ int64_t[:] indexer,
+ int16_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int16_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_axis1_int16_int16(ndarray[int16_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ int16_t[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_int16_int16_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int16_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_multi_int16_int16(ndarray[int16_t, ndim=2] values,
+ indexer,
+ ndarray[int16_t, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ int16_t fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[idx, idx1[j]]
+
+
+cdef inline take_1d_int16_int32_memview(int16_t[:] values,
+ int64_t[:] indexer,
+ int32_t[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ int32_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+def take_1d_int16_int32(ndarray[int16_t, ndim=1] values,
+ int64_t[:] indexer,
+ int32_t[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_int16_int32_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ int32_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+cdef inline take_2d_axis0_int16_int32_memview(int16_t[:, :] values,
+ int64_t[:] indexer,
+ int32_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int32_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ int32_t *v
+ int32_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(int32_t) and
+ sizeof(int32_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(int32_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+def take_2d_axis0_int16_int32(ndarray[int16_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ int32_t[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_int16_int32_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int32_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ int32_t *v
+ int32_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(int32_t) and
+ sizeof(int32_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(int32_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+cdef inline take_2d_axis1_int16_int32_memview(int16_t[:, :] values,
+ int64_t[:] indexer,
+ int32_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int32_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_axis1_int16_int32(ndarray[int16_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ int32_t[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_int16_int32_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int32_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_multi_int16_int32(ndarray[int16_t, ndim=2] values,
+ indexer,
+ ndarray[int32_t, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ int32_t fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[idx, idx1[j]]
+
+
+cdef inline take_1d_int16_int64_memview(int16_t[:] values,
+ int64_t[:] indexer,
+ int64_t[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ int64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+def take_1d_int16_int64(ndarray[int16_t, ndim=1] values,
+ int64_t[:] indexer,
+ int64_t[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_int16_int64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ int64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+cdef inline take_2d_axis0_int16_int64_memview(int16_t[:, :] values,
+ int64_t[:] indexer,
+ int64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ int64_t *v
+ int64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(int64_t) and
+ sizeof(int64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(int64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+def take_2d_axis0_int16_int64(ndarray[int16_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ int64_t[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_int16_int64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ int64_t *v
+ int64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(int64_t) and
+ sizeof(int64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(int64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+cdef inline take_2d_axis1_int16_int64_memview(int16_t[:, :] values,
+ int64_t[:] indexer,
+ int64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_axis1_int16_int64(ndarray[int16_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ int64_t[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_int16_int64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_multi_int16_int64(ndarray[int16_t, ndim=2] values,
+ indexer,
+ ndarray[int64_t, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ int64_t fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[idx, idx1[j]]
+
+
+cdef inline take_1d_int16_float64_memview(int16_t[:] values,
+ int64_t[:] indexer,
+ float64_t[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ float64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+def take_1d_int16_float64(ndarray[int16_t, ndim=1] values,
+ int64_t[:] indexer,
+ float64_t[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_int16_float64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ float64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+cdef inline take_2d_axis0_int16_float64_memview(int16_t[:, :] values,
+ int64_t[:] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ float64_t *v
+ float64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(float64_t) and
+ sizeof(float64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(float64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+def take_2d_axis0_int16_float64(ndarray[int16_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_int16_float64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ float64_t *v
+ float64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(float64_t) and
+ sizeof(float64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(float64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+cdef inline take_2d_axis1_int16_float64_memview(int16_t[:, :] values,
+ int64_t[:] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_axis1_int16_float64(ndarray[int16_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_int16_float64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_multi_int16_float64(ndarray[int16_t, ndim=2] values,
+ indexer,
+ ndarray[float64_t, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ float64_t fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[idx, idx1[j]]
+
+
+cdef inline take_1d_int32_int32_memview(int32_t[:] values,
+ int64_t[:] indexer,
+ int32_t[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ int32_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+def take_1d_int32_int32(ndarray[int32_t, ndim=1] values,
+ int64_t[:] indexer,
+ int32_t[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_int32_int32_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ int32_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+cdef inline take_2d_axis0_int32_int32_memview(int32_t[:, :] values,
+ int64_t[:] indexer,
+ int32_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int32_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF True:
+ cdef:
+ int32_t *v
+ int32_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(int32_t) and
+ sizeof(int32_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(int32_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+def take_2d_axis0_int32_int32(ndarray[int32_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ int32_t[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_int32_int32_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int32_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF True:
+ cdef:
+ int32_t *v
+ int32_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(int32_t) and
+ sizeof(int32_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(int32_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+cdef inline take_2d_axis1_int32_int32_memview(int32_t[:, :] values,
+ int64_t[:] indexer,
+ int32_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int32_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_axis1_int32_int32(ndarray[int32_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ int32_t[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_int32_int32_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int32_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_multi_int32_int32(ndarray[int32_t, ndim=2] values,
+ indexer,
+ ndarray[int32_t, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ int32_t fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[idx, idx1[j]]
+
+
+cdef inline take_1d_int32_int64_memview(int32_t[:] values,
+ int64_t[:] indexer,
+ int64_t[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ int64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+def take_1d_int32_int64(ndarray[int32_t, ndim=1] values,
+ int64_t[:] indexer,
+ int64_t[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_int32_int64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ int64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+cdef inline take_2d_axis0_int32_int64_memview(int32_t[:, :] values,
+ int64_t[:] indexer,
+ int64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ int64_t *v
+ int64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(int64_t) and
+ sizeof(int64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(int64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+def take_2d_axis0_int32_int64(ndarray[int32_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ int64_t[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_int32_int64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ int64_t *v
+ int64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(int64_t) and
+ sizeof(int64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(int64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+cdef inline take_2d_axis1_int32_int64_memview(int32_t[:, :] values,
+ int64_t[:] indexer,
+ int64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_axis1_int32_int64(ndarray[int32_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ int64_t[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_int32_int64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_multi_int32_int64(ndarray[int32_t, ndim=2] values,
+ indexer,
+ ndarray[int64_t, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ int64_t fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[idx, idx1[j]]
+
+
+cdef inline take_1d_int32_float64_memview(int32_t[:] values,
+ int64_t[:] indexer,
+ float64_t[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ float64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+def take_1d_int32_float64(ndarray[int32_t, ndim=1] values,
+ int64_t[:] indexer,
+ float64_t[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_int32_float64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ float64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+cdef inline take_2d_axis0_int32_float64_memview(int32_t[:, :] values,
+ int64_t[:] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ float64_t *v
+ float64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(float64_t) and
+ sizeof(float64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(float64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+def take_2d_axis0_int32_float64(ndarray[int32_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_int32_float64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ float64_t *v
+ float64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(float64_t) and
+ sizeof(float64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(float64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+cdef inline take_2d_axis1_int32_float64_memview(int32_t[:, :] values,
+ int64_t[:] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_axis1_int32_float64(ndarray[int32_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_int32_float64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_multi_int32_float64(ndarray[int32_t, ndim=2] values,
+ indexer,
+ ndarray[float64_t, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ float64_t fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[idx, idx1[j]]
+
+
+cdef inline take_1d_int64_int64_memview(int64_t[:] values,
+ int64_t[:] indexer,
+ int64_t[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ int64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+def take_1d_int64_int64(ndarray[int64_t, ndim=1] values,
+ int64_t[:] indexer,
+ int64_t[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_int64_int64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ int64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+cdef inline take_2d_axis0_int64_int64_memview(int64_t[:, :] values,
+ int64_t[:] indexer,
+ int64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF True:
+ cdef:
+ int64_t *v
+ int64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(int64_t) and
+ sizeof(int64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(int64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+def take_2d_axis0_int64_int64(ndarray[int64_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ int64_t[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_int64_int64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF True:
+ cdef:
+ int64_t *v
+ int64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(int64_t) and
+ sizeof(int64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(int64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+cdef inline take_2d_axis1_int64_int64_memview(int64_t[:, :] values,
+ int64_t[:] indexer,
+ int64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_axis1_int64_int64(ndarray[int64_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ int64_t[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_int64_int64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ int64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_multi_int64_int64(ndarray[int64_t, ndim=2] values,
+ indexer,
+ ndarray[int64_t, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ int64_t fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[idx, idx1[j]]
+
+
+cdef inline take_1d_int64_float64_memview(int64_t[:] values,
+ int64_t[:] indexer,
+ float64_t[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ float64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+def take_1d_int64_float64(ndarray[int64_t, ndim=1] values,
+ int64_t[:] indexer,
+ float64_t[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_int64_float64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ float64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+cdef inline take_2d_axis0_int64_float64_memview(int64_t[:, :] values,
+ int64_t[:] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ float64_t *v
+ float64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(float64_t) and
+ sizeof(float64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(float64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+def take_2d_axis0_int64_float64(ndarray[int64_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_int64_float64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ float64_t *v
+ float64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(float64_t) and
+ sizeof(float64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(float64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+cdef inline take_2d_axis1_int64_float64_memview(int64_t[:, :] values,
+ int64_t[:] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_axis1_int64_float64(ndarray[int64_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_int64_float64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_multi_int64_float64(ndarray[int64_t, ndim=2] values,
+ indexer,
+ ndarray[float64_t, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ float64_t fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[idx, idx1[j]]
+
+
+cdef inline take_1d_float32_float32_memview(float32_t[:] values,
+ int64_t[:] indexer,
+ float32_t[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ float32_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+def take_1d_float32_float32(ndarray[float32_t, ndim=1] values,
+ int64_t[:] indexer,
+ float32_t[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_float32_float32_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ float32_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+cdef inline take_2d_axis0_float32_float32_memview(float32_t[:, :] values,
+ int64_t[:] indexer,
+ float32_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float32_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF True:
+ cdef:
+ float32_t *v
+ float32_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(float32_t) and
+ sizeof(float32_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(float32_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+def take_2d_axis0_float32_float32(ndarray[float32_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ float32_t[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_float32_float32_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float32_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF True:
+ cdef:
+ float32_t *v
+ float32_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(float32_t) and
+ sizeof(float32_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(float32_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+cdef inline take_2d_axis1_float32_float32_memview(float32_t[:, :] values,
+ int64_t[:] indexer,
+ float32_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float32_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_axis1_float32_float32(ndarray[float32_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ float32_t[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_float32_float32_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float32_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_multi_float32_float32(ndarray[float32_t, ndim=2] values,
+ indexer,
+ ndarray[float32_t, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ float32_t fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[idx, idx1[j]]
+
+
+cdef inline take_1d_float32_float64_memview(float32_t[:] values,
+ int64_t[:] indexer,
+ float64_t[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ float64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+def take_1d_float32_float64(ndarray[float32_t, ndim=1] values,
+ int64_t[:] indexer,
+ float64_t[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_float32_float64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ float64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+cdef inline take_2d_axis0_float32_float64_memview(float32_t[:, :] values,
+ int64_t[:] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ float64_t *v
+ float64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(float64_t) and
+ sizeof(float64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(float64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+def take_2d_axis0_float32_float64(ndarray[float32_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_float32_float64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ float64_t *v
+ float64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(float64_t) and
+ sizeof(float64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(float64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+cdef inline take_2d_axis1_float32_float64_memview(float32_t[:, :] values,
+ int64_t[:] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_axis1_float32_float64(ndarray[float32_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_float32_float64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_multi_float32_float64(ndarray[float32_t, ndim=2] values,
+ indexer,
+ ndarray[float64_t, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ float64_t fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[idx, idx1[j]]
+
+
+cdef inline take_1d_float64_float64_memview(float64_t[:] values,
+ int64_t[:] indexer,
+ float64_t[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ float64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+def take_1d_float64_float64(ndarray[float64_t, ndim=1] values,
+ int64_t[:] indexer,
+ float64_t[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_float64_float64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ float64_t fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ with nogil:
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+cdef inline take_2d_axis0_float64_float64_memview(float64_t[:, :] values,
+ int64_t[:] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF True:
+ cdef:
+ float64_t *v
+ float64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(float64_t) and
+ sizeof(float64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(float64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+def take_2d_axis0_float64_float64(ndarray[float64_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_float64_float64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF True:
+ cdef:
+ float64_t *v
+ float64_t *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(float64_t) and
+ sizeof(float64_t) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(float64_t) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+cdef inline take_2d_axis1_float64_float64_memview(float64_t[:, :] values,
+ int64_t[:] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_axis1_float64_float64(ndarray[float64_t, ndim=2] values,
+ ndarray[int64_t] indexer,
+ float64_t[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_float64_float64_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ float64_t fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_multi_float64_float64(ndarray[float64_t, ndim=2] values,
+ indexer,
+ ndarray[float64_t, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ float64_t fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[idx, idx1[j]]
+
+
+cdef inline take_1d_object_object_memview(object[:] values,
+ int64_t[:] indexer,
+ object[:] out,
+ fill_value=np.nan):
+
+
+
+ cdef:
+ Py_ssize_t i, n, idx
+ object fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+def take_1d_object_object(ndarray[object, ndim=1] values,
+ int64_t[:] indexer,
+ object[:] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_1d_object_object_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+
+ cdef:
+ Py_ssize_t i, n, idx
+ object fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ out[i] = values[idx]
+
+
+
+cdef inline take_2d_axis0_object_object_memview(object[:, :] values,
+ int64_t[:] indexer,
+ object[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ object fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ object *v
+ object *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(object) and
+ sizeof(object) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(object) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+def take_2d_axis0_object_object(ndarray[object, ndim=2] values,
+ ndarray[int64_t] indexer,
+ object[:, :] out,
+ fill_value=np.nan):
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis0_object_object_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ object fv
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ IF False:
+ cdef:
+ object *v
+ object *o
+
+ #GH3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof(object) and
+ sizeof(object) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof(object) * k))
+ return
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ out[i, j] = values[idx, j]
+
+
+
+cdef inline take_2d_axis1_object_object_memview(object[:, :] values,
+ int64_t[:] indexer,
+ object[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ object fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_axis1_object_object(ndarray[object, ndim=2] values,
+ ndarray[int64_t] indexer,
+ object[:, :] out,
+ fill_value=np.nan):
+
+ if values.flags.writeable:
+ # We can call the memoryview version of the code
+ take_2d_axis1_object_object_memview(values, indexer, out,
+ fill_value=fill_value)
+ return
+
+ # We cannot use the memoryview version on readonly-buffers due to
+ # a limitation of Cython's typed memoryviews. Instead we can use
+ # the slightly slower Cython ndarray type directly.
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ object fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[i, idx]
+
+
+
+def take_2d_multi_object_object(ndarray[object, ndim=2] values,
+ indexer,
+ ndarray[object, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[int64_t] idx0 = indexer[0]
+ ndarray[int64_t] idx1 = indexer[1]
+ object fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ out[i, j] = values[idx, idx1[j]]
+
+# ----------------------------------------------------------------------
+# take_2d internal function
+# ----------------------------------------------------------------------
+
+ctypedef fused take_t:
+ float64_t
+ uint64_t
+ int64_t
+ object
+
+
+cdef _take_2d(ndarray[take_t, ndim=2] values, object idx):
+ cdef:
+ Py_ssize_t i, j, N, K
+ ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx
+ ndarray[take_t, ndim=2] result
+ object val
+
+ N, K = (<object>values).shape
+
+ if take_t is object:
+ # evaluated at compile-time
+ result = values.copy()
+ else:
+ result = np.empty_like(values)
+
+ for i in range(N):
+ for j in range(K):
+ result[i, j] = values[i, indexer[i, j]]
+ return result
+
+
+_take_2d_object = _take_2d[object]
+_take_2d_float64 = _take_2d[float64_t]
+_take_2d_int64 = _take_2d[int64_t]
+_take_2d_uint64 = _take_2d[uint64_t]
diff --git a/contrib/python/pandas/py2/pandas/_libs/groupby.pyx b/contrib/python/pandas/py2/pandas/_libs/groupby.pyx
new file mode 100644
index 00000000000..e6036654c71
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/groupby.pyx
@@ -0,0 +1,386 @@
+# -*- coding: utf-8 -*-
+
+import cython
+from cython import Py_ssize_t
+
+from libc.stdlib cimport malloc, free
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport (ndarray,
+ int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
+ uint32_t, uint64_t, float32_t, float64_t)
+cnp.import_array()
+
+
+from pandas._libs.util cimport numeric, get_nat
+
+from pandas._libs.algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE,
+ TIEBREAK_MIN, TIEBREAK_MAX, TIEBREAK_FIRST,
+ TIEBREAK_DENSE)
+from pandas._libs.algos import (take_2d_axis1_float64_float64,
+ groupsort_indexer, tiebreakers)
+
+cdef int64_t NPY_NAT = get_nat()
+
+cdef float64_t NaN = <float64_t>np.NaN
+
+
+cdef inline float64_t median_linear(float64_t* a, int n) nogil:
+ cdef:
+ int i, j, na_count = 0
+ float64_t result
+ float64_t* tmp
+
+ if n == 0:
+ return NaN
+
+ # count NAs
+ for i in range(n):
+ if a[i] != a[i]:
+ na_count += 1
+
+ if na_count:
+ if na_count == n:
+ return NaN
+
+ tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t))
+
+ j = 0
+ for i in range(n):
+ if a[i] == a[i]:
+ tmp[j] = a[i]
+ j += 1
+
+ a = tmp
+ n -= na_count
+
+ if n % 2:
+ result = kth_smallest_c( a, n / 2, n)
+ else:
+ result = (kth_smallest_c(a, n / 2, n) +
+ kth_smallest_c(a, n / 2 - 1, n)) / 2
+
+ if na_count:
+ free(a)
+
+ return result
+
+
+# TODO: Is this redundant with algos.kth_smallest
+cdef inline float64_t kth_smallest_c(float64_t* a,
+ Py_ssize_t k,
+ Py_ssize_t n) nogil:
+ cdef:
+ Py_ssize_t i, j, l, m
+ float64_t x, t
+
+ l = 0
+ m = n - 1
+ while l < m:
+ x = a[k]
+ i = l
+ j = m
+
+ while 1:
+ while a[i] < x: i += 1
+ while x < a[j]: j -= 1
+ if i <= j:
+ swap(&a[i], &a[j])
+ i += 1; j -= 1
+
+ if i > j: break
+
+ if j < k: l = i
+ if k < i: m = j
+ return a[k]
+
+
+def group_median_float64(ndarray[float64_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[float64_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, ngroups, size
+ ndarray[int64_t] _counts
+ ndarray[float64_t, ndim=2] data
+ float64_t* ptr
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ ngroups = len(counts)
+ N, K = (<object>values).shape
+
+ indexer, _counts = groupsort_indexer(labels, ngroups)
+ counts[:] = _counts[1:]
+
+ data = np.empty((K, N), dtype=np.float64)
+ ptr = <float64_t*>cnp.PyArray_DATA(data)
+
+ take_2d_axis1_float64_float64(values.T, indexer, out=data)
+
+ with nogil:
+
+ for i in range(K):
+ # exclude NA group
+ ptr += _counts[0]
+ for j in range(ngroups):
+ size = _counts[j + 1]
+ out[j, i] = median_linear(ptr, size)
+ ptr += size
+
+
+def group_cumprod_float64(float64_t[:, :] out,
+ const float64_t[:, :] values,
+ const int64_t[:] labels,
+ bint is_datetimelike,
+ bint skipna=True):
+ """
+ Only transforms on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, size
+ float64_t val
+ float64_t[:, :] accum
+ int64_t lab
+
+ N, K = (<object>values).shape
+ accum = np.ones_like(values)
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+
+ if lab < 0:
+ continue
+ for j in range(K):
+ val = values[i, j]
+ if val == val:
+ accum[lab, j] *= val
+ out[i, j] = accum[lab, j]
+ else:
+ out[i, j] = NaN
+ if not skipna:
+ accum[lab, j] = NaN
+ break
+
+
+def group_cumsum(numeric[:, :] out,
+ numeric[:, :] values,
+ const int64_t[:] labels,
+ is_datetimelike,
+ bint skipna=True):
+ """
+ Only transforms on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, size
+ numeric val
+ numeric[:, :] accum
+ int64_t lab
+
+ N, K = (<object>values).shape
+ accum = np.zeros_like(values)
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+
+ if lab < 0:
+ continue
+ for j in range(K):
+ val = values[i, j]
+
+ if numeric == float32_t or numeric == float64_t:
+ if val == val:
+ accum[lab, j] += val
+ out[i, j] = accum[lab, j]
+ else:
+ out[i, j] = NaN
+ if not skipna:
+ accum[lab, j] = NaN
+ break
+ else:
+ accum[lab, j] += val
+ out[i, j] = accum[lab, j]
+
+
+def group_shift_indexer(int64_t[:] out, const int64_t[:] labels,
+ int ngroups, int periods):
+ cdef:
+ Py_ssize_t N, i, j, ii
+ int offset, sign
+ int64_t lab, idxer, idxer_slot
+ int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64)
+ int64_t[:, :] label_indexer
+
+ N, = (<object>labels).shape
+
+ if periods < 0:
+ periods = -periods
+ offset = N - 1
+ sign = -1
+ elif periods > 0:
+ offset = 0
+ sign = 1
+
+ if periods == 0:
+ with nogil:
+ for i in range(N):
+ out[i] = i
+ else:
+ # array of each previous indexer seen
+ label_indexer = np.zeros((ngroups, periods), dtype=np.int64)
+ with nogil:
+ for i in range(N):
+ # reverse iterator if shifting backwards
+ ii = offset + sign * i
+ lab = labels[ii]
+
+ # Skip null keys
+ if lab == -1:
+ out[ii] = -1
+ continue
+
+ label_seen[lab] += 1
+
+ idxer_slot = label_seen[lab] % periods
+ idxer = label_indexer[lab, idxer_slot]
+
+ if label_seen[lab] > periods:
+ out[ii] = idxer
+ else:
+ out[ii] = -1
+
+ label_indexer[lab, idxer_slot] = ii
+
+
+def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
+ ndarray[uint8_t] mask, object direction,
+ int64_t limit):
+ """Indexes how to fill values forwards or backwards within a group
+
+ Parameters
+ ----------
+ out : array of int64_t values which this method will write its results to
+ Missing values will be written to with a value of -1
+ labels : array containing unique label for each group, with its ordering
+ matching up to the corresponding record in `values`
+ mask : array of int64_t values where a 1 indicates a missing value
+ direction : {'ffill', 'bfill'}
+ Direction for fill to be applied (forwards or backwards, respectively)
+ limit : Consecutive values to fill before stopping, or -1 for no limit
+
+ Notes
+ -----
+ This method modifies the `out` parameter rather than returning an object
+ """
+ cdef:
+ Py_ssize_t i, N
+ int64_t[:] sorted_labels
+ int64_t idx, curr_fill_idx=-1, filled_vals=0
+
+ N = len(out)
+
+ # Make sure all arrays are the same size
+ assert N == len(labels) == len(mask)
+
+ sorted_labels = np.argsort(labels, kind='mergesort').astype(
+ np.int64, copy=False)
+ if direction == 'bfill':
+ sorted_labels = sorted_labels[::-1]
+
+ with nogil:
+ for i in range(N):
+ idx = sorted_labels[i]
+ if mask[idx] == 1: # is missing
+ # Stop filling once we've hit the limit
+ if filled_vals >= limit and limit != -1:
+ curr_fill_idx = -1
+ filled_vals += 1
+ else: # reset items when not missing
+ filled_vals = 0
+ curr_fill_idx = idx
+
+ out[idx] = curr_fill_idx
+
+ # If we move to the next group, reset
+ # the fill_idx and counter
+ if i == N - 1 or labels[idx] != labels[sorted_labels[i + 1]]:
+ curr_fill_idx = -1
+ filled_vals = 0
+
+
+def group_any_all(uint8_t[:] out,
+ const int64_t[:] labels,
+ const uint8_t[:] values,
+ const uint8_t[:] mask,
+ object val_test,
+ bint skipna):
+ """Aggregated boolean values to show truthfulness of group elements
+
+ Parameters
+ ----------
+ out : array of values which this method will write its results to
+ labels : array containing unique label for each group, with its
+ ordering matching up to the corresponding record in `values`
+ values : array containing the truth value of each element
+ mask : array indicating whether a value is na or not
+ val_test : str {'any', 'all'}
+ String object dictating whether to use any or all truth testing
+ skipna : boolean
+ Flag to ignore nan values during truth testing
+
+ Notes
+ -----
+ This method modifies the `out` parameter rather than returning an object.
+ The returned values will either be 0 or 1 (False or True, respectively).
+ """
+ cdef:
+ Py_ssize_t i, N = len(labels)
+ int64_t lab
+ uint8_t flag_val
+
+ if val_test == 'all':
+ # Because the 'all' value of an empty iterable in Python is True we can
+ # start with an array full of ones and set to zero when a False value
+ # is encountered
+ flag_val = 0
+ elif val_test == 'any':
+ # Because the 'any' value of an empty iterable in Python is False we
+ # can start with an array full of zeros and set to one only if any
+ # value encountered is True
+ flag_val = 1
+ else:
+ raise ValueError("'bool_func' must be either 'any' or 'all'!")
+
+ out[:] = 1 - flag_val
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0 or (skipna and mask[i]):
+ continue
+
+ if values[i] == flag_val:
+ out[lab] = flag_val
+
+
+# generated from template
+include "groupby_helper.pxi"
diff --git a/contrib/python/pandas/py2/pandas/_libs/groupby_helper.pxi b/contrib/python/pandas/py2/pandas/_libs/groupby_helper.pxi
new file mode 100644
index 00000000000..59cc5b4fb6b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/groupby_helper.pxi
@@ -0,0 +1,1638 @@
+"""
+Template for each `dtype` helper function using groupby
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+cdef extern from "numpy/npy_math.h":
+ float64_t NAN "NPY_NAN"
+_int64_max = np.iinfo(np.int64).max
+
+# ----------------------------------------------------------------------
+# group_add, group_prod, group_var, group_mean, group_ohlc
+# ----------------------------------------------------------------------
+
+
+def group_add_float64(ndarray[float64_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[float64_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=0):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ float64_t val, count
+ ndarray[float64_t, ndim=2] sumx, nobs
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros_like(out)
+ sumx = np.zeros_like(out)
+
+ N, K = (<object>values).shape
+
+ with nogil:
+
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val:
+ nobs[lab, j] += 1
+ sumx[lab, j] += val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] < min_count:
+ out[i, j] = NAN
+ else:
+ out[i, j] = sumx[i, j]
+
+
+def group_prod_float64(ndarray[float64_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[float64_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=0):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ float64_t val, count
+ ndarray[float64_t, ndim=2] prodx, nobs
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros_like(out)
+ prodx = np.ones_like(out)
+
+ N, K = (<object>values).shape
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val:
+ nobs[lab, j] += 1
+ prodx[lab, j] *= val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] < min_count:
+ out[i, j] = NAN
+ else:
+ out[i, j] = prodx[i, j]
+
+
+def group_var_float64(ndarray[float64_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[float64_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ float64_t val, ct, oldmean
+ ndarray[float64_t, ndim=2] nobs, mean
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros_like(out)
+ mean = np.zeros_like(out)
+
+ N, K = (<object>values).shape
+
+ out[:, :] = 0.0
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val:
+ nobs[lab, j] += 1
+ oldmean = mean[lab, j]
+ mean[lab, j] += (val - oldmean) / nobs[lab, j]
+ out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
+
+ for i in range(ncounts):
+ for j in range(K):
+ ct = nobs[i, j]
+ if ct < 2:
+ out[i, j] = NAN
+ else:
+ out[i, j] /= (ct - 1)
+# add passing bin edges, instead of labels
+
+
+def group_mean_float64(ndarray[float64_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[float64_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ float64_t val, count
+ ndarray[float64_t, ndim=2] sumx, nobs
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros_like(out)
+ sumx = np.zeros_like(out)
+
+ N, K = (<object>values).shape
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+ # not nan
+ if val == val:
+ nobs[lab, j] += 1
+ sumx[lab, j] += val
+
+ for i in range(ncounts):
+ for j in range(K):
+ count = nobs[i, j]
+ if nobs[i, j] == 0:
+ out[i, j] = NAN
+ else:
+ out[i, j] = sumx[i, j] / count
+
+
+def group_ohlc_float64(ndarray[float64_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[float64_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab
+ float64_t val, count
+ Py_ssize_t ngroups = len(counts)
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if len(labels) == 0:
+ return
+
+ N, K = (<object>values).shape
+
+ if out.shape[1] != 4:
+ raise ValueError('Output array must have 4 columns')
+
+ if K > 1:
+ raise NotImplementedError("Argument 'values' must have only "
+ "one dimension")
+ out[:] = np.nan
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab == -1:
+ continue
+
+ counts[lab] += 1
+ val = values[i, 0]
+ if val != val:
+ continue
+
+ if out[lab, 0] != out[lab, 0]:
+ out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val
+ else:
+ out[lab, 1] = max(out[lab, 1], val)
+ out[lab, 2] = min(out[lab, 2], val)
+ out[lab, 3] = val
+
+
+def group_add_float32(ndarray[float32_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[float32_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=0):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ float32_t val, count
+ ndarray[float32_t, ndim=2] sumx, nobs
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros_like(out)
+ sumx = np.zeros_like(out)
+
+ N, K = (<object>values).shape
+
+ with nogil:
+
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val:
+ nobs[lab, j] += 1
+ sumx[lab, j] += val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] < min_count:
+ out[i, j] = NAN
+ else:
+ out[i, j] = sumx[i, j]
+
+
+def group_prod_float32(ndarray[float32_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[float32_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=0):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ float32_t val, count
+ ndarray[float32_t, ndim=2] prodx, nobs
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros_like(out)
+ prodx = np.ones_like(out)
+
+ N, K = (<object>values).shape
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val:
+ nobs[lab, j] += 1
+ prodx[lab, j] *= val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] < min_count:
+ out[i, j] = NAN
+ else:
+ out[i, j] = prodx[i, j]
+
+
+def group_var_float32(ndarray[float32_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[float32_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ float32_t val, ct, oldmean
+ ndarray[float32_t, ndim=2] nobs, mean
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros_like(out)
+ mean = np.zeros_like(out)
+
+ N, K = (<object>values).shape
+
+ out[:, :] = 0.0
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val:
+ nobs[lab, j] += 1
+ oldmean = mean[lab, j]
+ mean[lab, j] += (val - oldmean) / nobs[lab, j]
+ out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
+
+ for i in range(ncounts):
+ for j in range(K):
+ ct = nobs[i, j]
+ if ct < 2:
+ out[i, j] = NAN
+ else:
+ out[i, j] /= (ct - 1)
+# add passing bin edges, instead of labels
+
+
+def group_mean_float32(ndarray[float32_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[float32_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ float32_t val, count
+ ndarray[float32_t, ndim=2] sumx, nobs
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros_like(out)
+ sumx = np.zeros_like(out)
+
+ N, K = (<object>values).shape
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+ # not nan
+ if val == val:
+ nobs[lab, j] += 1
+ sumx[lab, j] += val
+
+ for i in range(ncounts):
+ for j in range(K):
+ count = nobs[i, j]
+ if nobs[i, j] == 0:
+ out[i, j] = NAN
+ else:
+ out[i, j] = sumx[i, j] / count
+
+
+def group_ohlc_float32(ndarray[float32_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[float32_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab
+ float32_t val, count
+ Py_ssize_t ngroups = len(counts)
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if len(labels) == 0:
+ return
+
+ N, K = (<object>values).shape
+
+ if out.shape[1] != 4:
+ raise ValueError('Output array must have 4 columns')
+
+ if K > 1:
+ raise NotImplementedError("Argument 'values' must have only "
+ "one dimension")
+ out[:] = np.nan
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab == -1:
+ continue
+
+ counts[lab] += 1
+ val = values[i, 0]
+ if val != val:
+ continue
+
+ if out[lab, 0] != out[lab, 0]:
+ out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val
+ else:
+ out[lab, 1] = max(out[lab, 1], val)
+ out[lab, 2] = min(out[lab, 2], val)
+ out[lab, 3] = val
+
+# ----------------------------------------------------------------------
+# group_nth, group_last, group_rank
+# ----------------------------------------------------------------------
+
+
+def group_last_float64(ndarray[float64_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[float64_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ float64_t val
+ ndarray[float64_t, ndim=2] resx
+ ndarray[int64_t, ndim=2] nobs
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros((<object>out).shape, dtype=np.int64)
+ resx = np.empty_like(out)
+
+ N, K = (<object>values).shape
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val and val != NAN:
+ nobs[lab, j] += 1
+ resx[lab, j] = val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] == 0:
+ out[i, j] = NAN
+ else:
+ out[i, j] = resx[i, j]
+
+
+def group_nth_float64(ndarray[float64_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[float64_t, ndim=2] values,
+ ndarray[int64_t] labels, int64_t rank,
+ Py_ssize_t min_count=-1):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ float64_t val
+ ndarray[float64_t, ndim=2] resx
+ ndarray[int64_t, ndim=2] nobs
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros((<object>out).shape, dtype=np.int64)
+ resx = np.empty_like(out)
+
+ N, K = (<object>values).shape
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val and val != NAN:
+ nobs[lab, j] += 1
+ if nobs[lab, j] == rank:
+ resx[lab, j] = val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] == 0:
+ out[i, j] = NAN
+ else:
+ out[i, j] = resx[i, j]
+
+
+
+def group_rank_float64(ndarray[float64_t, ndim=2] out,
+ ndarray[float64_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ bint is_datetimelike, object ties_method,
+ bint ascending, bint pct, object na_option):
+ """
+ Provides the rank of values within each group.
+
+ Parameters
+ ----------
+ out : array of float64_t values which this method will write its results to
+ values : array of float64_t values to be ranked
+ labels : array containing unique label for each group, with its ordering
+ matching up to the corresponding record in `values`
+ is_datetimelike : bool, default False
+ unused in this method but provided for call compatibility with other
+ Cython transformations
+ ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
+ 'average'
+ * average: average rank of group
+ * min: lowest rank in group
+ * max: highest rank in group
+ * first: ranks assigned in order they appear in the array
+ * dense: like 'min', but rank always increases by 1 between groups
+ ascending : boolean, default True
+ False for ranks by high (1) to low (N)
+ na_option : {'keep', 'top', 'bottom'}, default 'keep'
+ pct : boolean, default False
+ Compute percentage rank of data within each group
+ na_option : {'keep', 'top', 'bottom'}, default 'keep'
+ * keep: leave NA values where they are
+ * top: smallest rank if ascending
+ * bottom: smallest rank if descending
+
+ Notes
+ -----
+ This method modifies the `out` parameter rather than returning an object
+ """
+ cdef:
+ TiebreakEnumType tiebreak
+ Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0
+ Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0
+ ndarray[int64_t] _as
+ ndarray[float64_t, ndim=2] grp_sizes
+ ndarray[float64_t] masked_vals
+ ndarray[uint8_t] mask
+ bint keep_na
+ float64_t nan_fill_val
+
+ tiebreak = tiebreakers[ties_method]
+ keep_na = na_option == 'keep'
+ N, K = (<object>values).shape
+ grp_sizes = np.ones_like(out)
+
+ # Copy values into new array in order to fill missing data
+ # with mask, without obfuscating location of missing data
+ # in values array
+ masked_vals = np.array(values[:, 0], copy=True)
+ mask = np.isnan(masked_vals).astype(np.uint8)
+
+ if ascending ^ (na_option == 'top'):
+ nan_fill_val = np.inf
+ order = (masked_vals, mask, labels)
+ else:
+ nan_fill_val = -np.inf
+ order = (masked_vals, ~mask, labels)
+ np.putmask(masked_vals, mask, nan_fill_val)
+
+ # lexsort using labels, then mask, then actual values
+ # each label corresponds to a different group value,
+ # the mask helps you differentiate missing values before
+ # performing sort on the actual values
+ _as = np.lexsort(order).astype(np.int64, copy=False)
+
+ if not ascending:
+ _as = _as[::-1]
+
+ with nogil:
+ # Loop over the length of the value array
+ # each incremental i value can be looked up in the _as array
+ # that we sorted previously, which gives us the location of
+ # that sorted value for retrieval back from the original
+ # values / masked_vals arrays
+ for i in range(N):
+ # dups and sum_ranks will be incremented each loop where
+ # the value / group remains the same, and should be reset
+ # when either of those change
+ # Used to calculate tiebreakers
+ dups += 1
+ sum_ranks += i - grp_start + 1
+
+ # Update out only when there is a transition of values or labels.
+ # When a new value or group is encountered, go back #dups steps(
+ # the number of occurrence of current value) and assign the ranks
+ # based on the the starting index of the current group (grp_start)
+ # and the current index
+ if (i == N - 1 or
+ (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
+ (mask[_as[i]] ^ mask[_as[i+1]]) or
+ (labels[_as[i]] != labels[_as[i+1]])):
+ # if keep_na, check for missing values and assign back
+ # to the result where appropriate
+ if keep_na and mask[_as[i]]:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = NaN
+ grp_na_count = dups
+ elif tiebreak == TIEBREAK_AVERAGE:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = sum_ranks / <float64_t>dups
+ elif tiebreak == TIEBREAK_MIN:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = i - grp_start - dups + 2
+ elif tiebreak == TIEBREAK_MAX:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = i - grp_start + 1
+ elif tiebreak == TIEBREAK_FIRST:
+ for j in range(i - dups + 1, i + 1):
+ if ascending:
+ out[_as[j], 0] = j + 1 - grp_start
+ else:
+ out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start
+ elif tiebreak == TIEBREAK_DENSE:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = grp_vals_seen
+
+ # look forward to the next value (using the sorting in _as)
+ # if the value does not equal the current value then we need to
+ # reset the dups and sum_ranks, knowing that a new value is
+ # coming up. the conditional also needs to handle nan equality
+ # and the end of iteration
+ if (i == N - 1 or
+ (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
+ (mask[_as[i]] ^ mask[_as[i+1]])):
+ dups = sum_ranks = 0
+ grp_vals_seen += 1
+ grp_tie_count += 1
+
+ # Similar to the previous conditional, check now if we are
+ # moving to a new group. If so, keep track of the index where
+ # the new group occurs, so the tiebreaker calculations can
+ # decrement that from their position. fill in the size of each
+ # group encountered (used by pct calculations later). also be
+ # sure to reset any of the items helping to calculate dups
+ if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
+ if tiebreak != TIEBREAK_DENSE:
+ for j in range(grp_start, i + 1):
+ grp_sizes[_as[j], 0] = (i - grp_start + 1 -
+ grp_na_count)
+ else:
+ for j in range(grp_start, i + 1):
+ grp_sizes[_as[j], 0] = (grp_tie_count -
+ (grp_na_count > 0))
+ dups = sum_ranks = 0
+ grp_na_count = 0
+ grp_tie_count = 0
+ grp_start = i + 1
+ grp_vals_seen = 1
+
+ if pct:
+ for i in range(N):
+ # We don't include NaN values in percentage
+ # rankings, so we assign them percentages of NaN.
+ if out[i, 0] != out[i, 0] or out[i, 0] == NAN:
+ out[i, 0] = NAN
+ elif grp_sizes[i, 0] != 0:
+ out[i, 0] = out[i, 0] / grp_sizes[i, 0]
+
+
+def group_last_float32(ndarray[float32_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[float32_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ float32_t val
+ ndarray[float32_t, ndim=2] resx
+ ndarray[int64_t, ndim=2] nobs
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros((<object>out).shape, dtype=np.int64)
+ resx = np.empty_like(out)
+
+ N, K = (<object>values).shape
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val and val != NAN:
+ nobs[lab, j] += 1
+ resx[lab, j] = val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] == 0:
+ out[i, j] = NAN
+ else:
+ out[i, j] = resx[i, j]
+
+
+def group_nth_float32(ndarray[float32_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[float32_t, ndim=2] values,
+ ndarray[int64_t] labels, int64_t rank,
+ Py_ssize_t min_count=-1):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ float32_t val
+ ndarray[float32_t, ndim=2] resx
+ ndarray[int64_t, ndim=2] nobs
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros((<object>out).shape, dtype=np.int64)
+ resx = np.empty_like(out)
+
+ N, K = (<object>values).shape
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val and val != NAN:
+ nobs[lab, j] += 1
+ if nobs[lab, j] == rank:
+ resx[lab, j] = val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] == 0:
+ out[i, j] = NAN
+ else:
+ out[i, j] = resx[i, j]
+
+
+
+def group_rank_float32(ndarray[float64_t, ndim=2] out,
+ ndarray[float32_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ bint is_datetimelike, object ties_method,
+ bint ascending, bint pct, object na_option):
+ """
+ Provides the rank of values within each group.
+
+ Parameters
+ ----------
+ out : array of float64_t values which this method will write its results to
+ values : array of float32_t values to be ranked
+ labels : array containing unique label for each group, with its ordering
+ matching up to the corresponding record in `values`
+ is_datetimelike : bool, default False
+ unused in this method but provided for call compatibility with other
+ Cython transformations
+ ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
+ 'average'
+ * average: average rank of group
+ * min: lowest rank in group
+ * max: highest rank in group
+ * first: ranks assigned in order they appear in the array
+ * dense: like 'min', but rank always increases by 1 between groups
+ ascending : boolean, default True
+ False for ranks by high (1) to low (N)
+ na_option : {'keep', 'top', 'bottom'}, default 'keep'
+ pct : boolean, default False
+ Compute percentage rank of data within each group
+ na_option : {'keep', 'top', 'bottom'}, default 'keep'
+ * keep: leave NA values where they are
+ * top: smallest rank if ascending
+ * bottom: smallest rank if descending
+
+ Notes
+ -----
+ This method modifies the `out` parameter rather than returning an object
+ """
+ cdef:
+ TiebreakEnumType tiebreak
+ Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0
+ Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0
+ ndarray[int64_t] _as
+ ndarray[float64_t, ndim=2] grp_sizes
+ ndarray[float32_t] masked_vals
+ ndarray[uint8_t] mask
+ bint keep_na
+ float32_t nan_fill_val
+
+ tiebreak = tiebreakers[ties_method]
+ keep_na = na_option == 'keep'
+ N, K = (<object>values).shape
+ grp_sizes = np.ones_like(out)
+
+ # Copy values into new array in order to fill missing data
+ # with mask, without obfuscating location of missing data
+ # in values array
+ masked_vals = np.array(values[:, 0], copy=True)
+ mask = np.isnan(masked_vals).astype(np.uint8)
+
+ if ascending ^ (na_option == 'top'):
+ nan_fill_val = np.inf
+ order = (masked_vals, mask, labels)
+ else:
+ nan_fill_val = -np.inf
+ order = (masked_vals, ~mask, labels)
+ np.putmask(masked_vals, mask, nan_fill_val)
+
+ # lexsort using labels, then mask, then actual values
+ # each label corresponds to a different group value,
+ # the mask helps you differentiate missing values before
+ # performing sort on the actual values
+ _as = np.lexsort(order).astype(np.int64, copy=False)
+
+ if not ascending:
+ _as = _as[::-1]
+
+ with nogil:
+ # Loop over the length of the value array
+ # each incremental i value can be looked up in the _as array
+ # that we sorted previously, which gives us the location of
+ # that sorted value for retrieval back from the original
+ # values / masked_vals arrays
+ for i in range(N):
+ # dups and sum_ranks will be incremented each loop where
+ # the value / group remains the same, and should be reset
+ # when either of those change
+ # Used to calculate tiebreakers
+ dups += 1
+ sum_ranks += i - grp_start + 1
+
+ # Update out only when there is a transition of values or labels.
+ # When a new value or group is encountered, go back #dups steps(
+ # the number of occurrence of current value) and assign the ranks
+ # based on the the starting index of the current group (grp_start)
+ # and the current index
+ if (i == N - 1 or
+ (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
+ (mask[_as[i]] ^ mask[_as[i+1]]) or
+ (labels[_as[i]] != labels[_as[i+1]])):
+ # if keep_na, check for missing values and assign back
+ # to the result where appropriate
+ if keep_na and mask[_as[i]]:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = NaN
+ grp_na_count = dups
+ elif tiebreak == TIEBREAK_AVERAGE:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = sum_ranks / <float64_t>dups
+ elif tiebreak == TIEBREAK_MIN:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = i - grp_start - dups + 2
+ elif tiebreak == TIEBREAK_MAX:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = i - grp_start + 1
+ elif tiebreak == TIEBREAK_FIRST:
+ for j in range(i - dups + 1, i + 1):
+ if ascending:
+ out[_as[j], 0] = j + 1 - grp_start
+ else:
+ out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start
+ elif tiebreak == TIEBREAK_DENSE:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = grp_vals_seen
+
+ # look forward to the next value (using the sorting in _as)
+ # if the value does not equal the current value then we need to
+ # reset the dups and sum_ranks, knowing that a new value is
+ # coming up. the conditional also needs to handle nan equality
+ # and the end of iteration
+ if (i == N - 1 or
+ (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
+ (mask[_as[i]] ^ mask[_as[i+1]])):
+ dups = sum_ranks = 0
+ grp_vals_seen += 1
+ grp_tie_count += 1
+
+ # Similar to the previous conditional, check now if we are
+ # moving to a new group. If so, keep track of the index where
+ # the new group occurs, so the tiebreaker calculations can
+ # decrement that from their position. fill in the size of each
+ # group encountered (used by pct calculations later). also be
+ # sure to reset any of the items helping to calculate dups
+ if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
+ if tiebreak != TIEBREAK_DENSE:
+ for j in range(grp_start, i + 1):
+ grp_sizes[_as[j], 0] = (i - grp_start + 1 -
+ grp_na_count)
+ else:
+ for j in range(grp_start, i + 1):
+ grp_sizes[_as[j], 0] = (grp_tie_count -
+ (grp_na_count > 0))
+ dups = sum_ranks = 0
+ grp_na_count = 0
+ grp_tie_count = 0
+ grp_start = i + 1
+ grp_vals_seen = 1
+
+ if pct:
+ for i in range(N):
+ # We don't include NaN values in percentage
+ # rankings, so we assign them percentages of NaN.
+ if out[i, 0] != out[i, 0] or out[i, 0] == NAN:
+ out[i, 0] = NAN
+ elif grp_sizes[i, 0] != 0:
+ out[i, 0] = out[i, 0] / grp_sizes[i, 0]
+
+
+def group_last_int64(ndarray[int64_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[int64_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ int64_t val
+ ndarray[int64_t, ndim=2] resx
+ ndarray[int64_t, ndim=2] nobs
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros((<object>out).shape, dtype=np.int64)
+ resx = np.empty_like(out)
+
+ N, K = (<object>values).shape
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val and val != NPY_NAT:
+ nobs[lab, j] += 1
+ resx[lab, j] = val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] == 0:
+ out[i, j] = NPY_NAT
+ else:
+ out[i, j] = resx[i, j]
+
+
+def group_nth_int64(ndarray[int64_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[int64_t, ndim=2] values,
+ ndarray[int64_t] labels, int64_t rank,
+ Py_ssize_t min_count=-1):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ int64_t val
+ ndarray[int64_t, ndim=2] resx
+ ndarray[int64_t, ndim=2] nobs
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros((<object>out).shape, dtype=np.int64)
+ resx = np.empty_like(out)
+
+ N, K = (<object>values).shape
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val and val != NPY_NAT:
+ nobs[lab, j] += 1
+ if nobs[lab, j] == rank:
+ resx[lab, j] = val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] == 0:
+ out[i, j] = NPY_NAT
+ else:
+ out[i, j] = resx[i, j]
+
+
+
+def group_rank_int64(ndarray[float64_t, ndim=2] out,
+ ndarray[int64_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ bint is_datetimelike, object ties_method,
+ bint ascending, bint pct, object na_option):
+ """
+ Provides the rank of values within each group.
+
+ Parameters
+ ----------
+ out : array of float64_t values which this method will write its results to
+ values : array of int64_t values to be ranked
+ labels : array containing unique label for each group, with its ordering
+ matching up to the corresponding record in `values`
+ is_datetimelike : bool, default False
+ unused in this method but provided for call compatibility with other
+ Cython transformations
+ ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
+ 'average'
+ * average: average rank of group
+ * min: lowest rank in group
+ * max: highest rank in group
+ * first: ranks assigned in order they appear in the array
+ * dense: like 'min', but rank always increases by 1 between groups
+ ascending : boolean, default True
+ False for ranks by high (1) to low (N)
+ na_option : {'keep', 'top', 'bottom'}, default 'keep'
+ pct : boolean, default False
+ Compute percentage rank of data within each group
+ na_option : {'keep', 'top', 'bottom'}, default 'keep'
+ * keep: leave NA values where they are
+ * top: smallest rank if ascending
+ * bottom: smallest rank if descending
+
+ Notes
+ -----
+ This method modifies the `out` parameter rather than returning an object
+ """
+ cdef:
+ TiebreakEnumType tiebreak
+ Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0
+ Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0
+ ndarray[int64_t] _as
+ ndarray[float64_t, ndim=2] grp_sizes
+ ndarray[int64_t] masked_vals
+ ndarray[uint8_t] mask
+ bint keep_na
+ int64_t nan_fill_val
+
+ tiebreak = tiebreakers[ties_method]
+ keep_na = na_option == 'keep'
+ N, K = (<object>values).shape
+ grp_sizes = np.ones_like(out)
+
+ # Copy values into new array in order to fill missing data
+ # with mask, without obfuscating location of missing data
+ # in values array
+ masked_vals = np.array(values[:, 0], copy=True)
+ mask = (masked_vals == NPY_NAT).astype(np.uint8)
+
+ if ascending ^ (na_option == 'top'):
+ nan_fill_val = np.iinfo(np.int64).max
+ order = (masked_vals, mask, labels)
+ else:
+ nan_fill_val = np.iinfo(np.int64).min
+ order = (masked_vals, ~mask, labels)
+ np.putmask(masked_vals, mask, nan_fill_val)
+
+ # lexsort using labels, then mask, then actual values
+ # each label corresponds to a different group value,
+ # the mask helps you differentiate missing values before
+ # performing sort on the actual values
+ _as = np.lexsort(order).astype(np.int64, copy=False)
+
+ if not ascending:
+ _as = _as[::-1]
+
+ with nogil:
+ # Loop over the length of the value array
+ # each incremental i value can be looked up in the _as array
+ # that we sorted previously, which gives us the location of
+ # that sorted value for retrieval back from the original
+ # values / masked_vals arrays
+ for i in range(N):
+ # dups and sum_ranks will be incremented each loop where
+ # the value / group remains the same, and should be reset
+ # when either of those change
+ # Used to calculate tiebreakers
+ dups += 1
+ sum_ranks += i - grp_start + 1
+
+ # Update out only when there is a transition of values or labels.
+ # When a new value or group is encountered, go back #dups steps(
+ # the number of occurrence of current value) and assign the ranks
+ # based on the the starting index of the current group (grp_start)
+ # and the current index
+ if (i == N - 1 or
+ (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
+ (mask[_as[i]] ^ mask[_as[i+1]]) or
+ (labels[_as[i]] != labels[_as[i+1]])):
+ # if keep_na, check for missing values and assign back
+ # to the result where appropriate
+ if keep_na and mask[_as[i]]:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = NaN
+ grp_na_count = dups
+ elif tiebreak == TIEBREAK_AVERAGE:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = sum_ranks / <float64_t>dups
+ elif tiebreak == TIEBREAK_MIN:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = i - grp_start - dups + 2
+ elif tiebreak == TIEBREAK_MAX:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = i - grp_start + 1
+ elif tiebreak == TIEBREAK_FIRST:
+ for j in range(i - dups + 1, i + 1):
+ if ascending:
+ out[_as[j], 0] = j + 1 - grp_start
+ else:
+ out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start
+ elif tiebreak == TIEBREAK_DENSE:
+ for j in range(i - dups + 1, i + 1):
+ out[_as[j], 0] = grp_vals_seen
+
+ # look forward to the next value (using the sorting in _as)
+ # if the value does not equal the current value then we need to
+ # reset the dups and sum_ranks, knowing that a new value is
+ # coming up. the conditional also needs to handle nan equality
+ # and the end of iteration
+ if (i == N - 1 or
+ (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
+ (mask[_as[i]] ^ mask[_as[i+1]])):
+ dups = sum_ranks = 0
+ grp_vals_seen += 1
+ grp_tie_count += 1
+
+ # Similar to the previous conditional, check now if we are
+ # moving to a new group. If so, keep track of the index where
+ # the new group occurs, so the tiebreaker calculations can
+ # decrement that from their position. fill in the size of each
+ # group encountered (used by pct calculations later). also be
+ # sure to reset any of the items helping to calculate dups
+ if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
+ if tiebreak != TIEBREAK_DENSE:
+ for j in range(grp_start, i + 1):
+ grp_sizes[_as[j], 0] = (i - grp_start + 1 -
+ grp_na_count)
+ else:
+ for j in range(grp_start, i + 1):
+ grp_sizes[_as[j], 0] = (grp_tie_count -
+ (grp_na_count > 0))
+ dups = sum_ranks = 0
+ grp_na_count = 0
+ grp_tie_count = 0
+ grp_start = i + 1
+ grp_vals_seen = 1
+
+ if pct:
+ for i in range(N):
+ # We don't include NaN values in percentage
+ # rankings, so we assign them percentages of NaN.
+ if out[i, 0] != out[i, 0] or out[i, 0] == NAN:
+ out[i, 0] = NAN
+ elif grp_sizes[i, 0] != 0:
+ out[i, 0] = out[i, 0] / grp_sizes[i, 0]
+
+
+def group_last_object(ndarray[object, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[object, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ object val
+ ndarray[object, ndim=2] resx
+ ndarray[int64_t, ndim=2] nobs
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros((<object>out).shape, dtype=np.int64)
+ resx = np.empty((<object>out).shape, dtype=object)
+
+ N, K = (<object>values).shape
+
+ if True: # make templating happy
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val and val != NAN:
+ nobs[lab, j] += 1
+ resx[lab, j] = val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] == 0:
+ out[i, j] = NAN
+ else:
+ out[i, j] = resx[i, j]
+
+
+def group_nth_object(ndarray[object, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[object, ndim=2] values,
+ ndarray[int64_t] labels, int64_t rank,
+ Py_ssize_t min_count=-1):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ object val
+ ndarray[object, ndim=2] resx
+ ndarray[int64_t, ndim=2] nobs
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros((<object>out).shape, dtype=np.int64)
+ resx = np.empty((<object>out).shape, dtype=object)
+
+ N, K = (<object>values).shape
+
+ if True: # make templating happy
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if val == val and val != NAN:
+ nobs[lab, j] += 1
+ if nobs[lab, j] == rank:
+ resx[lab, j] = val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] == 0:
+ out[i, j] = NAN
+ else:
+ out[i, j] = resx[i, j]
+
+
+
+# ----------------------------------------------------------------------
+# group_min, group_max
+# ----------------------------------------------------------------------
+
+# TODO: consider implementing for more dtypes
+ctypedef fused groupby_t:
+ float64_t
+ float32_t
+ int64_t
+
+
+def group_max(ndarray[groupby_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[groupby_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ groupby_t val, count, nan_val
+ ndarray[groupby_t, ndim=2] maxx, nobs
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros_like(out)
+
+ maxx = np.empty_like(out)
+ if groupby_t is int64_t:
+ # Note: evaluated at compile-time
+ maxx[:] = -_int64_max
+ nan_val = NPY_NAT
+ else:
+ maxx[:] = -np.inf
+ nan_val = NAN
+
+ N, K = (<object>values).shape
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if groupby_t is int64_t:
+ if val != nan_val:
+ nobs[lab, j] += 1
+ if val > maxx[lab, j]:
+ maxx[lab, j] = val
+ else:
+ if val == val and val != nan_val:
+ nobs[lab, j] += 1
+ if val > maxx[lab, j]:
+ maxx[lab, j] = val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] == 0:
+ out[i, j] = nan_val
+ else:
+ out[i, j] = maxx[i, j]
+
+
+def group_min(ndarray[groupby_t, ndim=2] out,
+ ndarray[int64_t] counts,
+ ndarray[groupby_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ Py_ssize_t min_count=-1):
+ """
+ Only aggregates on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+ groupby_t val, count, nan_val
+ ndarray[groupby_t, ndim=2] minx, nobs
+
+ assert min_count == -1, "'min_count' only used in add and prod"
+
+ if not len(values) == len(labels):
+ raise AssertionError("len(index) != len(labels)")
+
+ nobs = np.zeros_like(out)
+
+ minx = np.empty_like(out)
+ if groupby_t is int64_t:
+ minx[:] = _int64_max
+ nan_val = NPY_NAT
+ else:
+ minx[:] = np.inf
+ nan_val = NAN
+
+ N, K = (<object>values).shape
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+ if lab < 0:
+ continue
+
+ counts[lab] += 1
+ for j in range(K):
+ val = values[i, j]
+
+ # not nan
+ if groupby_t is int64_t:
+ if val != nan_val:
+ nobs[lab, j] += 1
+ if val < minx[lab, j]:
+ minx[lab, j] = val
+ else:
+ if val == val and val != nan_val:
+ nobs[lab, j] += 1
+ if val < minx[lab, j]:
+ minx[lab, j] = val
+
+ for i in range(ncounts):
+ for j in range(K):
+ if nobs[i, j] == 0:
+ out[i, j] = nan_val
+ else:
+ out[i, j] = minx[i, j]
+
+
+def group_cummin(ndarray[groupby_t, ndim=2] out,
+ ndarray[groupby_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ bint is_datetimelike):
+ """
+ Only transforms on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, size
+ groupby_t val, mval
+ ndarray[groupby_t, ndim=2] accum
+ int64_t lab
+
+ N, K = (<object>values).shape
+ accum = np.empty_like(values)
+ if groupby_t is int64_t:
+ accum[:] = _int64_max
+ else:
+ accum[:] = np.inf
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+
+ if lab < 0:
+ continue
+ for j in range(K):
+ val = values[i, j]
+
+ # val = nan
+ if groupby_t is int64_t:
+ if is_datetimelike and val == NPY_NAT:
+ out[i, j] = NPY_NAT
+ else:
+ mval = accum[lab, j]
+ if val < mval:
+ accum[lab, j] = mval = val
+ out[i, j] = mval
+ else:
+ if val == val:
+ mval = accum[lab, j]
+ if val < mval:
+ accum[lab, j] = mval = val
+ out[i, j] = mval
+
+
+def group_cummax(ndarray[groupby_t, ndim=2] out,
+ ndarray[groupby_t, ndim=2] values,
+ ndarray[int64_t] labels,
+ bint is_datetimelike):
+ """
+ Only transforms on axis=0
+ """
+ cdef:
+ Py_ssize_t i, j, N, K, size
+ groupby_t val, mval
+ ndarray[groupby_t, ndim=2] accum
+ int64_t lab
+
+ N, K = (<object>values).shape
+ accum = np.empty_like(values)
+ if groupby_t is int64_t:
+ accum[:] = -_int64_max
+ else:
+ accum[:] = -np.inf
+
+ with nogil:
+ for i in range(N):
+ lab = labels[i]
+
+ if lab < 0:
+ continue
+ for j in range(K):
+ val = values[i, j]
+
+ if groupby_t is int64_t:
+ if is_datetimelike and val == NPY_NAT:
+ out[i, j] = NPY_NAT
+ else:
+ mval = accum[lab, j]
+ if val > mval:
+ accum[lab, j] = mval = val
+ out[i, j] = mval
+ else:
+ if val == val:
+ mval = accum[lab, j]
+ if val > mval:
+ accum[lab, j] = mval = val
+ out[i, j] = mval
diff --git a/contrib/python/pandas/py2/pandas/_libs/hashing.pyx b/contrib/python/pandas/py2/pandas/_libs/hashing.pyx
new file mode 100644
index 00000000000..21d6c5378e1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/hashing.pyx
@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+# Translated from the reference implementation
+# at https://github.com/veorq/SipHash
+
+import cython
+from libc.stdlib cimport malloc, free
+
+import numpy as np
+from numpy cimport uint8_t, uint32_t, uint64_t, import_array
+import_array()
+
+from pandas._libs.util cimport is_nan
+
+DEF cROUNDS = 2
+DEF dROUNDS = 4
+
+
+def hash_object_array(object[:] arr, object key, object encoding='utf8'):
+ """
+ Parameters
+ ----------
+ arr : 1-d object ndarray of objects
+ key : hash key, must be 16 byte len encoded
+ encoding : encoding for key & arr, default to 'utf8'
+
+ Returns
+ -------
+ 1-d uint64 ndarray of hashes
+
+ Notes
+ -----
+ allowed values must be strings, or nulls
+ mixed array types will raise TypeError
+
+ """
+ cdef:
+ Py_ssize_t i, l, n
+ uint64_t[:] result
+ bytes data, k
+ uint8_t *kb
+ uint64_t *lens
+ char **vecs
+ char *cdata
+ object val
+ list datas = []
+
+ k = <bytes>key.encode(encoding)
+ kb = <uint8_t *>k
+ if len(k) != 16:
+ raise ValueError("key should be a 16-byte string encoded, "
+ "got {key} (len {klen})".format(key=k, klen=len(k)))
+
+ n = len(arr)
+
+ # create an array of bytes
+ vecs = <char **>malloc(n * sizeof(char *))
+ lens = <uint64_t*>malloc(n * sizeof(uint64_t))
+
+ for i in range(n):
+ val = arr[i]
+ if isinstance(val, bytes):
+ data = <bytes>val
+ elif isinstance(val, unicode):
+ data = <bytes>val.encode(encoding)
+ elif val is None or is_nan(val):
+ # null, stringify and encode
+ data = <bytes>str(val).encode(encoding)
+
+ else:
+ raise TypeError("{val} of type {typ} is not a valid type "
+ "for hashing, must be string or null"
+ .format(val=val, typ=type(val)))
+
+ l = len(data)
+ lens[i] = l
+ cdata = data
+
+ # keep the references alive thru the end of the
+ # function
+ datas.append(data)
+ vecs[i] = cdata
+
+ result = np.empty(n, dtype=np.uint64)
+ with nogil:
+ for i in range(n):
+ result[i] = low_level_siphash(<uint8_t *>vecs[i], lens[i], kb)
+
+ free(vecs)
+ free(lens)
+ return result.base # .base to retrieve underlying np.ndarray
+
+
+cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil:
+ return (x << b) | (x >> (64 - b))
+
+
+cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil:
+ p[0] = <uint8_t>(v)
+ p[1] = <uint8_t>(v >> 8)
+ p[2] = <uint8_t>(v >> 16)
+ p[3] = <uint8_t>(v >> 24)
+
+
+cdef inline uint64_t u8to64_le(uint8_t* p) nogil:
+ return (<uint64_t>p[0] |
+ <uint64_t>p[1] << 8 |
+ <uint64_t>p[2] << 16 |
+ <uint64_t>p[3] << 24 |
+ <uint64_t>p[4] << 32 |
+ <uint64_t>p[5] << 40 |
+ <uint64_t>p[6] << 48 |
+ <uint64_t>p[7] << 56)
+
+
+cdef inline void _sipround(uint64_t* v0, uint64_t* v1,
+ uint64_t* v2, uint64_t* v3) nogil:
+ v0[0] += v1[0]
+ v1[0] = _rotl(v1[0], 13)
+ v1[0] ^= v0[0]
+ v0[0] = _rotl(v0[0], 32)
+ v2[0] += v3[0]
+ v3[0] = _rotl(v3[0], 16)
+ v3[0] ^= v2[0]
+ v0[0] += v3[0]
+ v3[0] = _rotl(v3[0], 21)
+ v3[0] ^= v0[0]
+ v2[0] += v1[0]
+ v1[0] = _rotl(v1[0], 17)
+ v1[0] ^= v2[0]
+ v2[0] = _rotl(v2[0], 32)
+
+
+cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen,
+ uint8_t* key) nogil:
+ cdef uint64_t v0 = 0x736f6d6570736575ULL
+ cdef uint64_t v1 = 0x646f72616e646f6dULL
+ cdef uint64_t v2 = 0x6c7967656e657261ULL
+ cdef uint64_t v3 = 0x7465646279746573ULL
+ cdef uint64_t b
+ cdef uint64_t k0 = u8to64_le(key)
+ cdef uint64_t k1 = u8to64_le(key + 8)
+ cdef uint64_t m
+ cdef int i
+ cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t))
+ cdef int left = datalen & 7
+ cdef int left_byte
+
+ b = (<uint64_t>datalen) << 56
+ v3 ^= k1
+ v2 ^= k0
+ v1 ^= k1
+ v0 ^= k0
+
+ while (data != end):
+ m = u8to64_le(data)
+ v3 ^= m
+ for i in range(cROUNDS):
+ _sipround(&v0, &v1, &v2, &v3)
+ v0 ^= m
+
+ data += sizeof(uint64_t)
+
+ for i in range(left-1, -1, -1):
+ b |= (<uint64_t>data[i]) << (i * 8)
+
+ v3 ^= b
+
+ for i in range(cROUNDS):
+ _sipround(&v0, &v1, &v2, &v3)
+
+ v0 ^= b
+ v2 ^= 0xff
+
+ for i in range(dROUNDS):
+ _sipround(&v0, &v1, &v2, &v3)
+
+ b = v0 ^ v1 ^ v2 ^ v3
+
+ return b
diff --git a/contrib/python/pandas/py2/pandas/_libs/hashtable.pxd b/contrib/python/pandas/py2/pandas/_libs/hashtable.pxd
new file mode 100644
index 00000000000..609420f4297
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/hashtable.pxd
@@ -0,0 +1,54 @@
+from pandas._libs.khash cimport (
+ kh_int64_t, kh_uint64_t, kh_float64_t, kh_pymap_t, kh_str_t, uint64_t,
+ int64_t, float64_t)
+from numpy cimport ndarray
+
+# prototypes for sharing
+
+cdef class HashTable:
+ pass
+
+cdef class UInt64HashTable(HashTable):
+ cdef kh_uint64_t *table
+
+ cpdef get_item(self, uint64_t val)
+ cpdef set_item(self, uint64_t key, Py_ssize_t val)
+
+cdef class Int64HashTable(HashTable):
+ cdef kh_int64_t *table
+
+ cpdef get_item(self, int64_t val)
+ cpdef set_item(self, int64_t key, Py_ssize_t val)
+
+cdef class Float64HashTable(HashTable):
+ cdef kh_float64_t *table
+
+ cpdef get_item(self, float64_t val)
+ cpdef set_item(self, float64_t key, Py_ssize_t val)
+
+cdef class PyObjectHashTable(HashTable):
+ cdef kh_pymap_t *table
+
+ cpdef get_item(self, object val)
+ cpdef set_item(self, object key, Py_ssize_t val)
+
+
+cdef class StringHashTable(HashTable):
+ cdef kh_str_t *table
+
+ cpdef get_item(self, object val)
+ cpdef set_item(self, object key, Py_ssize_t val)
+
+cdef struct Int64VectorData:
+ int64_t *data
+ size_t n, m
+
+cdef class Int64Vector:
+ cdef Int64VectorData *data
+ cdef ndarray ao
+ cdef bint external_view_exists
+
+ cdef resize(self)
+ cpdef to_array(self)
+ cdef inline void append(self, int64_t x)
+ cdef extend(self, int64_t[:] x)
diff --git a/contrib/python/pandas/py2/pandas/_libs/hashtable.pyx b/contrib/python/pandas/py2/pandas/_libs/hashtable.pyx
new file mode 100644
index 00000000000..47fa5932290
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/hashtable.pyx
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+
+cimport cython
+
+from cpython cimport (PyObject, Py_INCREF,
+ PyMem_Malloc, PyMem_Realloc, PyMem_Free)
+
+from libc.stdlib cimport malloc, free
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport ndarray, uint8_t, uint32_t, float64_t
+cnp.import_array()
+
+cdef extern from "numpy/npy_math.h":
+ float64_t NAN "NPY_NAN"
+
+
+from pandas._libs.khash cimport (
+ khiter_t,
+
+ kh_str_t, kh_init_str, kh_put_str, kh_exist_str,
+ kh_get_str, kh_destroy_str, kh_resize_str,
+
+ kh_put_strbox, kh_get_strbox, kh_init_strbox,
+
+ kh_int64_t, kh_init_int64, kh_resize_int64, kh_destroy_int64,
+ kh_get_int64, kh_exist_int64, kh_put_int64,
+
+ kh_float64_t, kh_exist_float64, kh_put_float64, kh_init_float64,
+ kh_get_float64, kh_destroy_float64, kh_resize_float64,
+
+ kh_resize_uint64, kh_exist_uint64, kh_destroy_uint64, kh_put_uint64,
+ kh_get_uint64, kh_init_uint64,
+
+ kh_destroy_pymap, kh_exist_pymap, kh_init_pymap, kh_get_pymap,
+ kh_put_pymap, kh_resize_pymap)
+
+
+cimport pandas._libs.util as util
+
+from pandas._libs.missing cimport checknull
+
+
+cdef int64_t NPY_NAT = util.get_nat()
+_SIZE_HINT_LIMIT = (1 << 20) + 7
+
+
+cdef size_t _INIT_VEC_CAP = 128
+
+include "hashtable_class_helper.pxi"
+include "hashtable_func_helper.pxi"
+
+cdef class Factorizer:
+ cdef public PyObjectHashTable table
+ cdef public ObjectVector uniques
+ cdef public Py_ssize_t count
+
+ def __init__(self, size_hint):
+ self.table = PyObjectHashTable(size_hint)
+ self.uniques = ObjectVector()
+ self.count = 0
+
+ def get_count(self):
+ return self.count
+
+ def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1,
+ na_value=None):
+ """
+ Factorize values with nans replaced by na_sentinel
+ >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
+ array([ 0, 1, 20])
+ """
+ if self.uniques.external_view_exists:
+ uniques = ObjectVector()
+ uniques.extend(self.uniques.to_array())
+ self.uniques = uniques
+ labels = self.table.get_labels(values, self.uniques,
+ self.count, na_sentinel, na_value)
+ mask = (labels == na_sentinel)
+ # sort on
+ if sort:
+ if labels.dtype != np.intp:
+ labels = labels.astype(np.intp)
+ sorter = self.uniques.to_array().argsort()
+ reverse_indexer = np.empty(len(sorter), dtype=np.intp)
+ reverse_indexer.put(sorter, np.arange(len(sorter)))
+ labels = reverse_indexer.take(labels, mode='clip')
+ labels[mask] = na_sentinel
+ self.count = len(self.uniques)
+ return labels
+
+ def unique(self, ndarray[object] values):
+ # just for fun
+ return self.table.unique(values)
+
+
+cdef class Int64Factorizer:
+ cdef public Int64HashTable table
+ cdef public Int64Vector uniques
+ cdef public Py_ssize_t count
+
+ def __init__(self, size_hint):
+ self.table = Int64HashTable(size_hint)
+ self.uniques = Int64Vector()
+ self.count = 0
+
+ def get_count(self):
+ return self.count
+
+ def factorize(self, int64_t[:] values, sort=False,
+ na_sentinel=-1, na_value=None):
+ """
+ Factorize values with nans replaced by na_sentinel
+ >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
+ array([ 0, 1, 20])
+ """
+ if self.uniques.external_view_exists:
+ uniques = Int64Vector()
+ uniques.extend(self.uniques.to_array())
+ self.uniques = uniques
+ labels = self.table.get_labels(values, self.uniques,
+ self.count, na_sentinel,
+ na_value=na_value)
+
+ # sort on
+ if sort:
+ if labels.dtype != np.intp:
+ labels = labels.astype(np.intp)
+
+ sorter = self.uniques.to_array().argsort()
+ reverse_indexer = np.empty(len(sorter), dtype=np.intp)
+ reverse_indexer.put(sorter, np.arange(len(sorter)))
+
+ labels = reverse_indexer.take(labels)
+
+ self.count = len(self.uniques)
+ return labels
+
+
+def unique_label_indices(ndarray[int64_t, ndim=1] labels):
+ """
+ indices of the first occurrences of the unique labels
+ *excluding* -1. equivalent to:
+ np.unique(labels, return_index=True)[1]
+ """
+ cdef:
+ int ret = 0
+ Py_ssize_t i, n = len(labels)
+ kh_int64_t *table = kh_init_int64()
+ Int64Vector idx = Int64Vector()
+ ndarray[int64_t, ndim=1] arr
+ Int64VectorData *ud = idx.data
+
+ kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT))
+
+ with nogil:
+ for i in range(n):
+ kh_put_int64(table, labels[i], &ret)
+ if ret != 0:
+ if needs_resize(ud):
+ with gil:
+ idx.resize()
+ append_data_int64(ud, i)
+
+ kh_destroy_int64(table)
+
+ arr = idx.to_array()
+ arr = arr[labels[arr].argsort()]
+
+ return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
diff --git a/contrib/python/pandas/py2/pandas/_libs/hashtable_class_helper.pxi b/contrib/python/pandas/py2/pandas/_libs/hashtable_class_helper.pxi
new file mode 100644
index 00000000000..36dcae7cba0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/hashtable_class_helper.pxi
@@ -0,0 +1,1802 @@
+"""
+Template for each `dtype` helper function for hashtable
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+
+# ----------------------------------------------------------------------
+# VectorData
+# ----------------------------------------------------------------------
+
+ctypedef struct Float64VectorData:
+ float64_t *data
+ Py_ssize_t n, m
+
+
+cdef inline void append_data_float64(Float64VectorData *data,
+ float64_t x) nogil:
+
+ data.data[data.n] = x
+ data.n += 1
+
+
+cdef inline void append_data_int64(Int64VectorData *data,
+ int64_t x) nogil:
+
+ data.data[data.n] = x
+ data.n += 1
+
+ctypedef struct StringVectorData:
+ char * *data
+ Py_ssize_t n, m
+
+
+cdef inline void append_data_string(StringVectorData *data,
+ char * x) nogil:
+
+ data.data[data.n] = x
+ data.n += 1
+
+ctypedef struct UInt64VectorData:
+ uint64_t *data
+ Py_ssize_t n, m
+
+
+cdef inline void append_data_uint64(UInt64VectorData *data,
+ uint64_t x) nogil:
+
+ data.data[data.n] = x
+ data.n += 1
+
+ctypedef fused vector_data:
+ Int64VectorData
+ UInt64VectorData
+ Float64VectorData
+ StringVectorData
+
+cdef inline bint needs_resize(vector_data *data) nogil:
+ return data.n == data.m
+
+# ----------------------------------------------------------------------
+# Vector
+# ----------------------------------------------------------------------
+
+cdef class Float64Vector:
+
+ cdef:
+ bint external_view_exists
+ Float64VectorData *data
+ ndarray ao
+
+ def __cinit__(self):
+ self.data = <Float64VectorData *>PyMem_Malloc(
+ sizeof(Float64VectorData))
+ if not self.data:
+ raise MemoryError()
+ self.external_view_exists = False
+ self.data.n = 0
+ self.data.m = _INIT_VEC_CAP
+ self.ao = np.empty(self.data.m, dtype=np.float64)
+ self.data.data = <float64_t*>self.ao.data
+
+ cdef resize(self):
+ self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
+ self.ao.resize(self.data.m, refcheck=False)
+ self.data.data = <float64_t*>self.ao.data
+
+ def __dealloc__(self):
+ if self.data is not NULL:
+ PyMem_Free(self.data)
+ self.data = NULL
+
+ def __len__(self):
+ return self.data.n
+
+ cpdef to_array(self):
+ if self.data.m != self.data.n:
+ if self.external_view_exists:
+ # should never happen
+ raise ValueError("should have raised on append()")
+ self.ao.resize(self.data.n, refcheck=False)
+ self.data.m = self.data.n
+ self.external_view_exists = True
+ return self.ao
+
+ cdef inline void append(self, float64_t x):
+
+ if needs_resize(self.data):
+ if self.external_view_exists:
+ raise ValueError("external reference but "
+ "Vector.resize() needed")
+ self.resize()
+
+ append_data_float64(self.data, x)
+
+ cdef extend(self, const float64_t[:] x):
+ for i in range(len(x)):
+ self.append(x[i])
+
+cdef class UInt64Vector:
+
+ cdef:
+ bint external_view_exists
+ UInt64VectorData *data
+ ndarray ao
+
+ def __cinit__(self):
+ self.data = <UInt64VectorData *>PyMem_Malloc(
+ sizeof(UInt64VectorData))
+ if not self.data:
+ raise MemoryError()
+ self.external_view_exists = False
+ self.data.n = 0
+ self.data.m = _INIT_VEC_CAP
+ self.ao = np.empty(self.data.m, dtype=np.uint64)
+ self.data.data = <uint64_t*>self.ao.data
+
+ cdef resize(self):
+ self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
+ self.ao.resize(self.data.m, refcheck=False)
+ self.data.data = <uint64_t*>self.ao.data
+
+ def __dealloc__(self):
+ if self.data is not NULL:
+ PyMem_Free(self.data)
+ self.data = NULL
+
+ def __len__(self):
+ return self.data.n
+
+ cpdef to_array(self):
+ if self.data.m != self.data.n:
+ if self.external_view_exists:
+ # should never happen
+ raise ValueError("should have raised on append()")
+ self.ao.resize(self.data.n, refcheck=False)
+ self.data.m = self.data.n
+ self.external_view_exists = True
+ return self.ao
+
+ cdef inline void append(self, uint64_t x):
+
+ if needs_resize(self.data):
+ if self.external_view_exists:
+ raise ValueError("external reference but "
+ "Vector.resize() needed")
+ self.resize()
+
+ append_data_uint64(self.data, x)
+
+ cdef extend(self, const uint64_t[:] x):
+ for i in range(len(x)):
+ self.append(x[i])
+
+cdef class Int64Vector:
+
+
+ def __cinit__(self):
+ self.data = <Int64VectorData *>PyMem_Malloc(
+ sizeof(Int64VectorData))
+ if not self.data:
+ raise MemoryError()
+ self.external_view_exists = False
+ self.data.n = 0
+ self.data.m = _INIT_VEC_CAP
+ self.ao = np.empty(self.data.m, dtype=np.int64)
+ self.data.data = <int64_t*>self.ao.data
+
+ cdef resize(self):
+ self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
+ self.ao.resize(self.data.m, refcheck=False)
+ self.data.data = <int64_t*>self.ao.data
+
+ def __dealloc__(self):
+ if self.data is not NULL:
+ PyMem_Free(self.data)
+ self.data = NULL
+
+ def __len__(self):
+ return self.data.n
+
+ cpdef to_array(self):
+ if self.data.m != self.data.n:
+ if self.external_view_exists:
+ # should never happen
+ raise ValueError("should have raised on append()")
+ self.ao.resize(self.data.n, refcheck=False)
+ self.data.m = self.data.n
+ self.external_view_exists = True
+ return self.ao
+
+ cdef inline void append(self, int64_t x):
+
+ if needs_resize(self.data):
+ if self.external_view_exists:
+ raise ValueError("external reference but "
+ "Vector.resize() needed")
+ self.resize()
+
+ append_data_int64(self.data, x)
+
+ cdef extend(self, const int64_t[:] x):
+ for i in range(len(x)):
+ self.append(x[i])
+
+cdef class StringVector:
+
+ cdef:
+ StringVectorData *data
+ bint external_view_exists
+
+ def __cinit__(self):
+ self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
+ if not self.data:
+ raise MemoryError()
+ self.external_view_exists = False
+ self.data.n = 0
+ self.data.m = _INIT_VEC_CAP
+ self.data.data = <char **>malloc(self.data.m * sizeof(char *))
+ if not self.data.data:
+ raise MemoryError()
+
+ cdef resize(self):
+ cdef:
+ char **orig_data
+ Py_ssize_t i, m
+
+ m = self.data.m
+ self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
+
+ orig_data = self.data.data
+ self.data.data = <char **>malloc(self.data.m * sizeof(char *))
+ if not self.data.data:
+ raise MemoryError()
+ for i in range(m):
+ self.data.data[i] = orig_data[i]
+
+ def __dealloc__(self):
+ if self.data is not NULL:
+ if self.data.data is not NULL:
+ free(self.data.data)
+ PyMem_Free(self.data)
+ self.data = NULL
+
+ def __len__(self):
+ return self.data.n
+
+ def to_array(self):
+ cdef:
+ ndarray ao
+ Py_ssize_t n
+ object val
+
+ ao = np.empty(self.data.n, dtype=np.object)
+ for i in range(self.data.n):
+ val = self.data.data[i]
+ ao[i] = val
+ self.external_view_exists = True
+ self.data.m = self.data.n
+ return ao
+
+ cdef inline void append(self, char *x):
+
+ if needs_resize(self.data):
+ self.resize()
+
+ append_data_string(self.data, x)
+
+ cdef extend(self, ndarray[:] x):
+ for i in range(len(x)):
+ self.append(x[i])
+
+
+cdef class ObjectVector:
+
+ cdef:
+ PyObject **data
+ Py_ssize_t n, m
+ ndarray ao
+ bint external_view_exists
+
+ def __cinit__(self):
+ self.external_view_exists = False
+ self.n = 0
+ self.m = _INIT_VEC_CAP
+ self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
+ self.data = <PyObject**>self.ao.data
+
+ def __len__(self):
+ return self.n
+
+ cdef inline append(self, object obj):
+ if self.n == self.m:
+ if self.external_view_exists:
+ raise ValueError("external reference but "
+ "Vector.resize() needed")
+ self.m = max(self.m * 2, _INIT_VEC_CAP)
+ self.ao.resize(self.m, refcheck=False)
+ self.data = <PyObject**>self.ao.data
+
+ Py_INCREF(obj)
+ self.data[self.n] = <PyObject*>obj
+ self.n += 1
+
+ def to_array(self):
+ if self.m != self.n:
+ if self.external_view_exists:
+ raise ValueError("should have raised on append()")
+ self.ao.resize(self.n, refcheck=False)
+ self.m = self.n
+ self.external_view_exists = True
+ return self.ao
+
+ cdef extend(self, ndarray[:] x):
+ for i in range(len(x)):
+ self.append(x[i])
+
+# ----------------------------------------------------------------------
+# HashTable
+# ----------------------------------------------------------------------
+
+
+cdef class HashTable:
+
+ pass
+
+cdef class Float64HashTable(HashTable):
+
+ def __cinit__(self, int64_t size_hint=1):
+ self.table = kh_init_float64()
+ if size_hint is not None:
+ size_hint = min(size_hint, _SIZE_HINT_LIMIT)
+ kh_resize_float64(self.table, size_hint)
+
+ def __len__(self):
+ return self.table.size
+
+ def __dealloc__(self):
+ if self.table is not NULL:
+ kh_destroy_float64(self.table)
+ self.table = NULL
+
+ def __contains__(self, object key):
+ cdef khiter_t k
+ k = kh_get_float64(self.table, key)
+ return k != self.table.n_buckets
+
+ def sizeof(self, deep=False):
+ """ return the size of my table in bytes """
+ return self.table.n_buckets * (sizeof(float64_t) + # keys
+ sizeof(Py_ssize_t) + # vals
+ sizeof(uint32_t)) # flags
+
+ cpdef get_item(self, float64_t val):
+ cdef khiter_t k
+ k = kh_get_float64(self.table, val)
+ if k != self.table.n_buckets:
+ return self.table.vals[k]
+ else:
+ raise KeyError(val)
+
+ cpdef set_item(self, float64_t key, Py_ssize_t val):
+ cdef:
+ khiter_t k
+ int ret = 0
+
+ k = kh_put_float64(self.table, key, &ret)
+ self.table.keys[k] = key
+ if kh_exist_float64(self.table, k):
+ self.table.vals[k] = val
+ else:
+ raise KeyError(key)
+
+ @cython.boundscheck(False)
+ def map(self, const float64_t[:] keys, const int64_t[:] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ float64_t key
+ khiter_t k
+
+ with nogil:
+ for i in range(n):
+ key = keys[i]
+ k = kh_put_float64(self.table, key, &ret)
+ self.table.vals[k] = <Py_ssize_t>values[i]
+
+ @cython.boundscheck(False)
+ def map_locations(self, ndarray[float64_t, ndim=1] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ float64_t val
+ khiter_t k
+
+ with nogil:
+ for i in range(n):
+ val = values[i]
+ k = kh_put_float64(self.table, val, &ret)
+ self.table.vals[k] = i
+
+ @cython.boundscheck(False)
+ def lookup(self, const float64_t[:] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ float64_t val
+ khiter_t k
+ int64_t[:] locs = np.empty(n, dtype=np.int64)
+
+ with nogil:
+ for i in range(n):
+ val = values[i]
+ k = kh_get_float64(self.table, val)
+ if k != self.table.n_buckets:
+ locs[i] = self.table.vals[k]
+ else:
+ locs[i] = -1
+
+ return np.asarray(locs)
+
+ @cython.boundscheck(False)
+ @cython.wraparound(False)
+ def _unique(self, const float64_t[:] values, Float64Vector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None, bint ignore_na=False,
+ bint return_inverse=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[float64]
+ Array of values of which unique will be calculated
+ uniques : Float64Vector
+ Vector into which uniques will be written
+ count_prior : Py_ssize_t, default 0
+ Number of existing entries in uniques
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then
+ any value "val" satisfying val != val is considered missing.
+ If na_value is not None, then _additionally_, any value "val"
+ satisfying val == na_value is considered missing.
+ ignore_na : boolean, default False
+ Whether NA-values should be ignored for calculating the uniques. If
+ True, the labels corresponding to missing values will be set to
+ na_sentinel.
+ return_inverse : boolean, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+
+ Returns
+ -------
+ uniques : ndarray[float64]
+ Unique values of input, not sorted
+ labels : ndarray[int64] (if return_inverse=True)
+ The labels from values to uniques
+ """
+ cdef:
+ Py_ssize_t i, idx, count = count_prior, n = len(values)
+ int64_t[:] labels
+ int ret = 0
+ float64_t val, na_value2
+ khiter_t k
+ Float64VectorData *ud
+ bint use_na_value
+
+ if return_inverse:
+ labels = np.empty(n, dtype=np.int64)
+ ud = uniques.data
+ use_na_value = na_value is not None
+
+ if use_na_value:
+ # We need this na_value2 because we want to allow users
+ # to *optionally* specify an NA sentinel *of the correct* type.
+ # We use None, to make it optional, which requires `object` type
+ # for the parameter. To please the compiler, we use na_value2,
+ # which is only used if it's *specified*.
+ na_value2 = <float64_t>na_value
+ else:
+ na_value2 = np.nan
+
+ with nogil:
+ for i in range(n):
+ val = values[i]
+
+ if ignore_na and (val != val
+ or (use_na_value and val == na_value2)):
+ # if missing values do not count as unique values (i.e. if
+ # ignore_na is True), skip the hashtable entry for them,
+ # and replace the corresponding label with na_sentinel
+ labels[i] = na_sentinel
+ continue
+
+ k = kh_get_float64(self.table, val)
+
+ if k == self.table.n_buckets:
+ # k hasn't been seen yet
+ k = kh_put_float64(self.table, val, &ret)
+
+ if needs_resize(ud):
+ with gil:
+ if uniques.external_view_exists:
+ raise ValueError("external reference to "
+ "uniques held, but "
+ "Vector.resize() needed")
+ uniques.resize()
+ append_data_float64(ud, val)
+ if return_inverse:
+ self.table.vals[k] = count
+ labels[i] = count
+ count += 1
+ elif return_inverse:
+ # k falls into a previous bucket
+ # only relevant in case we need to construct the inverse
+ idx = self.table.vals[k]
+ labels[i] = idx
+
+ if return_inverse:
+ return uniques.to_array(), np.asarray(labels)
+ return uniques.to_array()
+
+ def unique(self, const float64_t[:] values, bint return_inverse=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[float64]
+ Array of values of which unique will be calculated
+ return_inverse : boolean, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+
+ Returns
+ -------
+ uniques : ndarray[float64]
+ Unique values of input, not sorted
+ labels : ndarray[int64] (if return_inverse)
+ The labels from values to uniques
+ """
+ uniques = Float64Vector()
+ return self._unique(values, uniques, ignore_na=False,
+ return_inverse=return_inverse)
+
+ def factorize(self, const float64_t[:] values, Py_ssize_t na_sentinel=-1,
+ object na_value=None):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Missing values are not included in the "uniques" for this method.
+ The labels for any missing values will be set to "na_sentinel"
+
+ Parameters
+ ----------
+ values : ndarray[float64]
+ Array of values of which unique will be calculated
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then
+ any value "val" satisfying val != val is considered missing.
+ If na_value is not None, then _additionally_, any value "val"
+ satisfying val == na_value is considered missing.
+
+ Returns
+ -------
+ uniques : ndarray[float64]
+ Unique values of input, not sorted
+ labels : ndarray[int64]
+ The labels from values to uniques
+ """
+ uniques_vector = Float64Vector()
+ return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
+ na_value=na_value, ignore_na=True,
+ return_inverse=True)
+
+ def get_labels(self, const float64_t[:] values, Float64Vector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None):
+ _, labels = self._unique(values, uniques, count_prior=count_prior,
+ na_sentinel=na_sentinel, na_value=na_value,
+ ignore_na=True, return_inverse=True)
+ return labels
+
+ @cython.boundscheck(False)
+ def get_labels_groupby(self, const float64_t[:] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int64_t[:] labels
+ Py_ssize_t idx, count = 0
+ int ret = 0
+ float64_t val
+ khiter_t k
+ Float64Vector uniques = Float64Vector()
+ Float64VectorData *ud
+
+ labels = np.empty(n, dtype=np.int64)
+ ud = uniques.data
+
+ with nogil:
+ for i in range(n):
+ val = values[i]
+
+ # specific for groupby
+ if val < 0:
+ labels[i] = -1
+ continue
+
+ k = kh_get_float64(self.table, val)
+ if k != self.table.n_buckets:
+ idx = self.table.vals[k]
+ labels[i] = idx
+ else:
+ k = kh_put_float64(self.table, val, &ret)
+ self.table.vals[k] = count
+
+ if needs_resize(ud):
+ with gil:
+ uniques.resize()
+ append_data_float64(ud, val)
+ labels[i] = count
+ count += 1
+
+ arr_uniques = uniques.to_array()
+
+ return np.asarray(labels), arr_uniques
+
+cdef class UInt64HashTable(HashTable):
+
+ def __cinit__(self, int64_t size_hint=1):
+ self.table = kh_init_uint64()
+ if size_hint is not None:
+ size_hint = min(size_hint, _SIZE_HINT_LIMIT)
+ kh_resize_uint64(self.table, size_hint)
+
+ def __len__(self):
+ return self.table.size
+
+ def __dealloc__(self):
+ if self.table is not NULL:
+ kh_destroy_uint64(self.table)
+ self.table = NULL
+
+ def __contains__(self, object key):
+ cdef khiter_t k
+ k = kh_get_uint64(self.table, key)
+ return k != self.table.n_buckets
+
+ def sizeof(self, deep=False):
+ """ return the size of my table in bytes """
+ return self.table.n_buckets * (sizeof(uint64_t) + # keys
+ sizeof(Py_ssize_t) + # vals
+ sizeof(uint32_t)) # flags
+
+ cpdef get_item(self, uint64_t val):
+ cdef khiter_t k
+ k = kh_get_uint64(self.table, val)
+ if k != self.table.n_buckets:
+ return self.table.vals[k]
+ else:
+ raise KeyError(val)
+
+ cpdef set_item(self, uint64_t key, Py_ssize_t val):
+ cdef:
+ khiter_t k
+ int ret = 0
+
+ k = kh_put_uint64(self.table, key, &ret)
+ self.table.keys[k] = key
+ if kh_exist_uint64(self.table, k):
+ self.table.vals[k] = val
+ else:
+ raise KeyError(key)
+
+ @cython.boundscheck(False)
+ def map(self, const uint64_t[:] keys, const int64_t[:] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ uint64_t key
+ khiter_t k
+
+ with nogil:
+ for i in range(n):
+ key = keys[i]
+ k = kh_put_uint64(self.table, key, &ret)
+ self.table.vals[k] = <Py_ssize_t>values[i]
+
+ @cython.boundscheck(False)
+ def map_locations(self, ndarray[uint64_t, ndim=1] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ uint64_t val
+ khiter_t k
+
+ with nogil:
+ for i in range(n):
+ val = values[i]
+ k = kh_put_uint64(self.table, val, &ret)
+ self.table.vals[k] = i
+
+ @cython.boundscheck(False)
+ def lookup(self, const uint64_t[:] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ uint64_t val
+ khiter_t k
+ int64_t[:] locs = np.empty(n, dtype=np.int64)
+
+ with nogil:
+ for i in range(n):
+ val = values[i]
+ k = kh_get_uint64(self.table, val)
+ if k != self.table.n_buckets:
+ locs[i] = self.table.vals[k]
+ else:
+ locs[i] = -1
+
+ return np.asarray(locs)
+
+ @cython.boundscheck(False)
+ @cython.wraparound(False)
+ def _unique(self, const uint64_t[:] values, UInt64Vector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None, bint ignore_na=False,
+ bint return_inverse=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[uint64]
+ Array of values of which unique will be calculated
+ uniques : UInt64Vector
+ Vector into which uniques will be written
+ count_prior : Py_ssize_t, default 0
+ Number of existing entries in uniques
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then
+ any value "val" satisfying val != val is considered missing.
+ If na_value is not None, then _additionally_, any value "val"
+ satisfying val == na_value is considered missing.
+ ignore_na : boolean, default False
+ Whether NA-values should be ignored for calculating the uniques. If
+ True, the labels corresponding to missing values will be set to
+ na_sentinel.
+ return_inverse : boolean, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+
+ Returns
+ -------
+ uniques : ndarray[uint64]
+ Unique values of input, not sorted
+ labels : ndarray[int64] (if return_inverse=True)
+ The labels from values to uniques
+ """
+ cdef:
+ Py_ssize_t i, idx, count = count_prior, n = len(values)
+ int64_t[:] labels
+ int ret = 0
+ uint64_t val, na_value2
+ khiter_t k
+ UInt64VectorData *ud
+ bint use_na_value
+
+ if return_inverse:
+ labels = np.empty(n, dtype=np.int64)
+ ud = uniques.data
+ use_na_value = na_value is not None
+
+ if use_na_value:
+ # We need this na_value2 because we want to allow users
+ # to *optionally* specify an NA sentinel *of the correct* type.
+ # We use None, to make it optional, which requires `object` type
+ # for the parameter. To please the compiler, we use na_value2,
+ # which is only used if it's *specified*.
+ na_value2 = <uint64_t>na_value
+ else:
+ na_value2 = 0
+
+ with nogil:
+ for i in range(n):
+ val = values[i]
+
+ if ignore_na and (val != val
+ or (use_na_value and val == na_value2)):
+ # if missing values do not count as unique values (i.e. if
+ # ignore_na is True), skip the hashtable entry for them,
+ # and replace the corresponding label with na_sentinel
+ labels[i] = na_sentinel
+ continue
+
+ k = kh_get_uint64(self.table, val)
+
+ if k == self.table.n_buckets:
+ # k hasn't been seen yet
+ k = kh_put_uint64(self.table, val, &ret)
+
+ if needs_resize(ud):
+ with gil:
+ if uniques.external_view_exists:
+ raise ValueError("external reference to "
+ "uniques held, but "
+ "Vector.resize() needed")
+ uniques.resize()
+ append_data_uint64(ud, val)
+ if return_inverse:
+ self.table.vals[k] = count
+ labels[i] = count
+ count += 1
+ elif return_inverse:
+ # k falls into a previous bucket
+ # only relevant in case we need to construct the inverse
+ idx = self.table.vals[k]
+ labels[i] = idx
+
+ if return_inverse:
+ return uniques.to_array(), np.asarray(labels)
+ return uniques.to_array()
+
+ def unique(self, const uint64_t[:] values, bint return_inverse=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[uint64]
+ Array of values of which unique will be calculated
+ return_inverse : boolean, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+
+ Returns
+ -------
+ uniques : ndarray[uint64]
+ Unique values of input, not sorted
+ labels : ndarray[int64] (if return_inverse)
+ The labels from values to uniques
+ """
+ uniques = UInt64Vector()
+ return self._unique(values, uniques, ignore_na=False,
+ return_inverse=return_inverse)
+
+ def factorize(self, const uint64_t[:] values, Py_ssize_t na_sentinel=-1,
+ object na_value=None):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Missing values are not included in the "uniques" for this method.
+ The labels for any missing values will be set to "na_sentinel"
+
+ Parameters
+ ----------
+ values : ndarray[uint64]
+ Array of values of which unique will be calculated
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then
+ any value "val" satisfying val != val is considered missing.
+ If na_value is not None, then _additionally_, any value "val"
+ satisfying val == na_value is considered missing.
+
+ Returns
+ -------
+ uniques : ndarray[uint64]
+ Unique values of input, not sorted
+ labels : ndarray[int64]
+ The labels from values to uniques
+ """
+ uniques_vector = UInt64Vector()
+ return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
+ na_value=na_value, ignore_na=True,
+ return_inverse=True)
+
+ def get_labels(self, const uint64_t[:] values, UInt64Vector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None):
+ _, labels = self._unique(values, uniques, count_prior=count_prior,
+ na_sentinel=na_sentinel, na_value=na_value,
+ ignore_na=True, return_inverse=True)
+ return labels
+
+ @cython.boundscheck(False)
+ def get_labels_groupby(self, const uint64_t[:] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int64_t[:] labels
+ Py_ssize_t idx, count = 0
+ int ret = 0
+ uint64_t val
+ khiter_t k
+ UInt64Vector uniques = UInt64Vector()
+ UInt64VectorData *ud
+
+ labels = np.empty(n, dtype=np.int64)
+ ud = uniques.data
+
+ with nogil:
+ for i in range(n):
+ val = values[i]
+
+ # specific for groupby
+
+ k = kh_get_uint64(self.table, val)
+ if k != self.table.n_buckets:
+ idx = self.table.vals[k]
+ labels[i] = idx
+ else:
+ k = kh_put_uint64(self.table, val, &ret)
+ self.table.vals[k] = count
+
+ if needs_resize(ud):
+ with gil:
+ uniques.resize()
+ append_data_uint64(ud, val)
+ labels[i] = count
+ count += 1
+
+ arr_uniques = uniques.to_array()
+
+ return np.asarray(labels), arr_uniques
+
+cdef class Int64HashTable(HashTable):
+
+ def __cinit__(self, int64_t size_hint=1):
+ self.table = kh_init_int64()
+ if size_hint is not None:
+ size_hint = min(size_hint, _SIZE_HINT_LIMIT)
+ kh_resize_int64(self.table, size_hint)
+
+ def __len__(self):
+ return self.table.size
+
+ def __dealloc__(self):
+ if self.table is not NULL:
+ kh_destroy_int64(self.table)
+ self.table = NULL
+
+ def __contains__(self, object key):
+ cdef khiter_t k
+ k = kh_get_int64(self.table, key)
+ return k != self.table.n_buckets
+
+ def sizeof(self, deep=False):
+ """ return the size of my table in bytes """
+ return self.table.n_buckets * (sizeof(int64_t) + # keys
+ sizeof(Py_ssize_t) + # vals
+ sizeof(uint32_t)) # flags
+
+ cpdef get_item(self, int64_t val):
+ cdef khiter_t k
+ k = kh_get_int64(self.table, val)
+ if k != self.table.n_buckets:
+ return self.table.vals[k]
+ else:
+ raise KeyError(val)
+
+ cpdef set_item(self, int64_t key, Py_ssize_t val):
+ cdef:
+ khiter_t k
+ int ret = 0
+
+ k = kh_put_int64(self.table, key, &ret)
+ self.table.keys[k] = key
+ if kh_exist_int64(self.table, k):
+ self.table.vals[k] = val
+ else:
+ raise KeyError(key)
+
+ @cython.boundscheck(False)
+ def map(self, const int64_t[:] keys, const int64_t[:] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ int64_t key
+ khiter_t k
+
+ with nogil:
+ for i in range(n):
+ key = keys[i]
+ k = kh_put_int64(self.table, key, &ret)
+ self.table.vals[k] = <Py_ssize_t>values[i]
+
+ @cython.boundscheck(False)
+ def map_locations(self, ndarray[int64_t, ndim=1] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ int64_t val
+ khiter_t k
+
+ with nogil:
+ for i in range(n):
+ val = values[i]
+ k = kh_put_int64(self.table, val, &ret)
+ self.table.vals[k] = i
+
+ @cython.boundscheck(False)
+ def lookup(self, const int64_t[:] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ int64_t val
+ khiter_t k
+ int64_t[:] locs = np.empty(n, dtype=np.int64)
+
+ with nogil:
+ for i in range(n):
+ val = values[i]
+ k = kh_get_int64(self.table, val)
+ if k != self.table.n_buckets:
+ locs[i] = self.table.vals[k]
+ else:
+ locs[i] = -1
+
+ return np.asarray(locs)
+
+ @cython.boundscheck(False)
+ @cython.wraparound(False)
+ def _unique(self, const int64_t[:] values, Int64Vector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None, bint ignore_na=False,
+ bint return_inverse=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[int64]
+ Array of values of which unique will be calculated
+ uniques : Int64Vector
+ Vector into which uniques will be written
+ count_prior : Py_ssize_t, default 0
+ Number of existing entries in uniques
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then
+ any value "val" satisfying val != val is considered missing.
+ If na_value is not None, then _additionally_, any value "val"
+ satisfying val == na_value is considered missing.
+ ignore_na : boolean, default False
+ Whether NA-values should be ignored for calculating the uniques. If
+ True, the labels corresponding to missing values will be set to
+ na_sentinel.
+ return_inverse : boolean, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+
+ Returns
+ -------
+ uniques : ndarray[int64]
+ Unique values of input, not sorted
+ labels : ndarray[int64] (if return_inverse=True)
+ The labels from values to uniques
+ """
+ cdef:
+ Py_ssize_t i, idx, count = count_prior, n = len(values)
+ int64_t[:] labels
+ int ret = 0
+ int64_t val, na_value2
+ khiter_t k
+ Int64VectorData *ud
+ bint use_na_value
+
+ if return_inverse:
+ labels = np.empty(n, dtype=np.int64)
+ ud = uniques.data
+ use_na_value = na_value is not None
+
+ if use_na_value:
+ # We need this na_value2 because we want to allow users
+ # to *optionally* specify an NA sentinel *of the correct* type.
+ # We use None, to make it optional, which requires `object` type
+ # for the parameter. To please the compiler, we use na_value2,
+ # which is only used if it's *specified*.
+ na_value2 = <int64_t>na_value
+ else:
+ na_value2 = NPY_NAT
+
+ with nogil:
+ for i in range(n):
+ val = values[i]
+
+ if ignore_na and (val != val
+ or (use_na_value and val == na_value2)):
+ # if missing values do not count as unique values (i.e. if
+ # ignore_na is True), skip the hashtable entry for them,
+ # and replace the corresponding label with na_sentinel
+ labels[i] = na_sentinel
+ continue
+
+ k = kh_get_int64(self.table, val)
+
+ if k == self.table.n_buckets:
+ # k hasn't been seen yet
+ k = kh_put_int64(self.table, val, &ret)
+
+ if needs_resize(ud):
+ with gil:
+ if uniques.external_view_exists:
+ raise ValueError("external reference to "
+ "uniques held, but "
+ "Vector.resize() needed")
+ uniques.resize()
+ append_data_int64(ud, val)
+ if return_inverse:
+ self.table.vals[k] = count
+ labels[i] = count
+ count += 1
+ elif return_inverse:
+ # k falls into a previous bucket
+ # only relevant in case we need to construct the inverse
+ idx = self.table.vals[k]
+ labels[i] = idx
+
+ if return_inverse:
+ return uniques.to_array(), np.asarray(labels)
+ return uniques.to_array()
+
+ def unique(self, const int64_t[:] values, bint return_inverse=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[int64]
+ Array of values of which unique will be calculated
+ return_inverse : boolean, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+
+ Returns
+ -------
+ uniques : ndarray[int64]
+ Unique values of input, not sorted
+ labels : ndarray[int64] (if return_inverse)
+ The labels from values to uniques
+ """
+ uniques = Int64Vector()
+ return self._unique(values, uniques, ignore_na=False,
+ return_inverse=return_inverse)
+
+ def factorize(self, const int64_t[:] values, Py_ssize_t na_sentinel=-1,
+ object na_value=None):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Missing values are not included in the "uniques" for this method.
+ The labels for any missing values will be set to "na_sentinel"
+
+ Parameters
+ ----------
+ values : ndarray[int64]
+ Array of values of which unique will be calculated
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then
+ any value "val" satisfying val != val is considered missing.
+ If na_value is not None, then _additionally_, any value "val"
+ satisfying val == na_value is considered missing.
+
+ Returns
+ -------
+ uniques : ndarray[int64]
+ Unique values of input, not sorted
+ labels : ndarray[int64]
+ The labels from values to uniques
+ """
+ uniques_vector = Int64Vector()
+ return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
+ na_value=na_value, ignore_na=True,
+ return_inverse=True)
+
+ def get_labels(self, const int64_t[:] values, Int64Vector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None):
+ _, labels = self._unique(values, uniques, count_prior=count_prior,
+ na_sentinel=na_sentinel, na_value=na_value,
+ ignore_na=True, return_inverse=True)
+ return labels
+
+ @cython.boundscheck(False)
+ def get_labels_groupby(self, const int64_t[:] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int64_t[:] labels
+ Py_ssize_t idx, count = 0
+ int ret = 0
+ int64_t val
+ khiter_t k
+ Int64Vector uniques = Int64Vector()
+ Int64VectorData *ud
+
+ labels = np.empty(n, dtype=np.int64)
+ ud = uniques.data
+
+ with nogil:
+ for i in range(n):
+ val = values[i]
+
+ # specific for groupby
+ if val < 0:
+ labels[i] = -1
+ continue
+
+ k = kh_get_int64(self.table, val)
+ if k != self.table.n_buckets:
+ idx = self.table.vals[k]
+ labels[i] = idx
+ else:
+ k = kh_put_int64(self.table, val, &ret)
+ self.table.vals[k] = count
+
+ if needs_resize(ud):
+ with gil:
+ uniques.resize()
+ append_data_int64(ud, val)
+ labels[i] = count
+ count += 1
+
+ arr_uniques = uniques.to_array()
+
+ return np.asarray(labels), arr_uniques
+
+
+cdef class StringHashTable(HashTable):
+ # these by-definition *must* be strings
+ # or a sentinel np.nan / None missing value
+ na_string_sentinel = '__nan__'
+
+ def __init__(self, int64_t size_hint=1):
+ self.table = kh_init_str()
+ if size_hint is not None:
+ size_hint = min(size_hint, _SIZE_HINT_LIMIT)
+ kh_resize_str(self.table, size_hint)
+
+ def __dealloc__(self):
+ if self.table is not NULL:
+ kh_destroy_str(self.table)
+ self.table = NULL
+
+ def sizeof(self, deep=False):
+ """ return the size of my table in bytes """
+ return self.table.n_buckets * (sizeof(char *) + # keys
+ sizeof(Py_ssize_t) + # vals
+ sizeof(uint32_t)) # flags
+
+ cpdef get_item(self, object val):
+ cdef:
+ khiter_t k
+ const char *v
+ v = util.get_c_string(val)
+
+ k = kh_get_str(self.table, v)
+ if k != self.table.n_buckets:
+ return self.table.vals[k]
+ else:
+ raise KeyError(val)
+
+ cpdef set_item(self, object key, Py_ssize_t val):
+ cdef:
+ khiter_t k
+ int ret = 0
+ const char *v
+
+ v = util.get_c_string(val)
+
+ k = kh_put_str(self.table, v, &ret)
+ self.table.keys[k] = key
+ if kh_exist_str(self.table, k):
+ self.table.vals[k] = val
+ else:
+ raise KeyError(key)
+
+ @cython.boundscheck(False)
+ def get_indexer(self, ndarray[object] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ ndarray[int64_t] labels = np.empty(n, dtype=np.int64)
+ int64_t *resbuf = <int64_t*>labels.data
+ khiter_t k
+ kh_str_t *table = self.table
+ const char *v
+ const char **vecs
+
+ vecs = <const char **>malloc(n * sizeof(char *))
+ for i in range(n):
+ val = values[i]
+ v = util.get_c_string(val)
+ vecs[i] = v
+
+ with nogil:
+ for i in range(n):
+ k = kh_get_str(table, vecs[i])
+ if k != table.n_buckets:
+ resbuf[i] = table.vals[k]
+ else:
+ resbuf[i] = -1
+
+ free(vecs)
+ return labels
+
+ @cython.boundscheck(False)
+ def lookup(self, ndarray[object] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ object val
+ const char *v
+ khiter_t k
+ int64_t[:] locs = np.empty(n, dtype=np.int64)
+
+ # these by-definition *must* be strings
+ vecs = <char **>malloc(n * sizeof(char *))
+ for i in range(n):
+ val = values[i]
+
+ if isinstance(val, (str, unicode)):
+ v = util.get_c_string(val)
+ else:
+ v = util.get_c_string(self.na_string_sentinel)
+ vecs[i] = v
+
+ with nogil:
+ for i in range(n):
+ v = vecs[i]
+ k = kh_get_str(self.table, v)
+ if k != self.table.n_buckets:
+ locs[i] = self.table.vals[k]
+ else:
+ locs[i] = -1
+
+ free(vecs)
+ return np.asarray(locs)
+
+ @cython.boundscheck(False)
+ def map_locations(self, ndarray[object] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ object val
+ const char *v
+ const char **vecs
+ khiter_t k
+
+ # these by-definition *must* be strings
+ vecs = <const char **>malloc(n * sizeof(char *))
+ for i in range(n):
+ val = values[i]
+
+ if isinstance(val, (str, unicode)):
+ v = util.get_c_string(val)
+ else:
+ v = util.get_c_string(self.na_string_sentinel)
+ vecs[i] = v
+
+ with nogil:
+ for i in range(n):
+ v = vecs[i]
+ k = kh_put_str(self.table, v, &ret)
+ self.table.vals[k] = i
+ free(vecs)
+
+ @cython.boundscheck(False)
+ @cython.wraparound(False)
+ def _unique(self, ndarray[object] values, ObjectVector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None, bint ignore_na=False,
+ bint return_inverse=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ Array of values of which unique will be calculated
+ uniques : ObjectVector
+ Vector into which uniques will be written
+ count_prior : Py_ssize_t, default 0
+ Number of existing entries in uniques
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then any value
+ that is not a string is considered missing. If na_value is
+ not None, then _additionally_ any value "val" satisfying
+ val == na_value is considered missing.
+ ignore_na : boolean, default False
+ Whether NA-values should be ignored for calculating the uniques. If
+ True, the labels corresponding to missing values will be set to
+ na_sentinel.
+ return_inverse : boolean, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+
+ Returns
+ -------
+ uniques : ndarray[object]
+ Unique values of input, not sorted
+ labels : ndarray[int64] (if return_inverse=True)
+ The labels from values to uniques
+ """
+ cdef:
+ Py_ssize_t i, idx, count = count_prior, n = len(values)
+ int64_t[:] labels
+ int64_t[:] uindexer
+ int ret = 0
+ object val
+ const char *v
+ const char **vecs
+ khiter_t k
+ bint use_na_value
+
+ if return_inverse:
+ labels = np.zeros(n, dtype=np.int64)
+ uindexer = np.empty(n, dtype=np.int64)
+ use_na_value = na_value is not None
+
+ # assign pointers and pre-filter out missing (if ignore_na)
+ vecs = <const char **>malloc(n * sizeof(char *))
+ for i in range(n):
+ val = values[i]
+
+ if (ignore_na
+ and (not isinstance(val, (str, unicode))
+ or (use_na_value and val == na_value))):
+ # if missing values do not count as unique values (i.e. if
+ # ignore_na is True), we can skip the actual value, and
+ # replace the label with na_sentinel directly
+ labels[i] = na_sentinel
+ else:
+ # if ignore_na is False, we also stringify NaN/None/etc.
+ v = util.get_c_string(val)
+ vecs[i] = v
+
+ # compute
+ with nogil:
+ for i in range(n):
+ if ignore_na and labels[i] == na_sentinel:
+ # skip entries for ignored missing values (see above)
+ continue
+
+ v = vecs[i]
+ k = kh_get_str(self.table, v)
+ if k == self.table.n_buckets:
+ # k hasn't been seen yet
+ k = kh_put_str(self.table, v, &ret)
+ uindexer[count] = i
+ if return_inverse:
+ self.table.vals[k] = count
+ labels[i] = <int64_t>count
+ count += 1
+ elif return_inverse:
+ # k falls into a previous bucket
+ # only relevant in case we need to construct the inverse
+ idx = self.table.vals[k]
+ labels[i] = <int64_t>idx
+
+ free(vecs)
+
+ # uniques
+ for i in range(count):
+ uniques.append(values[uindexer[i]])
+
+ if return_inverse:
+ return uniques.to_array(), np.asarray(labels)
+ return uniques.to_array()
+
+ def unique(self, ndarray[object] values, bint return_inverse=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ Array of values of which unique will be calculated
+ return_inverse : boolean, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+
+ Returns
+ -------
+ uniques : ndarray[object]
+ Unique values of input, not sorted
+ labels : ndarray[int64] (if return_inverse)
+ The labels from values to uniques
+ """
+ uniques = ObjectVector()
+ return self._unique(values, uniques, ignore_na=False,
+ return_inverse=return_inverse)
+
+ def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
+ object na_value=None):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Missing values are not included in the "uniques" for this method.
+ The labels for any missing values will be set to "na_sentinel"
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ Array of values of which unique will be calculated
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then any value
+ that is not a string is considered missing. If na_value is
+ not None, then _additionally_ any value "val" satisfying
+ val == na_value is considered missing.
+
+ Returns
+ -------
+ uniques : ndarray[object]
+ Unique values of input, not sorted
+ labels : ndarray[int64]
+ The labels from values to uniques
+ """
+ uniques_vector = ObjectVector()
+ return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
+ na_value=na_value, ignore_na=True,
+ return_inverse=True)
+
+ def get_labels(self, ndarray[object] values, ObjectVector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None):
+ _, labels = self._unique(values, uniques, count_prior=count_prior,
+ na_sentinel=na_sentinel, na_value=na_value,
+ ignore_na=True, return_inverse=True)
+ return labels
+
+
+cdef class PyObjectHashTable(HashTable):
+
+ def __init__(self, int64_t size_hint=1):
+ self.table = kh_init_pymap()
+ if size_hint is not None:
+ size_hint = min(size_hint, _SIZE_HINT_LIMIT)
+ kh_resize_pymap(self.table, size_hint)
+
+ def __dealloc__(self):
+ if self.table is not NULL:
+ kh_destroy_pymap(self.table)
+ self.table = NULL
+
+ def __len__(self):
+ return self.table.size
+
+ def __contains__(self, object key):
+ cdef khiter_t k
+ hash(key)
+
+ k = kh_get_pymap(self.table, <PyObject*>key)
+ return k != self.table.n_buckets
+
+ def sizeof(self, deep=False):
+ """ return the size of my table in bytes """
+ return self.table.n_buckets * (sizeof(PyObject *) + # keys
+ sizeof(Py_ssize_t) + # vals
+ sizeof(uint32_t)) # flags
+
+ cpdef get_item(self, object val):
+ cdef khiter_t k
+
+ k = kh_get_pymap(self.table, <PyObject*>val)
+ if k != self.table.n_buckets:
+ return self.table.vals[k]
+ else:
+ raise KeyError(val)
+
+ cpdef set_item(self, object key, Py_ssize_t val):
+ cdef:
+ khiter_t k
+ int ret = 0
+ char* buf
+
+ hash(key)
+
+ k = kh_put_pymap(self.table, <PyObject*>key, &ret)
+ # self.table.keys[k] = key
+ if kh_exist_pymap(self.table, k):
+ self.table.vals[k] = val
+ else:
+ raise KeyError(key)
+
+ def map_locations(self, ndarray[object] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ object val
+ khiter_t k
+
+ for i in range(n):
+ val = values[i]
+ hash(val)
+
+ k = kh_put_pymap(self.table, <PyObject*>val, &ret)
+ self.table.vals[k] = i
+
+ def lookup(self, ndarray[object] values):
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ object val
+ khiter_t k
+ int64_t[:] locs = np.empty(n, dtype=np.int64)
+
+ for i in range(n):
+ val = values[i]
+ hash(val)
+
+ k = kh_get_pymap(self.table, <PyObject*>val)
+ if k != self.table.n_buckets:
+ locs[i] = self.table.vals[k]
+ else:
+ locs[i] = -1
+
+ return np.asarray(locs)
+
+ @cython.boundscheck(False)
+ @cython.wraparound(False)
+ def _unique(self, ndarray[object] values, ObjectVector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None, bint ignore_na=False,
+ bint return_inverse=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ Array of values of which unique will be calculated
+ uniques : ObjectVector
+ Vector into which uniques will be written
+ count_prior : Py_ssize_t, default 0
+ Number of existing entries in uniques
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then None _plus_
+ any value "val" satisfying val != val is considered missing.
+ If na_value is not None, then _additionally_, any value "val"
+ satisfying val == na_value is considered missing.
+ ignore_na : boolean, default False
+ Whether NA-values should be ignored for calculating the uniques. If
+ True, the labels corresponding to missing values will be set to
+ na_sentinel.
+ return_inverse : boolean, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+
+ Returns
+ -------
+ uniques : ndarray[object]
+ Unique values of input, not sorted
+ labels : ndarray[int64] (if return_inverse=True)
+ The labels from values to uniques
+ """
+ cdef:
+ Py_ssize_t i, idx, count = count_prior, n = len(values)
+ int64_t[:] labels
+ int ret = 0
+ object val
+ khiter_t k
+ bint use_na_value
+
+ if return_inverse:
+ labels = np.empty(n, dtype=np.int64)
+ use_na_value = na_value is not None
+
+ for i in range(n):
+ val = values[i]
+ hash(val)
+
+ if ignore_na and ((val != val or val is None)
+ or (use_na_value and val == na_value)):
+ # if missing values do not count as unique values (i.e. if
+ # ignore_na is True), skip the hashtable entry for them, and
+ # replace the corresponding label with na_sentinel
+ labels[i] = na_sentinel
+ continue
+
+ k = kh_get_pymap(self.table, <PyObject*>val)
+ if k == self.table.n_buckets:
+ # k hasn't been seen yet
+ k = kh_put_pymap(self.table, <PyObject*>val, &ret)
+ uniques.append(val)
+ if return_inverse:
+ self.table.vals[k] = count
+ labels[i] = count
+ count += 1
+ elif return_inverse:
+ # k falls into a previous bucket
+ # only relevant in case we need to construct the inverse
+ idx = self.table.vals[k]
+ labels[i] = idx
+
+ if return_inverse:
+ return uniques.to_array(), np.asarray(labels)
+ return uniques.to_array()
+
+ def unique(self, ndarray[object] values, bint return_inverse=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ Array of values of which unique will be calculated
+ return_inverse : boolean, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+
+ Returns
+ -------
+ uniques : ndarray[object]
+ Unique values of input, not sorted
+ labels : ndarray[int64] (if return_inverse)
+ The labels from values to uniques
+ """
+ uniques = ObjectVector()
+ return self._unique(values, uniques, ignore_na=False,
+ return_inverse=return_inverse)
+
+ def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
+ object na_value=None):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Missing values are not included in the "uniques" for this method.
+ The labels for any missing values will be set to "na_sentinel"
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ Array of values of which unique will be calculated
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then None _plus_
+ any value "val" satisfying val != val is considered missing.
+ If na_value is not None, then _additionally_, any value "val"
+ satisfying val == na_value is considered missing.
+
+ Returns
+ -------
+ uniques : ndarray[object]
+ Unique values of input, not sorted
+ labels : ndarray[int64]
+ The labels from values to uniques
+ """
+ uniques_vector = ObjectVector()
+ return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
+ na_value=na_value, ignore_na=True,
+ return_inverse=True)
+
+ def get_labels(self, ndarray[object] values, ObjectVector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None):
+ _, labels = self._unique(values, uniques, count_prior=count_prior,
+ na_sentinel=na_sentinel, na_value=na_value,
+ ignore_na=True, return_inverse=True)
+ return labels
diff --git a/contrib/python/pandas/py2/pandas/_libs/hashtable_func_helper.pxi b/contrib/python/pandas/py2/pandas/_libs/hashtable_func_helper.pxi
new file mode 100644
index 00000000000..07b6f267bec
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/hashtable_func_helper.pxi
@@ -0,0 +1,764 @@
+"""
+Template for each `dtype` helper function for hashtable
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+# ----------------------------------------------------------------------
+# VectorData
+# ----------------------------------------------------------------------
+
+
+cdef build_count_table_float64(float64_t[:] values,
+ kh_float64_t *table, bint dropna):
+ cdef:
+ khiter_t k
+ Py_ssize_t i, n = len(values)
+
+ float64_t val
+
+ int ret = 0
+
+ with nogil:
+ kh_resize_float64(table, n)
+
+ for i in range(n):
+ val = values[i]
+
+ if val == val or not dropna:
+ k = kh_get_float64(table, val)
+ if k != table.n_buckets:
+ table.vals[k] += 1
+ else:
+ k = kh_put_float64(table, val, &ret)
+ table.vals[k] = 1
+
+
+cpdef value_count_float64(float64_t[:] values, bint dropna):
+ cdef:
+ Py_ssize_t i = 0
+ kh_float64_t *table
+
+ float64_t[:] result_keys
+ int64_t[:] result_counts
+
+ Py_ssize_t k
+
+ table = kh_init_float64()
+ build_count_table_float64(values, table, dropna)
+
+ result_keys = np.empty(table.n_occupied, dtype=np.float64)
+ result_counts = np.zeros(table.n_occupied, dtype=np.int64)
+
+ with nogil:
+ for k in range(table.n_buckets):
+ if kh_exist_float64(table, k):
+ result_keys[i] = table.keys[k]
+ result_counts[i] = table.vals[k]
+ i += 1
+
+ kh_destroy_float64(table)
+
+ return np.asarray(result_keys), np.asarray(result_counts)
+
+
+
+
+def duplicated_float64(float64_t[:] values, object keep='first'):
+ cdef:
+ int ret = 0
+ float64_t value
+ Py_ssize_t k, i, n = len(values)
+ kh_float64_t *table = kh_init_float64()
+ ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
+
+ kh_resize_float64(table, min(n, _SIZE_HINT_LIMIT))
+
+ if keep not in ('last', 'first', False):
+ raise ValueError('keep must be either "first", "last" or False')
+
+ if keep == 'last':
+ with nogil:
+ for i from n > i >= 0:
+ kh_put_float64(table, values[i], &ret)
+ out[i] = ret == 0
+ elif keep == 'first':
+ with nogil:
+ for i in range(n):
+ kh_put_float64(table, values[i], &ret)
+ out[i] = ret == 0
+ else:
+ with nogil:
+ for i in range(n):
+ value = values[i]
+ k = kh_get_float64(table, value)
+ if k != table.n_buckets:
+ out[table.vals[k]] = 1
+ out[i] = 1
+ else:
+ k = kh_put_float64(table, value, &ret)
+ table.keys[k] = value
+ table.vals[k] = i
+ out[i] = 0
+ kh_destroy_float64(table)
+ return out
+
+
+# ----------------------------------------------------------------------
+# Membership
+# ----------------------------------------------------------------------
+
+
+
+
+def ismember_float64(float64_t[:] arr, float64_t[:] values):
+
+ """
+ Return boolean of values in arr on an
+ element by-element basis
+
+ Parameters
+ ----------
+ arr : float64 ndarray
+ values : float64 ndarray
+
+ Returns
+ -------
+ boolean ndarry len of (arr)
+ """
+ cdef:
+ Py_ssize_t i, n, k
+ int ret = 0
+ ndarray[uint8_t] result
+ float64_t val
+ kh_float64_t *table = kh_init_float64()
+
+ # construct the table
+ n = len(values)
+ kh_resize_float64(table, min(n, len(values)))
+
+ with nogil:
+ for i in range(n):
+ kh_put_float64(table, values[i], &ret)
+
+ # test membership
+ n = len(arr)
+ result = np.empty(n, dtype=np.uint8)
+
+ with nogil:
+ for i in range(n):
+ val = arr[i]
+ k = kh_get_float64(table, val)
+ result[i] = (k != table.n_buckets)
+
+ kh_destroy_float64(table)
+ return result.view(np.bool_)
+
+
+cdef build_count_table_uint64(uint64_t[:] values,
+ kh_uint64_t *table, bint dropna):
+ cdef:
+ khiter_t k
+ Py_ssize_t i, n = len(values)
+
+ uint64_t val
+
+ int ret = 0
+
+ with nogil:
+ kh_resize_uint64(table, n)
+
+ for i in range(n):
+ val = values[i]
+
+ if True:
+ k = kh_get_uint64(table, val)
+ if k != table.n_buckets:
+ table.vals[k] += 1
+ else:
+ k = kh_put_uint64(table, val, &ret)
+ table.vals[k] = 1
+
+
+cpdef value_count_uint64(uint64_t[:] values, bint dropna):
+ cdef:
+ Py_ssize_t i = 0
+ kh_uint64_t *table
+
+ uint64_t[:] result_keys
+ int64_t[:] result_counts
+
+ Py_ssize_t k
+
+ table = kh_init_uint64()
+ build_count_table_uint64(values, table, dropna)
+
+ result_keys = np.empty(table.n_occupied, dtype=np.uint64)
+ result_counts = np.zeros(table.n_occupied, dtype=np.int64)
+
+ with nogil:
+ for k in range(table.n_buckets):
+ if kh_exist_uint64(table, k):
+ result_keys[i] = table.keys[k]
+ result_counts[i] = table.vals[k]
+ i += 1
+
+ kh_destroy_uint64(table)
+
+ return np.asarray(result_keys), np.asarray(result_counts)
+
+
+
+
+def duplicated_uint64(uint64_t[:] values, object keep='first'):
+ cdef:
+ int ret = 0
+ uint64_t value
+ Py_ssize_t k, i, n = len(values)
+ kh_uint64_t *table = kh_init_uint64()
+ ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
+
+ kh_resize_uint64(table, min(n, _SIZE_HINT_LIMIT))
+
+ if keep not in ('last', 'first', False):
+ raise ValueError('keep must be either "first", "last" or False')
+
+ if keep == 'last':
+ with nogil:
+ for i from n > i >= 0:
+ kh_put_uint64(table, values[i], &ret)
+ out[i] = ret == 0
+ elif keep == 'first':
+ with nogil:
+ for i in range(n):
+ kh_put_uint64(table, values[i], &ret)
+ out[i] = ret == 0
+ else:
+ with nogil:
+ for i in range(n):
+ value = values[i]
+ k = kh_get_uint64(table, value)
+ if k != table.n_buckets:
+ out[table.vals[k]] = 1
+ out[i] = 1
+ else:
+ k = kh_put_uint64(table, value, &ret)
+ table.keys[k] = value
+ table.vals[k] = i
+ out[i] = 0
+ kh_destroy_uint64(table)
+ return out
+
+
+# ----------------------------------------------------------------------
+# Membership
+# ----------------------------------------------------------------------
+
+
+
+
+def ismember_uint64(uint64_t[:] arr, uint64_t[:] values):
+
+ """
+ Return boolean of values in arr on an
+ element by-element basis
+
+ Parameters
+ ----------
+ arr : uint64 ndarray
+ values : uint64 ndarray
+
+ Returns
+ -------
+ boolean ndarry len of (arr)
+ """
+ cdef:
+ Py_ssize_t i, n, k
+ int ret = 0
+ ndarray[uint8_t] result
+ uint64_t val
+ kh_uint64_t *table = kh_init_uint64()
+
+ # construct the table
+ n = len(values)
+ kh_resize_uint64(table, min(n, len(values)))
+
+ with nogil:
+ for i in range(n):
+ kh_put_uint64(table, values[i], &ret)
+
+ # test membership
+ n = len(arr)
+ result = np.empty(n, dtype=np.uint8)
+
+ with nogil:
+ for i in range(n):
+ val = arr[i]
+ k = kh_get_uint64(table, val)
+ result[i] = (k != table.n_buckets)
+
+ kh_destroy_uint64(table)
+ return result.view(np.bool_)
+
+
+cdef build_count_table_object(ndarray[object] values,
+ kh_pymap_t *table, bint dropna):
+ cdef:
+ khiter_t k
+ Py_ssize_t i, n = len(values)
+
+ object val
+
+ int ret = 0
+
+ kh_resize_pymap(table, n // 10)
+
+ for i in range(n):
+ val = values[i]
+
+ if not checknull(val) or not dropna:
+ k = kh_get_pymap(table, <PyObject*>val)
+ if k != table.n_buckets:
+ table.vals[k] += 1
+ else:
+ k = kh_put_pymap(table, <PyObject*>val, &ret)
+ table.vals[k] = 1
+
+
+cpdef value_count_object(ndarray[object] values, bint dropna):
+ cdef:
+ Py_ssize_t i = 0
+ kh_pymap_t *table
+
+
+ Py_ssize_t k
+
+ table = kh_init_pymap()
+ build_count_table_object(values, table, 1)
+
+ result_keys = np.empty(table.n_occupied, dtype=np.object)
+ result_counts = np.zeros(table.n_occupied, dtype=np.int64)
+
+ for k in range(table.n_buckets):
+ if kh_exist_pymap(table, k):
+ result_keys[i] = <object>table.keys[k]
+ result_counts[i] = table.vals[k]
+ i += 1
+
+ kh_destroy_pymap(table)
+
+ return result_keys, result_counts
+
+
+
+
+def duplicated_object(ndarray[object] values, object keep='first'):
+ cdef:
+ int ret = 0
+ Py_ssize_t k, i, n = len(values)
+ kh_pymap_t *table = kh_init_pymap()
+ ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
+
+ kh_resize_pymap(table, min(n, _SIZE_HINT_LIMIT))
+
+ if keep not in ('last', 'first', False):
+ raise ValueError('keep must be either "first", "last" or False')
+
+ if keep == 'last':
+ for i from n > i >= 0:
+ kh_put_pymap(table, <PyObject*>values[i], &ret)
+ out[i] = ret == 0
+ elif keep == 'first':
+ for i in range(n):
+ kh_put_pymap(table, <PyObject*>values[i], &ret)
+ out[i] = ret == 0
+ else:
+ for i in range(n):
+ value = values[i]
+ k = kh_get_pymap(table, <PyObject*>value)
+ if k != table.n_buckets:
+ out[table.vals[k]] = 1
+ out[i] = 1
+ else:
+ k = kh_put_pymap(table, <PyObject*>value, &ret)
+ table.keys[k] = <PyObject*>value
+ table.vals[k] = i
+ out[i] = 0
+ kh_destroy_pymap(table)
+ return out
+
+
+# ----------------------------------------------------------------------
+# Membership
+# ----------------------------------------------------------------------
+
+
+
+
+def ismember_object(ndarray[object] arr, ndarray[object] values):
+
+ """
+ Return boolean of values in arr on an
+ element by-element basis
+
+ Parameters
+ ----------
+ arr : object ndarray
+ values : object ndarray
+
+ Returns
+ -------
+ boolean ndarry len of (arr)
+ """
+ cdef:
+ Py_ssize_t i, n, k
+ int ret = 0
+ ndarray[uint8_t] result
+ object val
+ kh_pymap_t *table = kh_init_pymap()
+
+ # construct the table
+ n = len(values)
+ kh_resize_pymap(table, min(n, len(values)))
+
+ for i in range(n):
+ kh_put_pymap(table, <PyObject*>values[i], &ret)
+
+ # test membership
+ n = len(arr)
+ result = np.empty(n, dtype=np.uint8)
+
+ for i in range(n):
+ val = arr[i]
+ k = kh_get_pymap(table, <PyObject*>val)
+ result[i] = (k != table.n_buckets)
+
+ kh_destroy_pymap(table)
+ return result.view(np.bool_)
+
+
+cdef build_count_table_int64(int64_t[:] values,
+ kh_int64_t *table, bint dropna):
+ cdef:
+ khiter_t k
+ Py_ssize_t i, n = len(values)
+
+ int64_t val
+
+ int ret = 0
+
+ with nogil:
+ kh_resize_int64(table, n)
+
+ for i in range(n):
+ val = values[i]
+
+ if True:
+ k = kh_get_int64(table, val)
+ if k != table.n_buckets:
+ table.vals[k] += 1
+ else:
+ k = kh_put_int64(table, val, &ret)
+ table.vals[k] = 1
+
+
+cpdef value_count_int64(int64_t[:] values, bint dropna):
+ cdef:
+ Py_ssize_t i = 0
+ kh_int64_t *table
+
+ int64_t[:] result_keys
+ int64_t[:] result_counts
+
+ Py_ssize_t k
+
+ table = kh_init_int64()
+ build_count_table_int64(values, table, dropna)
+
+ result_keys = np.empty(table.n_occupied, dtype=np.int64)
+ result_counts = np.zeros(table.n_occupied, dtype=np.int64)
+
+ with nogil:
+ for k in range(table.n_buckets):
+ if kh_exist_int64(table, k):
+ result_keys[i] = table.keys[k]
+ result_counts[i] = table.vals[k]
+ i += 1
+
+ kh_destroy_int64(table)
+
+ return np.asarray(result_keys), np.asarray(result_counts)
+
+
+
+
+def duplicated_int64(int64_t[:] values, object keep='first'):
+ cdef:
+ int ret = 0
+ int64_t value
+ Py_ssize_t k, i, n = len(values)
+ kh_int64_t *table = kh_init_int64()
+ ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
+
+ kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT))
+
+ if keep not in ('last', 'first', False):
+ raise ValueError('keep must be either "first", "last" or False')
+
+ if keep == 'last':
+ with nogil:
+ for i from n > i >= 0:
+ kh_put_int64(table, values[i], &ret)
+ out[i] = ret == 0
+ elif keep == 'first':
+ with nogil:
+ for i in range(n):
+ kh_put_int64(table, values[i], &ret)
+ out[i] = ret == 0
+ else:
+ with nogil:
+ for i in range(n):
+ value = values[i]
+ k = kh_get_int64(table, value)
+ if k != table.n_buckets:
+ out[table.vals[k]] = 1
+ out[i] = 1
+ else:
+ k = kh_put_int64(table, value, &ret)
+ table.keys[k] = value
+ table.vals[k] = i
+ out[i] = 0
+ kh_destroy_int64(table)
+ return out
+
+
+# ----------------------------------------------------------------------
+# Membership
+# ----------------------------------------------------------------------
+
+
+
+
+def ismember_int64(int64_t[:] arr, int64_t[:] values):
+
+ """
+ Return boolean of values in arr on an
+ element by-element basis
+
+ Parameters
+ ----------
+ arr : int64 ndarray
+ values : int64 ndarray
+
+ Returns
+ -------
+ boolean ndarry len of (arr)
+ """
+ cdef:
+ Py_ssize_t i, n, k
+ int ret = 0
+ ndarray[uint8_t] result
+ int64_t val
+ kh_int64_t *table = kh_init_int64()
+
+ # construct the table
+ n = len(values)
+ kh_resize_int64(table, min(n, len(values)))
+
+ with nogil:
+ for i in range(n):
+ kh_put_int64(table, values[i], &ret)
+
+ # test membership
+ n = len(arr)
+ result = np.empty(n, dtype=np.uint8)
+
+ with nogil:
+ for i in range(n):
+ val = arr[i]
+ k = kh_get_int64(table, val)
+ result[i] = (k != table.n_buckets)
+
+ kh_destroy_int64(table)
+ return result.view(np.bool_)
+
+
+# ----------------------------------------------------------------------
+# Mode Computations
+# ----------------------------------------------------------------------
+
+
+
+
+def mode_float64(float64_t[:] values, bint dropna):
+ cdef:
+ int count, max_count = 1
+ int j = -1 # so you can do +=
+ Py_ssize_t k
+ kh_float64_t *table
+ ndarray[float64_t] modes
+
+ table = kh_init_float64()
+ build_count_table_float64(values, table, dropna)
+
+ modes = np.empty(table.n_buckets, dtype=np.float64)
+
+ with nogil:
+ for k in range(table.n_buckets):
+ if kh_exist_float64(table, k):
+ count = table.vals[k]
+ if count == max_count:
+ j += 1
+ elif count > max_count:
+ max_count = count
+ j = 0
+ else:
+ continue
+
+ modes[j] = table.keys[k]
+
+ kh_destroy_float64(table)
+
+ return modes[:j + 1]
+
+
+
+
+def mode_int64(int64_t[:] values, bint dropna):
+ cdef:
+ int count, max_count = 1
+ int j = -1 # so you can do +=
+ Py_ssize_t k
+ kh_int64_t *table
+ ndarray[int64_t] modes
+
+ table = kh_init_int64()
+ build_count_table_int64(values, table, dropna)
+
+ modes = np.empty(table.n_buckets, dtype=np.int64)
+
+ with nogil:
+ for k in range(table.n_buckets):
+ if kh_exist_int64(table, k):
+ count = table.vals[k]
+ if count == max_count:
+ j += 1
+ elif count > max_count:
+ max_count = count
+ j = 0
+ else:
+ continue
+
+ modes[j] = table.keys[k]
+
+ kh_destroy_int64(table)
+
+ return modes[:j + 1]
+
+
+
+
+def mode_uint64(uint64_t[:] values, bint dropna):
+ cdef:
+ int count, max_count = 1
+ int j = -1 # so you can do +=
+ Py_ssize_t k
+ kh_uint64_t *table
+ ndarray[uint64_t] modes
+
+ table = kh_init_uint64()
+ build_count_table_uint64(values, table, dropna)
+
+ modes = np.empty(table.n_buckets, dtype=np.uint64)
+
+ with nogil:
+ for k in range(table.n_buckets):
+ if kh_exist_uint64(table, k):
+ count = table.vals[k]
+ if count == max_count:
+ j += 1
+ elif count > max_count:
+ max_count = count
+ j = 0
+ else:
+ continue
+
+ modes[j] = table.keys[k]
+
+ kh_destroy_uint64(table)
+
+ return modes[:j + 1]
+
+
+
+
+def mode_object(ndarray[object] values, bint dropna):
+ cdef:
+ int count, max_count = 1
+ int j = -1 # so you can do +=
+ Py_ssize_t k
+ kh_pymap_t *table
+ ndarray[object] modes
+
+ table = kh_init_pymap()
+ build_count_table_object(values, table, dropna)
+
+ modes = np.empty(table.n_buckets, dtype=np.object_)
+
+ for k in range(table.n_buckets):
+ if kh_exist_pymap(table, k):
+ count = table.vals[k]
+
+ if count == max_count:
+ j += 1
+ elif count > max_count:
+ max_count = count
+ j = 0
+ else:
+ continue
+
+ modes[j] = <object>table.keys[k]
+
+ kh_destroy_pymap(table)
+
+ return modes[:j + 1]
diff --git a/contrib/python/pandas/py2/pandas/_libs/index.pyx b/contrib/python/pandas/py2/pandas/_libs/index.pyx
new file mode 100644
index 00000000000..8cea529fbb0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/index.pyx
@@ -0,0 +1,699 @@
+# -*- coding: utf-8 -*-
+from datetime import datetime, timedelta, date
+
+import cython
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport (ndarray, intp_t,
+ float64_t, float32_t,
+ int64_t, int32_t, int16_t, int8_t,
+ uint64_t, uint32_t, uint16_t, uint8_t,
+ # Note: NPY_DATETIME, NPY_TIMEDELTA are only available
+ # for cimport in cython>=0.27.3
+ NPY_DATETIME, NPY_TIMEDELTA)
+cnp.import_array()
+
+
+cimport pandas._libs.util as util
+
+from pandas._libs.tslibs.conversion cimport maybe_datetimelike_to_i8
+
+from pandas._libs.hashtable cimport HashTable
+
+from pandas._libs import algos, hashtable as _hash
+from pandas._libs.tslibs import Timestamp, Timedelta, period as periodlib
+from pandas._libs.missing import checknull
+
+cdef int64_t NPY_NAT = util.get_nat()
+
+
+cdef inline bint is_definitely_invalid_key(object val):
+ if isinstance(val, tuple):
+ try:
+ hash(val)
+ except TypeError:
+ return True
+
+ # we have a _data, means we are a NDFrame
+ return (isinstance(val, slice) or util.is_array(val)
+ or isinstance(val, list) or hasattr(val, '_data'))
+
+
+cpdef get_value_at(ndarray arr, object loc, object tz=None):
+ if arr.descr.type_num == NPY_DATETIME:
+ return Timestamp(util.get_value_at(arr, loc), tz=tz)
+ elif arr.descr.type_num == NPY_TIMEDELTA:
+ return Timedelta(util.get_value_at(arr, loc))
+ return util.get_value_at(arr, loc)
+
+
+def get_value_box(arr: ndarray, loc: object) -> object:
+ return get_value_at(arr, loc, tz=None)
+
+
+# Don't populate hash tables in monotonic indexes larger than this
+_SIZE_CUTOFF = 1000000
+
+
+cdef class IndexEngine:
+
+ cdef readonly:
+ object vgetter
+ HashTable mapping
+ bint over_size_threshold
+
+ cdef:
+ bint unique, monotonic_inc, monotonic_dec
+ bint need_monotonic_check, need_unique_check
+
+ def __init__(self, vgetter, n):
+ self.vgetter = vgetter
+
+ self.over_size_threshold = n >= _SIZE_CUTOFF
+ self.clear_mapping()
+
+ def __contains__(self, object val):
+ self._ensure_mapping_populated()
+ hash(val)
+ return val in self.mapping
+
+ cpdef get_value(self, ndarray arr, object key, object tz=None):
+ """
+ arr : 1-dimensional ndarray
+ """
+ cdef:
+ object loc
+ void* data_ptr
+
+ loc = self.get_loc(key)
+ if isinstance(loc, slice) or util.is_array(loc):
+ return arr[loc]
+ else:
+ return get_value_at(arr, loc, tz=tz)
+
+ cpdef set_value(self, ndarray arr, object key, object value):
+ """
+ arr : 1-dimensional ndarray
+ """
+ cdef:
+ object loc
+ void* data_ptr
+
+ loc = self.get_loc(key)
+ value = convert_scalar(arr, value)
+
+ arr[loc] = value
+
+ cpdef get_loc(self, object val):
+ if is_definitely_invalid_key(val):
+ raise TypeError("'{val}' is an invalid key".format(val=val))
+
+ if self.over_size_threshold and self.is_monotonic_increasing:
+ if not self.is_unique:
+ return self._get_loc_duplicates(val)
+ values = self._get_index_values()
+
+ self._check_type(val)
+ loc = _bin_search(values, val) # .searchsorted(val, side='left')
+ if loc >= len(values):
+ raise KeyError(val)
+ if util.get_value_at(values, loc) != val:
+ raise KeyError(val)
+ return loc
+
+ self._ensure_mapping_populated()
+ if not self.unique:
+ return self._get_loc_duplicates(val)
+
+ self._check_type(val)
+
+ try:
+ return self.mapping.get_item(val)
+ except (TypeError, ValueError):
+ raise KeyError(val)
+
+ cdef inline _get_loc_duplicates(self, object val):
+ cdef:
+ Py_ssize_t diff
+
+ if self.is_monotonic_increasing:
+ values = self._get_index_values()
+ left = values.searchsorted(val, side='left')
+ right = values.searchsorted(val, side='right')
+
+ diff = right - left
+ if diff == 0:
+ raise KeyError(val)
+ elif diff == 1:
+ return left
+ else:
+ return slice(left, right)
+
+ return self._maybe_get_bool_indexer(val)
+
+ cdef _maybe_get_bool_indexer(self, object val):
+ cdef:
+ ndarray[uint8_t, ndim=1, cast=True] indexer
+ ndarray[intp_t, ndim=1] found
+ int count
+
+ indexer = self._get_index_values() == val
+ found = np.where(indexer)[0]
+ count = len(found)
+
+ if count > 1:
+ return indexer
+ if count == 1:
+ return int(found[0])
+
+ raise KeyError(val)
+
+ def sizeof(self, deep=False):
+ """ return the sizeof our mapping """
+ if not self.is_mapping_populated:
+ return 0
+ return self.mapping.sizeof(deep=deep)
+
+ def __sizeof__(self):
+ return self.sizeof()
+
+ @property
+ def is_unique(self):
+ if self.need_unique_check:
+ self._do_unique_check()
+
+ return self.unique == 1
+
+ cdef inline _do_unique_check(self):
+
+ # this de-facto the same
+ self._ensure_mapping_populated()
+
+ @property
+ def is_monotonic_increasing(self):
+ if self.need_monotonic_check:
+ self._do_monotonic_check()
+
+ return self.monotonic_inc == 1
+
+ @property
+ def is_monotonic_decreasing(self):
+ if self.need_monotonic_check:
+ self._do_monotonic_check()
+
+ return self.monotonic_dec == 1
+
+ cdef inline _do_monotonic_check(self):
+ cdef object is_unique
+ try:
+ values = self._get_index_values()
+ self.monotonic_inc, self.monotonic_dec, is_unique = \
+ self._call_monotonic(values)
+ except TypeError:
+ self.monotonic_inc = 0
+ self.monotonic_dec = 0
+ is_unique = 0
+
+ self.need_monotonic_check = 0
+
+ # we can only be sure of uniqueness if is_unique=1
+ if is_unique:
+ self.unique = 1
+ self.need_unique_check = 0
+
+ cdef _get_index_values(self):
+ return self.vgetter()
+
+ def _call_monotonic(self, values):
+ return algos.is_monotonic(values, timelike=False)
+
+ def get_backfill_indexer(self, other, limit=None):
+ return algos.backfill(self._get_index_values(), other, limit=limit)
+
+ def get_pad_indexer(self, other, limit=None):
+ return algos.pad(self._get_index_values(), other, limit=limit)
+
+ cdef _make_hash_table(self, n):
+ raise NotImplementedError
+
+ cdef _check_type(self, object val):
+ hash(val)
+
+ @property
+ def is_mapping_populated(self):
+ return self.mapping is not None
+
+ cdef inline _ensure_mapping_populated(self):
+ # this populates the mapping
+ # if its not already populated
+ # also satisfies the need_unique_check
+
+ if not self.is_mapping_populated:
+
+ values = self._get_index_values()
+ self.mapping = self._make_hash_table(len(values))
+ self._call_map_locations(values)
+
+ if len(self.mapping) == len(values):
+ self.unique = 1
+
+ self.need_unique_check = 0
+
+ cpdef _call_map_locations(self, values):
+ self.mapping.map_locations(values)
+
+ def clear_mapping(self):
+ self.mapping = None
+ self.need_monotonic_check = 1
+ self.need_unique_check = 1
+
+ self.unique = 0
+ self.monotonic_inc = 0
+ self.monotonic_dec = 0
+
+ def get_indexer(self, values):
+ self._ensure_mapping_populated()
+ return self.mapping.lookup(values)
+
+ def get_indexer_non_unique(self, targets):
+ """ return an indexer suitable for takng from a non unique index
+ return the labels in the same order ast the target
+ and a missing indexer into the targets (which correspond
+ to the -1 indices in the results """
+
+ cdef:
+ ndarray values, x
+ ndarray[int64_t] result, missing
+ set stargets
+ dict d = {}
+ object val
+ int count = 0, count_missing = 0
+ Py_ssize_t i, j, n, n_t, n_alloc
+
+ self._ensure_mapping_populated()
+ values = np.array(self._get_index_values(), copy=False)
+ stargets = set(targets)
+ n = len(values)
+ n_t = len(targets)
+ if n > 10000:
+ n_alloc = 10000
+ else:
+ n_alloc = n
+
+ result = np.empty(n_alloc, dtype=np.int64)
+ missing = np.empty(n_t, dtype=np.int64)
+
+ # map each starget to its position in the index
+ if stargets and len(stargets) < 5 and self.is_monotonic_increasing:
+ # if there are few enough stargets and the index is monotonically
+ # increasing, then use binary search for each starget
+ for starget in stargets:
+ start = values.searchsorted(starget, side='left')
+ end = values.searchsorted(starget, side='right')
+ if start != end:
+ d[starget] = list(range(start, end))
+ else:
+ # otherwise, map by iterating through all items in the index
+ for i in range(n):
+ val = values[i]
+ if val in stargets:
+ if val not in d:
+ d[val] = []
+ d[val].append(i)
+
+ for i in range(n_t):
+ val = targets[i]
+
+ # found
+ if val in d:
+ for j in d[val]:
+
+ # realloc if needed
+ if count >= n_alloc:
+ n_alloc += 10000
+ result = np.resize(result, n_alloc)
+
+ result[count] = j
+ count += 1
+
+ # value not found
+ else:
+
+ if count >= n_alloc:
+ n_alloc += 10000
+ result = np.resize(result, n_alloc)
+ result[count] = -1
+ count += 1
+ missing[count_missing] = i
+ count_missing += 1
+
+ return result[0:count], missing[0:count_missing]
+
+
+cdef Py_ssize_t _bin_search(ndarray values, object val) except -1:
+ cdef:
+ Py_ssize_t mid, lo = 0, hi = len(values) - 1
+ object pval
+
+ if hi >= 0 and val > util.get_value_at(values, hi):
+ return len(values)
+
+ while lo < hi:
+ mid = (lo + hi) // 2
+ pval = util.get_value_at(values, mid)
+ if val < pval:
+ hi = mid
+ elif val > pval:
+ lo = mid + 1
+ else:
+ while mid > 0 and val == util.get_value_at(values, mid - 1):
+ mid -= 1
+ return mid
+
+ if val <= util.get_value_at(values, mid):
+ return mid
+ else:
+ return mid + 1
+
+
+cdef class ObjectEngine(IndexEngine):
+ """
+ Index Engine for use with object-dtype Index, namely the base class Index
+ """
+ cdef _make_hash_table(self, n):
+ return _hash.PyObjectHashTable(n)
+
+
+cdef class DatetimeEngine(Int64Engine):
+
+ cdef _get_box_dtype(self):
+ return 'M8[ns]'
+
+ def __contains__(self, object val):
+ if self.over_size_threshold and self.is_monotonic_increasing:
+ if not self.is_unique:
+ return self._get_loc_duplicates(val)
+ values = self._get_index_values()
+ conv = maybe_datetimelike_to_i8(val)
+ loc = values.searchsorted(conv, side='left')
+ return util.get_value_at(values, loc) == conv
+
+ self._ensure_mapping_populated()
+ return maybe_datetimelike_to_i8(val) in self.mapping
+
+ cdef _get_index_values(self):
+ return self.vgetter().view('i8')
+
+ def _call_monotonic(self, values):
+ return algos.is_monotonic(values, timelike=True)
+
+ cpdef get_loc(self, object val):
+ if is_definitely_invalid_key(val):
+ raise TypeError
+
+ # Welcome to the spaghetti factory
+ if self.over_size_threshold and self.is_monotonic_increasing:
+ if not self.is_unique:
+ val = maybe_datetimelike_to_i8(val)
+ return self._get_loc_duplicates(val)
+ values = self._get_index_values()
+
+ try:
+ conv = maybe_datetimelike_to_i8(val)
+ loc = values.searchsorted(conv, side='left')
+ except TypeError:
+ self._date_check_type(val)
+ raise KeyError(val)
+
+ if loc == len(values) or util.get_value_at(values, loc) != conv:
+ raise KeyError(val)
+ return loc
+
+ self._ensure_mapping_populated()
+ if not self.unique:
+ val = maybe_datetimelike_to_i8(val)
+ return self._get_loc_duplicates(val)
+
+ try:
+ return self.mapping.get_item(val.value)
+ except KeyError:
+ raise KeyError(val)
+ except AttributeError:
+ pass
+
+ try:
+ val = maybe_datetimelike_to_i8(val)
+ return self.mapping.get_item(val)
+ except (TypeError, ValueError):
+ self._date_check_type(val)
+ raise KeyError(val)
+
+ cdef inline _date_check_type(self, object val):
+ hash(val)
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+
+ def get_indexer(self, values):
+ self._ensure_mapping_populated()
+ if values.dtype != self._get_box_dtype():
+ return np.repeat(-1, len(values)).astype('i4')
+ values = np.asarray(values).view('i8')
+ return self.mapping.lookup(values)
+
+ def get_pad_indexer(self, other, limit=None):
+ if other.dtype != self._get_box_dtype():
+ return np.repeat(-1, len(other)).astype('i4')
+ other = np.asarray(other).view('i8')
+ return algos.pad(self._get_index_values(), other, limit=limit)
+
+ def get_backfill_indexer(self, other, limit=None):
+ if other.dtype != self._get_box_dtype():
+ return np.repeat(-1, len(other)).astype('i4')
+ other = np.asarray(other).view('i8')
+ return algos.backfill(self._get_index_values(), other, limit=limit)
+
+
+cdef class TimedeltaEngine(DatetimeEngine):
+
+ cdef _get_box_dtype(self):
+ return 'm8[ns]'
+
+
+cdef class PeriodEngine(Int64Engine):
+
+ cdef _get_index_values(self):
+ return super(PeriodEngine, self).vgetter()
+
+ cpdef _call_map_locations(self, values):
+ super(PeriodEngine, self)._call_map_locations(values.view('i8'))
+
+ def _call_monotonic(self, values):
+ return super(PeriodEngine, self)._call_monotonic(values.view('i8'))
+
+ def get_indexer(self, values):
+ cdef ndarray[int64_t, ndim=1] ordinals
+
+ super(PeriodEngine, self)._ensure_mapping_populated()
+
+ freq = super(PeriodEngine, self).vgetter().freq
+ ordinals = periodlib.extract_ordinals(values, freq)
+
+ return self.mapping.lookup(ordinals)
+
+ def get_pad_indexer(self, other, limit=None):
+ freq = super(PeriodEngine, self).vgetter().freq
+ ordinal = periodlib.extract_ordinals(other, freq)
+
+ return algos.pad(self._get_index_values(),
+ np.asarray(ordinal), limit=limit)
+
+ def get_backfill_indexer(self, other, limit=None):
+ freq = super(PeriodEngine, self).vgetter().freq
+ ordinal = periodlib.extract_ordinals(other, freq)
+
+ return algos.backfill(self._get_index_values(),
+ np.asarray(ordinal), limit=limit)
+
+ def get_indexer_non_unique(self, targets):
+ freq = super(PeriodEngine, self).vgetter().freq
+ ordinal = periodlib.extract_ordinals(targets, freq)
+ ordinal_array = np.asarray(ordinal)
+
+ return super(PeriodEngine, self).get_indexer_non_unique(ordinal_array)
+
+
+cpdef convert_scalar(ndarray arr, object value):
+ # we don't turn integers
+ # into datetimes/timedeltas
+
+ # we don't turn bools into int/float/complex
+
+ if arr.descr.type_num == NPY_DATETIME:
+ if util.is_array(value):
+ pass
+ elif isinstance(value, (datetime, np.datetime64, date)):
+ return Timestamp(value).value
+ elif value is None or value != value:
+ return NPY_NAT
+ elif util.is_string_object(value):
+ return Timestamp(value).value
+ raise ValueError("cannot set a Timestamp with a non-timestamp")
+
+ elif arr.descr.type_num == NPY_TIMEDELTA:
+ if util.is_array(value):
+ pass
+ elif isinstance(value, timedelta):
+ return Timedelta(value).value
+ elif value is None or value != value:
+ return NPY_NAT
+ elif util.is_string_object(value):
+ return Timedelta(value).value
+ raise ValueError("cannot set a Timedelta with a non-timedelta")
+
+ if (issubclass(arr.dtype.type, (np.integer, np.floating, np.complex)) and
+ not issubclass(arr.dtype.type, np.bool_)):
+ if util.is_bool_object(value):
+ raise ValueError('Cannot assign bool to float/integer series')
+
+ if issubclass(arr.dtype.type, (np.integer, np.bool_)):
+ if util.is_float_object(value) and value != value:
+ raise ValueError('Cannot assign nan to integer series')
+
+ return value
+
+
+cdef class BaseMultiIndexCodesEngine:
+ """
+ Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
+ represent each label in a MultiIndex as an integer, by juxtaposing the bits
+ encoding each level, with appropriate offsets.
+
+ For instance: if 3 levels have respectively 3, 6 and 1 possible values,
+ then their labels can be represented using respectively 2, 3 and 1 bits,
+ as follows:
+ _ _ _ _____ _ __ __ __
+ |0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
+ — — — ————— — —— —— ——
+ |0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
+ — — — ————— — —— —— ——
+ |0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
+ ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
+ and the resulting unsigned integer representation will be:
+ _ _ _ _____ _ __ __ __ __ __ __
+ |0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
+ ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
+
+ Offsets are calculated at initialization, labels are transformed by method
+ _codes_to_ints.
+
+ Keys are located by first locating each component against the respective
+ level, then locating (the integer representation of) codes.
+ """
+ def __init__(self, object levels, object labels,
+ ndarray[uint64_t, ndim=1] offsets):
+ """
+ Parameters
+ ----------
+ levels : list-like of numpy arrays
+ Levels of the MultiIndex
+ labels : list-like of numpy arrays of integer dtype
+ Labels of the MultiIndex
+ offsets : numpy array of uint64 dtype
+ Pre-calculated offsets, one for each level of the index
+ """
+
+ self.levels = levels
+ self.offsets = offsets
+
+ # Transform labels in a single array, and add 1 so that we are working
+ # with positive integers (-1 for NaN becomes 0):
+ codes = (np.array(labels, dtype='int64').T + 1).astype('uint64',
+ copy=False)
+
+ # Map each codes combination in the index to an integer unambiguously
+ # (no collisions possible), based on the "offsets", which describe the
+ # number of bits to switch labels for each level:
+ lab_ints = self._codes_to_ints(codes)
+
+ # Initialize underlying index (e.g. libindex.UInt64Engine) with
+ # integers representing labels: we will use its get_loc and get_indexer
+ self._base.__init__(self, lambda: lab_ints, len(lab_ints))
+
+ def _extract_level_codes(self, object target, object method=None):
+ """
+ Map the requested list of (tuple) keys to their integer representations
+ for searching in the underlying integer index.
+
+ Parameters
+ ----------
+ target : list-like of keys
+ Each key is a tuple, with a label for each level of the index.
+
+ Returns
+ ------
+ int_keys : 1-dimensional array of dtype uint64 or object
+ Integers representing one combination each
+ """
+
+ level_codes = [lev.get_indexer(codes) + 1 for lev, codes
+ in zip(self.levels, zip(*target))]
+ return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
+
+ def get_indexer(self, object target, object method=None,
+ object limit=None):
+ lab_ints = self._extract_level_codes(target)
+
+ # All methods (exact, backfill, pad) directly map to the respective
+ # methods of the underlying (integers) index...
+ if method is not None:
+ # but underlying backfill and pad methods require index and keys
+ # to be sorted. The index already is (checked in
+ # Index._get_fill_indexer), sort (integer representations of) keys:
+ order = np.argsort(lab_ints)
+ lab_ints = lab_ints[order]
+ indexer = (getattr(self._base, 'get_{}_indexer'.format(method))
+ (self, lab_ints, limit=limit))
+ indexer = indexer[order]
+ else:
+ indexer = self._base.get_indexer(self, lab_ints)
+
+ return indexer
+
+ def get_loc(self, object key):
+ if is_definitely_invalid_key(key):
+ raise TypeError("'{key}' is an invalid key".format(key=key))
+ if not isinstance(key, tuple):
+ raise KeyError(key)
+ try:
+ indices = [0 if checknull(v) else lev.get_loc(v) + 1
+ for lev, v in zip(self.levels, key)]
+ except KeyError:
+ raise KeyError(key)
+
+ # Transform indices into single integer:
+ lab_int = self._codes_to_ints(np.array(indices, dtype='uint64'))
+
+ return self._base.get_loc(self, lab_int)
+
+ def get_indexer_non_unique(self, object target):
+ # This needs to be overridden just because the default one works on
+ # target._values, and target can be itself a MultiIndex.
+
+ lab_ints = self._extract_level_codes(target)
+ indexer = self._base.get_indexer_non_unique(self, lab_ints)
+
+ return indexer
+
+ def __contains__(self, object val):
+ # Default __contains__ looks in the underlying mapping, which in this
+ # case only contains integer representations.
+ try:
+ self.get_loc(val)
+ return True
+ except (KeyError, TypeError, ValueError):
+ return False
+
+
+# Generated from template.
+include "index_class_helper.pxi"
diff --git a/contrib/python/pandas/py2/pandas/_libs/index_class_helper.pxi b/contrib/python/pandas/py2/pandas/_libs/index_class_helper.pxi
new file mode 100644
index 00000000000..f3c7efde5c0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/index_class_helper.pxi
@@ -0,0 +1,409 @@
+"""
+Template for functions of IndexEngine subclasses.
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+# ----------------------------------------------------------------------
+# IndexEngine Subclass Methods
+# ----------------------------------------------------------------------
+
+
+cdef class Float64Engine(IndexEngine):
+
+ cdef _make_hash_table(self, n):
+ return _hash.Float64HashTable(n)
+
+
+ cpdef _call_map_locations(self, values):
+ # self.mapping is of type Float64HashTable,
+ # so convert dtype of values
+ self.mapping.map_locations(algos.ensure_float64(values))
+
+ cdef _get_index_values(self):
+ return algos.ensure_float64(self.vgetter())
+
+ cdef _maybe_get_bool_indexer(self, object val):
+ cdef:
+ ndarray[uint8_t, ndim=1, cast=True] indexer
+ ndarray[intp_t, ndim=1] found
+ ndarray[float64_t] values
+ int count = 0
+
+
+ # A view is needed for some subclasses, such as PeriodEngine:
+ values = self._get_index_values().view('float64')
+ indexer = values == val
+ found = np.where(indexer)[0]
+ count = len(found)
+
+ if count > 1:
+ return indexer
+ if count == 1:
+ return int(found[0])
+
+ raise KeyError(val)
+
+
+cdef class Float32Engine(IndexEngine):
+
+ cdef _make_hash_table(self, n):
+ return _hash.Float64HashTable(n)
+
+
+ cpdef _call_map_locations(self, values):
+ # self.mapping is of type Float64HashTable,
+ # so convert dtype of values
+ self.mapping.map_locations(algos.ensure_float64(values))
+
+ cdef _get_index_values(self):
+ return algos.ensure_float32(self.vgetter())
+
+ cdef _maybe_get_bool_indexer(self, object val):
+ cdef:
+ ndarray[uint8_t, ndim=1, cast=True] indexer
+ ndarray[intp_t, ndim=1] found
+ ndarray[float32_t] values
+ int count = 0
+
+
+ # A view is needed for some subclasses, such as PeriodEngine:
+ values = self._get_index_values().view('float32')
+ indexer = values == val
+ found = np.where(indexer)[0]
+ count = len(found)
+
+ if count > 1:
+ return indexer
+ if count == 1:
+ return int(found[0])
+
+ raise KeyError(val)
+
+
+cdef class Int64Engine(IndexEngine):
+
+ cdef _make_hash_table(self, n):
+ return _hash.Int64HashTable(n)
+
+ cdef _check_type(self, object val):
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+
+ cpdef _call_map_locations(self, values):
+ # self.mapping is of type Int64HashTable,
+ # so convert dtype of values
+ self.mapping.map_locations(algos.ensure_int64(values))
+
+ cdef _get_index_values(self):
+ return algos.ensure_int64(self.vgetter())
+
+ cdef _maybe_get_bool_indexer(self, object val):
+ cdef:
+ ndarray[uint8_t, ndim=1, cast=True] indexer
+ ndarray[intp_t, ndim=1] found
+ ndarray[int64_t] values
+ int count = 0
+
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+
+ # A view is needed for some subclasses, such as PeriodEngine:
+ values = self._get_index_values().view('int64')
+ indexer = values == val
+ found = np.where(indexer)[0]
+ count = len(found)
+
+ if count > 1:
+ return indexer
+ if count == 1:
+ return int(found[0])
+
+ raise KeyError(val)
+
+
+cdef class Int32Engine(IndexEngine):
+
+ cdef _make_hash_table(self, n):
+ return _hash.Int64HashTable(n)
+
+ cdef _check_type(self, object val):
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+
+ cpdef _call_map_locations(self, values):
+ # self.mapping is of type Int64HashTable,
+ # so convert dtype of values
+ self.mapping.map_locations(algos.ensure_int64(values))
+
+ cdef _get_index_values(self):
+ return algos.ensure_int32(self.vgetter())
+
+ cdef _maybe_get_bool_indexer(self, object val):
+ cdef:
+ ndarray[uint8_t, ndim=1, cast=True] indexer
+ ndarray[intp_t, ndim=1] found
+ ndarray[int32_t] values
+ int count = 0
+
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+
+ # A view is needed for some subclasses, such as PeriodEngine:
+ values = self._get_index_values().view('int32')
+ indexer = values == val
+ found = np.where(indexer)[0]
+ count = len(found)
+
+ if count > 1:
+ return indexer
+ if count == 1:
+ return int(found[0])
+
+ raise KeyError(val)
+
+
+cdef class Int16Engine(IndexEngine):
+
+ cdef _make_hash_table(self, n):
+ return _hash.Int64HashTable(n)
+
+ cdef _check_type(self, object val):
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+
+ cpdef _call_map_locations(self, values):
+ # self.mapping is of type Int64HashTable,
+ # so convert dtype of values
+ self.mapping.map_locations(algos.ensure_int64(values))
+
+ cdef _get_index_values(self):
+ return algos.ensure_int16(self.vgetter())
+
+ cdef _maybe_get_bool_indexer(self, object val):
+ cdef:
+ ndarray[uint8_t, ndim=1, cast=True] indexer
+ ndarray[intp_t, ndim=1] found
+ ndarray[int16_t] values
+ int count = 0
+
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+
+ # A view is needed for some subclasses, such as PeriodEngine:
+ values = self._get_index_values().view('int16')
+ indexer = values == val
+ found = np.where(indexer)[0]
+ count = len(found)
+
+ if count > 1:
+ return indexer
+ if count == 1:
+ return int(found[0])
+
+ raise KeyError(val)
+
+
+cdef class Int8Engine(IndexEngine):
+
+ cdef _make_hash_table(self, n):
+ return _hash.Int64HashTable(n)
+
+ cdef _check_type(self, object val):
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+
+ cpdef _call_map_locations(self, values):
+ # self.mapping is of type Int64HashTable,
+ # so convert dtype of values
+ self.mapping.map_locations(algos.ensure_int64(values))
+
+ cdef _get_index_values(self):
+ return algos.ensure_int8(self.vgetter())
+
+ cdef _maybe_get_bool_indexer(self, object val):
+ cdef:
+ ndarray[uint8_t, ndim=1, cast=True] indexer
+ ndarray[intp_t, ndim=1] found
+ ndarray[int8_t] values
+ int count = 0
+
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+
+ # A view is needed for some subclasses, such as PeriodEngine:
+ values = self._get_index_values().view('int8')
+ indexer = values == val
+ found = np.where(indexer)[0]
+ count = len(found)
+
+ if count > 1:
+ return indexer
+ if count == 1:
+ return int(found[0])
+
+ raise KeyError(val)
+
+
+cdef class UInt64Engine(IndexEngine):
+
+ cdef _make_hash_table(self, n):
+ return _hash.UInt64HashTable(n)
+
+ cdef _check_type(self, object val):
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+
+ cpdef _call_map_locations(self, values):
+ # self.mapping is of type UInt64HashTable,
+ # so convert dtype of values
+ self.mapping.map_locations(algos.ensure_uint64(values))
+
+ cdef _get_index_values(self):
+ return algos.ensure_uint64(self.vgetter())
+
+ cdef _maybe_get_bool_indexer(self, object val):
+ cdef:
+ ndarray[uint8_t, ndim=1, cast=True] indexer
+ ndarray[intp_t, ndim=1] found
+ ndarray[uint64_t] values
+ int count = 0
+
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+
+ # A view is needed for some subclasses, such as PeriodEngine:
+ values = self._get_index_values().view('uint64')
+ indexer = values == val
+ found = np.where(indexer)[0]
+ count = len(found)
+
+ if count > 1:
+ return indexer
+ if count == 1:
+ return int(found[0])
+
+ raise KeyError(val)
+
+
+cdef class UInt32Engine(IndexEngine):
+
+ cdef _make_hash_table(self, n):
+ return _hash.UInt64HashTable(n)
+
+ cdef _check_type(self, object val):
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+
+ cpdef _call_map_locations(self, values):
+ # self.mapping is of type UInt64HashTable,
+ # so convert dtype of values
+ self.mapping.map_locations(algos.ensure_uint64(values))
+
+ cdef _get_index_values(self):
+ return algos.ensure_uint32(self.vgetter())
+
+ cdef _maybe_get_bool_indexer(self, object val):
+ cdef:
+ ndarray[uint8_t, ndim=1, cast=True] indexer
+ ndarray[intp_t, ndim=1] found
+ ndarray[uint32_t] values
+ int count = 0
+
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+
+ # A view is needed for some subclasses, such as PeriodEngine:
+ values = self._get_index_values().view('uint32')
+ indexer = values == val
+ found = np.where(indexer)[0]
+ count = len(found)
+
+ if count > 1:
+ return indexer
+ if count == 1:
+ return int(found[0])
+
+ raise KeyError(val)
+
+
+cdef class UInt16Engine(IndexEngine):
+
+ cdef _make_hash_table(self, n):
+ return _hash.UInt64HashTable(n)
+
+ cdef _check_type(self, object val):
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+
+ cpdef _call_map_locations(self, values):
+ # self.mapping is of type UInt64HashTable,
+ # so convert dtype of values
+ self.mapping.map_locations(algos.ensure_uint64(values))
+
+ cdef _get_index_values(self):
+ return algos.ensure_uint16(self.vgetter())
+
+ cdef _maybe_get_bool_indexer(self, object val):
+ cdef:
+ ndarray[uint8_t, ndim=1, cast=True] indexer
+ ndarray[intp_t, ndim=1] found
+ ndarray[uint16_t] values
+ int count = 0
+
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+
+ # A view is needed for some subclasses, such as PeriodEngine:
+ values = self._get_index_values().view('uint16')
+ indexer = values == val
+ found = np.where(indexer)[0]
+ count = len(found)
+
+ if count > 1:
+ return indexer
+ if count == 1:
+ return int(found[0])
+
+ raise KeyError(val)
+
+
+cdef class UInt8Engine(IndexEngine):
+
+ cdef _make_hash_table(self, n):
+ return _hash.UInt64HashTable(n)
+
+ cdef _check_type(self, object val):
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+
+ cpdef _call_map_locations(self, values):
+ # self.mapping is of type UInt64HashTable,
+ # so convert dtype of values
+ self.mapping.map_locations(algos.ensure_uint64(values))
+
+ cdef _get_index_values(self):
+ return algos.ensure_uint8(self.vgetter())
+
+ cdef _maybe_get_bool_indexer(self, object val):
+ cdef:
+ ndarray[uint8_t, ndim=1, cast=True] indexer
+ ndarray[intp_t, ndim=1] found
+ ndarray[uint8_t] values
+ int count = 0
+
+ if not util.is_integer_object(val):
+ raise KeyError(val)
+
+ # A view is needed for some subclasses, such as PeriodEngine:
+ values = self._get_index_values().view('uint8')
+ indexer = values == val
+ found = np.where(indexer)[0]
+ count = len(found)
+
+ if count > 1:
+ return indexer
+ if count == 1:
+ return int(found[0])
+
+ raise KeyError(val)
diff --git a/contrib/python/pandas/py2/pandas/_libs/indexing.pyx b/contrib/python/pandas/py2/pandas/_libs/indexing.pyx
new file mode 100644
index 00000000000..af6e00bad7f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/indexing.pyx
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+
+
+cdef class _NDFrameIndexerBase:
+ """
+ A base class for _NDFrameIndexer for fast instantiation and attribute
+ access.
+ """
+ cdef public object obj, name, _ndim
+
+ def __init__(self, name, obj):
+ self.obj = obj
+ self.name = name
+ self._ndim = None
+
+ @property
+ def ndim(self):
+ # Delay `ndim` instantiation until required as reading it
+ # from `obj` isn't entirely cheap.
+ ndim = self._ndim
+ if ndim is None:
+ ndim = self._ndim = self.obj.ndim
+ return ndim
diff --git a/contrib/python/pandas/py2/pandas/_libs/internals.pyx b/contrib/python/pandas/py2/pandas/_libs/internals.pyx
new file mode 100644
index 00000000000..72a1cf16f96
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/internals.pyx
@@ -0,0 +1,465 @@
+# -*- coding: utf-8 -*-
+
+import cython
+from cython import Py_ssize_t
+
+from cpython cimport PyObject
+
+cdef extern from "Python.h":
+ Py_ssize_t PY_SSIZE_T_MAX
+
+import numpy as np
+from numpy cimport int64_t
+
+cdef extern from "compat_helper.h":
+ cdef int slice_get_indices(PyObject* s, Py_ssize_t length,
+ Py_ssize_t *start, Py_ssize_t *stop,
+ Py_ssize_t *step,
+ Py_ssize_t *slicelength) except -1
+
+
+from pandas._libs.algos import ensure_int64
+
+
+cdef class BlockPlacement:
+ # __slots__ = '_as_slice', '_as_array', '_len'
+ cdef slice _as_slice
+ cdef object _as_array
+
+ cdef bint _has_slice, _has_array, _is_known_slice_like
+
+ def __init__(self, val):
+ cdef:
+ slice slc
+
+ self._as_slice = None
+ self._as_array = None
+ self._has_slice = False
+ self._has_array = False
+
+ if isinstance(val, slice):
+ slc = slice_canonize(val)
+
+ if slc.start != slc.stop:
+ self._as_slice = slc
+ self._has_slice = True
+ else:
+ arr = np.empty(0, dtype=np.int64)
+ self._as_array = arr
+ self._has_array = True
+ else:
+ # Cython memoryview interface requires ndarray to be writeable.
+ arr = np.require(val, dtype=np.int64, requirements='W')
+ assert arr.ndim == 1
+ self._as_array = arr
+ self._has_array = True
+
+ def __str__(self):
+ cdef:
+ slice s = self._ensure_has_slice()
+ if s is not None:
+ v = self._as_slice
+ else:
+ v = self._as_array
+
+ return '%s(%r)' % (self.__class__.__name__, v)
+
+ def __repr__(self):
+ return str(self)
+
+ def __len__(self):
+ cdef:
+ slice s = self._ensure_has_slice()
+ if s is not None:
+ return slice_len(s)
+ else:
+ return len(self._as_array)
+
+ def __iter__(self):
+ cdef:
+ slice s = self._ensure_has_slice()
+ Py_ssize_t start, stop, step, _
+ if s is not None:
+ start, stop, step, _ = slice_get_indices_ex(s)
+ return iter(range(start, stop, step))
+ else:
+ return iter(self._as_array)
+
+ @property
+ def as_slice(self):
+ cdef:
+ slice s = self._ensure_has_slice()
+ if s is None:
+ raise TypeError('Not slice-like')
+ else:
+ return s
+
+ @property
+ def indexer(self):
+ cdef:
+ slice s = self._ensure_has_slice()
+ if s is not None:
+ return s
+ else:
+ return self._as_array
+
+ def isin(self, arr):
+ from pandas.core.index import Int64Index
+ return Int64Index(self.as_array, copy=False).isin(arr)
+
+ @property
+ def as_array(self):
+ cdef:
+ Py_ssize_t start, stop, end, _
+ if not self._has_array:
+ start, stop, step, _ = slice_get_indices_ex(self._as_slice)
+ self._as_array = np.arange(start, stop, step,
+ dtype=np.int64)
+ self._has_array = True
+ return self._as_array
+
+ @property
+ def is_slice_like(self):
+ cdef:
+ slice s = self._ensure_has_slice()
+ return s is not None
+
+ def __getitem__(self, loc):
+ cdef:
+ slice s = self._ensure_has_slice()
+ if s is not None:
+ val = slice_getitem(s, loc)
+ else:
+ val = self._as_array[loc]
+
+ if not isinstance(val, slice) and val.ndim == 0:
+ return val
+
+ return BlockPlacement(val)
+
+ def delete(self, loc):
+ return BlockPlacement(np.delete(self.as_array, loc, axis=0))
+
+ def append(self, others):
+ if len(others) == 0:
+ return self
+
+ return BlockPlacement(np.concatenate([self.as_array] +
+ [o.as_array for o in others]))
+
+ cdef iadd(self, other):
+ cdef:
+ slice s = self._ensure_has_slice()
+ Py_ssize_t other_int, start, stop, step, l
+
+ if isinstance(other, int) and s is not None:
+ other_int = <Py_ssize_t>other
+
+ if other_int == 0:
+ # BlockPlacement is treated as immutable
+ return self
+
+ start, stop, step, l = slice_get_indices_ex(s)
+ start += other_int
+ stop += other_int
+
+ if ((step > 0 and start < 0) or
+ (step < 0 and stop < step)):
+ raise ValueError("iadd causes length change")
+
+ if stop < 0:
+ val = slice(start, None, step)
+ else:
+ val = slice(start, stop, step)
+
+ return BlockPlacement(val)
+ else:
+ newarr = self.as_array + other
+ if (newarr < 0).any():
+ raise ValueError("iadd causes length change")
+
+ val = newarr
+ return BlockPlacement(val)
+
+ def add(self, other):
+ return self.iadd(other)
+
+ def sub(self, other):
+ return self.add(-other)
+
+ cdef slice _ensure_has_slice(self):
+ if not self._has_slice:
+ self._as_slice = indexer_as_slice(self._as_array)
+ self._has_slice = True
+ return self._as_slice
+
+
+cdef slice slice_canonize(slice s):
+ """
+ Convert slice to canonical bounded form.
+ """
+ cdef:
+ Py_ssize_t start = 0, stop = 0, step = 1, length
+
+ if s.step is None:
+ step = 1
+ else:
+ step = <Py_ssize_t>s.step
+ if step == 0:
+ raise ValueError("slice step cannot be zero")
+
+ if step > 0:
+ if s.stop is None:
+ raise ValueError("unbounded slice")
+
+ stop = <Py_ssize_t>s.stop
+ if s.start is None:
+ start = 0
+ else:
+ start = <Py_ssize_t>s.start
+ if start > stop:
+ start = stop
+ elif step < 0:
+ if s.start is None:
+ raise ValueError("unbounded slice")
+
+ start = <Py_ssize_t>s.start
+ if s.stop is None:
+ stop = -1
+ else:
+ stop = <Py_ssize_t>s.stop
+ if stop > start:
+ stop = start
+
+ if start < 0 or (stop < 0 and s.stop is not None):
+ raise ValueError("unbounded slice")
+
+ if stop < 0:
+ return slice(start, None, step)
+ else:
+ return slice(start, stop, step)
+
+
+cpdef Py_ssize_t slice_len(
+ slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1:
+ """
+ Get length of a bounded slice.
+
+ The slice must not have any "open" bounds that would create dependency on
+ container size, i.e.:
+ - if ``s.step is None or s.step > 0``, ``s.stop`` is not ``None``
+ - if ``s.step < 0``, ``s.start`` is not ``None``
+
+ Otherwise, the result is unreliable.
+
+ """
+ cdef:
+ Py_ssize_t start, stop, step, length
+
+ if slc is None:
+ raise TypeError("slc must be slice")
+
+ slice_get_indices(<PyObject *>slc, objlen,
+ &start, &stop, &step, &length)
+
+ return length
+
+
+cdef slice_get_indices_ex(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX):
+ """
+ Get (start, stop, step, length) tuple for a slice.
+
+ If `objlen` is not specified, slice must be bounded, otherwise the result
+ will be wrong.
+
+ """
+ cdef:
+ Py_ssize_t start, stop, step, length
+
+ if slc is None:
+ raise TypeError("slc should be a slice")
+
+ slice_get_indices(<PyObject *>slc, objlen,
+ &start, &stop, &step, &length)
+
+ return start, stop, step, length
+
+
+def slice_getitem(slice slc not None, ind):
+ cdef:
+ Py_ssize_t s_start, s_stop, s_step, s_len
+ Py_ssize_t ind_start, ind_stop, ind_step, ind_len
+
+ s_start, s_stop, s_step, s_len = slice_get_indices_ex(slc)
+
+ if isinstance(ind, slice):
+ ind_start, ind_stop, ind_step, ind_len = slice_get_indices_ex(ind,
+ s_len)
+
+ if ind_step > 0 and ind_len == s_len:
+ # short-cut for no-op slice
+ if ind_len == s_len:
+ return slc
+
+ if ind_step < 0:
+ s_start = s_stop - s_step
+ ind_step = -ind_step
+
+ s_step *= ind_step
+ s_stop = s_start + ind_stop * s_step
+ s_start = s_start + ind_start * s_step
+
+ if s_step < 0 and s_stop < 0:
+ return slice(s_start, None, s_step)
+ else:
+ return slice(s_start, s_stop, s_step)
+
+ else:
+ return np.arange(s_start, s_stop, s_step, dtype=np.int64)[ind]
+
+
+cpdef slice indexer_as_slice(int64_t[:] vals):
+ cdef:
+ Py_ssize_t i, n, start, stop
+ int64_t d
+
+ if vals is None:
+ raise TypeError("vals must be ndarray")
+
+ n = vals.shape[0]
+
+ if n == 0 or vals[0] < 0:
+ return None
+
+ if n == 1:
+ return slice(vals[0], vals[0] + 1, 1)
+
+ if vals[1] < 0:
+ return None
+
+ # n > 2
+ d = vals[1] - vals[0]
+
+ if d == 0:
+ return None
+
+ for i in range(2, n):
+ if vals[i] < 0 or vals[i] - vals[i - 1] != d:
+ return None
+
+ start = vals[0]
+ stop = start + n * d
+ if stop < 0 and d < 0:
+ return slice(start, None, d)
+ else:
+ return slice(start, stop, d)
+
+
+def get_blkno_indexers(int64_t[:] blknos, bint group=True):
+ """
+ Enumerate contiguous runs of integers in ndarray.
+
+ Iterate over elements of `blknos` yielding ``(blkno, slice(start, stop))``
+ pairs for each contiguous run found.
+
+ If `group` is True and there is more than one run for a certain blkno,
+ ``(blkno, array)`` with an array containing positions of all elements equal
+ to blkno.
+
+ Returns
+ -------
+ iter : iterator of (int, slice or array)
+
+ """
+ # There's blkno in this function's name because it's used in block &
+ # blockno handling.
+ cdef:
+ int64_t cur_blkno
+ Py_ssize_t i, start, stop, n, diff
+
+ object blkno
+ list group_order
+ dict group_slices
+ int64_t[:] res_view
+
+ n = blknos.shape[0]
+
+ if n == 0:
+ return
+
+ start = 0
+ cur_blkno = blknos[start]
+
+ if group is False:
+ for i in range(1, n):
+ if blknos[i] != cur_blkno:
+ yield cur_blkno, slice(start, i)
+
+ start = i
+ cur_blkno = blknos[i]
+
+ yield cur_blkno, slice(start, n)
+ else:
+ group_order = []
+ group_dict = {}
+
+ for i in range(1, n):
+ if blknos[i] != cur_blkno:
+ if cur_blkno not in group_dict:
+ group_order.append(cur_blkno)
+ group_dict[cur_blkno] = [(start, i)]
+ else:
+ group_dict[cur_blkno].append((start, i))
+
+ start = i
+ cur_blkno = blknos[i]
+
+ if cur_blkno not in group_dict:
+ group_order.append(cur_blkno)
+ group_dict[cur_blkno] = [(start, n)]
+ else:
+ group_dict[cur_blkno].append((start, n))
+
+ for blkno in group_order:
+ slices = group_dict[blkno]
+ if len(slices) == 1:
+ yield blkno, slice(slices[0][0], slices[0][1])
+ else:
+ tot_len = sum(stop - start for start, stop in slices)
+ result = np.empty(tot_len, dtype=np.int64)
+ res_view = result
+
+ i = 0
+ for start, stop in slices:
+ for diff in range(start, stop):
+ res_view[i] = diff
+ i += 1
+
+ yield blkno, result
+
+
+def get_blkno_placements(blknos, blk_count, group=True):
+ """
+
+ Parameters
+ ----------
+ blknos : array of int64
+ blk_count : int
+ group : bool
+
+ Returns
+ -------
+ iterator
+ yield (BlockPlacement, blkno)
+
+ """
+
+ blknos = ensure_int64(blknos)
+
+ # FIXME: blk_count is unused, but it may avoid the use of dicts in cython
+ for blkno, indexer in get_blkno_indexers(blknos, group):
+ yield blkno, BlockPlacement(indexer)
diff --git a/contrib/python/pandas/py2/pandas/_libs/interval.pyx b/contrib/python/pandas/py2/pandas/_libs/interval.pyx
new file mode 100644
index 00000000000..3147f36dcc8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/interval.pyx
@@ -0,0 +1,488 @@
+# -*- coding: utf-8 -*-
+import numbers
+from operator import le, lt
+
+from cpython.object cimport (Py_EQ, Py_NE, Py_GT, Py_LT, Py_GE, Py_LE,
+ PyObject_RichCompare)
+
+import cython
+from cython import Py_ssize_t
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport (
+ int64_t, int32_t, float64_t, float32_t, uint64_t,
+ ndarray,
+ PyArray_ArgSort, NPY_QUICKSORT, PyArray_Take)
+cnp.import_array()
+
+
+cimport pandas._libs.util as util
+util.import_array()
+
+from pandas._libs.hashtable cimport Int64Vector, Int64VectorData
+
+from pandas._libs.tslibs import Timestamp
+from pandas._libs.tslibs.timezones cimport tz_compare
+
+
+_VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither'])
+
+
+cdef class IntervalMixin(object):
+
+ @property
+ def closed_left(self):
+ """
+ Check if the interval is closed on the left side.
+
+ For the meaning of `closed` and `open` see :class:`~pandas.Interval`.
+
+ Returns
+ -------
+ bool
+ ``True`` if the Interval is closed on the left-side, else
+ ``False``.
+ """
+ return self.closed in ('left', 'both')
+
+ @property
+ def closed_right(self):
+ """
+ Check if the interval is closed on the right side.
+
+ For the meaning of `closed` and `open` see :class:`~pandas.Interval`.
+
+ Returns
+ -------
+ bool
+ ``True`` if the Interval is closed on the left-side, else
+ ``False``.
+ """
+ return self.closed in ('right', 'both')
+
+ @property
+ def open_left(self):
+ """
+ Check if the interval is open on the left side.
+
+ For the meaning of `closed` and `open` see :class:`~pandas.Interval`.
+
+ Returns
+ -------
+ bool
+ ``True`` if the Interval is closed on the left-side, else
+ ``False``.
+ """
+ return not self.closed_left
+
+ @property
+ def open_right(self):
+ """
+ Check if the interval is open on the right side.
+
+ For the meaning of `closed` and `open` see :class:`~pandas.Interval`.
+
+ Returns
+ -------
+ bool
+ ``True`` if the Interval is closed on the left-side, else
+ ``False``.
+ """
+ return not self.closed_right
+
+ @property
+ def mid(self):
+ """
+ Return the midpoint of the Interval
+ """
+ try:
+ return 0.5 * (self.left + self.right)
+ except TypeError:
+ # datetime safe version
+ return self.left + 0.5 * self.length
+
+ @property
+ def length(self):
+ """Return the length of the Interval"""
+ try:
+ return self.right - self.left
+ except TypeError:
+ # length not defined for some types, e.g. string
+ msg = 'cannot compute length between {left!r} and {right!r}'
+ raise TypeError(msg.format(left=self.left, right=self.right))
+
+ def _check_closed_matches(self, other, name='other'):
+ """Check if the closed attribute of `other` matches.
+
+ Note that 'left' and 'right' are considered different from 'both'.
+
+ Parameters
+ ----------
+ other : Interval, IntervalIndex, IntervalArray
+ name : str
+ Name to use for 'other' in the error message.
+
+ Raises
+ ------
+ ValueError
+ When `other` is not closed exactly the same as self.
+ """
+ if self.closed != other.closed:
+ msg = "'{}.closed' is '{}', expected '{}'."
+ raise ValueError(msg.format(name, other.closed, self.closed))
+
+
+cdef _interval_like(other):
+ return (hasattr(other, 'left')
+ and hasattr(other, 'right')
+ and hasattr(other, 'closed'))
+
+
+cdef class Interval(IntervalMixin):
+ """
+ Immutable object implementing an Interval, a bounded slice-like interval.
+
+ .. versionadded:: 0.20.0
+
+ Parameters
+ ----------
+ left : orderable scalar
+ Left bound for the interval.
+ right : orderable scalar
+ Right bound for the interval.
+ closed : {'left', 'right', 'both', 'neither'}, default 'right'
+ Whether the interval is closed on the left-side, right-side, both or
+ neither.
+ closed : {'right', 'left', 'both', 'neither'}, default 'right'
+ Whether the interval is closed on the left-side, right-side, both or
+ neither. See the Notes for more detailed explanation.
+
+ See Also
+ --------
+ IntervalIndex : An Index of Interval objects that are all closed on the
+ same side.
+ cut : Convert continuous data into discrete bins (Categorical
+ of Interval objects).
+ qcut : Convert continuous data into bins (Categorical of Interval objects)
+ based on quantiles.
+ Period : Represents a period of time.
+
+ Notes
+ -----
+ The parameters `left` and `right` must be from the same type, you must be
+ able to compare them and they must satisfy ``left <= right``.
+
+ A closed interval (in mathematics denoted by square brackets) contains
+ its endpoints, i.e. the closed interval ``[0, 5]`` is characterized by the
+ conditions ``0 <= x <= 5``. This is what ``closed='both'`` stands for.
+ An open interval (in mathematics denoted by parentheses) does not contain
+ its endpoints, i.e. the open interval ``(0, 5)`` is characterized by the
+ conditions ``0 < x < 5``. This is what ``closed='neither'`` stands for.
+ Intervals can also be half-open or half-closed, i.e. ``[0, 5)`` is
+ described by ``0 <= x < 5`` (``closed='left'``) and ``(0, 5]`` is
+ described by ``0 < x <= 5`` (``closed='right'``).
+
+ Examples
+ --------
+ It is possible to build Intervals of different types, like numeric ones:
+
+ >>> iv = pd.Interval(left=0, right=5)
+ >>> iv
+ Interval(0, 5, closed='right')
+
+ You can check if an element belongs to it
+
+ >>> 2.5 in iv
+ True
+
+ You can test the bounds (``closed='right'``, so ``0 < x <= 5``):
+
+ >>> 0 in iv
+ False
+ >>> 5 in iv
+ True
+ >>> 0.0001 in iv
+ True
+
+ Calculate its length
+
+ >>> iv.length
+ 5
+
+ You can operate with `+` and `*` over an Interval and the operation
+ is applied to each of its bounds, so the result depends on the type
+ of the bound elements
+
+ >>> shifted_iv = iv + 3
+ >>> shifted_iv
+ Interval(3, 8, closed='right')
+ >>> extended_iv = iv * 10.0
+ >>> extended_iv
+ Interval(0.0, 50.0, closed='right')
+
+ To create a time interval you can use Timestamps as the bounds
+
+ >>> year_2017 = pd.Interval(pd.Timestamp('2017-01-01 00:00:00'),
+ ... pd.Timestamp('2018-01-01 00:00:00'),
+ ... closed='left')
+ >>> pd.Timestamp('2017-01-01 00:00') in year_2017
+ True
+ >>> year_2017.length
+ Timedelta('365 days 00:00:00')
+
+ And also you can create string intervals
+
+ >>> volume_1 = pd.Interval('Ant', 'Dog', closed='both')
+ >>> 'Bee' in volume_1
+ True
+ """
+ _typ = "interval"
+
+ cdef readonly object left
+ """Left bound for the interval"""
+
+ cdef readonly object right
+ """Right bound for the interval"""
+
+ cdef readonly str closed
+ """
+ Whether the interval is closed on the left-side, right-side, both or
+ neither
+ """
+
+ def __init__(self, left, right, str closed='right'):
+ # note: it is faster to just do these checks than to use a special
+ # constructor (__cinit__/__new__) to avoid them
+ if closed not in _VALID_CLOSED:
+ msg = "invalid option for 'closed': {closed}".format(closed=closed)
+ raise ValueError(msg)
+ if not left <= right:
+ raise ValueError('left side of interval must be <= right side')
+ if (isinstance(left, Timestamp) and
+ not tz_compare(left.tzinfo, right.tzinfo)):
+ # GH 18538
+ msg = ("left and right must have the same time zone, got "
+ "'{left_tz}' and '{right_tz}'")
+ raise ValueError(msg.format(left_tz=left.tzinfo,
+ right_tz=right.tzinfo))
+ self.left = left
+ self.right = right
+ self.closed = closed
+
+ def __hash__(self):
+ return hash((self.left, self.right, self.closed))
+
+ def __contains__(self, key):
+ if _interval_like(key):
+ raise TypeError('__contains__ not defined for two intervals')
+ return ((self.left < key if self.open_left else self.left <= key) and
+ (key < self.right if self.open_right else key <= self.right))
+
+ def __richcmp__(self, other, op: int):
+ if hasattr(other, 'ndim'):
+ # let numpy (or IntervalIndex) handle vectorization
+ return NotImplemented
+
+ if _interval_like(other):
+ self_tuple = (self.left, self.right, self.closed)
+ other_tuple = (other.left, other.right, other.closed)
+ return PyObject_RichCompare(self_tuple, other_tuple, op)
+
+ # nb. could just return NotImplemented now, but handling this
+ # explicitly allows us to opt into the Python 3 behavior, even on
+ # Python 2.
+ if op == Py_EQ or op == Py_NE:
+ return NotImplemented
+ else:
+ name = type(self).__name__
+ other = type(other).__name__
+ op_str = {Py_LT: '<', Py_LE: '<=', Py_GT: '>', Py_GE: '>='}[op]
+ raise TypeError('unorderable types: {name}() {op} {other}()'
+ .format(name=name, op=op_str, other=other))
+
+ def __reduce__(self):
+ args = (self.left, self.right, self.closed)
+ return (type(self), args)
+
+ def _repr_base(self):
+ left = self.left
+ right = self.right
+
+ # TODO: need more general formatting methodology here
+ if isinstance(left, Timestamp) and isinstance(right, Timestamp):
+ left = left._short_repr
+ right = right._short_repr
+
+ return left, right
+
+ def __repr__(self):
+
+ left, right = self._repr_base()
+ name = type(self).__name__
+ repr_str = '{name}({left!r}, {right!r}, closed={closed!r})'.format(
+ name=name, left=left, right=right, closed=self.closed)
+ return repr_str
+
+ def __str__(self):
+
+ left, right = self._repr_base()
+ start_symbol = '[' if self.closed_left else '('
+ end_symbol = ']' if self.closed_right else ')'
+ return '{start}{left}, {right}{end}'.format(
+ start=start_symbol, left=left, right=right, end=end_symbol)
+
+ def __add__(self, y):
+ if isinstance(y, numbers.Number):
+ return Interval(self.left + y, self.right + y, closed=self.closed)
+ elif isinstance(y, Interval) and isinstance(self, numbers.Number):
+ return Interval(y.left + self, y.right + self, closed=y.closed)
+ return NotImplemented
+
+ def __sub__(self, y):
+ if isinstance(y, numbers.Number):
+ return Interval(self.left - y, self.right - y, closed=self.closed)
+ return NotImplemented
+
+ def __mul__(self, y):
+ if isinstance(y, numbers.Number):
+ return Interval(self.left * y, self.right * y, closed=self.closed)
+ elif isinstance(y, Interval) and isinstance(self, numbers.Number):
+ return Interval(y.left * self, y.right * self, closed=y.closed)
+ return NotImplemented
+
+ def __div__(self, y):
+ if isinstance(y, numbers.Number):
+ return Interval(self.left / y, self.right / y, closed=self.closed)
+ return NotImplemented
+
+ def __truediv__(self, y):
+ if isinstance(y, numbers.Number):
+ return Interval(self.left / y, self.right / y, closed=self.closed)
+ return NotImplemented
+
+ def __floordiv__(self, y):
+ if isinstance(y, numbers.Number):
+ return Interval(
+ self.left // y, self.right // y, closed=self.closed)
+ return NotImplemented
+
+ def overlaps(self, other):
+ """
+ Check whether two Interval objects overlap.
+
+ Two intervals overlap if they share a common point, including closed
+ endpoints. Intervals that only have an open endpoint in common do not
+ overlap.
+
+ .. versionadded:: 0.24.0
+
+ Parameters
+ ----------
+ other : Interval
+ The interval to check against for an overlap.
+
+ Returns
+ -------
+ bool
+ ``True`` if the two intervals overlap, else ``False``.
+
+ See Also
+ --------
+ IntervalArray.overlaps : The corresponding method for IntervalArray.
+ IntervalIndex.overlaps : The corresponding method for IntervalIndex.
+
+ Examples
+ --------
+ >>> i1 = pd.Interval(0, 2)
+ >>> i2 = pd.Interval(1, 3)
+ >>> i1.overlaps(i2)
+ True
+ >>> i3 = pd.Interval(4, 5)
+ >>> i1.overlaps(i3)
+ False
+
+ Intervals that share closed endpoints overlap:
+
+ >>> i4 = pd.Interval(0, 1, closed='both')
+ >>> i5 = pd.Interval(1, 2, closed='both')
+ >>> i4.overlaps(i5)
+ True
+
+ Intervals that only have an open endpoint in common do not overlap:
+
+ >>> i6 = pd.Interval(1, 2, closed='neither')
+ >>> i4.overlaps(i6)
+ False
+ """
+ if not isinstance(other, Interval):
+ msg = '`other` must be an Interval, got {other}'
+ raise TypeError(msg.format(other=type(other).__name__))
+
+ # equality is okay if both endpoints are closed (overlap at a point)
+ op1 = le if (self.closed_left and other.closed_right) else lt
+ op2 = le if (other.closed_left and self.closed_right) else lt
+
+ # overlaps is equivalent negation of two interval being disjoint:
+ # disjoint = (A.left > B.right) or (B.left > A.right)
+ # (simplifying the negation allows this to be done in less operations)
+ return op1(self.left, other.right) and op2(other.left, self.right)
+
+
+def intervals_to_interval_bounds(ndarray intervals,
+ bint validate_closed=True):
+ """
+ Parameters
+ ----------
+ intervals : ndarray
+ object array of Intervals / nulls
+
+ validate_closed: boolean, default True
+ boolean indicating if all intervals must be closed on the same side.
+ Mismatching closed will raise if True, else return None for closed.
+
+ Returns
+ -------
+ tuples (left: ndarray object array,
+ right: ndarray object array,
+ closed: str)
+
+ """
+
+ cdef:
+ object closed = None, interval
+ int64_t n = len(intervals)
+ ndarray left, right
+ bint seen_closed = False
+
+ left = np.empty(n, dtype=intervals.dtype)
+ right = np.empty(n, dtype=intervals.dtype)
+
+ for i in range(len(intervals)):
+ interval = intervals[i]
+ if interval is None or util.is_nan(interval):
+ left[i] = np.nan
+ right[i] = np.nan
+ continue
+
+ if not isinstance(interval, Interval):
+ raise TypeError("type {typ} with value {iv} is not an interval"
+ .format(typ=type(interval), iv=interval))
+
+ left[i] = interval.left
+ right[i] = interval.right
+ if not seen_closed:
+ seen_closed = True
+ closed = interval.closed
+ elif closed != interval.closed:
+ closed = None
+ if validate_closed:
+ msg = 'intervals must all be closed on the same side'
+ raise ValueError(msg)
+
+ return left, right, closed
+
+
+include "intervaltree.pxi"
diff --git a/contrib/python/pandas/py2/pandas/_libs/intervaltree.pxi b/contrib/python/pandas/py2/pandas/_libs/intervaltree.pxi
new file mode 100644
index 00000000000..5cab9024b15
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/intervaltree.pxi
@@ -0,0 +1,3618 @@
+"""
+Template for intervaltree
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+ctypedef fused scalar_t:
+ float64_t
+ float32_t
+ int64_t
+ int32_t
+ uint64_t
+
+# ----------------------------------------------------------------------
+# IntervalTree
+# ----------------------------------------------------------------------
+
+cdef class IntervalTree(IntervalMixin):
+ """A centered interval tree
+
+ Based off the algorithm described on Wikipedia:
+ http://en.wikipedia.org/wiki/Interval_tree
+
+ we are emulating the IndexEngine interface
+ """
+ cdef:
+ readonly object left, right, root, dtype
+ readonly str closed
+ object _is_overlapping, _left_sorter, _right_sorter
+
+ def __init__(self, left, right, closed='right', leaf_size=100):
+ """
+ Parameters
+ ----------
+ left, right : np.ndarray[ndim=1]
+ Left and right bounds for each interval. Assumed to contain no
+ NaNs.
+ closed : {'left', 'right', 'both', 'neither'}, optional
+ Whether the intervals are closed on the left-side, right-side, both
+ or neither. Defaults to 'right'.
+ leaf_size : int, optional
+ Parameter that controls when the tree switches from creating nodes
+ to brute-force search. Tune this parameter to optimize query
+ performance.
+ """
+ if closed not in ['left', 'right', 'both', 'neither']:
+ raise ValueError("invalid option for 'closed': %s" % closed)
+
+ left = np.asarray(left)
+ right = np.asarray(right)
+ self.dtype = np.result_type(left, right)
+ self.left = np.asarray(left, dtype=self.dtype)
+ self.right = np.asarray(right, dtype=self.dtype)
+
+ indices = np.arange(len(left), dtype='int64')
+
+ self.closed = closed
+
+ # GH 23352: ensure no nan in nodes
+ mask = ~np.isnan(self.left)
+ self.left = self.left[mask]
+ self.right = self.right[mask]
+ indices = indices[mask]
+
+ node_cls = NODE_CLASSES[str(self.dtype), closed]
+ self.root = node_cls(self.left, self.right, indices, leaf_size)
+
+ @property
+ def left_sorter(self):
+ """How to sort the left labels; this is used for binary search
+ """
+ if self._left_sorter is None:
+ self._left_sorter = np.argsort(self.left)
+ return self._left_sorter
+
+ @property
+ def right_sorter(self):
+ """How to sort the right labels
+ """
+ if self._right_sorter is None:
+ self._right_sorter = np.argsort(self.right)
+ return self._right_sorter
+
+ @property
+ def is_overlapping(self):
+ """
+ Determine if the IntervalTree contains overlapping intervals.
+ Cached as self._is_overlapping.
+ """
+ if self._is_overlapping is not None:
+ return self._is_overlapping
+
+ # <= when both sides closed since endpoints can overlap
+ op = le if self.closed == 'both' else lt
+
+ # overlap if start of current interval < end of previous interval
+ # (current and previous in terms of sorted order by left/start side)
+ current = self.left[self.left_sorter[1:]]
+ previous = self.right[self.left_sorter[:-1]]
+ self._is_overlapping = bool(op(current, previous).any())
+
+ return self._is_overlapping
+
+ def get_loc(self, scalar_t key):
+ """Return all positions corresponding to intervals that overlap with
+ the given scalar key
+ """
+ result = Int64Vector()
+ self.root.query(result, key)
+ if not result.data.n:
+ raise KeyError(key)
+ return result.to_array().astype('intp')
+
+ def _get_partial_overlap(self, key_left, key_right, side):
+ """Return all positions corresponding to intervals with the given side
+ falling between the left and right bounds of an interval query
+ """
+ if side == 'left':
+ values = self.left
+ sorter = self.left_sorter
+ else:
+ values = self.right
+ sorter = self.right_sorter
+ key = [key_left, key_right]
+ i, j = values.searchsorted(key, sorter=sorter)
+ return sorter[i:j]
+
+ def get_loc_interval(self, key_left, key_right):
+ """Lookup the intervals enclosed in the given interval bounds
+
+ The given interval is presumed to have closed bounds.
+ """
+ import pandas as pd
+ left_overlap = self._get_partial_overlap(key_left, key_right, 'left')
+ right_overlap = self._get_partial_overlap(key_left, key_right, 'right')
+ enclosing = self.get_loc(0.5 * (key_left + key_right))
+ combined = np.concatenate([left_overlap, right_overlap, enclosing])
+ uniques = pd.unique(combined)
+ return uniques.astype('intp')
+
+ def get_indexer(self, scalar_t[:] target):
+ """Return the positions corresponding to unique intervals that overlap
+ with the given array of scalar targets.
+ """
+
+ # TODO: write get_indexer_intervals
+ cdef:
+ size_t old_len
+ Py_ssize_t i
+ Int64Vector result
+
+ result = Int64Vector()
+ old_len = 0
+ for i in range(len(target)):
+ self.root.query(result, target[i])
+ if result.data.n == old_len:
+ result.append(-1)
+ elif result.data.n > old_len + 1:
+ raise KeyError(
+ 'indexer does not intersect a unique set of intervals')
+ old_len = result.data.n
+ return result.to_array().astype('intp')
+
+ def get_indexer_non_unique(self, scalar_t[:] target):
+ """Return the positions corresponding to intervals that overlap with
+ the given array of scalar targets. Non-unique positions are repeated.
+ """
+ cdef:
+ size_t old_len
+ Py_ssize_t i
+ Int64Vector result, missing
+
+ result = Int64Vector()
+ missing = Int64Vector()
+ old_len = 0
+ for i in range(len(target)):
+ self.root.query(result, target[i])
+ if result.data.n == old_len:
+ result.append(-1)
+ missing.append(i)
+ old_len = result.data.n
+ return (result.to_array().astype('intp'),
+ missing.to_array().astype('intp'))
+
+ def __repr__(self):
+ return ('<IntervalTree[{dtype},{closed}]: '
+ '{n_elements} elements>'.format(
+ dtype=self.dtype, closed=self.closed,
+ n_elements=self.root.n_elements))
+
+ # compat with IndexEngine interface
+ def clear_mapping(self):
+ pass
+
+
+cdef take(ndarray source, ndarray indices):
+ """Take the given positions from a 1D ndarray
+ """
+ return PyArray_Take(source, indices, 0)
+
+
+cdef sort_values_and_indices(all_values, all_indices, subset):
+ indices = take(all_indices, subset)
+ values = take(all_values, subset)
+ sorter = PyArray_ArgSort(values, 0, NPY_QUICKSORT)
+ sorted_values = take(values, sorter)
+ sorted_indices = take(indices, sorter)
+ return sorted_values, sorted_indices
+
+
+# ----------------------------------------------------------------------
+# Nodes
+# ----------------------------------------------------------------------
+
+# we need specialized nodes and leaves to optimize for different dtype and
+# closed values
+
+NODE_CLASSES = {}
+
+cdef class Float32ClosedLeftIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Float32ClosedLeftIntervalNode left_node, right_node
+ float32_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ float32_t min_left, max_right
+ readonly float32_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[float32_t, ndim=1] left,
+ ndarray[float32_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, float32_t[:] left, float32_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] <= self.pivot:
+ left_ind.append(i)
+ elif self.pivot < left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[float32_t, ndim=1] left,
+ ndarray[float32_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Float32ClosedLeftIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ float32_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] <= point < self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] <= point:
+ break
+ result.append(indices[i])
+ if point < self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point < values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left <= point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Float32ClosedLeftIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Float32ClosedLeftIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['float32',
+ 'left'] = Float32ClosedLeftIntervalNode
+
+cdef class Float32ClosedRightIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Float32ClosedRightIntervalNode left_node, right_node
+ float32_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ float32_t min_left, max_right
+ readonly float32_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[float32_t, ndim=1] left,
+ ndarray[float32_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, float32_t[:] left, float32_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] < self.pivot:
+ left_ind.append(i)
+ elif self.pivot <= left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[float32_t, ndim=1] left,
+ ndarray[float32_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Float32ClosedRightIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ float32_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] < point <= self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] < point:
+ break
+ result.append(indices[i])
+ if point <= self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point <= values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left < point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Float32ClosedRightIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Float32ClosedRightIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['float32',
+ 'right'] = Float32ClosedRightIntervalNode
+
+cdef class Float32ClosedBothIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Float32ClosedBothIntervalNode left_node, right_node
+ float32_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ float32_t min_left, max_right
+ readonly float32_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[float32_t, ndim=1] left,
+ ndarray[float32_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, float32_t[:] left, float32_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] < self.pivot:
+ left_ind.append(i)
+ elif self.pivot < left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[float32_t, ndim=1] left,
+ ndarray[float32_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Float32ClosedBothIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ float32_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] <= point <= self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] <= point:
+ break
+ result.append(indices[i])
+ if point <= self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point <= values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left <= point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Float32ClosedBothIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Float32ClosedBothIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['float32',
+ 'both'] = Float32ClosedBothIntervalNode
+
+cdef class Float32ClosedNeitherIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Float32ClosedNeitherIntervalNode left_node, right_node
+ float32_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ float32_t min_left, max_right
+ readonly float32_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[float32_t, ndim=1] left,
+ ndarray[float32_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, float32_t[:] left, float32_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] <= self.pivot:
+ left_ind.append(i)
+ elif self.pivot <= left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[float32_t, ndim=1] left,
+ ndarray[float32_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Float32ClosedNeitherIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ float32_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] < point < self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] < point:
+ break
+ result.append(indices[i])
+ if point < self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point < values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left < point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Float32ClosedNeitherIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Float32ClosedNeitherIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['float32',
+ 'neither'] = Float32ClosedNeitherIntervalNode
+
+cdef class Float64ClosedLeftIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Float64ClosedLeftIntervalNode left_node, right_node
+ float64_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ float64_t min_left, max_right
+ readonly float64_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[float64_t, ndim=1] left,
+ ndarray[float64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, float64_t[:] left, float64_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] <= self.pivot:
+ left_ind.append(i)
+ elif self.pivot < left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[float64_t, ndim=1] left,
+ ndarray[float64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Float64ClosedLeftIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ float64_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] <= point < self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] <= point:
+ break
+ result.append(indices[i])
+ if point < self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point < values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left <= point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Float64ClosedLeftIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Float64ClosedLeftIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['float64',
+ 'left'] = Float64ClosedLeftIntervalNode
+
+cdef class Float64ClosedRightIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Float64ClosedRightIntervalNode left_node, right_node
+ float64_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ float64_t min_left, max_right
+ readonly float64_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[float64_t, ndim=1] left,
+ ndarray[float64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, float64_t[:] left, float64_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] < self.pivot:
+ left_ind.append(i)
+ elif self.pivot <= left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[float64_t, ndim=1] left,
+ ndarray[float64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Float64ClosedRightIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ float64_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] < point <= self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] < point:
+ break
+ result.append(indices[i])
+ if point <= self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point <= values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left < point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Float64ClosedRightIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Float64ClosedRightIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['float64',
+ 'right'] = Float64ClosedRightIntervalNode
+
+cdef class Float64ClosedBothIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Float64ClosedBothIntervalNode left_node, right_node
+ float64_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ float64_t min_left, max_right
+ readonly float64_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[float64_t, ndim=1] left,
+ ndarray[float64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, float64_t[:] left, float64_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] < self.pivot:
+ left_ind.append(i)
+ elif self.pivot < left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[float64_t, ndim=1] left,
+ ndarray[float64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Float64ClosedBothIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ float64_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] <= point <= self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] <= point:
+ break
+ result.append(indices[i])
+ if point <= self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point <= values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left <= point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Float64ClosedBothIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Float64ClosedBothIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['float64',
+ 'both'] = Float64ClosedBothIntervalNode
+
+cdef class Float64ClosedNeitherIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Float64ClosedNeitherIntervalNode left_node, right_node
+ float64_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ float64_t min_left, max_right
+ readonly float64_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[float64_t, ndim=1] left,
+ ndarray[float64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, float64_t[:] left, float64_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] <= self.pivot:
+ left_ind.append(i)
+ elif self.pivot <= left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[float64_t, ndim=1] left,
+ ndarray[float64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Float64ClosedNeitherIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ float64_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] < point < self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] < point:
+ break
+ result.append(indices[i])
+ if point < self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point < values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left < point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Float64ClosedNeitherIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Float64ClosedNeitherIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['float64',
+ 'neither'] = Float64ClosedNeitherIntervalNode
+
+cdef class Int32ClosedLeftIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Int32ClosedLeftIntervalNode left_node, right_node
+ int32_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ int32_t min_left, max_right
+ readonly int32_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[int32_t, ndim=1] left,
+ ndarray[int32_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, int32_t[:] left, int32_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] <= self.pivot:
+ left_ind.append(i)
+ elif self.pivot < left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[int32_t, ndim=1] left,
+ ndarray[int32_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Int32ClosedLeftIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ int32_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] <= point < self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] <= point:
+ break
+ result.append(indices[i])
+ if point < self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point < values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left <= point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Int32ClosedLeftIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Int32ClosedLeftIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['int32',
+ 'left'] = Int32ClosedLeftIntervalNode
+
+cdef class Int32ClosedRightIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Int32ClosedRightIntervalNode left_node, right_node
+ int32_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ int32_t min_left, max_right
+ readonly int32_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[int32_t, ndim=1] left,
+ ndarray[int32_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, int32_t[:] left, int32_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] < self.pivot:
+ left_ind.append(i)
+ elif self.pivot <= left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[int32_t, ndim=1] left,
+ ndarray[int32_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Int32ClosedRightIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ int32_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] < point <= self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] < point:
+ break
+ result.append(indices[i])
+ if point <= self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point <= values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left < point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Int32ClosedRightIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Int32ClosedRightIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['int32',
+ 'right'] = Int32ClosedRightIntervalNode
+
+cdef class Int32ClosedBothIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Int32ClosedBothIntervalNode left_node, right_node
+ int32_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ int32_t min_left, max_right
+ readonly int32_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[int32_t, ndim=1] left,
+ ndarray[int32_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, int32_t[:] left, int32_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] < self.pivot:
+ left_ind.append(i)
+ elif self.pivot < left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[int32_t, ndim=1] left,
+ ndarray[int32_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Int32ClosedBothIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ int32_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] <= point <= self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] <= point:
+ break
+ result.append(indices[i])
+ if point <= self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point <= values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left <= point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Int32ClosedBothIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Int32ClosedBothIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['int32',
+ 'both'] = Int32ClosedBothIntervalNode
+
+cdef class Int32ClosedNeitherIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Int32ClosedNeitherIntervalNode left_node, right_node
+ int32_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ int32_t min_left, max_right
+ readonly int32_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[int32_t, ndim=1] left,
+ ndarray[int32_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, int32_t[:] left, int32_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] <= self.pivot:
+ left_ind.append(i)
+ elif self.pivot <= left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[int32_t, ndim=1] left,
+ ndarray[int32_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Int32ClosedNeitherIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ int32_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] < point < self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] < point:
+ break
+ result.append(indices[i])
+ if point < self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point < values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left < point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Int32ClosedNeitherIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Int32ClosedNeitherIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['int32',
+ 'neither'] = Int32ClosedNeitherIntervalNode
+
+cdef class Int64ClosedLeftIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Int64ClosedLeftIntervalNode left_node, right_node
+ int64_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ int64_t min_left, max_right
+ readonly int64_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[int64_t, ndim=1] left,
+ ndarray[int64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, int64_t[:] left, int64_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] <= self.pivot:
+ left_ind.append(i)
+ elif self.pivot < left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[int64_t, ndim=1] left,
+ ndarray[int64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Int64ClosedLeftIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ int64_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] <= point < self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] <= point:
+ break
+ result.append(indices[i])
+ if point < self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point < values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left <= point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Int64ClosedLeftIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Int64ClosedLeftIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['int64',
+ 'left'] = Int64ClosedLeftIntervalNode
+
+cdef class Int64ClosedRightIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Int64ClosedRightIntervalNode left_node, right_node
+ int64_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ int64_t min_left, max_right
+ readonly int64_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[int64_t, ndim=1] left,
+ ndarray[int64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, int64_t[:] left, int64_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] < self.pivot:
+ left_ind.append(i)
+ elif self.pivot <= left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[int64_t, ndim=1] left,
+ ndarray[int64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Int64ClosedRightIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ int64_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] < point <= self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] < point:
+ break
+ result.append(indices[i])
+ if point <= self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point <= values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left < point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Int64ClosedRightIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Int64ClosedRightIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['int64',
+ 'right'] = Int64ClosedRightIntervalNode
+
+cdef class Int64ClosedBothIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Int64ClosedBothIntervalNode left_node, right_node
+ int64_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ int64_t min_left, max_right
+ readonly int64_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[int64_t, ndim=1] left,
+ ndarray[int64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, int64_t[:] left, int64_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] < self.pivot:
+ left_ind.append(i)
+ elif self.pivot < left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[int64_t, ndim=1] left,
+ ndarray[int64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Int64ClosedBothIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ int64_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] <= point <= self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] <= point:
+ break
+ result.append(indices[i])
+ if point <= self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point <= values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left <= point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Int64ClosedBothIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Int64ClosedBothIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['int64',
+ 'both'] = Int64ClosedBothIntervalNode
+
+cdef class Int64ClosedNeitherIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Int64ClosedNeitherIntervalNode left_node, right_node
+ int64_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ int64_t min_left, max_right
+ readonly int64_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[int64_t, ndim=1] left,
+ ndarray[int64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, int64_t[:] left, int64_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] <= self.pivot:
+ left_ind.append(i)
+ elif self.pivot <= left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[int64_t, ndim=1] left,
+ ndarray[int64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Int64ClosedNeitherIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ int64_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] < point < self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] < point:
+ break
+ result.append(indices[i])
+ if point < self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point < values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left < point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Int64ClosedNeitherIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Int64ClosedNeitherIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['int64',
+ 'neither'] = Int64ClosedNeitherIntervalNode
+
+cdef class Uint64ClosedLeftIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Uint64ClosedLeftIntervalNode left_node, right_node
+ uint64_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ uint64_t min_left, max_right
+ readonly uint64_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[uint64_t, ndim=1] left,
+ ndarray[uint64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, uint64_t[:] left, uint64_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] <= self.pivot:
+ left_ind.append(i)
+ elif self.pivot < left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[uint64_t, ndim=1] left,
+ ndarray[uint64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Uint64ClosedLeftIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ uint64_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] <= point < self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] <= point:
+ break
+ result.append(indices[i])
+ if point < self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point < values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left <= point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Uint64ClosedLeftIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Uint64ClosedLeftIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['uint64',
+ 'left'] = Uint64ClosedLeftIntervalNode
+
+cdef class Uint64ClosedRightIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Uint64ClosedRightIntervalNode left_node, right_node
+ uint64_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ uint64_t min_left, max_right
+ readonly uint64_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[uint64_t, ndim=1] left,
+ ndarray[uint64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, uint64_t[:] left, uint64_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] < self.pivot:
+ left_ind.append(i)
+ elif self.pivot <= left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[uint64_t, ndim=1] left,
+ ndarray[uint64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Uint64ClosedRightIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ uint64_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] < point <= self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] < point:
+ break
+ result.append(indices[i])
+ if point <= self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point <= values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left < point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Uint64ClosedRightIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Uint64ClosedRightIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['uint64',
+ 'right'] = Uint64ClosedRightIntervalNode
+
+cdef class Uint64ClosedBothIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Uint64ClosedBothIntervalNode left_node, right_node
+ uint64_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ uint64_t min_left, max_right
+ readonly uint64_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[uint64_t, ndim=1] left,
+ ndarray[uint64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, uint64_t[:] left, uint64_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] < self.pivot:
+ left_ind.append(i)
+ elif self.pivot < left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[uint64_t, ndim=1] left,
+ ndarray[uint64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Uint64ClosedBothIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ uint64_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] <= point <= self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] <= point:
+ break
+ result.append(indices[i])
+ if point <= self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point <= values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left <= point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Uint64ClosedBothIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Uint64ClosedBothIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['uint64',
+ 'both'] = Uint64ClosedBothIntervalNode
+
+cdef class Uint64ClosedNeitherIntervalNode:
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef:
+ Uint64ClosedNeitherIntervalNode left_node, right_node
+ uint64_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ uint64_t min_left, max_right
+ readonly uint64_t pivot
+ readonly int64_t n_elements, n_center, leaf_size
+ readonly bint is_leaf_node
+
+ def __init__(self,
+ ndarray[uint64_t, ndim=1] left,
+ ndarray[uint64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, uint64_t[:] left, uint64_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] <= self.pivot:
+ left_ind.append(i)
+ elif self.pivot <= left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[uint64_t, ndim=1] left,
+ ndarray[uint64_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return Uint64ClosedNeitherIntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ uint64_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] < point < self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] < point:
+ break
+ result.append(indices[i])
+ if point < self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point < values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left < point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+ def __repr__(self):
+ if self.is_leaf_node:
+ return ('<Uint64ClosedNeitherIntervalNode: '
+ '%s elements (terminal)>' % self.n_elements)
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return ('<Uint64ClosedNeitherIntervalNode: '
+ 'pivot %s, %s elements (%s left, %s right, %s '
+ 'overlapping)>' % (self.pivot, self.n_elements,
+ n_left, n_right, n_center))
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+NODE_CLASSES['uint64',
+ 'neither'] = Uint64ClosedNeitherIntervalNode
diff --git a/contrib/python/pandas/py2/pandas/_libs/join.pyx b/contrib/python/pandas/py2/pandas/_libs/join.pyx
new file mode 100644
index 00000000000..e4440ac3d9f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/join.pyx
@@ -0,0 +1,1006 @@
+# -*- coding: utf-8 -*-
+
+import cython
+from cython import Py_ssize_t
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport (ndarray,
+ int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
+ uint32_t, uint64_t, float32_t, float64_t)
+cnp.import_array()
+
+from pandas._libs.algos import groupsort_indexer, ensure_platform_int
+from pandas.core.algorithms import take_nd
+
+
+def inner_join(ndarray[int64_t] left, ndarray[int64_t] right,
+ Py_ssize_t max_groups):
+ cdef:
+ Py_ssize_t i, j, k, count = 0
+ ndarray[int64_t] left_count, right_count, left_sorter, right_sorter
+ ndarray[int64_t] left_indexer, right_indexer
+ int64_t lc, rc
+
+ # NA group in location 0
+
+ left_sorter, left_count = groupsort_indexer(left, max_groups)
+ right_sorter, right_count = groupsort_indexer(right, max_groups)
+
+ # First pass, determine size of result set, do not use the NA group
+ for i in range(1, max_groups + 1):
+ lc = left_count[i]
+ rc = right_count[i]
+
+ if rc > 0 and lc > 0:
+ count += lc * rc
+
+ # group 0 is the NA group
+ cdef:
+ Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0
+ Py_ssize_t offset
+
+ # exclude the NA group
+ left_pos = left_count[0]
+ right_pos = right_count[0]
+
+ left_indexer = np.empty(count, dtype=np.int64)
+ right_indexer = np.empty(count, dtype=np.int64)
+
+ for i in range(1, max_groups + 1):
+ lc = left_count[i]
+ rc = right_count[i]
+
+ if rc > 0 and lc > 0:
+ for j in range(lc):
+ offset = position + j * rc
+ for k in range(rc):
+ left_indexer[offset + k] = left_pos + j
+ right_indexer[offset + k] = right_pos + k
+ position += lc * rc
+ left_pos += lc
+ right_pos += rc
+
+ return (_get_result_indexer(left_sorter, left_indexer),
+ _get_result_indexer(right_sorter, right_indexer))
+
+
+def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
+ Py_ssize_t max_groups, sort=True):
+ cdef:
+ Py_ssize_t i, j, k, count = 0
+ ndarray[int64_t] left_count, right_count
+ ndarray left_sorter, right_sorter, rev
+ ndarray[int64_t] left_indexer, right_indexer
+ int64_t lc, rc
+
+ # NA group in location 0
+
+ left_sorter, left_count = groupsort_indexer(left, max_groups)
+ right_sorter, right_count = groupsort_indexer(right, max_groups)
+
+ # First pass, determine size of result set, do not use the NA group
+ for i in range(1, max_groups + 1):
+ if right_count[i] > 0:
+ count += left_count[i] * right_count[i]
+ else:
+ count += left_count[i]
+
+ # group 0 is the NA group
+ cdef:
+ Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0
+ Py_ssize_t offset
+
+ # exclude the NA group
+ left_pos = left_count[0]
+ right_pos = right_count[0]
+
+ left_indexer = np.empty(count, dtype=np.int64)
+ right_indexer = np.empty(count, dtype=np.int64)
+
+ for i in range(1, max_groups + 1):
+ lc = left_count[i]
+ rc = right_count[i]
+
+ if rc == 0:
+ for j in range(lc):
+ left_indexer[position + j] = left_pos + j
+ right_indexer[position + j] = -1
+ position += lc
+ else:
+ for j in range(lc):
+ offset = position + j * rc
+ for k in range(rc):
+ left_indexer[offset + k] = left_pos + j
+ right_indexer[offset + k] = right_pos + k
+ position += lc * rc
+ left_pos += lc
+ right_pos += rc
+
+ left_indexer = _get_result_indexer(left_sorter, left_indexer)
+ right_indexer = _get_result_indexer(right_sorter, right_indexer)
+
+ if not sort: # if not asked to sort, revert to original order
+ if len(left) == len(left_indexer):
+ # no multiple matches for any row on the left
+ # this is a short-cut to avoid groupsort_indexer
+ # otherwise, the `else` path also works in this case
+ left_sorter = ensure_platform_int(left_sorter)
+
+ rev = np.empty(len(left), dtype=np.intp)
+ rev.put(left_sorter, np.arange(len(left)))
+ else:
+ rev, _ = groupsort_indexer(left_indexer, len(left))
+
+ rev = ensure_platform_int(rev)
+ right_indexer = right_indexer.take(rev)
+ left_indexer = left_indexer.take(rev)
+
+ return left_indexer, right_indexer
+
+
+def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
+ Py_ssize_t max_groups):
+ cdef:
+ Py_ssize_t i, j, k, count = 0
+ ndarray[int64_t] left_count, right_count, left_sorter, right_sorter
+ ndarray[int64_t] left_indexer, right_indexer
+ int64_t lc, rc
+
+ # NA group in location 0
+
+ left_sorter, left_count = groupsort_indexer(left, max_groups)
+ right_sorter, right_count = groupsort_indexer(right, max_groups)
+
+ # First pass, determine size of result set, do not use the NA group
+ for i in range(1, max_groups + 1):
+ lc = left_count[i]
+ rc = right_count[i]
+
+ if rc > 0 and lc > 0:
+ count += lc * rc
+ else:
+ count += lc + rc
+
+ # group 0 is the NA group
+ cdef:
+ int64_t left_pos = 0, right_pos = 0
+ Py_ssize_t offset, position = 0
+
+ # exclude the NA group
+ left_pos = left_count[0]
+ right_pos = right_count[0]
+
+ left_indexer = np.empty(count, dtype=np.int64)
+ right_indexer = np.empty(count, dtype=np.int64)
+
+ for i in range(1, max_groups + 1):
+ lc = left_count[i]
+ rc = right_count[i]
+
+ if rc == 0:
+ for j in range(lc):
+ left_indexer[position + j] = left_pos + j
+ right_indexer[position + j] = -1
+ position += lc
+ elif lc == 0:
+ for j in range(rc):
+ left_indexer[position + j] = -1
+ right_indexer[position + j] = right_pos + j
+ position += rc
+ else:
+ for j in range(lc):
+ offset = position + j * rc
+ for k in range(rc):
+ left_indexer[offset + k] = left_pos + j
+ right_indexer[offset + k] = right_pos + k
+ position += lc * rc
+ left_pos += lc
+ right_pos += rc
+
+ return (_get_result_indexer(left_sorter, left_indexer),
+ _get_result_indexer(right_sorter, right_indexer))
+
+
+def _get_result_indexer(sorter, indexer):
+ if len(sorter) > 0:
+ res = take_nd(sorter, indexer, fill_value=-1)
+ else:
+ # length-0 case
+ res = np.empty(len(indexer), dtype=np.int64)
+ res[:] = -1
+
+ return res
+
+
+def ffill_indexer(ndarray[int64_t] indexer):
+ cdef:
+ Py_ssize_t i, n = len(indexer)
+ ndarray[int64_t] result
+ int64_t val, last_obs
+
+ result = np.empty(n, dtype=np.int64)
+ last_obs = -1
+
+ for i in range(n):
+ val = indexer[i]
+ if val == -1:
+ result[i] = last_obs
+ else:
+ result[i] = val
+ last_obs = val
+
+ return result
+
+
+# ----------------------------------------------------------------------
+# left_join_indexer, inner_join_indexer, outer_join_indexer
+# ----------------------------------------------------------------------
+
+ctypedef fused join_t:
+ float64_t
+ float32_t
+ object
+ int32_t
+ int64_t
+ uint64_t
+
+
+# Joins on ordered, unique indices
+
+# right might contain non-unique values
+
+def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right):
+ cdef:
+ Py_ssize_t i, j, nleft, nright
+ ndarray[int64_t] indexer
+ join_t lval, rval
+
+ i = 0
+ j = 0
+ nleft = len(left)
+ nright = len(right)
+
+ indexer = np.empty(nleft, dtype=np.int64)
+ while True:
+ if i == nleft:
+ break
+
+ if j == nright:
+ indexer[i] = -1
+ i += 1
+ continue
+
+ rval = right[j]
+
+ while i < nleft - 1 and left[i] == rval:
+ indexer[i] = j
+ i += 1
+
+ if left[i] == right[j]:
+ indexer[i] = j
+ i += 1
+ while i < nleft - 1 and left[i] == rval:
+ indexer[i] = j
+ i += 1
+ j += 1
+ elif left[i] > rval:
+ indexer[i] = -1
+ j += 1
+ else:
+ indexer[i] = -1
+ i += 1
+ return indexer
+
+
+left_join_indexer_unique_float64 = left_join_indexer_unique["float64_t"]
+left_join_indexer_unique_float32 = left_join_indexer_unique["float32_t"]
+left_join_indexer_unique_object = left_join_indexer_unique["object"]
+left_join_indexer_unique_int32 = left_join_indexer_unique["int32_t"]
+left_join_indexer_unique_int64 = left_join_indexer_unique["int64_t"]
+left_join_indexer_unique_uint64 = left_join_indexer_unique["uint64_t"]
+
+
+def left_join_indexer(ndarray[join_t] left, ndarray[join_t] right):
+ """
+ Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+ """
+ cdef:
+ Py_ssize_t i, j, k, nright, nleft, count
+ join_t lval, rval
+ ndarray[int64_t] lindexer, rindexer
+ ndarray[join_t] result
+
+ nleft = len(left)
+ nright = len(right)
+
+ i = 0
+ j = 0
+ count = 0
+ if nleft > 0:
+ while i < nleft:
+ if j == nright:
+ count += nleft - i
+ break
+
+ lval = left[i]
+ rval = right[j]
+
+ if lval == rval:
+ count += 1
+ if i < nleft - 1:
+ if j < nright - 1 and right[j + 1] == rval:
+ j += 1
+ else:
+ i += 1
+ if left[i] != rval:
+ j += 1
+ elif j < nright - 1:
+ j += 1
+ if lval != right[j]:
+ i += 1
+ else:
+ # end of the road
+ break
+ elif lval < rval:
+ count += 1
+ i += 1
+ else:
+ j += 1
+
+ # do it again now that result size is known
+
+ lindexer = np.empty(count, dtype=np.int64)
+ rindexer = np.empty(count, dtype=np.int64)
+ result = np.empty(count, dtype=left.dtype)
+
+ i = 0
+ j = 0
+ count = 0
+ if nleft > 0:
+ while i < nleft:
+ if j == nright:
+ while i < nleft:
+ lindexer[count] = i
+ rindexer[count] = -1
+ result[count] = left[i]
+ i += 1
+ count += 1
+ break
+
+ lval = left[i]
+ rval = right[j]
+
+ if lval == rval:
+ lindexer[count] = i
+ rindexer[count] = j
+ result[count] = lval
+ count += 1
+ if i < nleft - 1:
+ if j < nright - 1 and right[j + 1] == rval:
+ j += 1
+ else:
+ i += 1
+ if left[i] != rval:
+ j += 1
+ elif j < nright - 1:
+ j += 1
+ if lval != right[j]:
+ i += 1
+ else:
+ # end of the road
+ break
+ elif lval < rval:
+ lindexer[count] = i
+ rindexer[count] = -1
+ result[count] = left[i]
+ count += 1
+ i += 1
+ else:
+ j += 1
+
+ return result, lindexer, rindexer
+
+
+left_join_indexer_float64 = left_join_indexer["float64_t"]
+left_join_indexer_float32 = left_join_indexer["float32_t"]
+left_join_indexer_object = left_join_indexer["object"]
+left_join_indexer_int32 = left_join_indexer["int32_t"]
+left_join_indexer_int64 = left_join_indexer["int64_t"]
+left_join_indexer_uint64 = left_join_indexer["uint64_t"]
+
+
+def inner_join_indexer(ndarray[join_t] left, ndarray[join_t] right):
+ """
+ Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+ """
+ cdef:
+ Py_ssize_t i, j, k, nright, nleft, count
+ join_t lval, rval
+ ndarray[int64_t] lindexer, rindexer
+ ndarray[join_t] result
+
+ nleft = len(left)
+ nright = len(right)
+
+ i = 0
+ j = 0
+ count = 0
+ if nleft > 0 and nright > 0:
+ while True:
+ if i == nleft:
+ break
+ if j == nright:
+ break
+
+ lval = left[i]
+ rval = right[j]
+ if lval == rval:
+ count += 1
+ if i < nleft - 1:
+ if j < nright - 1 and right[j + 1] == rval:
+ j += 1
+ else:
+ i += 1
+ if left[i] != rval:
+ j += 1
+ elif j < nright - 1:
+ j += 1
+ if lval != right[j]:
+ i += 1
+ else:
+ # end of the road
+ break
+ elif lval < rval:
+ i += 1
+ else:
+ j += 1
+
+ # do it again now that result size is known
+
+ lindexer = np.empty(count, dtype=np.int64)
+ rindexer = np.empty(count, dtype=np.int64)
+ result = np.empty(count, dtype=left.dtype)
+
+ i = 0
+ j = 0
+ count = 0
+ if nleft > 0 and nright > 0:
+ while True:
+ if i == nleft:
+ break
+ if j == nright:
+ break
+
+ lval = left[i]
+ rval = right[j]
+ if lval == rval:
+ lindexer[count] = i
+ rindexer[count] = j
+ result[count] = rval
+ count += 1
+ if i < nleft - 1:
+ if j < nright - 1 and right[j + 1] == rval:
+ j += 1
+ else:
+ i += 1
+ if left[i] != rval:
+ j += 1
+ elif j < nright - 1:
+ j += 1
+ if lval != right[j]:
+ i += 1
+ else:
+ # end of the road
+ break
+ elif lval < rval:
+ i += 1
+ else:
+ j += 1
+
+ return result, lindexer, rindexer
+
+
+inner_join_indexer_float64 = inner_join_indexer["float64_t"]
+inner_join_indexer_float32 = inner_join_indexer["float32_t"]
+inner_join_indexer_object = inner_join_indexer["object"]
+inner_join_indexer_int32 = inner_join_indexer["int32_t"]
+inner_join_indexer_int64 = inner_join_indexer["int64_t"]
+inner_join_indexer_uint64 = inner_join_indexer["uint64_t"]
+
+
+def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right):
+ cdef:
+ Py_ssize_t i, j, nright, nleft, count
+ join_t lval, rval
+ ndarray[int64_t] lindexer, rindexer
+ ndarray[join_t] result
+
+ nleft = len(left)
+ nright = len(right)
+
+ i = 0
+ j = 0
+ count = 0
+ if nleft == 0:
+ count = nright
+ elif nright == 0:
+ count = nleft
+ else:
+ while True:
+ if i == nleft:
+ count += nright - j
+ break
+ if j == nright:
+ count += nleft - i
+ break
+
+ lval = left[i]
+ rval = right[j]
+ if lval == rval:
+ count += 1
+ if i < nleft - 1:
+ if j < nright - 1 and right[j + 1] == rval:
+ j += 1
+ else:
+ i += 1
+ if left[i] != rval:
+ j += 1
+ elif j < nright - 1:
+ j += 1
+ if lval != right[j]:
+ i += 1
+ else:
+ # end of the road
+ break
+ elif lval < rval:
+ count += 1
+ i += 1
+ else:
+ count += 1
+ j += 1
+
+ lindexer = np.empty(count, dtype=np.int64)
+ rindexer = np.empty(count, dtype=np.int64)
+ result = np.empty(count, dtype=left.dtype)
+
+ # do it again, but populate the indexers / result
+
+ i = 0
+ j = 0
+ count = 0
+ if nleft == 0:
+ for j in range(nright):
+ lindexer[j] = -1
+ rindexer[j] = j
+ result[j] = right[j]
+ elif nright == 0:
+ for i in range(nleft):
+ lindexer[i] = i
+ rindexer[i] = -1
+ result[i] = left[i]
+ else:
+ while True:
+ if i == nleft:
+ while j < nright:
+ lindexer[count] = -1
+ rindexer[count] = j
+ result[count] = right[j]
+ count += 1
+ j += 1
+ break
+ if j == nright:
+ while i < nleft:
+ lindexer[count] = i
+ rindexer[count] = -1
+ result[count] = left[i]
+ count += 1
+ i += 1
+ break
+
+ lval = left[i]
+ rval = right[j]
+
+ if lval == rval:
+ lindexer[count] = i
+ rindexer[count] = j
+ result[count] = lval
+ count += 1
+ if i < nleft - 1:
+ if j < nright - 1 and right[j + 1] == rval:
+ j += 1
+ else:
+ i += 1
+ if left[i] != rval:
+ j += 1
+ elif j < nright - 1:
+ j += 1
+ if lval != right[j]:
+ i += 1
+ else:
+ # end of the road
+ break
+ elif lval < rval:
+ lindexer[count] = i
+ rindexer[count] = -1
+ result[count] = lval
+ count += 1
+ i += 1
+ else:
+ lindexer[count] = -1
+ rindexer[count] = j
+ result[count] = rval
+ count += 1
+ j += 1
+
+ return result, lindexer, rindexer
+
+
+outer_join_indexer_float64 = outer_join_indexer["float64_t"]
+outer_join_indexer_float32 = outer_join_indexer["float32_t"]
+outer_join_indexer_object = outer_join_indexer["object"]
+outer_join_indexer_int32 = outer_join_indexer["int32_t"]
+outer_join_indexer_int64 = outer_join_indexer["int64_t"]
+outer_join_indexer_uint64 = outer_join_indexer["uint64_t"]
+
+
+# ----------------------------------------------------------------------
+# asof_join_by
+# ----------------------------------------------------------------------
+
+from pandas._libs.hashtable cimport (
+ HashTable, PyObjectHashTable, UInt64HashTable, Int64HashTable)
+
+ctypedef fused asof_t:
+ uint8_t
+ uint16_t
+ uint32_t
+ uint64_t
+ int8_t
+ int16_t
+ int32_t
+ int64_t
+ float
+ float64_t
+
+ctypedef fused by_t:
+ object
+ int64_t
+ uint64_t
+
+
+def asof_join_backward_on_X_by_Y(ndarray[asof_t] left_values,
+ ndarray[asof_t] right_values,
+ ndarray[by_t] left_by_values,
+ ndarray[by_t] right_by_values,
+ bint allow_exact_matches=1,
+ tolerance=None):
+
+ cdef:
+ Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
+ ndarray[int64_t] left_indexer, right_indexer
+ bint has_tolerance = 0
+ asof_t tolerance_ = 0
+ asof_t diff = 0
+ HashTable hash_table
+ by_t by_value
+
+ # if we are using tolerance, set our objects
+ if tolerance is not None:
+ has_tolerance = 1
+ tolerance_ = tolerance
+
+ left_size = len(left_values)
+ right_size = len(right_values)
+
+ left_indexer = np.empty(left_size, dtype=np.int64)
+ right_indexer = np.empty(left_size, dtype=np.int64)
+
+ if by_t is object:
+ hash_table = PyObjectHashTable(right_size)
+ elif by_t is int64_t:
+ hash_table = Int64HashTable(right_size)
+ elif by_t is uint64_t:
+ hash_table = UInt64HashTable(right_size)
+
+ right_pos = 0
+ for left_pos in range(left_size):
+ # restart right_pos if it went negative in a previous iteration
+ if right_pos < 0:
+ right_pos = 0
+
+ # find last position in right whose value is less than left's
+ if allow_exact_matches:
+ while (right_pos < right_size and
+ right_values[right_pos] <= left_values[left_pos]):
+ hash_table.set_item(right_by_values[right_pos], right_pos)
+ right_pos += 1
+ else:
+ while (right_pos < right_size and
+ right_values[right_pos] < left_values[left_pos]):
+ hash_table.set_item(right_by_values[right_pos], right_pos)
+ right_pos += 1
+ right_pos -= 1
+
+ # save positions as the desired index
+ by_value = left_by_values[left_pos]
+ found_right_pos = (hash_table.get_item(by_value)
+ if by_value in hash_table else -1)
+ left_indexer[left_pos] = left_pos
+ right_indexer[left_pos] = found_right_pos
+
+ # if needed, verify that tolerance is met
+ if has_tolerance and found_right_pos != -1:
+ diff = left_values[left_pos] - right_values[found_right_pos]
+ if diff > tolerance_:
+ right_indexer[left_pos] = -1
+
+ return left_indexer, right_indexer
+
+
+def asof_join_forward_on_X_by_Y(ndarray[asof_t] left_values,
+ ndarray[asof_t] right_values,
+ ndarray[by_t] left_by_values,
+ ndarray[by_t] right_by_values,
+ bint allow_exact_matches=1,
+ tolerance=None):
+
+ cdef:
+ Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
+ ndarray[int64_t] left_indexer, right_indexer
+ bint has_tolerance = 0
+ asof_t tolerance_ = 0
+ asof_t diff = 0
+ HashTable hash_table
+ by_t by_value
+
+ # if we are using tolerance, set our objects
+ if tolerance is not None:
+ has_tolerance = 1
+ tolerance_ = tolerance
+
+ left_size = len(left_values)
+ right_size = len(right_values)
+
+ left_indexer = np.empty(left_size, dtype=np.int64)
+ right_indexer = np.empty(left_size, dtype=np.int64)
+
+ if by_t is object:
+ hash_table = PyObjectHashTable(right_size)
+ elif by_t is int64_t:
+ hash_table = Int64HashTable(right_size)
+ elif by_t is uint64_t:
+ hash_table = UInt64HashTable(right_size)
+
+ right_pos = right_size - 1
+ for left_pos in range(left_size - 1, -1, -1):
+ # restart right_pos if it went over in a previous iteration
+ if right_pos == right_size:
+ right_pos = right_size - 1
+
+ # find first position in right whose value is greater than left's
+ if allow_exact_matches:
+ while (right_pos >= 0 and
+ right_values[right_pos] >= left_values[left_pos]):
+ hash_table.set_item(right_by_values[right_pos], right_pos)
+ right_pos -= 1
+ else:
+ while (right_pos >= 0 and
+ right_values[right_pos] > left_values[left_pos]):
+ hash_table.set_item(right_by_values[right_pos], right_pos)
+ right_pos -= 1
+ right_pos += 1
+
+ # save positions as the desired index
+ by_value = left_by_values[left_pos]
+ found_right_pos = (hash_table.get_item(by_value)
+ if by_value in hash_table else -1)
+ left_indexer[left_pos] = left_pos
+ right_indexer[left_pos] = found_right_pos
+
+ # if needed, verify that tolerance is met
+ if has_tolerance and found_right_pos != -1:
+ diff = right_values[found_right_pos] - left_values[left_pos]
+ if diff > tolerance_:
+ right_indexer[left_pos] = -1
+
+ return left_indexer, right_indexer
+
+
+def asof_join_nearest_on_X_by_Y(ndarray[asof_t] left_values,
+ ndarray[asof_t] right_values,
+ ndarray[by_t] left_by_values,
+ ndarray[by_t] right_by_values,
+ bint allow_exact_matches=1,
+ tolerance=None):
+
+ cdef:
+ Py_ssize_t left_size, right_size, i
+ ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri
+ asof_t bdiff, fdiff
+
+ left_size = len(left_values)
+ right_size = len(right_values)
+
+ left_indexer = np.empty(left_size, dtype=np.int64)
+ right_indexer = np.empty(left_size, dtype=np.int64)
+
+ # search both forward and backward
+ bli, bri = asof_join_backward_on_X_by_Y(left_values,
+ right_values,
+ left_by_values,
+ right_by_values,
+ allow_exact_matches,
+ tolerance)
+ fli, fri = asof_join_forward_on_X_by_Y(left_values,
+ right_values,
+ left_by_values,
+ right_by_values,
+ allow_exact_matches,
+ tolerance)
+
+ for i in range(len(bri)):
+ # choose timestamp from right with smaller difference
+ if bri[i] != -1 and fri[i] != -1:
+ bdiff = left_values[bli[i]] - right_values[bri[i]]
+ fdiff = right_values[fri[i]] - left_values[fli[i]]
+ right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i]
+ else:
+ right_indexer[i] = bri[i] if bri[i] != -1 else fri[i]
+ left_indexer[i] = bli[i]
+
+ return left_indexer, right_indexer
+
+
+# ----------------------------------------------------------------------
+# asof_join
+# ----------------------------------------------------------------------
+
+def asof_join_backward(ndarray[asof_t] left_values,
+ ndarray[asof_t] right_values,
+ bint allow_exact_matches=1,
+ tolerance=None):
+
+ cdef:
+ Py_ssize_t left_pos, right_pos, left_size, right_size
+ ndarray[int64_t] left_indexer, right_indexer
+ bint has_tolerance = 0
+ asof_t tolerance_ = 0
+ asof_t diff = 0
+
+ # if we are using tolerance, set our objects
+ if tolerance is not None:
+ has_tolerance = 1
+ tolerance_ = tolerance
+
+ left_size = len(left_values)
+ right_size = len(right_values)
+
+ left_indexer = np.empty(left_size, dtype=np.int64)
+ right_indexer = np.empty(left_size, dtype=np.int64)
+
+ right_pos = 0
+ for left_pos in range(left_size):
+ # restart right_pos if it went negative in a previous iteration
+ if right_pos < 0:
+ right_pos = 0
+
+ # find last position in right whose value is less than left's
+ if allow_exact_matches:
+ while (right_pos < right_size and
+ right_values[right_pos] <= left_values[left_pos]):
+ right_pos += 1
+ else:
+ while (right_pos < right_size and
+ right_values[right_pos] < left_values[left_pos]):
+ right_pos += 1
+ right_pos -= 1
+
+ # save positions as the desired index
+ left_indexer[left_pos] = left_pos
+ right_indexer[left_pos] = right_pos
+
+ # if needed, verify that tolerance is met
+ if has_tolerance and right_pos != -1:
+ diff = left_values[left_pos] - right_values[right_pos]
+ if diff > tolerance_:
+ right_indexer[left_pos] = -1
+
+ return left_indexer, right_indexer
+
+
+def asof_join_forward(ndarray[asof_t] left_values,
+ ndarray[asof_t] right_values,
+ bint allow_exact_matches=1,
+ tolerance=None):
+
+ cdef:
+ Py_ssize_t left_pos, right_pos, left_size, right_size
+ ndarray[int64_t] left_indexer, right_indexer
+ bint has_tolerance = 0
+ asof_t tolerance_ = 0
+ asof_t diff = 0
+
+ # if we are using tolerance, set our objects
+ if tolerance is not None:
+ has_tolerance = 1
+ tolerance_ = tolerance
+
+ left_size = len(left_values)
+ right_size = len(right_values)
+
+ left_indexer = np.empty(left_size, dtype=np.int64)
+ right_indexer = np.empty(left_size, dtype=np.int64)
+
+ right_pos = right_size - 1
+ for left_pos in range(left_size - 1, -1, -1):
+ # restart right_pos if it went over in a previous iteration
+ if right_pos == right_size:
+ right_pos = right_size - 1
+
+ # find first position in right whose value is greater than left's
+ if allow_exact_matches:
+ while (right_pos >= 0 and
+ right_values[right_pos] >= left_values[left_pos]):
+ right_pos -= 1
+ else:
+ while (right_pos >= 0 and
+ right_values[right_pos] > left_values[left_pos]):
+ right_pos -= 1
+ right_pos += 1
+
+ # save positions as the desired index
+ left_indexer[left_pos] = left_pos
+ right_indexer[left_pos] = (right_pos
+ if right_pos != right_size else -1)
+
+ # if needed, verify that tolerance is met
+ if has_tolerance and right_pos != right_size:
+ diff = right_values[right_pos] - left_values[left_pos]
+ if diff > tolerance_:
+ right_indexer[left_pos] = -1
+
+ return left_indexer, right_indexer
+
+
+def asof_join_nearest(ndarray[asof_t] left_values,
+ ndarray[asof_t] right_values,
+ bint allow_exact_matches=1,
+ tolerance=None):
+
+ cdef:
+ Py_ssize_t left_size, right_size, i
+ ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri
+ asof_t bdiff, fdiff
+
+ left_size = len(left_values)
+ right_size = len(right_values)
+
+ left_indexer = np.empty(left_size, dtype=np.int64)
+ right_indexer = np.empty(left_size, dtype=np.int64)
+
+ # search both forward and backward
+ bli, bri = asof_join_backward(left_values, right_values,
+ allow_exact_matches, tolerance)
+ fli, fri = asof_join_forward(left_values, right_values,
+ allow_exact_matches, tolerance)
+
+ for i in range(len(bri)):
+ # choose timestamp from right with smaller difference
+ if bri[i] != -1 and fri[i] != -1:
+ bdiff = left_values[bli[i]] - right_values[bri[i]]
+ fdiff = right_values[fri[i]] - left_values[fli[i]]
+ right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i]
+ else:
+ right_indexer[i] = bri[i] if bri[i] != -1 else fri[i]
+ left_indexer[i] = bli[i]
+
+ return left_indexer, right_indexer
diff --git a/contrib/python/pandas/py2/pandas/_libs/khash.pxd b/contrib/python/pandas/py2/pandas/_libs/khash.pxd
new file mode 100644
index 00000000000..971a45e3655
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/khash.pxd
@@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+from cpython cimport PyObject
+from numpy cimport int64_t, uint64_t, int32_t, uint32_t, float64_t
+
+cdef extern from "khash_python.h":
+ ctypedef uint32_t khint_t
+ ctypedef khint_t khiter_t
+
+ ctypedef struct kh_pymap_t:
+ khint_t n_buckets, size, n_occupied, upper_bound
+ uint32_t *flags
+ PyObject **keys
+ size_t *vals
+
+ kh_pymap_t* kh_init_pymap()
+ void kh_destroy_pymap(kh_pymap_t*)
+ void kh_clear_pymap(kh_pymap_t*)
+ khint_t kh_get_pymap(kh_pymap_t*, PyObject*)
+ void kh_resize_pymap(kh_pymap_t*, khint_t)
+ khint_t kh_put_pymap(kh_pymap_t*, PyObject*, int*)
+ void kh_del_pymap(kh_pymap_t*, khint_t)
+
+ bint kh_exist_pymap(kh_pymap_t*, khiter_t)
+
+ ctypedef struct kh_pyset_t:
+ khint_t n_buckets, size, n_occupied, upper_bound
+ uint32_t *flags
+ PyObject **keys
+ size_t *vals
+
+ kh_pyset_t* kh_init_pyset()
+ void kh_destroy_pyset(kh_pyset_t*)
+ void kh_clear_pyset(kh_pyset_t*)
+ khint_t kh_get_pyset(kh_pyset_t*, PyObject*)
+ void kh_resize_pyset(kh_pyset_t*, khint_t)
+ khint_t kh_put_pyset(kh_pyset_t*, PyObject*, int*)
+ void kh_del_pyset(kh_pyset_t*, khint_t)
+
+ bint kh_exist_pyset(kh_pyset_t*, khiter_t)
+
+ ctypedef char* kh_cstr_t
+
+ ctypedef struct kh_str_t:
+ khint_t n_buckets, size, n_occupied, upper_bound
+ uint32_t *flags
+ kh_cstr_t *keys
+ size_t *vals
+
+ kh_str_t* kh_init_str() nogil
+ void kh_destroy_str(kh_str_t*) nogil
+ void kh_clear_str(kh_str_t*) nogil
+ khint_t kh_get_str(kh_str_t*, kh_cstr_t) nogil
+ void kh_resize_str(kh_str_t*, khint_t) nogil
+ khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*) nogil
+ void kh_del_str(kh_str_t*, khint_t) nogil
+
+ bint kh_exist_str(kh_str_t*, khiter_t) nogil
+
+ ctypedef struct kh_int64_t:
+ khint_t n_buckets, size, n_occupied, upper_bound
+ uint32_t *flags
+ int64_t *keys
+ size_t *vals
+
+ kh_int64_t* kh_init_int64() nogil
+ void kh_destroy_int64(kh_int64_t*) nogil
+ void kh_clear_int64(kh_int64_t*) nogil
+ khint_t kh_get_int64(kh_int64_t*, int64_t) nogil
+ void kh_resize_int64(kh_int64_t*, khint_t) nogil
+ khint_t kh_put_int64(kh_int64_t*, int64_t, int*) nogil
+ void kh_del_int64(kh_int64_t*, khint_t) nogil
+
+ bint kh_exist_int64(kh_int64_t*, khiter_t) nogil
+
+ ctypedef uint64_t khuint64_t
+
+ ctypedef struct kh_uint64_t:
+ khint_t n_buckets, size, n_occupied, upper_bound
+ uint32_t *flags
+ khuint64_t *keys
+ size_t *vals
+
+ kh_uint64_t* kh_init_uint64() nogil
+ void kh_destroy_uint64(kh_uint64_t*) nogil
+ void kh_clear_uint64(kh_uint64_t*) nogil
+ khint_t kh_get_uint64(kh_uint64_t*, uint64_t) nogil
+ void kh_resize_uint64(kh_uint64_t*, khint_t) nogil
+ khint_t kh_put_uint64(kh_uint64_t*, uint64_t, int*) nogil
+ void kh_del_uint64(kh_uint64_t*, khint_t) nogil
+
+ bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil
+
+ ctypedef struct kh_float64_t:
+ khint_t n_buckets, size, n_occupied, upper_bound
+ uint32_t *flags
+ float64_t *keys
+ size_t *vals
+
+ kh_float64_t* kh_init_float64() nogil
+ void kh_destroy_float64(kh_float64_t*) nogil
+ void kh_clear_float64(kh_float64_t*) nogil
+ khint_t kh_get_float64(kh_float64_t*, float64_t) nogil
+ void kh_resize_float64(kh_float64_t*, khint_t) nogil
+ khint_t kh_put_float64(kh_float64_t*, float64_t, int*) nogil
+ void kh_del_float64(kh_float64_t*, khint_t) nogil
+
+ bint kh_exist_float64(kh_float64_t*, khiter_t) nogil
+
+ ctypedef struct kh_int32_t:
+ khint_t n_buckets, size, n_occupied, upper_bound
+ uint32_t *flags
+ int32_t *keys
+ size_t *vals
+
+ kh_int32_t* kh_init_int32() nogil
+ void kh_destroy_int32(kh_int32_t*) nogil
+ void kh_clear_int32(kh_int32_t*) nogil
+ khint_t kh_get_int32(kh_int32_t*, int32_t) nogil
+ void kh_resize_int32(kh_int32_t*, khint_t) nogil
+ khint_t kh_put_int32(kh_int32_t*, int32_t, int*) nogil
+ void kh_del_int32(kh_int32_t*, khint_t) nogil
+
+ bint kh_exist_int32(kh_int32_t*, khiter_t) nogil
+
+ # sweep factorize
+
+ ctypedef struct kh_strbox_t:
+ khint_t n_buckets, size, n_occupied, upper_bound
+ uint32_t *flags
+ kh_cstr_t *keys
+ PyObject **vals
+
+ kh_strbox_t* kh_init_strbox() nogil
+ void kh_destroy_strbox(kh_strbox_t*) nogil
+ void kh_clear_strbox(kh_strbox_t*) nogil
+ khint_t kh_get_strbox(kh_strbox_t*, kh_cstr_t) nogil
+ void kh_resize_strbox(kh_strbox_t*, khint_t) nogil
+ khint_t kh_put_strbox(kh_strbox_t*, kh_cstr_t, int*) nogil
+ void kh_del_strbox(kh_strbox_t*, khint_t) nogil
+
+ bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil
diff --git a/contrib/python/pandas/py2/pandas/_libs/lib.pyx b/contrib/python/pandas/py2/pandas/_libs/lib.pyx
new file mode 100644
index 00000000000..9f1f4d3f1df
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/lib.pyx
@@ -0,0 +1,2349 @@
+# -*- coding: utf-8 -*-
+from decimal import Decimal
+from fractions import Fraction
+from numbers import Number
+
+import sys
+import warnings
+
+import cython
+from cython import Py_ssize_t
+
+from cpython cimport (Py_INCREF, PyTuple_SET_ITEM,
+ PyTuple_New,
+ Py_EQ,
+ PyObject_RichCompareBool)
+
+from cpython.datetime cimport (PyDateTime_Check, PyDate_Check,
+ PyTime_Check, PyDelta_Check,
+ PyDateTime_IMPORT)
+PyDateTime_IMPORT
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport (ndarray, PyArray_GETITEM,
+ PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew,
+ flatiter, NPY_OBJECT,
+ int64_t,
+ float32_t, float64_t,
+ uint8_t, uint64_t,
+ complex128_t)
+cnp.import_array()
+
+cdef extern from "numpy/arrayobject.h":
+ # cython's numpy.dtype specification is incorrect, which leads to
+ # errors in issubclass(self.dtype.type, np.bool_), so we directly
+ # include the correct version
+ # https://github.com/cython/cython/issues/2022
+
+ ctypedef class numpy.dtype [object PyArray_Descr]:
+ # Use PyDataType_* macros when possible, however there are no macros
+ # for accessing some of the fields, so some are defined. Please
+ # ask on cython-dev if you need more.
+ cdef int type_num
+ cdef int itemsize "elsize"
+ cdef char byteorder
+ cdef object fields
+ cdef tuple names
+
+
+cdef extern from "src/parse_helper.h":
+ int floatify(object, float64_t *result, int *maybe_int) except -1
+
+cimport pandas._libs.util as util
+from pandas._libs.util cimport is_nan, UINT64_MAX, INT64_MAX, INT64_MIN
+
+from pandas._libs.tslib import array_to_datetime
+from pandas._libs.tslibs.nattype cimport NPY_NAT
+from pandas._libs.tslibs.nattype import NaT
+from pandas._libs.tslibs.conversion cimport convert_to_tsobject
+from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64
+from pandas._libs.tslibs.timezones cimport get_timezone, tz_compare
+
+from pandas._libs.missing cimport (
+ checknull, isnaobj, is_null_datetime64, is_null_timedelta64, is_null_period
+)
+
+
+# constants that will be compared to potentially arbitrarily large
+# python int
+cdef object oINT64_MAX = <int64_t>INT64_MAX
+cdef object oINT64_MIN = <int64_t>INT64_MIN
+cdef object oUINT64_MAX = <uint64_t>UINT64_MAX
+
+cdef bint PY2 = sys.version_info[0] == 2
+cdef float64_t NaN = <float64_t>np.NaN
+
+
+def values_from_object(obj: object):
+ """ return my values or the object if we are say an ndarray """
+ func: object
+
+ func = getattr(obj, 'get_values', None)
+ if func is not None:
+ obj = func()
+
+ return obj
+
+
+def memory_usage_of_objects(arr: object[:]) -> int64_t:
+ """ return the memory usage of an object array in bytes,
+ does not include the actual bytes of the pointers """
+ i: Py_ssize_t
+ n: Py_ssize_t
+ size: int64_t
+
+ size = 0
+ n = len(arr)
+ for i in range(n):
+ size += arr[i].__sizeof__()
+ return size
+
+
+# ----------------------------------------------------------------------
+
+
+def is_scalar(val: object) -> bool:
+ """
+ Return True if given value is scalar.
+
+ Parameters
+ ----------
+ val : object
+ This includes:
+
+ - numpy array scalar (e.g. np.int64)
+ - Python builtin numerics
+ - Python builtin byte arrays and strings
+ - None
+ - datetime.datetime
+ - datetime.timedelta
+ - Period
+ - decimal.Decimal
+ - Interval
+ - DateOffset
+ - Fraction
+ - Number
+
+ Returns
+ -------
+ bool
+ Return True if given object is scalar, False otherwise
+
+ Examples
+ --------
+ >>> dt = pd.datetime.datetime(2018, 10, 3)
+ >>> pd.is_scalar(dt)
+ True
+
+ >>> pd.api.types.is_scalar([2, 3])
+ False
+
+ >>> pd.api.types.is_scalar({0: 1, 2: 3})
+ False
+
+ >>> pd.api.types.is_scalar((0, 2))
+ False
+
+ pandas supports PEP 3141 numbers:
+
+ >>> from fractions import Fraction
+ >>> pd.api.types.is_scalar(Fraction(3, 5))
+ True
+ """
+
+ return (cnp.PyArray_IsAnyScalar(val)
+ # PyArray_IsAnyScalar is always False for bytearrays on Py3
+ or isinstance(val, (Fraction, Number))
+ # We differ from numpy, which claims that None is not scalar;
+ # see np.isscalar
+ or val is None
+ or PyDate_Check(val)
+ or PyDelta_Check(val)
+ or PyTime_Check(val)
+ or util.is_period_object(val)
+ or is_decimal(val)
+ or is_interval(val)
+ or util.is_offset_object(val))
+
+
+def item_from_zerodim(val: object) -> object:
+ """
+ If the value is a zerodim array, return the item it contains.
+
+ Parameters
+ ----------
+ val : object
+
+ Returns
+ -------
+ result : object
+
+ Examples
+ --------
+ >>> item_from_zerodim(1)
+ 1
+ >>> item_from_zerodim('foobar')
+ 'foobar'
+ >>> item_from_zerodim(np.array(1))
+ 1
+ >>> item_from_zerodim(np.array([1]))
+ array([1])
+
+ """
+ if cnp.PyArray_IsZeroDim(val):
+ return cnp.PyArray_ToScalar(cnp.PyArray_DATA(val), val)
+ return val
+
+
+def fast_unique_multiple(list arrays, sort: bool=True):
+ """
+ Generate a list of unique values from a list of arrays.
+
+ Parameters
+ ----------
+ list : array-like
+ A list of array-like objects
+ sort : boolean
+ Whether or not to sort the resulting unique list
+
+ Returns
+ -------
+ unique_list : list of unique values
+ """
+ cdef:
+ ndarray[object] buf
+ Py_ssize_t k = len(arrays)
+ Py_ssize_t i, j, n
+ list uniques = []
+ dict table = {}
+ object val, stub = 0
+
+ for i in range(k):
+ buf = arrays[i]
+ n = len(buf)
+ for j in range(n):
+ val = buf[j]
+ if val not in table:
+ table[val] = stub
+ uniques.append(val)
+ if sort is None:
+ try:
+ uniques.sort()
+ except Exception:
+ # TODO: RuntimeWarning?
+ pass
+
+ return uniques
+
+
+def fast_unique_multiple_list(lists: list, sort: bool=True) -> list:
+ cdef:
+ list buf
+ Py_ssize_t k = len(lists)
+ Py_ssize_t i, j, n
+ list uniques = []
+ dict table = {}
+ object val, stub = 0
+
+ for i in range(k):
+ buf = lists[i]
+ n = len(buf)
+ for j in range(n):
+ val = buf[j]
+ if val not in table:
+ table[val] = stub
+ uniques.append(val)
+ if sort:
+ try:
+ uniques.sort()
+ except Exception:
+ pass
+
+ return uniques
+
+
+def fast_unique_multiple_list_gen(object gen, bint sort=True):
+ """
+ Generate a list of unique values from a generator of lists.
+
+ Parameters
+ ----------
+ gen : generator object
+ A generator of lists from which the unique list is created
+ sort : boolean
+ Whether or not to sort the resulting unique list
+
+ Returns
+ -------
+ unique_list : list of unique values
+ """
+ cdef:
+ list buf
+ Py_ssize_t j, n
+ list uniques = []
+ dict table = {}
+ object val, stub = 0
+
+ for buf in gen:
+ n = len(buf)
+ for j in range(n):
+ val = buf[j]
+ if val not in table:
+ table[val] = stub
+ uniques.append(val)
+ if sort:
+ try:
+ uniques.sort()
+ except Exception:
+ pass
+
+ return uniques
+
+
+def dicts_to_array(dicts: list, columns: list):
+ cdef:
+ Py_ssize_t i, j, k, n
+ ndarray[object, ndim=2] result
+ dict row
+ object col, onan = np.nan
+
+ k = len(columns)
+ n = len(dicts)
+
+ result = np.empty((n, k), dtype='O')
+
+ for i in range(n):
+ row = dicts[i]
+ for j in range(k):
+ col = columns[j]
+ if col in row:
+ result[i, j] = row[col]
+ else:
+ result[i, j] = onan
+
+ return result
+
+
+def fast_zip(list ndarrays):
+ """
+ For zipping multiple ndarrays into an ndarray of tuples
+ """
+ cdef:
+ Py_ssize_t i, j, k, n
+ ndarray[object] result
+ flatiter it
+ object val, tup
+
+ k = len(ndarrays)
+ n = len(ndarrays[0])
+
+ result = np.empty(n, dtype=object)
+
+ # initialize tuples on first pass
+ arr = ndarrays[0]
+ it = <flatiter>PyArray_IterNew(arr)
+ for i in range(n):
+ val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
+ tup = PyTuple_New(k)
+
+ PyTuple_SET_ITEM(tup, 0, val)
+ Py_INCREF(val)
+ result[i] = tup
+ PyArray_ITER_NEXT(it)
+
+ for j in range(1, k):
+ arr = ndarrays[j]
+ it = <flatiter>PyArray_IterNew(arr)
+ if len(arr) != n:
+ raise ValueError('all arrays must be same length')
+
+ for i in range(n):
+ val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
+ PyTuple_SET_ITEM(result[i], j, val)
+ Py_INCREF(val)
+ PyArray_ITER_NEXT(it)
+
+ return result
+
+
+def get_reverse_indexer(ndarray[int64_t] indexer, Py_ssize_t length):
+ """
+ Reverse indexing operation.
+
+ Given `indexer`, make `indexer_inv` of it, such that::
+
+ indexer_inv[indexer[x]] = x
+
+ .. note:: If indexer is not unique, only first occurrence is accounted.
+
+ """
+
+ cdef:
+ Py_ssize_t i, n = len(indexer)
+ ndarray[int64_t] rev_indexer
+ int64_t idx
+
+ rev_indexer = np.empty(length, dtype=np.int64)
+ rev_indexer[:] = -1
+ for i in range(n):
+ idx = indexer[i]
+ if idx != -1:
+ rev_indexer[idx] = i
+
+ return rev_indexer
+
+
+def has_infs_f4(ndarray[float32_t] arr) -> bool:
+ cdef:
+ Py_ssize_t i, n = len(arr)
+ float32_t inf, neginf, val
+
+ inf = np.inf
+ neginf = -inf
+
+ for i in range(n):
+ val = arr[i]
+ if val == inf or val == neginf:
+ return True
+ return False
+
+
+def has_infs_f8(ndarray[float64_t] arr) -> bool:
+ cdef:
+ Py_ssize_t i, n = len(arr)
+ float64_t inf, neginf, val
+
+ inf = np.inf
+ neginf = -inf
+
+ for i in range(n):
+ val = arr[i]
+ if val == inf or val == neginf:
+ return True
+ return False
+
+
+def maybe_indices_to_slice(ndarray[int64_t] indices, int max_len):
+ cdef:
+ Py_ssize_t i, n = len(indices)
+ int k, vstart, vlast, v
+
+ if n == 0:
+ return slice(0, 0)
+
+ vstart = indices[0]
+ if vstart < 0 or max_len <= vstart:
+ return indices
+
+ if n == 1:
+ return slice(vstart, vstart + 1)
+
+ vlast = indices[n - 1]
+ if vlast < 0 or max_len <= vlast:
+ return indices
+
+ k = indices[1] - indices[0]
+ if k == 0:
+ return indices
+ else:
+ for i in range(2, n):
+ v = indices[i]
+ if v - indices[i - 1] != k:
+ return indices
+
+ if k > 0:
+ return slice(vstart, vlast + 1, k)
+ else:
+ if vlast == 0:
+ return slice(vstart, None, k)
+ else:
+ return slice(vstart, vlast - 1, k)
+
+
+def maybe_booleans_to_slice(ndarray[uint8_t] mask):
+ cdef:
+ Py_ssize_t i, n = len(mask)
+ Py_ssize_t start, end
+ bint started = 0, finished = 0
+
+ for i in range(n):
+ if mask[i]:
+ if finished:
+ return mask.view(np.bool_)
+ if not started:
+ started = 1
+ start = i
+ else:
+ if finished:
+ continue
+
+ if started:
+ end = i
+ finished = 1
+
+ if not started:
+ return slice(0, 0)
+ if not finished:
+ return slice(start, None)
+ else:
+ return slice(start, end)
+
+
+def array_equivalent_object(left: object[:], right: object[:]) -> bool:
+ """ perform an element by element comparion on 1-d object arrays
+ taking into account nan positions """
+ cdef:
+ Py_ssize_t i, n = left.shape[0]
+ object x, y
+
+ for i in range(n):
+ x = left[i]
+ y = right[i]
+
+ # we are either not equal or both nan
+ # I think None == None will be true here
+ if not (PyObject_RichCompareBool(x, y, Py_EQ) or
+ (x is None or is_nan(x)) and (y is None or is_nan(y))):
+ return False
+ return True
+
+
+def astype_intsafe(ndarray[object] arr, new_dtype):
+ cdef:
+ Py_ssize_t i, n = len(arr)
+ object val
+ bint is_datelike
+ ndarray result
+
+ is_datelike = new_dtype == 'm8[ns]'
+ result = np.empty(n, dtype=new_dtype)
+ for i in range(n):
+ val = arr[i]
+ if is_datelike and checknull(val):
+ result[i] = NPY_NAT
+ else:
+ result[i] = val
+
+ return result
+
+
+def astype_unicode(arr: ndarray, skipna: bool=False) -> ndarray[object]:
+ """
+ Convert all elements in an array to unicode.
+
+ Parameters
+ ----------
+ arr : ndarray
+ The array whose elements we are casting.
+ skipna : bool, default False
+ Whether or not to coerce nulls to their stringified form
+ (e.g. NaN becomes 'nan').
+
+ Returns
+ -------
+ casted_arr : ndarray
+ A new array with the input array's elements casted.
+ """
+ cdef:
+ object arr_i
+ Py_ssize_t i, n = arr.size
+ ndarray[object] result = np.empty(n, dtype=object)
+
+ for i in range(n):
+ arr_i = arr[i]
+
+ if not (skipna and checknull(arr_i)):
+ arr_i = unicode(arr_i)
+
+ result[i] = arr_i
+
+ return result
+
+
+def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]:
+ """
+ Convert all elements in an array to string.
+
+ Parameters
+ ----------
+ arr : ndarray
+ The array whose elements we are casting.
+ skipna : bool, default False
+ Whether or not to coerce nulls to their stringified form
+ (e.g. NaN becomes 'nan').
+
+ Returns
+ -------
+ casted_arr : ndarray
+ A new array with the input array's elements casted.
+ """
+ cdef:
+ object arr_i
+ Py_ssize_t i, n = arr.size
+ ndarray[object] result = np.empty(n, dtype=object)
+
+ for i in range(n):
+ arr_i = arr[i]
+
+ if not (skipna and checknull(arr_i)):
+ arr_i = str(arr_i)
+
+ result[i] = arr_i
+
+ return result
+
+
+def clean_index_list(obj: list):
+ """
+ Utility used in pandas.core.index.ensure_index
+ """
+ cdef:
+ Py_ssize_t i, n = len(obj)
+ object val
+ bint all_arrays = 1
+
+ for i in range(n):
+ val = obj[i]
+ if not (isinstance(val, list) or
+ util.is_array(val) or hasattr(val, '_data')):
+ all_arrays = 0
+ break
+
+ if all_arrays:
+ return obj, all_arrays
+
+ # don't force numpy coerce with nan's
+ inferred = infer_dtype(obj, skipna=False)
+ if inferred in ['string', 'bytes', 'unicode', 'mixed', 'mixed-integer']:
+ return np.asarray(obj, dtype=object), 0
+ elif inferred in ['integer']:
+ # TODO: we infer an integer but it *could* be a uint64
+ try:
+ return np.asarray(obj, dtype='int64'), 0
+ except OverflowError:
+ return np.asarray(obj, dtype='object'), 0
+
+ return np.asarray(obj), 0
+
+
+# ------------------------------------------------------------------------------
+# Groupby-related functions
+
+# TODO: could do even better if we know something about the data. eg, index has
+# 1-min data, binner has 5-min data, then bins are just strides in index. This
+# is a general, O(max(len(values), len(binner))) method.
+def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
+ object closed='left', bint hasnans=0):
+ """
+ Int64 (datetime64) version of generic python version in groupby.py
+ """
+ cdef:
+ Py_ssize_t lenidx, lenbin, i, j, bc, vc
+ ndarray[int64_t] bins
+ int64_t l_bin, r_bin, nat_count
+ bint right_closed = closed == 'right'
+
+ nat_count = 0
+ if hasnans:
+ mask = values == NPY_NAT
+ nat_count = np.sum(mask)
+ values = values[~mask]
+
+ lenidx = len(values)
+ lenbin = len(binner)
+
+ if lenidx <= 0 or lenbin <= 0:
+ raise ValueError("Invalid length for values or for binner")
+
+ # check binner fits data
+ if values[0] < binner[0]:
+ raise ValueError("Values falls before first bin")
+
+ if values[lenidx - 1] > binner[lenbin - 1]:
+ raise ValueError("Values falls after last bin")
+
+ bins = np.empty(lenbin - 1, dtype=np.int64)
+
+ j = 0 # index into values
+ bc = 0 # bin count
+
+ # linear scan
+ if right_closed:
+ for i in range(0, lenbin - 1):
+ r_bin = binner[i + 1]
+ # count values in current bin, advance to next bin
+ while j < lenidx and values[j] <= r_bin:
+ j += 1
+ bins[bc] = j
+ bc += 1
+ else:
+ for i in range(0, lenbin - 1):
+ r_bin = binner[i + 1]
+ # count values in current bin, advance to next bin
+ while j < lenidx and values[j] < r_bin:
+ j += 1
+ bins[bc] = j
+ bc += 1
+
+ if nat_count > 0:
+ # shift bins by the number of NaT
+ bins = bins + nat_count
+ bins = np.insert(bins, 0, nat_count)
+
+ return bins
+
+
+def row_bool_subset(ndarray[float64_t, ndim=2] values,
+ ndarray[uint8_t, cast=True] mask):
+ cdef:
+ Py_ssize_t i, j, n, k, pos = 0
+ ndarray[float64_t, ndim=2] out
+
+ n, k = (<object>values).shape
+ assert (n == len(mask))
+
+ out = np.empty((mask.sum(), k), dtype=np.float64)
+
+ for i in range(n):
+ if mask[i]:
+ for j in range(k):
+ out[pos, j] = values[i, j]
+ pos += 1
+
+ return out
+
+
+def row_bool_subset_object(ndarray[object, ndim=2] values,
+ ndarray[uint8_t, cast=True] mask):
+ cdef:
+ Py_ssize_t i, j, n, k, pos = 0
+ ndarray[object, ndim=2] out
+
+ n, k = (<object>values).shape
+ assert (n == len(mask))
+
+ out = np.empty((mask.sum(), k), dtype=object)
+
+ for i in range(n):
+ if mask[i]:
+ for j in range(k):
+ out[pos, j] = values[i, j]
+ pos += 1
+
+ return out
+
+
+def get_level_sorter(ndarray[int64_t, ndim=1] label,
+ ndarray[int64_t, ndim=1] starts):
+ """
+ argsort for a single level of a multi-index, keeping the order of higher
+ levels unchanged. `starts` points to starts of same-key indices w.r.t
+ to leading levels; equivalent to:
+ np.hstack([label[starts[i]:starts[i+1]].argsort(kind='mergesort')
+ + starts[i] for i in range(len(starts) - 1)])
+ """
+ cdef:
+ int64_t l, r
+ Py_ssize_t i
+ ndarray[int64_t, ndim=1] out = np.empty(len(label), dtype=np.int64)
+
+ for i in range(len(starts) - 1):
+ l, r = starts[i], starts[i + 1]
+ out[l:r] = l + label[l:r].argsort(kind='mergesort')
+
+ return out
+
+
+def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
+ ndarray[int64_t, ndim=1] labels,
+ Py_ssize_t max_bin,
+ int axis):
+ cdef:
+ Py_ssize_t i, j, k, n
+ ndarray[int64_t, ndim=2] counts
+
+ assert (axis == 0 or axis == 1)
+ n, k = (<object>mask).shape
+
+ if axis == 0:
+ counts = np.zeros((max_bin, k), dtype='i8')
+ with nogil:
+ for i in range(n):
+ for j in range(k):
+ counts[labels[i], j] += mask[i, j]
+
+ else: # axis == 1
+ counts = np.zeros((n, max_bin), dtype='i8')
+ with nogil:
+ for i in range(n):
+ for j in range(k):
+ counts[i, labels[j]] += mask[i, j]
+
+ return counts
+
+
+def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
+ cdef:
+ Py_ssize_t i, group_size, n, start
+ int64_t lab
+ object slobj
+ ndarray[int64_t] starts, ends
+
+ n = len(labels)
+
+ starts = np.zeros(ngroups, dtype=np.int64)
+ ends = np.zeros(ngroups, dtype=np.int64)
+
+ start = 0
+ group_size = 0
+ for i in range(n):
+ lab = labels[i]
+ if lab < 0:
+ start += 1
+ else:
+ group_size += 1
+ if i == n - 1 or lab != labels[i + 1]:
+ starts[lab] = start
+ ends[lab] = start + group_size
+ start += group_size
+ group_size = 0
+
+ return starts, ends
+
+
+def indices_fast(object index, ndarray[int64_t] labels, list keys,
+ list sorted_labels):
+ cdef:
+ Py_ssize_t i, j, k, lab, cur, start, n = len(labels)
+ dict result = {}
+ object tup
+
+ k = len(keys)
+
+ if n == 0:
+ return result
+
+ start = 0
+ cur = labels[0]
+ for i in range(1, n):
+ lab = labels[i]
+
+ if lab != cur:
+ if lab != -1:
+ tup = PyTuple_New(k)
+ for j in range(k):
+ val = util.get_value_at(keys[j],
+ sorted_labels[j][i - 1])
+ PyTuple_SET_ITEM(tup, j, val)
+ Py_INCREF(val)
+
+ result[tup] = index[start:i]
+ start = i
+ cur = lab
+
+ tup = PyTuple_New(k)
+ for j in range(k):
+ val = util.get_value_at(keys[j],
+ sorted_labels[j][n - 1])
+ PyTuple_SET_ITEM(tup, j, val)
+ Py_INCREF(val)
+ result[tup] = index[start:]
+
+ return result
+
+
+# core.common import for fast inference checks
+
+def is_float(obj: object) -> bool:
+ return util.is_float_object(obj)
+
+
+def is_integer(obj: object) -> bool:
+ return util.is_integer_object(obj)
+
+
+def is_bool(obj: object) -> bool:
+ return util.is_bool_object(obj)
+
+
+def is_complex(obj: object) -> bool:
+ return util.is_complex_object(obj)
+
+
+cpdef bint is_decimal(object obj):
+ return isinstance(obj, Decimal)
+
+
+cpdef bint is_interval(object obj):
+ return getattr(obj, '_typ', '_typ') == 'interval'
+
+
+def is_period(val: object) -> bool:
+ """ Return a boolean if this is a Period object """
+ return util.is_period_object(val)
+
+
+_TYPE_MAP = {
+ 'categorical': 'categorical',
+ 'category': 'categorical',
+ 'int8': 'integer',
+ 'int16': 'integer',
+ 'int32': 'integer',
+ 'int64': 'integer',
+ 'i': 'integer',
+ 'uint8': 'integer',
+ 'uint16': 'integer',
+ 'uint32': 'integer',
+ 'uint64': 'integer',
+ 'u': 'integer',
+ 'float32': 'floating',
+ 'float64': 'floating',
+ 'f': 'floating',
+ 'complex128': 'complex',
+ 'c': 'complex',
+ 'string': 'string' if PY2 else 'bytes',
+ 'S': 'string' if PY2 else 'bytes',
+ 'unicode': 'unicode' if PY2 else 'string',
+ 'U': 'unicode' if PY2 else 'string',
+ 'bool': 'boolean',
+ 'b': 'boolean',
+ 'datetime64[ns]': 'datetime64',
+ 'M': 'datetime64',
+ 'timedelta64[ns]': 'timedelta64',
+ 'm': 'timedelta64',
+}
+
+# types only exist on certain platform
+try:
+ np.float128
+ _TYPE_MAP['float128'] = 'floating'
+except AttributeError:
+ pass
+try:
+ np.complex256
+ _TYPE_MAP['complex256'] = 'complex'
+except AttributeError:
+ pass
+try:
+ np.float16
+ _TYPE_MAP['float16'] = 'floating'
+except AttributeError:
+ pass
+
+
+cdef class Seen(object):
+ """
+ Class for keeping track of the types of elements
+ encountered when trying to perform type conversions.
+ """
+
+ cdef:
+ bint int_ # seen_int
+ bint bool_ # seen_bool
+ bint null_ # seen_null
+ bint uint_ # seen_uint (unsigned integer)
+ bint sint_ # seen_sint (signed integer)
+ bint float_ # seen_float
+ bint object_ # seen_object
+ bint complex_ # seen_complex
+ bint datetime_ # seen_datetime
+ bint coerce_numeric # coerce data to numeric
+ bint timedelta_ # seen_timedelta
+ bint datetimetz_ # seen_datetimetz
+
+ def __cinit__(self, bint coerce_numeric=0):
+ """
+ Initialize a Seen instance.
+
+ Parameters
+ ----------
+ coerce_numeric : bint, default 0
+ Whether or not to force conversion to a numeric data type if
+ initial methods to convert to numeric fail.
+ """
+ self.int_ = 0
+ self.bool_ = 0
+ self.null_ = 0
+ self.uint_ = 0
+ self.sint_ = 0
+ self.float_ = 0
+ self.object_ = 0
+ self.complex_ = 0
+ self.datetime_ = 0
+ self.timedelta_ = 0
+ self.datetimetz_ = 0
+ self.coerce_numeric = coerce_numeric
+
+ cdef inline bint check_uint64_conflict(self) except -1:
+ """
+ Check whether we can safely convert a uint64 array to a numeric dtype.
+
+ There are two cases when conversion to numeric dtype with a uint64
+ array is not safe (and will therefore not be performed)
+
+ 1) A NaN element is encountered.
+
+ uint64 cannot be safely cast to float64 due to truncation issues
+ at the extreme ends of the range.
+
+ 2) A negative number is encountered.
+
+ There is no numerical dtype that can hold both negative numbers
+ and numbers greater than INT64_MAX. Hence, at least one number
+ will be improperly cast if we convert to a numeric dtype.
+
+ Returns
+ -------
+ return_values : bool
+ Whether or not we should return the original input array to avoid
+ data truncation.
+
+ Raises
+ ------
+ ValueError : uint64 elements were detected, and at least one of the
+ two conflict cases was also detected. However, we are
+ trying to force conversion to a numeric dtype.
+ """
+ return (self.uint_ and (self.null_ or self.sint_)
+ and not self.coerce_numeric)
+
+ cdef inline saw_null(self):
+ """
+ Set flags indicating that a null value was encountered.
+ """
+ self.null_ = 1
+ self.float_ = 1
+
+ cdef saw_int(self, object val):
+ """
+ Set flags indicating that an integer value was encountered.
+
+ In addition to setting a flag that an integer was seen, we
+ also set two flags depending on the type of integer seen:
+
+ 1) sint_ : a negative (signed) number in the
+ range of [-2**63, 0) was encountered
+ 2) uint_ : a positive number in the range of
+ [2**63, 2**64) was encountered
+
+ Parameters
+ ----------
+ val : Python int
+ Value with which to set the flags.
+ """
+ self.int_ = 1
+ self.sint_ = self.sint_ or (oINT64_MIN <= val < 0)
+ self.uint_ = self.uint_ or (oINT64_MAX < val <= oUINT64_MAX)
+
+ @property
+ def numeric_(self):
+ return self.complex_ or self.float_ or self.int_
+
+ @property
+ def is_bool(self):
+ return not (self.datetime_ or self.numeric_ or self.timedelta_)
+
+ @property
+ def is_float_or_complex(self):
+ return not (self.bool_ or self.datetime_ or self.timedelta_)
+
+
+cdef _try_infer_map(v):
+ """ if its in our map, just return the dtype """
+ cdef:
+ object attr, val
+ for attr in ['name', 'kind', 'base']:
+ val = getattr(v.dtype, attr)
+ if val in _TYPE_MAP:
+ return _TYPE_MAP[val]
+ return None
+
+
+def infer_dtype(value: object, skipna: object=None) -> str:
+ """
+ Efficiently infer the type of a passed val, or list-like
+ array of values. Return a string describing the type.
+
+ Parameters
+ ----------
+ value : scalar, list, ndarray, or pandas type
+ skipna : bool, default False
+ Ignore NaN values when inferring the type.
+
+ .. versionadded:: 0.21.0
+
+ Returns
+ -------
+ string describing the common type of the input data.
+ Results can include:
+
+ - string
+ - unicode
+ - bytes
+ - floating
+ - integer
+ - mixed-integer
+ - mixed-integer-float
+ - decimal
+ - complex
+ - categorical
+ - boolean
+ - datetime64
+ - datetime
+ - date
+ - timedelta64
+ - timedelta
+ - time
+ - period
+ - mixed
+
+ Raises
+ ------
+ TypeError if ndarray-like but cannot infer the dtype
+
+ Notes
+ -----
+ - 'mixed' is the catchall for anything that is not otherwise
+ specialized
+ - 'mixed-integer-float' are floats and integers
+ - 'mixed-integer' are integers mixed with non-integers
+
+ Examples
+ --------
+ >>> infer_dtype(['foo', 'bar'])
+ 'string'
+
+ >>> infer_dtype(['a', np.nan, 'b'], skipna=True)
+ 'string'
+
+ >>> infer_dtype(['a', np.nan, 'b'], skipna=False)
+ 'mixed'
+
+ >>> infer_dtype([b'foo', b'bar'])
+ 'bytes'
+
+ >>> infer_dtype([1, 2, 3])
+ 'integer'
+
+ >>> infer_dtype([1, 2, 3.5])
+ 'mixed-integer-float'
+
+ >>> infer_dtype([1.0, 2.0, 3.5])
+ 'floating'
+
+ >>> infer_dtype(['a', 1])
+ 'mixed-integer'
+
+ >>> infer_dtype([Decimal(1), Decimal(2.0)])
+ 'decimal'
+
+ >>> infer_dtype([True, False])
+ 'boolean'
+
+ >>> infer_dtype([True, False, np.nan])
+ 'mixed'
+
+ >>> infer_dtype([pd.Timestamp('20130101')])
+ 'datetime'
+
+ >>> infer_dtype([datetime.date(2013, 1, 1)])
+ 'date'
+
+ >>> infer_dtype([np.datetime64('2013-01-01')])
+ 'datetime64'
+
+ >>> infer_dtype([datetime.timedelta(0, 1, 1)])
+ 'timedelta'
+
+ >>> infer_dtype(pd.Series(list('aabc')).astype('category'))
+ 'categorical'
+ """
+ cdef:
+ Py_ssize_t i, n
+ object val
+ ndarray values
+ bint seen_pdnat = False
+ bint seen_val = False
+
+ if skipna is None:
+ msg = ('A future version of pandas will default to `skipna=True`. To '
+ 'silence this warning, pass `skipna=True|False` explicitly.')
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+ skipna = False
+
+ if util.is_array(value):
+ values = value
+ elif hasattr(value, 'dtype'):
+ # this will handle ndarray-like
+ # e.g. categoricals
+ try:
+ values = getattr(value, '_values', getattr(value, 'values', value))
+ except:
+ value = _try_infer_map(value)
+ if value is not None:
+ return value
+
+ # its ndarray like but we can't handle
+ raise ValueError("cannot infer type for {typ}"
+ .format(typ=type(value)))
+
+ else:
+ if not isinstance(value, list):
+ value = list(value)
+ from pandas.core.dtypes.cast import (
+ construct_1d_object_array_from_listlike)
+ values = construct_1d_object_array_from_listlike(value)
+
+ values = getattr(values, 'values', values)
+
+ # make contiguous
+ values = values.ravel()
+
+ if skipna:
+ values = values[~isnaobj(values)]
+
+ val = _try_infer_map(values)
+ if val is not None:
+ return val
+
+ if values.dtype != np.object_:
+ values = values.astype('O')
+
+ n = len(values)
+ if n == 0:
+ return 'empty'
+
+ # try to use a valid value
+ for i in range(n):
+ val = values[i]
+
+ # do not use is_nul_datetimelike to keep
+ # np.datetime64('nat') and np.timedelta64('nat')
+ if val is None or util.is_nan(val):
+ pass
+ elif val is NaT:
+ seen_pdnat = True
+ else:
+ seen_val = True
+ break
+
+ # if all values are nan/NaT
+ if seen_val is False and seen_pdnat is True:
+ return 'datetime'
+ # float/object nan is handled in latter logic
+
+ if util.is_datetime64_object(val):
+ if is_datetime64_array(values):
+ return 'datetime64'
+
+ elif is_timedelta(val):
+ if is_timedelta_or_timedelta64_array(values):
+ return 'timedelta'
+
+ elif util.is_integer_object(val):
+ # ordering matters here; this check must come after the is_timedelta
+ # check otherwise numpy timedelta64 objects would come through here
+
+ if is_integer_array(values):
+ return 'integer'
+ elif is_integer_float_array(values):
+ return 'mixed-integer-float'
+ return 'mixed-integer'
+
+ elif PyDateTime_Check(val):
+ if is_datetime_array(values):
+ return 'datetime'
+
+ elif PyDate_Check(val):
+ if is_date_array(values, skipna=skipna):
+ return 'date'
+
+ elif PyTime_Check(val):
+ if is_time_array(values, skipna=skipna):
+ return 'time'
+
+ elif is_decimal(val):
+ return 'decimal'
+
+ elif util.is_float_object(val):
+ if is_float_array(values):
+ return 'floating'
+ elif is_integer_float_array(values):
+ return 'mixed-integer-float'
+
+ elif util.is_bool_object(val):
+ if is_bool_array(values, skipna=skipna):
+ return 'boolean'
+
+ elif isinstance(val, str):
+ if is_string_array(values, skipna=skipna):
+ return 'string'
+
+ elif isinstance(val, unicode):
+ if is_unicode_array(values, skipna=skipna):
+ return 'unicode'
+
+ elif isinstance(val, bytes):
+ if is_bytes_array(values, skipna=skipna):
+ return 'bytes'
+
+ elif util.is_period_object(val):
+ if is_period_array(values):
+ return 'period'
+
+ elif is_interval(val):
+ if is_interval_array(values):
+ return 'interval'
+
+ for i in range(n):
+ val = values[i]
+ if (util.is_integer_object(val) and
+ not util.is_timedelta64_object(val) and
+ not util.is_datetime64_object(val)):
+ return 'mixed-integer'
+
+ return 'mixed'
+
+
+def infer_datetimelike_array(arr: object) -> object:
+ """
+ infer if we have a datetime or timedelta array
+ - date: we have *only* date and maybe strings, nulls
+ - datetime: we have *only* datetimes and maybe strings, nulls
+ - timedelta: we have *only* timedeltas and maybe strings, nulls
+ - nat: we do not have *any* date, datetimes or timedeltas, but do have
+ at least a NaT
+ - mixed: other objects (strings, a mix of tz-aware and tz-naive, or
+ actual objects)
+
+ Parameters
+ ----------
+ arr : object array
+
+ Returns
+ -------
+ string: {datetime, timedelta, date, nat, mixed}
+ """
+
+ cdef:
+ Py_ssize_t i, n = len(arr)
+ bint seen_timedelta = 0, seen_date = 0, seen_datetime = 0
+ bint seen_tz_aware = 0, seen_tz_naive = 0
+ bint seen_nat = 0
+ list objs = []
+ object v
+
+ for i in range(n):
+ v = arr[i]
+ if util.is_string_object(v):
+ objs.append(v)
+
+ if len(objs) == 3:
+ break
+
+ elif v is None or util.is_nan(v):
+ # nan or None
+ pass
+ elif v is NaT:
+ seen_nat = 1
+ elif PyDateTime_Check(v):
+ # datetime
+ seen_datetime = 1
+
+ # disambiguate between tz-naive and tz-aware
+ if v.tzinfo is None:
+ seen_tz_naive = 1
+ else:
+ seen_tz_aware = 1
+
+ if seen_tz_naive and seen_tz_aware:
+ return 'mixed'
+ elif util.is_datetime64_object(v):
+ # np.datetime64
+ seen_datetime = 1
+ elif PyDate_Check(v):
+ seen_date = 1
+ elif is_timedelta(v):
+ # timedelta, or timedelta64
+ seen_timedelta = 1
+ else:
+ return 'mixed'
+
+ if seen_date and not (seen_datetime or seen_timedelta):
+ return 'date'
+ elif seen_datetime and not seen_timedelta:
+ return 'datetime'
+ elif seen_timedelta and not seen_datetime:
+ return 'timedelta'
+ elif seen_nat:
+ return 'nat'
+
+ # short-circuit by trying to
+ # actually convert these strings
+ # this is for performance as we don't need to try
+ # convert *every* string array
+ if len(objs):
+ try:
+ array_to_datetime(objs, errors='raise')
+ return 'datetime'
+ except:
+ pass
+
+ # we are *not* going to infer from strings
+ # for timedelta as too much ambiguity
+
+ return 'mixed'
+
+
+cdef inline bint is_timedelta(object o):
+ return PyDelta_Check(o) or util.is_timedelta64_object(o)
+
+
+cdef class Validator:
+
+ cdef:
+ Py_ssize_t n
+ dtype dtype
+ bint skipna
+
+ def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_),
+ bint skipna=False):
+ self.n = n
+ self.dtype = dtype
+ self.skipna = skipna
+
+ cdef bint validate(self, ndarray values) except -1:
+ if not self.n:
+ return False
+
+ if self.is_array_typed():
+ return True
+ elif self.dtype.type_num == NPY_OBJECT:
+ if self.skipna:
+ return self._validate_skipna(values)
+ else:
+ return self._validate(values)
+ else:
+ return False
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef bint _validate(self, ndarray values) except -1:
+ cdef:
+ Py_ssize_t i
+ Py_ssize_t n = self.n
+
+ for i in range(n):
+ if not self.is_valid(values[i]):
+ return False
+
+ return self.finalize_validate()
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef bint _validate_skipna(self, ndarray values) except -1:
+ cdef:
+ Py_ssize_t i
+ Py_ssize_t n = self.n
+
+ for i in range(n):
+ if not self.is_valid_skipna(values[i]):
+ return False
+
+ return self.finalize_validate_skipna()
+
+ cdef bint is_valid(self, object value) except -1:
+ return self.is_value_typed(value)
+
+ cdef bint is_valid_skipna(self, object value) except -1:
+ return self.is_valid(value) or self.is_valid_null(value)
+
+ cdef bint is_value_typed(self, object value) except -1:
+ raise NotImplementedError(
+ '{typ} child class must define is_value_typed'
+ .format(typ=type(self).__name__))
+
+ cdef bint is_valid_null(self, object value) except -1:
+ return value is None or util.is_nan(value)
+
+ cdef bint is_array_typed(self) except -1:
+ return False
+
+ cdef inline bint finalize_validate(self):
+ return True
+
+ cdef bint finalize_validate_skipna(self):
+ # TODO(phillipc): Remove the existing validate methods and replace them
+ # with the skipna versions upon full deprecation of skipna=False
+ return True
+
+
+cdef class BoolValidator(Validator):
+ cdef inline bint is_value_typed(self, object value) except -1:
+ return util.is_bool_object(value)
+
+ cdef inline bint is_array_typed(self) except -1:
+ return issubclass(self.dtype.type, np.bool_)
+
+
+cpdef bint is_bool_array(ndarray values, bint skipna=False):
+ cdef:
+ BoolValidator validator = BoolValidator(len(values),
+ values.dtype,
+ skipna=skipna)
+ return validator.validate(values)
+
+
+cdef class IntegerValidator(Validator):
+ cdef inline bint is_value_typed(self, object value) except -1:
+ return util.is_integer_object(value)
+
+ cdef inline bint is_array_typed(self) except -1:
+ return issubclass(self.dtype.type, np.integer)
+
+
+cpdef bint is_integer_array(ndarray values):
+ cdef:
+ IntegerValidator validator = IntegerValidator(len(values),
+ values.dtype)
+ return validator.validate(values)
+
+
+cdef class IntegerFloatValidator(Validator):
+ cdef inline bint is_value_typed(self, object value) except -1:
+ return util.is_integer_object(value) or util.is_float_object(value)
+
+ cdef inline bint is_array_typed(self) except -1:
+ return issubclass(self.dtype.type, np.integer)
+
+
+cdef bint is_integer_float_array(ndarray values):
+ cdef:
+ IntegerFloatValidator validator = IntegerFloatValidator(len(values),
+ values.dtype)
+ return validator.validate(values)
+
+
+cdef class FloatValidator(Validator):
+ cdef inline bint is_value_typed(self, object value) except -1:
+ return util.is_float_object(value)
+
+ cdef inline bint is_array_typed(self) except -1:
+ return issubclass(self.dtype.type, np.floating)
+
+
+cpdef bint is_float_array(ndarray values):
+ cdef:
+ FloatValidator validator = FloatValidator(len(values), values.dtype)
+ return validator.validate(values)
+
+
+cdef class StringValidator(Validator):
+ cdef inline bint is_value_typed(self, object value) except -1:
+ return isinstance(value, str)
+
+ cdef inline bint is_array_typed(self) except -1:
+ return issubclass(self.dtype.type, np.str_)
+
+
+cpdef bint is_string_array(ndarray values, bint skipna=False):
+ cdef:
+ StringValidator validator = StringValidator(len(values),
+ values.dtype,
+ skipna=skipna)
+ return validator.validate(values)
+
+
+cdef class UnicodeValidator(Validator):
+ cdef inline bint is_value_typed(self, object value) except -1:
+ return isinstance(value, unicode)
+
+ cdef inline bint is_array_typed(self) except -1:
+ return issubclass(self.dtype.type, np.unicode_)
+
+
+cdef bint is_unicode_array(ndarray values, bint skipna=False):
+ cdef:
+ UnicodeValidator validator = UnicodeValidator(len(values),
+ values.dtype,
+ skipna=skipna)
+ return validator.validate(values)
+
+
+cdef class BytesValidator(Validator):
+ cdef inline bint is_value_typed(self, object value) except -1:
+ return isinstance(value, bytes)
+
+ cdef inline bint is_array_typed(self) except -1:
+ return issubclass(self.dtype.type, np.bytes_)
+
+
+cdef bint is_bytes_array(ndarray values, bint skipna=False):
+ cdef:
+ BytesValidator validator = BytesValidator(len(values), values.dtype,
+ skipna=skipna)
+ return validator.validate(values)
+
+
+cdef class TemporalValidator(Validator):
+ cdef:
+ Py_ssize_t generic_null_count
+
+ def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_),
+ bint skipna=False):
+ self.n = n
+ self.dtype = dtype
+ self.skipna = skipna
+ self.generic_null_count = 0
+
+ cdef inline bint is_valid(self, object value) except -1:
+ return self.is_value_typed(value) or self.is_valid_null(value)
+
+ cdef bint is_valid_null(self, object value) except -1:
+ raise NotImplementedError(
+ '{typ} child class must define is_valid_null'
+ .format(typ=type(self).__name__))
+
+ cdef inline bint is_valid_skipna(self, object value) except -1:
+ cdef:
+ bint is_typed_null = self.is_valid_null(value)
+ bint is_generic_null = value is None or util.is_nan(value)
+ self.generic_null_count += is_typed_null and is_generic_null
+ return self.is_value_typed(value) or is_typed_null or is_generic_null
+
+ cdef inline bint finalize_validate_skipna(self):
+ return self.generic_null_count != self.n
+
+
+cdef class DatetimeValidator(TemporalValidator):
+ cdef bint is_value_typed(self, object value) except -1:
+ return PyDateTime_Check(value)
+
+ cdef inline bint is_valid_null(self, object value) except -1:
+ return is_null_datetime64(value)
+
+
+cpdef bint is_datetime_array(ndarray values):
+ cdef:
+ DatetimeValidator validator = DatetimeValidator(len(values),
+ skipna=True)
+ return validator.validate(values)
+
+
+cdef class Datetime64Validator(DatetimeValidator):
+ cdef inline bint is_value_typed(self, object value) except -1:
+ return util.is_datetime64_object(value)
+
+
+cpdef bint is_datetime64_array(ndarray values):
+ cdef:
+ Datetime64Validator validator = Datetime64Validator(len(values),
+ skipna=True)
+ return validator.validate(values)
+
+
+def is_datetime_with_singletz_array(values: ndarray) -> bool:
+ """
+ Check values have the same tzinfo attribute.
+ Doesn't check values are datetime-like types.
+ """
+ cdef:
+ Py_ssize_t i, j, n = len(values)
+ object base_val, base_tz, val, tz
+
+ if n == 0:
+ return False
+ # Get a reference timezone to compare with the rest of the tzs in the array
+ for i in range(n):
+ base_val = values[i]
+ if base_val is not NaT:
+ base_tz = get_timezone(getattr(base_val, 'tzinfo', None))
+ break
+
+ for j in range(i, n):
+ # Compare val's timezone with the reference timezone
+ # NaT can coexist with tz-aware datetimes, so skip if encountered
+ val = values[j]
+ if val is not NaT:
+ tz = getattr(val, 'tzinfo', None)
+ if not tz_compare(base_tz, tz):
+ return False
+
+ return True
+
+
+cdef class TimedeltaValidator(TemporalValidator):
+ cdef bint is_value_typed(self, object value) except -1:
+ return PyDelta_Check(value)
+
+ cdef inline bint is_valid_null(self, object value) except -1:
+ return is_null_timedelta64(value)
+
+
+cdef class AnyTimedeltaValidator(TimedeltaValidator):
+ cdef inline bint is_value_typed(self, object value) except -1:
+ return is_timedelta(value)
+
+
+cpdef bint is_timedelta_or_timedelta64_array(ndarray values):
+ """ infer with timedeltas and/or nat/none """
+ cdef:
+ AnyTimedeltaValidator validator = AnyTimedeltaValidator(len(values),
+ skipna=True)
+ return validator.validate(values)
+
+
+cdef class DateValidator(Validator):
+ cdef inline bint is_value_typed(self, object value) except -1:
+ return PyDate_Check(value)
+
+
+cpdef bint is_date_array(ndarray values, bint skipna=False):
+ cdef:
+ DateValidator validator = DateValidator(len(values), skipna=skipna)
+ return validator.validate(values)
+
+
+cdef class TimeValidator(Validator):
+ cdef inline bint is_value_typed(self, object value) except -1:
+ return PyTime_Check(value)
+
+
+cpdef bint is_time_array(ndarray values, bint skipna=False):
+ cdef:
+ TimeValidator validator = TimeValidator(len(values), skipna=skipna)
+ return validator.validate(values)
+
+
+cdef class PeriodValidator(TemporalValidator):
+ cdef inline bint is_value_typed(self, object value) except -1:
+ return util.is_period_object(value)
+
+ cdef inline bint is_valid_null(self, object value) except -1:
+ return is_null_period(value)
+
+
+cpdef bint is_period_array(ndarray values):
+ cdef:
+ PeriodValidator validator = PeriodValidator(len(values), skipna=True)
+ return validator.validate(values)
+
+
+cdef class IntervalValidator(Validator):
+ cdef inline bint is_value_typed(self, object value) except -1:
+ return is_interval(value)
+
+
+cpdef bint is_interval_array(ndarray values):
+ cdef:
+ IntervalValidator validator = IntervalValidator(len(values),
+ skipna=True)
+ return validator.validate(values)
+
+
+def maybe_convert_numeric(ndarray[object] values, set na_values,
+ bint convert_empty=True, bint coerce_numeric=False):
+ """
+ Convert object array to a numeric array if possible.
+
+ Parameters
+ ----------
+ values : ndarray
+ Array of object elements to convert.
+ na_values : set
+ Set of values that should be interpreted as NaN.
+ convert_empty : bool, default True
+ If an empty array-like object is encountered, whether to interpret
+ that element as NaN or not. If set to False, a ValueError will be
+ raised if such an element is encountered and 'coerce_numeric' is False.
+ coerce_numeric : bool, default False
+ If initial attempts to convert to numeric have failed, whether to
+ force conversion to numeric via alternative methods or by setting the
+ element to NaN. Otherwise, an Exception will be raised when such an
+ element is encountered.
+
+ This boolean also has an impact on how conversion behaves when a
+ numeric array has no suitable numerical dtype to return (i.e. uint64,
+ int32, uint8). If set to False, the original object array will be
+ returned. Otherwise, a ValueError will be raised.
+
+ Returns
+ -------
+ numeric_array : array of converted object values to numerical ones
+ """
+
+ if len(values) == 0:
+ return np.array([], dtype='i8')
+
+ # fastpath for ints - try to convert all based on first value
+ cdef:
+ object val = values[0]
+
+ if util.is_integer_object(val):
+ try:
+ maybe_ints = values.astype('i8')
+ if (maybe_ints == values).all():
+ return maybe_ints
+ except (ValueError, OverflowError, TypeError):
+ pass
+
+ # otherwise, iterate and do full infererence
+ cdef:
+ int status, maybe_int
+ Py_ssize_t i, n = values.size
+ Seen seen = Seen(coerce_numeric)
+ ndarray[float64_t] floats = np.empty(n, dtype='f8')
+ ndarray[complex128_t] complexes = np.empty(n, dtype='c16')
+ ndarray[int64_t] ints = np.empty(n, dtype='i8')
+ ndarray[uint64_t] uints = np.empty(n, dtype='u8')
+ ndarray[uint8_t] bools = np.empty(n, dtype='u1')
+ float64_t fval
+
+ for i in range(n):
+ val = values[i]
+
+ if val.__hash__ is not None and val in na_values:
+ seen.saw_null()
+ floats[i] = complexes[i] = NaN
+ elif util.is_float_object(val):
+ fval = val
+ if fval != fval:
+ seen.null_ = True
+
+ floats[i] = complexes[i] = fval
+ seen.float_ = True
+ elif util.is_integer_object(val):
+ floats[i] = complexes[i] = val
+
+ val = int(val)
+ seen.saw_int(val)
+
+ if val >= 0:
+ if val <= oUINT64_MAX:
+ uints[i] = val
+ else:
+ seen.float_ = True
+
+ if val <= oINT64_MAX:
+ ints[i] = val
+
+ if seen.sint_ and seen.uint_:
+ seen.float_ = True
+
+ elif util.is_bool_object(val):
+ floats[i] = uints[i] = ints[i] = bools[i] = val
+ seen.bool_ = True
+ elif val is None:
+ seen.saw_null()
+ floats[i] = complexes[i] = NaN
+ elif hasattr(val, '__len__') and len(val) == 0:
+ if convert_empty or seen.coerce_numeric:
+ seen.saw_null()
+ floats[i] = complexes[i] = NaN
+ else:
+ raise ValueError('Empty string encountered')
+ elif util.is_complex_object(val):
+ complexes[i] = val
+ seen.complex_ = True
+ elif is_decimal(val):
+ floats[i] = complexes[i] = val
+ seen.float_ = True
+ else:
+ try:
+ status = floatify(val, &fval, &maybe_int)
+
+ if fval in na_values:
+ seen.saw_null()
+ floats[i] = complexes[i] = NaN
+ else:
+ if fval != fval:
+ seen.null_ = True
+
+ floats[i] = fval
+
+ if maybe_int:
+ as_int = int(val)
+
+ if as_int in na_values:
+ seen.saw_null()
+ else:
+ seen.saw_int(as_int)
+
+ if not (seen.float_ or as_int in na_values):
+ if as_int < oINT64_MIN or as_int > oUINT64_MAX:
+ raise ValueError('Integer out of range.')
+
+ if as_int >= 0:
+ uints[i] = as_int
+ if as_int <= oINT64_MAX:
+ ints[i] = as_int
+
+ seen.float_ = seen.float_ or (seen.uint_ and seen.sint_)
+ else:
+ seen.float_ = True
+ except (TypeError, ValueError) as e:
+ if not seen.coerce_numeric:
+ raise type(e)(str(e) + ' at position {pos}'.format(pos=i))
+ elif "uint64" in str(e): # Exception from check functions.
+ raise
+ seen.saw_null()
+ floats[i] = NaN
+
+ if seen.check_uint64_conflict():
+ return values
+
+ if seen.complex_:
+ return complexes
+ elif seen.float_:
+ return floats
+ elif seen.int_:
+ if seen.uint_:
+ return uints
+ else:
+ return ints
+ elif seen.bool_:
+ return bools.view(np.bool_)
+ elif seen.uint_:
+ return uints
+ return ints
+
+
+def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
+ bint safe=0, bint convert_datetime=0,
+ bint convert_timedelta=0):
+ """
+ Type inference function-- convert object array to proper dtype
+ """
+ cdef:
+ Py_ssize_t i, n
+ ndarray[float64_t] floats
+ ndarray[complex128_t] complexes
+ ndarray[int64_t] ints
+ ndarray[uint64_t] uints
+ ndarray[uint8_t] bools
+ ndarray[int64_t] idatetimes
+ ndarray[int64_t] itimedeltas
+ Seen seen = Seen()
+ object val
+ float64_t fval, fnan
+
+ n = len(objects)
+
+ floats = np.empty(n, dtype='f8')
+ complexes = np.empty(n, dtype='c16')
+ ints = np.empty(n, dtype='i8')
+ uints = np.empty(n, dtype='u8')
+ bools = np.empty(n, dtype=np.uint8)
+
+ if convert_datetime:
+ datetimes = np.empty(n, dtype='M8[ns]')
+ idatetimes = datetimes.view(np.int64)
+
+ if convert_timedelta:
+ timedeltas = np.empty(n, dtype='m8[ns]')
+ itimedeltas = timedeltas.view(np.int64)
+
+ fnan = np.nan
+
+ for i in range(n):
+ val = objects[i]
+
+ if val is None:
+ seen.null_ = 1
+ floats[i] = complexes[i] = fnan
+ elif val is NaT:
+ if convert_datetime:
+ idatetimes[i] = NPY_NAT
+ seen.datetime_ = 1
+ if convert_timedelta:
+ itimedeltas[i] = NPY_NAT
+ seen.timedelta_ = 1
+ if not (convert_datetime or convert_timedelta):
+ seen.object_ = 1
+ elif util.is_bool_object(val):
+ seen.bool_ = 1
+ bools[i] = val
+ elif util.is_float_object(val):
+ floats[i] = complexes[i] = val
+ seen.float_ = 1
+ elif util.is_datetime64_object(val):
+ if convert_datetime:
+ idatetimes[i] = convert_to_tsobject(
+ val, None, None, 0, 0).value
+ seen.datetime_ = 1
+ else:
+ seen.object_ = 1
+ break
+ elif is_timedelta(val):
+ if convert_timedelta:
+ itimedeltas[i] = convert_to_timedelta64(val, 'ns')
+ seen.timedelta_ = 1
+ else:
+ seen.object_ = 1
+ break
+ elif util.is_integer_object(val):
+ seen.int_ = 1
+ floats[i] = <float64_t>val
+ complexes[i] = <double complex>val
+ if not seen.null_:
+ val = int(val)
+ seen.saw_int(val)
+
+ if ((seen.uint_ and seen.sint_) or
+ val > oUINT64_MAX or val < oINT64_MIN):
+ seen.object_ = 1
+ break
+
+ if seen.uint_:
+ uints[i] = val
+ elif seen.sint_:
+ ints[i] = val
+ else:
+ uints[i] = val
+ ints[i] = val
+
+ elif util.is_complex_object(val):
+ complexes[i] = val
+ seen.complex_ = 1
+ elif PyDateTime_Check(val) or util.is_datetime64_object(val):
+
+ # if we have an tz's attached then return the objects
+ if convert_datetime:
+ if getattr(val, 'tzinfo', None) is not None:
+ seen.datetimetz_ = 1
+ break
+ else:
+ seen.datetime_ = 1
+ idatetimes[i] = convert_to_tsobject(
+ val, None, None, 0, 0).value
+ else:
+ seen.object_ = 1
+ break
+ elif try_float and not util.is_string_object(val):
+ # this will convert Decimal objects
+ try:
+ floats[i] = float(val)
+ complexes[i] = complex(val)
+ seen.float_ = 1
+ except Exception:
+ seen.object_ = 1
+ break
+ else:
+ seen.object_ = 1
+ break
+
+ # we try to coerce datetime w/tz but must all have the same tz
+ if seen.datetimetz_:
+ if is_datetime_with_singletz_array(objects):
+ from pandas import DatetimeIndex
+ return DatetimeIndex(objects)
+ seen.object_ = 1
+
+ if not seen.object_:
+ if not safe:
+ if seen.null_:
+ if seen.is_float_or_complex:
+ if seen.complex_:
+ return complexes
+ elif seen.float_ or seen.int_:
+ return floats
+ else:
+ if not seen.bool_:
+ if seen.datetime_:
+ if not seen.numeric_:
+ return datetimes
+ elif seen.timedelta_:
+ if not seen.numeric_:
+ return timedeltas
+ else:
+ if seen.complex_:
+ return complexes
+ elif seen.float_:
+ return floats
+ elif seen.int_:
+ if seen.uint_:
+ return uints
+ else:
+ return ints
+ elif seen.is_bool:
+ return bools.view(np.bool_)
+
+ else:
+ # don't cast int to float, etc.
+ if seen.null_:
+ if seen.is_float_or_complex:
+ if seen.complex_:
+ if not seen.int_:
+ return complexes
+ elif seen.float_:
+ if not seen.int_:
+ return floats
+ else:
+ if not seen.bool_:
+ if seen.datetime_:
+ if not seen.numeric_:
+ return datetimes
+ elif seen.timedelta_:
+ if not seen.numeric_:
+ return timedeltas
+ else:
+ if seen.complex_:
+ if not seen.int_:
+ return complexes
+ elif seen.float_:
+ if not seen.int_:
+ return floats
+ elif seen.int_:
+ if seen.uint_:
+ return uints
+ else:
+ return ints
+ elif seen.is_bool:
+ return bools.view(np.bool_)
+
+ return objects
+
+
+def map_infer_mask(ndarray arr, object f, ndarray[uint8_t] mask,
+ bint convert=1):
+ """
+ Substitute for np.vectorize with pandas-friendly dtype inference
+
+ Parameters
+ ----------
+ arr : ndarray
+ f : function
+
+ Returns
+ -------
+ mapped : ndarray
+ """
+ cdef:
+ Py_ssize_t i, n
+ ndarray[object] result
+ object val
+
+ n = len(arr)
+ result = np.empty(n, dtype=object)
+ for i in range(n):
+ if mask[i]:
+ val = arr[i]
+ else:
+ val = f(arr[i])
+
+ if cnp.PyArray_IsZeroDim(val):
+ # unbox 0-dim arrays, GH#690
+ # TODO: is there a faster way to unbox?
+ # item_from_zerodim?
+ val = val.item()
+
+ result[i] = val
+
+ if convert:
+ return maybe_convert_objects(result,
+ try_float=0,
+ convert_datetime=0,
+ convert_timedelta=0)
+
+ return result
+
+
+def map_infer(ndarray arr, object f, bint convert=1):
+ """
+ Substitute for np.vectorize with pandas-friendly dtype inference
+
+ Parameters
+ ----------
+ arr : ndarray
+ f : function
+
+ Returns
+ -------
+ mapped : ndarray
+ """
+ cdef:
+ Py_ssize_t i, n
+ ndarray[object] result
+ object val
+
+ n = len(arr)
+ result = np.empty(n, dtype=object)
+ for i in range(n):
+ val = f(arr[i])
+
+ if cnp.PyArray_IsZeroDim(val):
+ # unbox 0-dim arrays, GH#690
+ # TODO: is there a faster way to unbox?
+ # item_from_zerodim?
+ val = val.item()
+
+ result[i] = val
+
+ if convert:
+ return maybe_convert_objects(result,
+ try_float=0,
+ convert_datetime=0,
+ convert_timedelta=0)
+
+ return result
+
+
+def to_object_array(rows: object, int min_width=0):
+ """
+ Convert a list of lists into an object array.
+
+ Parameters
+ ----------
+ rows : 2-d array (N, K)
+ A list of lists to be converted into an array
+ min_width : int
+ The minimum width of the object array. If a list
+ in `rows` contains fewer than `width` elements,
+ the remaining elements in the corresponding row
+ will all be `NaN`.
+
+ Returns
+ -------
+ obj_array : numpy array of the object dtype
+ """
+ cdef:
+ Py_ssize_t i, j, n, k, tmp
+ ndarray[object, ndim=2] result
+ list input_rows
+ list row
+
+ input_rows = <list>rows
+ n = len(input_rows)
+
+ k = min_width
+ for i in range(n):
+ tmp = len(input_rows[i])
+ if tmp > k:
+ k = tmp
+
+ result = np.empty((n, k), dtype=object)
+
+ for i in range(n):
+ row = list(input_rows[i])
+
+ for j in range(len(row)):
+ result[i, j] = row[j]
+
+ return result
+
+
+def tuples_to_object_array(ndarray[object] tuples):
+ cdef:
+ Py_ssize_t i, j, n, k, tmp
+ ndarray[object, ndim=2] result
+ tuple tup
+
+ n = len(tuples)
+ k = len(tuples[0])
+ result = np.empty((n, k), dtype=object)
+ for i in range(n):
+ tup = tuples[i]
+ for j in range(k):
+ result[i, j] = tup[j]
+
+ return result
+
+
+def to_object_array_tuples(rows: list):
+ cdef:
+ Py_ssize_t i, j, n, k, tmp
+ ndarray[object, ndim=2] result
+ tuple row
+
+ n = len(rows)
+
+ k = 0
+ for i in range(n):
+ tmp = 1 if checknull(rows[i]) else len(rows[i])
+ if tmp > k:
+ k = tmp
+
+ result = np.empty((n, k), dtype=object)
+
+ try:
+ for i in range(n):
+ row = rows[i]
+ for j in range(len(row)):
+ result[i, j] = row[j]
+ except Exception:
+ # upcast any subclasses to tuple
+ for i in range(n):
+ row = (rows[i],) if checknull(rows[i]) else tuple(rows[i])
+ for j in range(len(row)):
+ result[i, j] = row[j]
+
+ return result
+
+
+def fast_multiget(dict mapping, ndarray keys, default=np.nan):
+ cdef:
+ Py_ssize_t i, n = len(keys)
+ object val
+ ndarray[object] output = np.empty(n, dtype='O')
+
+ if n == 0:
+ # kludge, for Series
+ return np.empty(0, dtype='f8')
+
+ keys = getattr(keys, 'values', keys)
+
+ for i in range(n):
+ val = keys[i]
+ if val in mapping:
+ output[i] = mapping[val]
+ else:
+ output[i] = default
+
+ return maybe_convert_objects(output)
diff --git a/contrib/python/pandas/py2/pandas/_libs/missing.pxd b/contrib/python/pandas/py2/pandas/_libs/missing.pxd
new file mode 100644
index 00000000000..d0dd306680a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/missing.pxd
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+from numpy cimport ndarray, uint8_t
+
+cpdef bint checknull(object val)
+cpdef bint checknull_old(object val)
+cpdef ndarray[uint8_t] isnaobj(ndarray arr)
+
+cdef bint is_null_datetime64(v)
+cdef bint is_null_timedelta64(v)
+cdef bint is_null_period(v)
diff --git a/contrib/python/pandas/py2/pandas/_libs/missing.pyx b/contrib/python/pandas/py2/pandas/_libs/missing.pyx
new file mode 100644
index 00000000000..229edbac499
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/missing.pyx
@@ -0,0 +1,284 @@
+# -*- coding: utf-8 -*-
+
+import cython
+from cython import Py_ssize_t
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport ndarray, int64_t, uint8_t, float64_t
+cnp.import_array()
+
+cimport pandas._libs.util as util
+
+from pandas._libs.tslibs.np_datetime cimport (
+ get_timedelta64_value, get_datetime64_value)
+from pandas._libs.tslibs.nattype cimport (
+ checknull_with_nat, c_NaT as NaT, is_null_datetimelike)
+
+
+cdef float64_t INF = <float64_t>np.inf
+cdef float64_t NEGINF = -INF
+
+cdef int64_t NPY_NAT = util.get_nat()
+
+
+cpdef bint checknull(object val):
+ """
+ Return boolean describing of the input is NA-like, defined here as any
+ of:
+ - None
+ - nan
+ - NaT
+ - np.datetime64 representation of NaT
+ - np.timedelta64 representation of NaT
+
+ Parameters
+ ----------
+ val : object
+
+ Returns
+ -------
+ result : bool
+
+ Notes
+ -----
+ The difference between `checknull` and `checknull_old` is that `checknull`
+ does *not* consider INF or NEGINF to be NA.
+ """
+ return is_null_datetimelike(val, inat_is_null=False)
+
+
+cpdef bint checknull_old(object val):
+ """
+ Return boolean describing of the input is NA-like, defined here as any
+ of:
+ - None
+ - nan
+ - INF
+ - NEGINF
+ - NaT
+ - np.datetime64 representation of NaT
+ - np.timedelta64 representation of NaT
+
+ Parameters
+ ----------
+ val : object
+
+ Returns
+ -------
+ result : bool
+
+ Notes
+ -----
+ The difference between `checknull` and `checknull_old` is that `checknull`
+ does *not* consider INF or NEGINF to be NA.
+ """
+ if checknull(val):
+ return True
+ elif util.is_float_object(val) or util.is_complex_object(val):
+ return val == INF or val == NEGINF
+ return False
+
+
+cdef inline bint _check_none_nan_inf_neginf(object val):
+ try:
+ return val is None or (isinstance(val, float) and
+ (val != val or val == INF or val == NEGINF))
+ except ValueError:
+ return False
+
+
+cpdef ndarray[uint8_t] isnaobj(ndarray arr):
+ """
+ Return boolean mask denoting which elements of a 1-D array are na-like,
+ according to the criteria defined in `checknull`:
+ - None
+ - nan
+ - NaT
+ - np.datetime64 representation of NaT
+ - np.timedelta64 representation of NaT
+
+ Parameters
+ ----------
+ arr : ndarray
+
+ Returns
+ -------
+ result : ndarray (dtype=np.bool_)
+ """
+ cdef:
+ Py_ssize_t i, n
+ object val
+ ndarray[uint8_t] result
+
+ assert arr.ndim == 1, "'arr' must be 1-D."
+
+ n = len(arr)
+ result = np.empty(n, dtype=np.uint8)
+ for i in range(n):
+ val = arr[i]
+ result[i] = checknull(val)
+ return result.view(np.bool_)
+
+
+def isnaobj_old(ndarray arr):
+ """
+ Return boolean mask denoting which elements of a 1-D array are na-like,
+ defined as being any of:
+ - None
+ - nan
+ - INF
+ - NEGINF
+ - NaT
+
+ Parameters
+ ----------
+ arr : ndarray
+
+ Returns
+ -------
+ result : ndarray (dtype=np.bool_)
+ """
+ cdef:
+ Py_ssize_t i, n
+ object val
+ ndarray[uint8_t] result
+
+ assert arr.ndim == 1, "'arr' must be 1-D."
+
+ n = len(arr)
+ result = np.zeros(n, dtype=np.uint8)
+ for i in range(n):
+ val = arr[i]
+ result[i] = val is NaT or _check_none_nan_inf_neginf(val)
+ return result.view(np.bool_)
+
+
+def isnaobj2d(ndarray arr):
+ """
+ Return boolean mask denoting which elements of a 2-D array are na-like,
+ according to the criteria defined in `checknull`:
+ - None
+ - nan
+ - NaT
+ - np.datetime64 representation of NaT
+ - np.timedelta64 representation of NaT
+
+ Parameters
+ ----------
+ arr : ndarray
+
+ Returns
+ -------
+ result : ndarray (dtype=np.bool_)
+
+ Notes
+ -----
+ The difference between `isnaobj2d` and `isnaobj2d_old` is that `isnaobj2d`
+ does *not* consider INF or NEGINF to be NA.
+ """
+ cdef:
+ Py_ssize_t i, j, n, m
+ object val
+ ndarray[uint8_t, ndim=2] result
+
+ assert arr.ndim == 2, "'arr' must be 2-D."
+
+ n, m = (<object>arr).shape
+ result = np.zeros((n, m), dtype=np.uint8)
+ for i in range(n):
+ for j in range(m):
+ val = arr[i, j]
+ if checknull(val):
+ result[i, j] = 1
+ return result.view(np.bool_)
+
+
+def isnaobj2d_old(ndarray arr):
+ """
+ Return boolean mask denoting which elements of a 2-D array are na-like,
+ according to the criteria defined in `checknull_old`:
+ - None
+ - nan
+ - INF
+ - NEGINF
+ - NaT
+ - np.datetime64 representation of NaT
+ - np.timedelta64 representation of NaT
+
+ Parameters
+ ----------
+ arr : ndarray
+
+ Returns
+ -------
+ result : ndarray (dtype=np.bool_)
+
+ Notes
+ -----
+ The difference between `isnaobj2d` and `isnaobj2d_old` is that `isnaobj2d`
+ does *not* consider INF or NEGINF to be NA.
+ """
+ cdef:
+ Py_ssize_t i, j, n, m
+ object val
+ ndarray[uint8_t, ndim=2] result
+
+ assert arr.ndim == 2, "'arr' must be 2-D."
+
+ n, m = (<object>arr).shape
+ result = np.zeros((n, m), dtype=np.uint8)
+ for i in range(n):
+ for j in range(m):
+ val = arr[i, j]
+ if checknull_old(val):
+ result[i, j] = 1
+ return result.view(np.bool_)
+
+
+def isposinf_scalar(val: object) -> bool:
+ if util.is_float_object(val) and val == INF:
+ return True
+ else:
+ return False
+
+
+def isneginf_scalar(val: object) -> bool:
+ if util.is_float_object(val) and val == NEGINF:
+ return True
+ else:
+ return False
+
+
+cdef inline bint is_null_datetime64(v):
+ # determine if we have a null for a datetime (or integer versions),
+ # excluding np.timedelta64('nat')
+ if checknull_with_nat(v):
+ return True
+ elif util.is_datetime64_object(v):
+ return get_datetime64_value(v) == NPY_NAT
+ return False
+
+
+cdef inline bint is_null_timedelta64(v):
+ # determine if we have a null for a timedelta (or integer versions),
+ # excluding np.datetime64('nat')
+ if checknull_with_nat(v):
+ return True
+ elif util.is_timedelta64_object(v):
+ return get_timedelta64_value(v) == NPY_NAT
+ return False
+
+
+cdef inline bint is_null_period(v):
+ # determine if we have a null for a Period (or integer versions),
+ # excluding np.datetime64('nat') and np.timedelta64('nat')
+ return checknull_with_nat(v)
diff --git a/contrib/python/pandas/py2/pandas/_libs/ops.pyx b/contrib/python/pandas/py2/pandas/_libs/ops.pyx
new file mode 100644
index 00000000000..fb1d2e37995
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/ops.pyx
@@ -0,0 +1,295 @@
+# -*- coding: utf-8 -*-
+import operator
+
+from cpython cimport (PyObject_RichCompareBool,
+ Py_EQ, Py_NE, Py_LT, Py_LE, Py_GT, Py_GE)
+
+import cython
+from cython import Py_ssize_t
+
+import numpy as np
+from numpy cimport ndarray, uint8_t, import_array
+import_array()
+
+
+from pandas._libs.util cimport UINT8_MAX, is_nan
+
+from pandas._libs.missing cimport checknull
+
+
+def scalar_compare(object[:] values, object val, object op):
+ """
+ Compare each element of `values` array with the scalar `val`, with
+ the comparison operation described by `op`.
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ val : object
+ op : {operator.eq, operator.ne,
+ operator.le, operator.lt,
+ operator.ge, operator.gt}
+
+ Returns
+ -------
+ result : ndarray[bool]
+ """
+ cdef:
+ Py_ssize_t i, n = len(values)
+ ndarray[uint8_t, cast=True] result
+ bint isnull_val
+ int flag
+ object x
+
+ if op is operator.lt:
+ flag = Py_LT
+ elif op is operator.le:
+ flag = Py_LE
+ elif op is operator.gt:
+ flag = Py_GT
+ elif op is operator.ge:
+ flag = Py_GE
+ elif op is operator.eq:
+ flag = Py_EQ
+ elif op is operator.ne:
+ flag = Py_NE
+ else:
+ raise ValueError('Unrecognized operator')
+
+ result = np.empty(n, dtype=bool).view(np.uint8)
+ isnull_val = checknull(val)
+
+ if flag == Py_NE:
+ for i in range(n):
+ x = values[i]
+ if checknull(x):
+ result[i] = True
+ elif isnull_val:
+ result[i] = True
+ else:
+ try:
+ result[i] = PyObject_RichCompareBool(x, val, flag)
+ except TypeError:
+ result[i] = True
+ elif flag == Py_EQ:
+ for i in range(n):
+ x = values[i]
+ if checknull(x):
+ result[i] = False
+ elif isnull_val:
+ result[i] = False
+ else:
+ try:
+ result[i] = PyObject_RichCompareBool(x, val, flag)
+ except TypeError:
+ result[i] = False
+
+ else:
+ for i in range(n):
+ x = values[i]
+ if checknull(x):
+ result[i] = False
+ elif isnull_val:
+ result[i] = False
+ else:
+ result[i] = PyObject_RichCompareBool(x, val, flag)
+
+ return result.view(bool)
+
+
+def vec_compare(object[:] left, object[:] right, object op):
+ """
+ Compare the elements of `left` with the elements of `right` pointwise,
+ with the comparison operation described by `op`.
+
+ Parameters
+ ----------
+ left : ndarray[object]
+ right : ndarray[object]
+ op : {operator.eq, operator.ne,
+ operator.le, operator.lt,
+ operator.ge, operator.gt}
+
+ Returns
+ -------
+ result : ndarray[bool]
+ """
+ cdef:
+ Py_ssize_t i, n = len(left)
+ ndarray[uint8_t, cast=True] result
+ int flag
+
+ if n != len(right):
+ raise ValueError('Arrays were different lengths: {n} vs {nright}'
+ .format(n=n, nright=len(right)))
+
+ if op is operator.lt:
+ flag = Py_LT
+ elif op is operator.le:
+ flag = Py_LE
+ elif op is operator.gt:
+ flag = Py_GT
+ elif op is operator.ge:
+ flag = Py_GE
+ elif op is operator.eq:
+ flag = Py_EQ
+ elif op is operator.ne:
+ flag = Py_NE
+ else:
+ raise ValueError('Unrecognized operator')
+
+ result = np.empty(n, dtype=bool).view(np.uint8)
+
+ if flag == Py_NE:
+ for i in range(n):
+ x = left[i]
+ y = right[i]
+
+ if checknull(x) or checknull(y):
+ result[i] = True
+ else:
+ result[i] = PyObject_RichCompareBool(x, y, flag)
+ else:
+ for i in range(n):
+ x = left[i]
+ y = right[i]
+
+ if checknull(x) or checknull(y):
+ result[i] = False
+ else:
+ result[i] = PyObject_RichCompareBool(x, y, flag)
+
+ return result.view(bool)
+
+
+def scalar_binop(object[:] values, object val, object op):
+ """
+ Apply the given binary operator `op` between each element of the array
+ `values` and the scalar `val`.
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ val : object
+ op : binary operator
+
+ Returns
+ -------
+ result : ndarray[object]
+ """
+ cdef:
+ Py_ssize_t i, n = len(values)
+ object[:] result
+ object x
+
+ result = np.empty(n, dtype=object)
+ if val is None or is_nan(val):
+ result[:] = val
+ return result.base # `.base` to access underlying np.ndarray
+
+ for i in range(n):
+ x = values[i]
+ if x is None or is_nan(x):
+ result[i] = x
+ else:
+ result[i] = op(x, val)
+
+ return maybe_convert_bool(result.base)
+
+
+def vec_binop(object[:] left, object[:] right, object op):
+ """
+ Apply the given binary operator `op` pointwise to the elements of
+ arrays `left` and `right`.
+
+ Parameters
+ ----------
+ left : ndarray[object]
+ right : ndarray[object]
+ op : binary operator
+
+ Returns
+ -------
+ result : ndarray[object]
+ """
+ cdef:
+ Py_ssize_t i, n = len(left)
+ object[:] result
+
+ if n != len(right):
+ raise ValueError('Arrays were different lengths: {n} vs {nright}'
+ .format(n=n, nright=len(right)))
+
+ result = np.empty(n, dtype=object)
+
+ for i in range(n):
+ x = left[i]
+ y = right[i]
+ try:
+ result[i] = op(x, y)
+ except TypeError:
+ if x is None or is_nan(x):
+ result[i] = x
+ elif y is None or is_nan(y):
+ result[i] = y
+ else:
+ raise
+
+ return maybe_convert_bool(result.base) # `.base` to access np.ndarray
+
+
+def maybe_convert_bool(ndarray[object] arr,
+ true_values=None, false_values=None):
+ cdef:
+ Py_ssize_t i, n
+ ndarray[uint8_t] result
+ object val
+ set true_vals, false_vals
+ int na_count = 0
+
+ n = len(arr)
+ result = np.empty(n, dtype=np.uint8)
+
+ # the defaults
+ true_vals = {'True', 'TRUE', 'true'}
+ false_vals = {'False', 'FALSE', 'false'}
+
+ if true_values is not None:
+ true_vals = true_vals | set(true_values)
+
+ if false_values is not None:
+ false_vals = false_vals | set(false_values)
+
+ for i in range(n):
+ val = arr[i]
+
+ if isinstance(val, bool):
+ if val is True:
+ result[i] = 1
+ else:
+ result[i] = 0
+ elif val in true_vals:
+ result[i] = 1
+ elif val in false_vals:
+ result[i] = 0
+ elif isinstance(val, float):
+ result[i] = UINT8_MAX
+ na_count += 1
+ else:
+ return arr
+
+ if na_count > 0:
+ mask = result == UINT8_MAX
+ arr = result.view(np.bool_).astype(object)
+ np.putmask(arr, mask, np.nan)
+ return arr
+ else:
+ return result.view(np.bool_)
diff --git a/contrib/python/pandas/py2/pandas/_libs/parsers.pyx b/contrib/python/pandas/py2/pandas/_libs/parsers.pyx
new file mode 100644
index 00000000000..6cb6ed749f8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/parsers.pyx
@@ -0,0 +1,2324 @@
+# Copyright (c) 2012, Lambda Foundry, Inc.
+# See LICENSE for the license
+import os
+import sys
+import time
+import warnings
+
+from csv import QUOTE_MINIMAL, QUOTE_NONNUMERIC, QUOTE_NONE
+from errno import ENOENT
+
+from libc.stdlib cimport free
+from libc.string cimport strncpy, strlen, strcasecmp
+
+import cython
+from cython import Py_ssize_t
+
+from cpython cimport (PyObject, PyBytes_FromString,
+ PyBytes_AsString,
+ PyUnicode_AsUTF8String,
+ PyErr_Occurred, PyErr_Fetch)
+from cpython.ref cimport Py_XDECREF
+
+
+cdef extern from "Python.h":
+ object PyUnicode_FromString(char *v)
+
+ object PyUnicode_Decode(char *v, Py_ssize_t size, char *encoding,
+ char *errors)
+
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport ndarray, uint8_t, uint64_t, int64_t, float64_t
+cnp.import_array()
+
+from pandas._libs.util cimport UINT64_MAX, INT64_MAX, INT64_MIN
+import pandas._libs.lib as lib
+
+from pandas._libs.khash cimport (
+ khiter_t,
+ kh_str_t, kh_init_str, kh_put_str, kh_exist_str,
+ kh_get_str, kh_destroy_str,
+ kh_float64_t, kh_get_float64, kh_destroy_float64,
+ kh_put_float64, kh_init_float64,
+ kh_strbox_t, kh_put_strbox, kh_get_strbox, kh_init_strbox,
+ kh_destroy_strbox)
+
+import pandas.compat as compat
+from pandas.core.dtypes.common import (
+ is_categorical_dtype,
+ is_integer_dtype, is_float_dtype,
+ is_bool_dtype, is_object_dtype,
+ is_datetime64_dtype,
+ pandas_dtype, is_extension_array_dtype)
+from pandas.core.arrays import Categorical
+from pandas.core.dtypes.concat import union_categoricals
+import pandas.io.common as icom
+
+from pandas.errors import (ParserError, DtypeWarning,
+ EmptyDataError, ParserWarning)
+
+# Import CParserError as alias of ParserError for backwards compatibility.
+# Ultimately, we want to remove this import. See gh-12665 and gh-14479.
+CParserError = ParserError
+
+
+cdef bint PY3 = (sys.version_info[0] >= 3)
+
+cdef float64_t INF = <float64_t>np.inf
+cdef float64_t NEGINF = -INF
+
+
+cdef extern from "errno.h":
+ int errno
+
+cdef extern from "headers/portable.h":
+ # I *think* this is here so that strcasecmp is defined on Windows
+ # so we don't get
+ # `parsers.obj : error LNK2001: unresolved external symbol strcasecmp`
+ # in Appveyor.
+ # In a sane world, the `from libc.string cimport` above would fail
+ # loudly.
+ pass
+
+try:
+ basestring
+except NameError:
+ basestring = str
+
+
+cdef extern from "parser/tokenizer.h":
+
+ ctypedef enum ParserState:
+ START_RECORD
+ START_FIELD
+ ESCAPED_CHAR
+ IN_FIELD
+ IN_QUOTED_FIELD
+ ESCAPE_IN_QUOTED_FIELD
+ QUOTE_IN_QUOTED_FIELD
+ EAT_CRNL
+ EAT_CRNL_NOP
+ EAT_WHITESPACE
+ EAT_COMMENT
+ EAT_LINE_COMMENT
+ WHITESPACE_LINE
+ SKIP_LINE
+ FINISHED
+
+ enum: ERROR_OVERFLOW
+
+ ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
+ int *status)
+ ctypedef int (*io_cleanup)(void *src)
+
+ ctypedef struct parser_t:
+ void *source
+ io_callback cb_io
+ io_cleanup cb_cleanup
+
+ int64_t chunksize # Number of bytes to prepare for each chunk
+ char *data # pointer to data to be processed
+ int64_t datalen # amount of data available
+ int64_t datapos
+
+ # where to write out tokenized data
+ char *stream
+ int64_t stream_len
+ int64_t stream_cap
+
+ # Store words in (potentially ragged) matrix for now, hmm
+ char **words
+ int64_t *word_starts # where we are in the stream
+ int64_t words_len
+ int64_t words_cap
+ int64_t max_words_cap # maximum word cap encountered
+
+ char *pword_start # pointer to stream start of current field
+ int64_t word_start # position start of current field
+
+ int64_t *line_start # position in words for start of line
+ int64_t *line_fields # Number of fields in each line
+ int64_t lines # Number of lines observed
+ int64_t file_lines # Number of lines observed (with bad/skipped)
+ int64_t lines_cap # Vector capacity
+
+ # Tokenizing stuff
+ ParserState state
+ int doublequote # is " represented by ""? */
+ char delimiter # field separator */
+ int delim_whitespace # consume tabs / spaces instead
+ char quotechar # quote character */
+ char escapechar # escape character */
+ char lineterminator
+ int skipinitialspace # ignore spaces following delimiter? */
+ int quoting # style of quoting to write */
+
+ # hmm =/
+ # int numeric_field
+
+ char commentchar
+ int allow_embedded_newline
+ int strict # raise exception on bad CSV */
+
+ int usecols
+
+ int expected_fields
+ int error_bad_lines
+ int warn_bad_lines
+
+ # floating point options
+ char decimal
+ char sci
+
+ # thousands separator (comma, period)
+ char thousands
+
+ int header # Boolean: 1: has header, 0: no header
+ int64_t header_start # header row start
+ int64_t header_end # header row end
+
+ void *skipset
+ PyObject *skipfunc
+ int64_t skip_first_N_rows
+ int64_t skipfooter
+ # pick one, depending on whether the converter requires GIL
+ float64_t (*double_converter_nogil)(const char *, char **,
+ char, char, char, int) nogil
+ float64_t (*double_converter_withgil)(const char *, char **,
+ char, char, char, int)
+
+ # error handling
+ char *warn_msg
+ char *error_msg
+
+ int64_t skip_empty_lines
+
+ ctypedef struct coliter_t:
+ char **words
+ int64_t *line_start
+ int64_t col
+
+ ctypedef struct uint_state:
+ int seen_sint
+ int seen_uint
+ int seen_null
+
+ void uint_state_init(uint_state *self)
+ int uint64_conflict(uint_state *self)
+
+ void coliter_setup(coliter_t *it, parser_t *parser,
+ int64_t i, int64_t start) nogil
+ void COLITER_NEXT(coliter_t, const char *) nogil
+
+ parser_t* parser_new()
+
+ int parser_init(parser_t *self) nogil
+ void parser_free(parser_t *self) nogil
+ void parser_del(parser_t *self) nogil
+ int parser_add_skiprow(parser_t *self, int64_t row)
+
+ int parser_set_skipfirstnrows(parser_t *self, int64_t nrows)
+
+ void parser_set_default_options(parser_t *self)
+
+ int parser_consume_rows(parser_t *self, size_t nrows)
+
+ int parser_trim_buffers(parser_t *self)
+
+ int tokenize_all_rows(parser_t *self) nogil
+ int tokenize_nrows(parser_t *self, size_t nrows) nogil
+
+ int64_t str_to_int64(char *p_item, int64_t int_min,
+ int64_t int_max, int *error, char tsep) nogil
+ uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
+ uint64_t uint_max, int *error, char tsep) nogil
+
+ float64_t xstrtod(const char *p, char **q, char decimal, char sci,
+ char tsep, int skip_trailing) nogil
+ float64_t precise_xstrtod(const char *p, char **q, char decimal, char sci,
+ char tsep, int skip_trailing) nogil
+ float64_t round_trip(const char *p, char **q, char decimal, char sci,
+ char tsep, int skip_trailing) nogil
+
+ int to_boolean(const char *item, uint8_t *val) nogil
+
+
+cdef extern from "parser/io.h":
+ void *new_mmap(char *fname)
+ int del_mmap(void *src)
+ void* buffer_mmap_bytes(void *source, size_t nbytes,
+ size_t *bytes_read, int *status)
+
+ void *new_file_source(char *fname, size_t buffer_size)
+
+ void *new_rd_source(object obj)
+
+ int del_file_source(void *src)
+ int del_rd_source(void *src)
+
+ void* buffer_file_bytes(void *source, size_t nbytes,
+ size_t *bytes_read, int *status)
+
+ void* buffer_rd_bytes(void *source, size_t nbytes,
+ size_t *bytes_read, int *status)
+
+
+DEFAULT_CHUNKSIZE = 256 * 1024
+
+
+cdef class TextReader:
+ """
+
+ # source: StringIO or file object
+
+ """
+
+ cdef:
+ parser_t *parser
+ object file_handle, na_fvalues
+ object true_values, false_values
+ object handle
+ bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
+ int64_t parser_start
+ list clocks
+ char *c_encoding
+ kh_str_t *false_set
+ kh_str_t *true_set
+
+ cdef public:
+ int64_t leading_cols, table_width, skipfooter, buffer_lines
+ object allow_leading_cols
+ object delimiter, converters, delim_whitespace
+ object na_values
+ object memory_map
+ object header, orig_header, names, header_start, header_end
+ object index_col
+ object low_memory
+ object skiprows
+ object dtype
+ object encoding
+ object compression
+ object mangle_dupe_cols
+ object tupleize_cols
+ object usecols
+ list dtype_cast_order
+ set unnamed_cols
+ set noconvert
+
+ def __cinit__(self, source,
+ delimiter=b',',
+
+ header=0,
+ header_start=0,
+ header_end=0,
+ index_col=None,
+ names=None,
+
+ memory_map=False,
+ tokenize_chunksize=DEFAULT_CHUNKSIZE,
+ delim_whitespace=False,
+
+ compression=None,
+
+ converters=None,
+
+ skipinitialspace=False,
+ escapechar=None,
+ doublequote=True,
+ quotechar=b'"',
+ quoting=0,
+ lineterminator=None,
+
+ encoding=None,
+
+ comment=None,
+ decimal=b'.',
+ thousands=None,
+
+ dtype=None,
+ usecols=None,
+ error_bad_lines=True,
+ warn_bad_lines=True,
+
+ na_filter=True,
+ na_values=None,
+ na_fvalues=None,
+ keep_default_na=True,
+
+ true_values=None,
+ false_values=None,
+ allow_leading_cols=True,
+ low_memory=False,
+ skiprows=None,
+ skipfooter=0,
+ verbose=False,
+ mangle_dupe_cols=True,
+ tupleize_cols=False,
+ float_precision=None,
+ skip_blank_lines=True):
+
+ # set encoding for native Python and C library
+ if encoding is not None:
+ if not isinstance(encoding, bytes):
+ encoding = encoding.encode('utf-8')
+ encoding = encoding.lower()
+ self.c_encoding = <char*>encoding
+ else:
+ self.c_encoding = NULL
+
+ self.encoding = encoding
+
+ self.parser = parser_new()
+ self.parser.chunksize = tokenize_chunksize
+
+ self.mangle_dupe_cols = mangle_dupe_cols
+ self.tupleize_cols = tupleize_cols
+
+ # For timekeeping
+ self.clocks = []
+
+ self.compression = compression
+ self.memory_map = memory_map
+
+ self.parser.usecols = (usecols is not None)
+
+ self._setup_parser_source(source)
+ parser_set_default_options(self.parser)
+
+ parser_init(self.parser)
+
+ if delim_whitespace:
+ self.parser.delim_whitespace = delim_whitespace
+ else:
+ if len(delimiter) > 1:
+ raise ValueError('only length-1 separators excluded right now')
+ self.parser.delimiter = ord(delimiter)
+
+ # ----------------------------------------
+ # parser options
+
+ self.parser.doublequote = doublequote
+ self.parser.skipinitialspace = skipinitialspace
+ self.parser.skip_empty_lines = skip_blank_lines
+
+ if lineterminator is not None:
+ if len(lineterminator) != 1:
+ raise ValueError('Only length-1 line terminators supported')
+ self.parser.lineterminator = ord(lineterminator)
+
+ if len(decimal) != 1:
+ raise ValueError('Only length-1 decimal markers supported')
+ self.parser.decimal = ord(decimal)
+
+ if thousands is not None:
+ if len(thousands) != 1:
+ raise ValueError('Only length-1 thousands markers supported')
+ self.parser.thousands = ord(thousands)
+
+ if escapechar is not None:
+ if len(escapechar) != 1:
+ raise ValueError('Only length-1 escapes supported')
+ self.parser.escapechar = ord(escapechar)
+
+ self._set_quoting(quotechar, quoting)
+
+ dtype_order = ['int64', 'float64', 'bool', 'object']
+ if quoting == QUOTE_NONNUMERIC:
+ # consistent with csv module semantics, cast all to float
+ dtype_order = dtype_order[1:]
+ self.dtype_cast_order = [np.dtype(x) for x in dtype_order]
+
+ if comment is not None:
+ if len(comment) > 1:
+ raise ValueError('Only length-1 comment characters supported')
+ self.parser.commentchar = ord(comment)
+
+ # error handling of bad lines
+ self.parser.error_bad_lines = int(error_bad_lines)
+ self.parser.warn_bad_lines = int(warn_bad_lines)
+
+ self.skiprows = skiprows
+ if skiprows is not None:
+ self._make_skiprow_set()
+
+ self.skipfooter = skipfooter
+
+ # suboptimal
+ if usecols is not None:
+ self.has_usecols = 1
+ # GH-20558, validate usecols at higher level and only pass clean
+ # usecols into TextReader.
+ self.usecols = usecols
+
+ # XXX
+ if skipfooter > 0:
+ self.parser.error_bad_lines = 0
+ self.parser.warn_bad_lines = 0
+
+ self.delimiter = delimiter
+ self.delim_whitespace = delim_whitespace
+
+ self.na_values = na_values
+ if na_fvalues is None:
+ na_fvalues = set()
+ self.na_fvalues = na_fvalues
+
+ self.true_values = _maybe_encode(true_values) + _true_values
+ self.false_values = _maybe_encode(false_values) + _false_values
+
+ self.true_set = kset_from_list(self.true_values)
+ self.false_set = kset_from_list(self.false_values)
+
+ self.keep_default_na = keep_default_na
+ self.converters = converters
+ self.na_filter = na_filter
+
+ self.verbose = verbose
+ self.low_memory = low_memory
+ self.parser.double_converter_nogil = xstrtod
+ self.parser.double_converter_withgil = NULL
+ if float_precision == 'high':
+ self.parser.double_converter_nogil = precise_xstrtod
+ self.parser.double_converter_withgil = NULL
+ elif float_precision == 'round_trip': # avoid gh-15140
+ self.parser.double_converter_nogil = NULL
+ self.parser.double_converter_withgil = round_trip
+
+ if isinstance(dtype, dict):
+ dtype = {k: pandas_dtype(dtype[k])
+ for k in dtype}
+ elif dtype is not None:
+ dtype = pandas_dtype(dtype)
+
+ self.dtype = dtype
+
+ # XXX
+ self.noconvert = set()
+
+ self.index_col = index_col
+
+ # ----------------------------------------
+ # header stuff
+
+ self.allow_leading_cols = allow_leading_cols
+ self.leading_cols = 0
+
+ # TODO: no header vs. header is not the first row
+ self.has_mi_columns = 0
+ self.orig_header = header
+ if header is None:
+ # sentinel value
+ self.parser.header_start = -1
+ self.parser.header_end = -1
+ self.parser.header = -1
+ self.parser_start = 0
+ self.header = []
+ else:
+ if isinstance(header, list):
+ if len(header) > 1:
+ # need to artificially skip the final line
+ # which is still a header line
+ header = list(header)
+ header.append(header[-1] + 1)
+ self.parser.header_end = header[-1]
+ self.has_mi_columns = 1
+ else:
+ self.parser.header_end = header[0]
+
+ self.parser_start = header[-1] + 1
+ self.parser.header_start = header[0]
+ self.parser.header = header[0]
+ self.header = header
+ else:
+ self.parser.header_start = header
+ self.parser.header_end = header
+ self.parser_start = header + 1
+ self.parser.header = header
+ self.header = [ header ]
+
+ self.names = names
+ self.header, self.table_width, self.unnamed_cols = self._get_header()
+
+ if not self.table_width:
+ raise EmptyDataError("No columns to parse from file")
+
+ # Compute buffer_lines as function of table width.
+ heuristic = 2**20 // self.table_width
+ self.buffer_lines = 1
+ while self.buffer_lines * 2 < heuristic:
+ self.buffer_lines *= 2
+
+ def __init__(self, *args, **kwargs):
+ pass
+
+ def __dealloc__(self):
+ parser_free(self.parser)
+ if self.true_set:
+ kh_destroy_str(self.true_set)
+ self.true_set = NULL
+ if self.false_set:
+ kh_destroy_str(self.false_set)
+ self.false_set = NULL
+ parser_del(self.parser)
+
+ def close(self):
+ # we need to properly close an open derived
+ # filehandle here, e.g. and UTFRecoder
+ if self.handle is not None:
+ try:
+ self.handle.close()
+ except:
+ pass
+ # also preemptively free all allocated memory
+ parser_free(self.parser)
+ if self.true_set:
+ kh_destroy_str(self.true_set)
+ self.true_set = NULL
+ if self.false_set:
+ kh_destroy_str(self.false_set)
+ self.false_set = NULL
+
+ def set_error_bad_lines(self, int status):
+ self.parser.error_bad_lines = status
+
+ def _set_quoting(self, quote_char, quoting):
+ if not isinstance(quoting, int):
+ raise TypeError('"quoting" must be an integer')
+
+ if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE:
+ raise TypeError('bad "quoting" value')
+
+ if not isinstance(quote_char, (str, compat.text_type,
+ bytes)) and quote_char is not None:
+ dtype = type(quote_char).__name__
+ raise TypeError('"quotechar" must be string, '
+ 'not {dtype}'.format(dtype=dtype))
+
+ if quote_char is None or quote_char == '':
+ if quoting != QUOTE_NONE:
+ raise TypeError("quotechar must be set if quoting enabled")
+ self.parser.quoting = quoting
+ self.parser.quotechar = -1
+ elif len(quote_char) > 1: # 0-len case handled earlier
+ raise TypeError('"quotechar" must be a 1-character string')
+ else:
+ self.parser.quoting = quoting
+ self.parser.quotechar = ord(quote_char)
+
+ cdef _make_skiprow_set(self):
+ if isinstance(self.skiprows, (int, np.integer)):
+ parser_set_skipfirstnrows(self.parser, self.skiprows)
+ elif not callable(self.skiprows):
+ for i in self.skiprows:
+ parser_add_skiprow(self.parser, i)
+ else:
+ self.parser.skipfunc = <PyObject *>self.skiprows
+
+ cdef _setup_parser_source(self, source):
+ cdef:
+ int status
+ void *ptr
+
+ self.parser.cb_io = NULL
+ self.parser.cb_cleanup = NULL
+
+ if self.compression:
+ if self.compression == 'gzip':
+ import gzip
+ if isinstance(source, basestring):
+ source = gzip.GzipFile(source, 'rb')
+ else:
+ source = gzip.GzipFile(fileobj=source)
+ elif self.compression == 'bz2':
+ import bz2
+ if isinstance(source, basestring) or PY3:
+ source = bz2.BZ2File(source, 'rb')
+ else:
+ content = source.read()
+ source.close()
+ source = compat.StringIO(bz2.decompress(content))
+ elif self.compression == 'zip':
+ import zipfile
+ zip_file = zipfile.ZipFile(source)
+ zip_names = zip_file.namelist()
+
+ if len(zip_names) == 1:
+ file_name = zip_names.pop()
+ source = zip_file.open(file_name)
+
+ elif len(zip_names) == 0:
+ raise ValueError('Zero files found in compressed '
+ 'zip file %s', source)
+ else:
+ raise ValueError('Multiple files found in compressed '
+ 'zip file %s', str(zip_names))
+ elif self.compression == 'xz':
+ lzma = compat.import_lzma()
+
+ if isinstance(source, basestring):
+ source = lzma.LZMAFile(source, 'rb')
+ else:
+ source = lzma.LZMAFile(filename=source)
+ else:
+ raise ValueError('Unrecognized compression type: %s' %
+ self.compression)
+
+ if b'utf-16' in (self.encoding or b''):
+ # we need to read utf-16 through UTF8Recoder.
+ # if source is utf-16, convert source to utf-8 by UTF8Recoder.
+ source = icom.UTF8Recoder(source,
+ self.encoding.decode('utf-8'))
+ self.encoding = b'utf-8'
+ self.c_encoding = <char*>self.encoding
+
+ self.handle = source
+
+ if isinstance(source, basestring):
+ if not isinstance(source, bytes):
+ if compat.PY36 and compat.is_platform_windows():
+ # see gh-15086.
+ encoding = "mbcs"
+ else:
+ encoding = sys.getfilesystemencoding() or "utf-8"
+
+ source = source.encode(encoding)
+
+ if self.memory_map:
+ ptr = new_mmap(source)
+ if ptr == NULL:
+ # fall back
+ ptr = new_file_source(source, self.parser.chunksize)
+ self.parser.cb_io = &buffer_file_bytes
+ self.parser.cb_cleanup = &del_file_source
+ else:
+ self.parser.cb_io = &buffer_mmap_bytes
+ self.parser.cb_cleanup = &del_mmap
+ else:
+ ptr = new_file_source(source, self.parser.chunksize)
+ self.parser.cb_io = &buffer_file_bytes
+ self.parser.cb_cleanup = &del_file_source
+
+ if ptr == NULL:
+ if not os.path.exists(source):
+ raise compat.FileNotFoundError(
+ ENOENT,
+ 'File {source} does not exist'.format(source=source),
+ source)
+ raise IOError('Initializing from file failed')
+
+ self.parser.source = ptr
+
+ elif hasattr(source, 'read'):
+ # e.g., StringIO
+
+ ptr = new_rd_source(source)
+ if ptr == NULL:
+ raise IOError('Initializing parser from file-like '
+ 'object failed')
+
+ self.parser.source = ptr
+ self.parser.cb_io = &buffer_rd_bytes
+ self.parser.cb_cleanup = &del_rd_source
+ else:
+ raise IOError('Expected file path name or file-like object,'
+ ' got %s type' % type(source))
+
+ cdef _get_header(self):
+ # header is now a list of lists, so field_count should use header[0]
+
+ cdef:
+ Py_ssize_t i, start, field_count, passed_count, unnamed_count # noqa
+ char *word
+ object name, old_name
+ int status
+ int64_t hr, data_line
+ char *errors = "strict"
+ cdef StringPath path = _string_path(self.c_encoding)
+
+ header = []
+ unnamed_cols = set()
+
+ if self.parser.header_start >= 0:
+
+ # Header is in the file
+ for level, hr in enumerate(self.header):
+
+ this_header = []
+
+ if self.parser.lines < hr + 1:
+ self._tokenize_rows(hr + 2)
+
+ if self.parser.lines == 0:
+ field_count = 0
+ start = self.parser.line_start[0]
+
+ # e.g., if header=3 and file only has 2 lines
+ elif (self.parser.lines < hr + 1
+ and not isinstance(self.orig_header, list)) or (
+ self.parser.lines < hr):
+ msg = self.orig_header
+ if isinstance(msg, list):
+ msg = "[%s], len of %d," % (
+ ','.join(str(m) for m in msg), len(msg))
+ raise ParserError(
+ 'Passed header=%s but only %d lines in file'
+ % (msg, self.parser.lines))
+
+ else:
+ field_count = self.parser.line_fields[hr]
+ start = self.parser.line_start[hr]
+
+ counts = {}
+ unnamed_count = 0
+
+ for i in range(field_count):
+ word = self.parser.words[start + i]
+
+ if path == CSTRING:
+ name = PyBytes_FromString(word)
+ elif path == UTF8:
+ name = PyUnicode_FromString(word)
+ elif path == ENCODED:
+ name = PyUnicode_Decode(word, strlen(word),
+ self.c_encoding, errors)
+
+ # We use this later when collecting placeholder names.
+ old_name = name
+
+ if name == '':
+ if self.has_mi_columns:
+ name = ('Unnamed: {i}_level_{lvl}'
+ .format(i=i, lvl=level))
+ else:
+ name = 'Unnamed: {i}'.format(i=i)
+ unnamed_count += 1
+
+ count = counts.get(name, 0)
+
+ if not self.has_mi_columns and self.mangle_dupe_cols:
+ while count > 0:
+ counts[name] = count + 1
+ name = '%s.%d' % (name, count)
+ count = counts.get(name, 0)
+
+ if old_name == '':
+ unnamed_cols.add(name)
+
+ this_header.append(name)
+ counts[name] = count + 1
+
+ if self.has_mi_columns:
+
+ # If we have grabbed an extra line, but it's not in our
+ # format, save in the buffer, and create an blank extra
+ # line for the rest of the parsing code.
+ if hr == self.header[-1]:
+ lc = len(this_header)
+ ic = (len(self.index_col) if self.index_col
+ is not None else 0)
+
+ if lc != unnamed_count and lc - ic > unnamed_count:
+ hr -= 1
+ self.parser_start -= 1
+ this_header = [None] * lc
+
+ data_line = hr + 1
+ header.append(this_header)
+
+ if self.names is not None:
+ header = [ self.names ]
+
+ elif self.names is not None:
+ # Enforce this unless usecols
+ if not self.has_usecols:
+ self.parser.expected_fields = len(self.names)
+
+ # Names passed
+ if self.parser.lines < 1:
+ self._tokenize_rows(1)
+
+ header = [ self.names ]
+ data_line = 0
+
+ if self.parser.lines < 1:
+ field_count = len(header[0])
+ else:
+ field_count = self.parser.line_fields[data_line]
+ else:
+ # No header passed nor to be found in the file
+ if self.parser.lines < 1:
+ self._tokenize_rows(1)
+
+ return None, self.parser.line_fields[0], unnamed_cols
+
+ # Corner case, not enough lines in the file
+ if self.parser.lines < data_line + 1:
+ field_count = len(header[0])
+ else: # not self.has_usecols:
+
+ field_count = self.parser.line_fields[data_line]
+
+ # #2981
+ if self.names is not None:
+ field_count = max(field_count, len(self.names))
+
+ passed_count = len(header[0])
+
+ # if passed_count > field_count:
+ # raise ParserError('Column names have %d fields, '
+ # 'data has %d fields'
+ # % (passed_count, field_count))
+
+ if (self.has_usecols and self.allow_leading_cols and
+ not callable(self.usecols)):
+ nuse = len(self.usecols)
+ if nuse == passed_count:
+ self.leading_cols = 0
+ elif self.names is None and nuse < passed_count:
+ self.leading_cols = field_count - passed_count
+ elif passed_count != field_count:
+ raise ValueError('Passed header names '
+ 'mismatches usecols')
+ # oh boy, #2442, #2981
+ elif self.allow_leading_cols and passed_count < field_count:
+ self.leading_cols = field_count - passed_count
+
+ return header, field_count, unnamed_cols
+
+ def read(self, rows=None):
+ """
+ rows=None --> read all rows
+ """
+ cdef:
+ int status
+
+ if self.low_memory:
+ # Conserve intermediate space
+ columns = self._read_low_memory(rows)
+ else:
+ # Don't care about memory usage
+ columns = self._read_rows(rows, 1)
+
+ return columns
+
+ cdef _read_low_memory(self, rows):
+ cdef:
+ size_t rows_read = 0
+ chunks = []
+
+ if rows is None:
+ while True:
+ try:
+ chunk = self._read_rows(self.buffer_lines, 0)
+ if len(chunk) == 0:
+ break
+ except StopIteration:
+ break
+ else:
+ chunks.append(chunk)
+ else:
+ while rows_read < rows:
+ try:
+ crows = min(self.buffer_lines, rows - rows_read)
+
+ chunk = self._read_rows(crows, 0)
+ if len(chunk) == 0:
+ break
+
+ rows_read += len(list(chunk.values())[0])
+ except StopIteration:
+ break
+ else:
+ chunks.append(chunk)
+
+ parser_trim_buffers(self.parser)
+
+ if len(chunks) == 0:
+ raise StopIteration
+
+ # destructive to chunks
+ return _concatenate_chunks(chunks)
+
+ cdef _tokenize_rows(self, size_t nrows):
+ cdef int status
+ with nogil:
+ status = tokenize_nrows(self.parser, nrows)
+
+ if self.parser.warn_msg != NULL:
+ print >> sys.stderr, self.parser.warn_msg
+ free(self.parser.warn_msg)
+ self.parser.warn_msg = NULL
+
+ if status < 0:
+ raise_parser_error('Error tokenizing data', self.parser)
+
+ cdef _read_rows(self, rows, bint trim):
+ cdef:
+ int64_t buffered_lines
+ int64_t irows, footer = 0
+
+ self._start_clock()
+
+ if rows is not None:
+ irows = rows
+ buffered_lines = self.parser.lines - self.parser_start
+ if buffered_lines < irows:
+ self._tokenize_rows(irows - buffered_lines)
+
+ if self.skipfooter > 0:
+ raise ValueError('skipfooter can only be used to read '
+ 'the whole file')
+ else:
+ with nogil:
+ status = tokenize_all_rows(self.parser)
+
+ if self.parser.warn_msg != NULL:
+ print >> sys.stderr, self.parser.warn_msg
+ free(self.parser.warn_msg)
+ self.parser.warn_msg = NULL
+
+ if status < 0:
+ raise_parser_error('Error tokenizing data', self.parser)
+ footer = self.skipfooter
+
+ if self.parser_start >= self.parser.lines:
+ raise StopIteration
+ self._end_clock('Tokenization')
+
+ self._start_clock()
+ columns = self._convert_column_data(rows=rows,
+ footer=footer,
+ upcast_na=True)
+ self._end_clock('Type conversion')
+ self._start_clock()
+ if len(columns) > 0:
+ rows_read = len(list(columns.values())[0])
+ # trim
+ parser_consume_rows(self.parser, rows_read)
+ if trim:
+ parser_trim_buffers(self.parser)
+ self.parser_start -= rows_read
+
+ self._end_clock('Parser memory cleanup')
+
+ return columns
+
+ cdef _start_clock(self):
+ self.clocks.append(time.time())
+
+ cdef _end_clock(self, what):
+ if self.verbose:
+ elapsed = time.time() - self.clocks.pop(-1)
+ print('%s took: %.2f ms' % (what, elapsed * 1000))
+
+ def set_noconvert(self, i):
+ self.noconvert.add(i)
+
+ def remove_noconvert(self, i):
+ self.noconvert.remove(i)
+
+ def _convert_column_data(self, rows=None, upcast_na=False, footer=0):
+ cdef:
+ int64_t i
+ int nused
+ kh_str_t *na_hashset = NULL
+ int64_t start, end
+ object name, na_flist, col_dtype = None
+ bint na_filter = 0
+ int64_t num_cols
+
+ start = self.parser_start
+
+ if rows is None:
+ end = self.parser.lines
+ else:
+ end = min(start + rows, self.parser.lines)
+
+ # # skip footer
+ # if footer > 0:
+ # end -= footer
+
+ num_cols = -1
+ for i in range(self.parser.lines):
+ num_cols = (num_cols < self.parser.line_fields[i]) * \
+ self.parser.line_fields[i] + \
+ (num_cols >= self.parser.line_fields[i]) * num_cols
+
+ if self.table_width - self.leading_cols > num_cols:
+ raise ParserError(
+ "Too many columns specified: expected {expected} and "
+ "found {found}"
+ .format(expected=self.table_width - self.leading_cols,
+ found=num_cols))
+
+ results = {}
+ nused = 0
+ for i in range(self.table_width):
+ if i < self.leading_cols:
+ # Pass through leading columns always
+ name = i
+ elif (self.usecols and not callable(self.usecols) and
+ nused == len(self.usecols)):
+ # Once we've gathered all requested columns, stop. GH5766
+ break
+ else:
+ name = self._get_column_name(i, nused)
+ usecols = set()
+ if callable(self.usecols):
+ if self.usecols(name):
+ usecols = {i}
+ else:
+ usecols = self.usecols
+ if self.has_usecols and not (i in usecols or
+ name in usecols):
+ continue
+ nused += 1
+
+ conv = self._get_converter(i, name)
+
+ col_dtype = None
+ if self.dtype is not None:
+ if isinstance(self.dtype, dict):
+ if name in self.dtype:
+ col_dtype = self.dtype[name]
+ elif i in self.dtype:
+ col_dtype = self.dtype[i]
+ else:
+ if self.dtype.names:
+ # structured array
+ col_dtype = np.dtype(self.dtype.descr[i][1])
+ else:
+ col_dtype = self.dtype
+
+ if conv:
+ if col_dtype is not None:
+ warnings.warn(("Both a converter and dtype were specified "
+ "for column {0} - only the converter will "
+ "be used").format(name), ParserWarning,
+ stacklevel=5)
+ results[i] = _apply_converter(conv, self.parser, i, start, end,
+ self.c_encoding)
+ continue
+
+ # Collect the list of NaN values associated with the column.
+ # If we aren't supposed to do that, or none are collected,
+ # we set `na_filter` to `0` (`1` otherwise).
+ na_flist = set()
+
+ if self.na_filter:
+ na_list, na_flist = self._get_na_list(i, name)
+ if na_list is None:
+ na_filter = 0
+ else:
+ na_filter = 1
+ na_hashset = kset_from_list(na_list)
+ else:
+ na_filter = 0
+
+ # Attempt to parse tokens and infer dtype of the column.
+ # Should return as the desired dtype (inferred or specified).
+ try:
+ col_res, na_count = self._convert_tokens(
+ i, start, end, name, na_filter, na_hashset,
+ na_flist, col_dtype)
+ finally:
+ # gh-21353
+ #
+ # Cleanup the NaN hash that we generated
+ # to avoid memory leaks.
+ if na_filter:
+ self._free_na_set(na_hashset)
+
+ # don't try to upcast EAs
+ try_upcast = upcast_na and na_count > 0
+ if try_upcast and not is_extension_array_dtype(col_dtype):
+ col_res = _maybe_upcast(col_res)
+
+ if col_res is None:
+ raise ParserError('Unable to parse column {i}'.format(i=i))
+
+ results[i] = col_res
+
+ self.parser_start += end - start
+
+ return results
+
+ cdef inline _convert_tokens(self, Py_ssize_t i, int start, int end,
+ object name, bint na_filter,
+ kh_str_t *na_hashset,
+ object na_flist, object col_dtype):
+
+ if col_dtype is not None:
+ col_res, na_count = self._convert_with_dtype(
+ col_dtype, i, start, end, na_filter,
+ 1, na_hashset, na_flist)
+
+ # Fallback on the parse (e.g. we requested int dtype,
+ # but its actually a float).
+ if col_res is not None:
+ return col_res, na_count
+
+ if i in self.noconvert:
+ return self._string_convert(i, start, end, na_filter, na_hashset)
+ else:
+ col_res = None
+ for dt in self.dtype_cast_order:
+ try:
+ col_res, na_count = self._convert_with_dtype(
+ dt, i, start, end, na_filter, 0, na_hashset, na_flist)
+ except ValueError:
+ # This error is raised from trying to convert to uint64,
+ # and we discover that we cannot convert to any numerical
+ # dtype successfully. As a result, we leave the data
+ # column AS IS with object dtype.
+ col_res, na_count = self._convert_with_dtype(
+ np.dtype('object'), i, start, end, 0,
+ 0, na_hashset, na_flist)
+ except OverflowError:
+ col_res, na_count = self._convert_with_dtype(
+ np.dtype('object'), i, start, end, na_filter,
+ 0, na_hashset, na_flist)
+
+ if col_res is not None:
+ break
+
+ # we had a fallback parse on the dtype, so now try to cast
+ # only allow safe casts, eg. with a nan you cannot safely cast to int
+ if col_res is not None and col_dtype is not None:
+ try:
+ col_res = col_res.astype(col_dtype, casting='safe')
+ except TypeError:
+
+ # float -> int conversions can fail the above
+ # even with no nans
+ col_res_orig = col_res
+ col_res = col_res.astype(col_dtype)
+ if (col_res != col_res_orig).any():
+ raise ValueError(
+ "cannot safely convert passed user dtype of "
+ "{col_dtype} for {col_res} dtyped data in "
+ "column {column}".format(
+ col_dtype=col_dtype,
+ col_res=col_res_orig.dtype.name,
+ column=i))
+
+ return col_res, na_count
+
+ cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,
+ int64_t start, int64_t end,
+ bint na_filter,
+ bint user_dtype,
+ kh_str_t *na_hashset,
+ object na_flist):
+ if is_categorical_dtype(dtype):
+ # TODO: I suspect that _categorical_convert could be
+ # optimized when dtype is an instance of CategoricalDtype
+ codes, cats, na_count = _categorical_convert(
+ self.parser, i, start, end, na_filter,
+ na_hashset, self.c_encoding)
+
+ # Method accepts list of strings, not encoded ones.
+ true_values = [x.decode() for x in self.true_values]
+ cat = Categorical._from_inferred_categories(
+ cats, codes, dtype, true_values=true_values)
+ return cat, na_count
+
+ elif is_extension_array_dtype(dtype):
+ result, na_count = self._string_convert(i, start, end, na_filter,
+ na_hashset)
+ array_type = dtype.construct_array_type()
+ try:
+ # use _from_sequence_of_strings if the class defines it
+ result = array_type._from_sequence_of_strings(result,
+ dtype=dtype)
+ except NotImplementedError:
+ raise NotImplementedError(
+ "Extension Array: {ea} must implement "
+ "_from_sequence_of_strings in order "
+ "to be used in parser methods".format(ea=array_type))
+
+ return result, na_count
+
+ elif is_integer_dtype(dtype):
+ try:
+ result, na_count = _try_int64(self.parser, i, start,
+ end, na_filter, na_hashset)
+ if user_dtype and na_count is not None:
+ if na_count > 0:
+ raise ValueError("Integer column has NA values in "
+ "column {column}".format(column=i))
+ except OverflowError:
+ result = _try_uint64(self.parser, i, start, end,
+ na_filter, na_hashset)
+ na_count = 0
+
+ if result is not None and dtype != 'int64':
+ result = result.astype(dtype)
+
+ return result, na_count
+
+ elif is_float_dtype(dtype):
+ result, na_count = _try_double(self.parser, i, start, end,
+ na_filter, na_hashset, na_flist)
+
+ if result is not None and dtype != 'float64':
+ result = result.astype(dtype)
+ return result, na_count
+ elif is_bool_dtype(dtype):
+ result, na_count = _try_bool_flex(self.parser, i, start, end,
+ na_filter, na_hashset,
+ self.true_set, self.false_set)
+ if user_dtype and na_count is not None:
+ if na_count > 0:
+ raise ValueError("Bool column has NA values in "
+ "column {column}".format(column=i))
+ return result, na_count
+
+ elif dtype.kind == 'S':
+ # TODO: na handling
+ width = dtype.itemsize
+ if width > 0:
+ result = _to_fw_string(self.parser, i, start, end, width)
+ return result, 0
+
+ # treat as a regular string parsing
+ return self._string_convert(i, start, end, na_filter,
+ na_hashset)
+ elif dtype.kind == 'U':
+ width = dtype.itemsize
+ if width > 0:
+ raise TypeError("the dtype {dtype} is not "
+ "supported for parsing".format(dtype=dtype))
+
+ # unicode variable width
+ return self._string_convert(i, start, end, na_filter,
+ na_hashset)
+ elif is_object_dtype(dtype):
+ return self._string_convert(i, start, end, na_filter,
+ na_hashset)
+ elif is_datetime64_dtype(dtype):
+ raise TypeError("the dtype {dtype} is not supported "
+ "for parsing, pass this column "
+ "using parse_dates instead".format(dtype=dtype))
+ else:
+ raise TypeError("the dtype {dtype} is not "
+ "supported for parsing".format(dtype=dtype))
+
+ cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,
+ bint na_filter, kh_str_t *na_hashset):
+
+ cdef StringPath path = _string_path(self.c_encoding)
+
+ if path == UTF8:
+ return _string_box_utf8(self.parser, i, start, end, na_filter,
+ na_hashset)
+ elif path == ENCODED:
+ return _string_box_decode(self.parser, i, start, end,
+ na_filter, na_hashset, self.c_encoding)
+ elif path == CSTRING:
+ return _string_box_factorize(self.parser, i, start, end,
+ na_filter, na_hashset)
+
+ def _get_converter(self, i, name):
+ if self.converters is None:
+ return None
+
+ if name is not None and name in self.converters:
+ return self.converters[name]
+
+ # Converter for position, if any
+ return self.converters.get(i)
+
+ cdef _get_na_list(self, i, name):
+ if self.na_values is None:
+ return None, set()
+
+ if isinstance(self.na_values, dict):
+ key = None
+ values = None
+
+ if name is not None and name in self.na_values:
+ key = name
+ elif i in self.na_values:
+ key = i
+ else: # No na_values provided for this column.
+ if self.keep_default_na:
+ return _NA_VALUES, set()
+
+ return list(), set()
+
+ values = self.na_values[key]
+ if values is not None and not isinstance(values, list):
+ values = list(values)
+
+ fvalues = self.na_fvalues[key]
+ if fvalues is not None and not isinstance(fvalues, set):
+ fvalues = set(fvalues)
+
+ return _ensure_encoded(values), fvalues
+ else:
+ if not isinstance(self.na_values, list):
+ self.na_values = list(self.na_values)
+ if not isinstance(self.na_fvalues, set):
+ self.na_fvalues = set(self.na_fvalues)
+
+ return _ensure_encoded(self.na_values), self.na_fvalues
+
+ cdef _free_na_set(self, kh_str_t *table):
+ kh_destroy_str(table)
+
+ cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):
+ cdef int64_t j
+ if self.has_usecols and self.names is not None:
+ if (not callable(self.usecols) and
+ len(self.names) == len(self.usecols)):
+ return self.names[nused]
+ else:
+ return self.names[i - self.leading_cols]
+ else:
+ if self.header is not None:
+ j = i - self.leading_cols
+ # hack for #2442
+ if j == len(self.header[0]):
+ return j
+ else:
+ return self.header[0][j]
+ else:
+ return None
+
+
+cdef object _true_values = [b'True', b'TRUE', b'true']
+cdef object _false_values = [b'False', b'FALSE', b'false']
+
+
+def _ensure_encoded(list lst):
+ cdef list result = []
+ for x in lst:
+ if isinstance(x, unicode):
+ x = PyUnicode_AsUTF8String(x)
+ elif not isinstance(x, bytes):
+ x = asbytes(x)
+
+ result.append(x)
+ return result
+
+
+cdef asbytes(object o):
+ if PY3:
+ return str(o).encode('utf-8')
+ else:
+ return str(o)
+
+
+# common NA values
+# no longer excluding inf representations
+# '1.#INF','-1.#INF', '1.#INF000000',
+_NA_VALUES = _ensure_encoded(list(icom._NA_VALUES))
+
+
+def _maybe_upcast(arr):
+ """
+
+ """
+ if issubclass(arr.dtype.type, np.integer):
+ na_value = na_values[arr.dtype]
+ arr = arr.astype(float)
+ np.putmask(arr, arr == na_value, np.nan)
+ elif arr.dtype == np.bool_:
+ mask = arr.view(np.uint8) == na_values[np.uint8]
+ arr = arr.astype(object)
+ np.putmask(arr, mask, np.nan)
+
+ return arr
+
+
+cdef enum StringPath:
+ CSTRING
+ UTF8
+ ENCODED
+
+
+# factored out logic to pick string converter
+cdef inline StringPath _string_path(char *encoding):
+ if encoding != NULL and encoding != b"utf-8":
+ return ENCODED
+ elif PY3 or encoding != NULL:
+ return UTF8
+ else:
+ return CSTRING
+
+
+# ----------------------------------------------------------------------
+# Type conversions / inference support code
+
+
+cdef _string_box_factorize(parser_t *parser, int64_t col,
+ int64_t line_start, int64_t line_end,
+ bint na_filter, kh_str_t *na_hashset):
+ cdef:
+ int error, na_count = 0
+ Py_ssize_t i, lines
+ coliter_t it
+ const char *word = NULL
+ ndarray[object] result
+
+ int ret = 0
+ kh_strbox_t *table
+
+ object pyval
+
+ object NA = na_values[np.object_]
+ khiter_t k
+
+ table = kh_init_strbox()
+ lines = line_end - line_start
+ result = np.empty(lines, dtype=np.object_)
+ coliter_setup(&it, parser, col, line_start)
+
+ for i in range(lines):
+ COLITER_NEXT(it, word)
+
+ if na_filter:
+ k = kh_get_str(na_hashset, word)
+ # in the hash table
+ if k != na_hashset.n_buckets:
+ na_count += 1
+ result[i] = NA
+ continue
+
+ k = kh_get_strbox(table, word)
+
+ # in the hash table
+ if k != table.n_buckets:
+ # this increments the refcount, but need to test
+ pyval = <object>table.vals[k]
+ else:
+ # box it. new ref?
+ pyval = PyBytes_FromString(word)
+
+ k = kh_put_strbox(table, word, &ret)
+ table.vals[k] = <PyObject*>pyval
+
+ result[i] = pyval
+
+ kh_destroy_strbox(table)
+
+ return result, na_count
+
+
+cdef _string_box_utf8(parser_t *parser, int64_t col,
+ int64_t line_start, int64_t line_end,
+ bint na_filter, kh_str_t *na_hashset):
+ cdef:
+ int error, na_count = 0
+ Py_ssize_t i, lines
+ coliter_t it
+ const char *word = NULL
+ ndarray[object] result
+
+ int ret = 0
+ kh_strbox_t *table
+
+ object pyval
+
+ object NA = na_values[np.object_]
+ khiter_t k
+
+ table = kh_init_strbox()
+ lines = line_end - line_start
+ result = np.empty(lines, dtype=np.object_)
+ coliter_setup(&it, parser, col, line_start)
+
+ for i in range(lines):
+ COLITER_NEXT(it, word)
+
+ if na_filter:
+ k = kh_get_str(na_hashset, word)
+ # in the hash table
+ if k != na_hashset.n_buckets:
+ na_count += 1
+ result[i] = NA
+ continue
+
+ k = kh_get_strbox(table, word)
+
+ # in the hash table
+ if k != table.n_buckets:
+ # this increments the refcount, but need to test
+ pyval = <object>table.vals[k]
+ else:
+ # box it. new ref?
+ pyval = PyUnicode_FromString(word)
+
+ k = kh_put_strbox(table, word, &ret)
+ table.vals[k] = <PyObject *>pyval
+
+ result[i] = pyval
+
+ kh_destroy_strbox(table)
+
+ return result, na_count
+
+
+cdef _string_box_decode(parser_t *parser, int64_t col,
+ int64_t line_start, int64_t line_end,
+ bint na_filter, kh_str_t *na_hashset,
+ char *encoding):
+ cdef:
+ int error, na_count = 0
+ Py_ssize_t i, size, lines
+ coliter_t it
+ const char *word = NULL
+ ndarray[object] result
+
+ int ret = 0
+ kh_strbox_t *table
+
+ char *errors = "strict"
+
+ object pyval
+
+ object NA = na_values[np.object_]
+ khiter_t k
+
+ table = kh_init_strbox()
+ lines = line_end - line_start
+ result = np.empty(lines, dtype=np.object_)
+ coliter_setup(&it, parser, col, line_start)
+
+ for i in range(lines):
+ COLITER_NEXT(it, word)
+
+ if na_filter:
+ k = kh_get_str(na_hashset, word)
+ # in the hash table
+ if k != na_hashset.n_buckets:
+ na_count += 1
+ result[i] = NA
+ continue
+
+ k = kh_get_strbox(table, word)
+
+ # in the hash table
+ if k != table.n_buckets:
+ # this increments the refcount, but need to test
+ pyval = <object>table.vals[k]
+ else:
+ # box it. new ref?
+ size = strlen(word)
+ pyval = PyUnicode_Decode(word, size, encoding, errors)
+
+ k = kh_put_strbox(table, word, &ret)
+ table.vals[k] = <PyObject *>pyval
+
+ result[i] = pyval
+
+ kh_destroy_strbox(table)
+
+ return result, na_count
+
+
+cdef _categorical_convert(parser_t *parser, int64_t col,
+ int64_t line_start, int64_t line_end,
+ bint na_filter, kh_str_t *na_hashset,
+ char *encoding):
+ "Convert column data into codes, categories"
+ cdef:
+ int error, na_count = 0
+ Py_ssize_t i, size, lines
+ coliter_t it
+ const char *word = NULL
+
+ int64_t NA = -1
+ int64_t[:] codes
+ int64_t current_category = 0
+
+ char *errors = "strict"
+ cdef StringPath path = _string_path(encoding)
+
+ int ret = 0
+ kh_str_t *table
+ khiter_t k
+
+ lines = line_end - line_start
+ codes = np.empty(lines, dtype=np.int64)
+
+ # factorize parsed values, creating a hash table
+ # bytes -> category code
+ with nogil:
+ table = kh_init_str()
+ coliter_setup(&it, parser, col, line_start)
+
+ for i in range(lines):
+ COLITER_NEXT(it, word)
+
+ if na_filter:
+ k = kh_get_str(na_hashset, word)
+ # is in NA values
+ if k != na_hashset.n_buckets:
+ na_count += 1
+ codes[i] = NA
+ continue
+
+ k = kh_get_str(table, word)
+ # not in the hash table
+ if k == table.n_buckets:
+ k = kh_put_str(table, word, &ret)
+ table.vals[k] = current_category
+ current_category += 1
+
+ codes[i] = table.vals[k]
+
+ # parse and box categories to python strings
+ result = np.empty(table.n_occupied, dtype=np.object_)
+ if path == ENCODED:
+ for k in range(table.n_buckets):
+ if kh_exist_str(table, k):
+ size = strlen(table.keys[k])
+ result[table.vals[k]] = PyUnicode_Decode(
+ table.keys[k], size, encoding, errors)
+ elif path == UTF8:
+ for k in range(table.n_buckets):
+ if kh_exist_str(table, k):
+ result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
+ elif path == CSTRING:
+ for k in range(table.n_buckets):
+ if kh_exist_str(table, k):
+ result[table.vals[k]] = PyBytes_FromString(table.keys[k])
+
+ kh_destroy_str(table)
+ return np.asarray(codes), result, na_count
+
+
+cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start,
+ int64_t line_end, int64_t width):
+ cdef:
+ Py_ssize_t i
+ coliter_t it
+ const char *word = NULL
+ char *data
+ ndarray result
+
+ result = np.empty(line_end - line_start, dtype='|S%d' % width)
+ data = <char*>result.data
+
+ with nogil:
+ _to_fw_string_nogil(parser, col, line_start, line_end, width, data)
+
+ return result
+
+
+cdef inline void _to_fw_string_nogil(parser_t *parser, int64_t col,
+ int64_t line_start, int64_t line_end,
+ size_t width, char *data) nogil:
+ cdef:
+ int64_t i
+ coliter_t it
+ const char *word = NULL
+
+ coliter_setup(&it, parser, col, line_start)
+
+ for i in range(line_end - line_start):
+ COLITER_NEXT(it, word)
+ strncpy(data, word, width)
+ data += width
+
+
+cdef char* cinf = b'inf'
+cdef char* cposinf = b'+inf'
+cdef char* cneginf = b'-inf'
+
+
+cdef _try_double(parser_t *parser, int64_t col,
+ int64_t line_start, int64_t line_end,
+ bint na_filter, kh_str_t *na_hashset, object na_flist):
+ cdef:
+ int error, na_count = 0
+ Py_ssize_t i, lines
+ coliter_t it
+ const char *word = NULL
+ char *p_end
+ float64_t *data
+ float64_t NA = na_values[np.float64]
+ kh_float64_t *na_fset
+ ndarray result
+ khiter_t k
+ bint use_na_flist = len(na_flist) > 0
+
+ lines = line_end - line_start
+ result = np.empty(lines, dtype=np.float64)
+ data = <float64_t *>result.data
+ na_fset = kset_float64_from_list(na_flist)
+ if parser.double_converter_nogil != NULL: # if it can run without the GIL
+ with nogil:
+ error = _try_double_nogil(parser, parser.double_converter_nogil,
+ col, line_start, line_end,
+ na_filter, na_hashset, use_na_flist,
+ na_fset, NA, data, &na_count)
+ else:
+ assert parser.double_converter_withgil != NULL
+ error = _try_double_nogil(parser,
+ <float64_t (*)(const char *, char **,
+ char, char, char, int)
+ nogil>parser.double_converter_withgil,
+ col, line_start, line_end,
+ na_filter, na_hashset, use_na_flist,
+ na_fset, NA, data, &na_count)
+ kh_destroy_float64(na_fset)
+ if error != 0:
+ return None, None
+ return result, na_count
+
+
+cdef inline int _try_double_nogil(parser_t *parser,
+ float64_t (*double_converter)(
+ const char *, char **, char,
+ char, char, int) nogil,
+ int col, int line_start, int line_end,
+ bint na_filter, kh_str_t *na_hashset,
+ bint use_na_flist,
+ const kh_float64_t *na_flist,
+ float64_t NA, float64_t *data,
+ int *na_count) nogil:
+ cdef:
+ int error,
+ Py_ssize_t i, lines = line_end - line_start
+ coliter_t it
+ const char *word = NULL
+ char *p_end
+ khiter_t k, k64
+
+ global errno
+
+ na_count[0] = 0
+ coliter_setup(&it, parser, col, line_start)
+
+ if na_filter:
+ for i in range(lines):
+ COLITER_NEXT(it, word)
+
+ k = kh_get_str(na_hashset, word)
+ # in the hash table
+ if k != na_hashset.n_buckets:
+ na_count[0] += 1
+ data[0] = NA
+ else:
+ data[0] = double_converter(word, &p_end, parser.decimal,
+ parser.sci, parser.thousands, 1)
+ if errno != 0 or p_end[0] or p_end == word:
+ if (strcasecmp(word, cinf) == 0 or
+ strcasecmp(word, cposinf) == 0):
+ data[0] = INF
+ elif strcasecmp(word, cneginf) == 0:
+ data[0] = NEGINF
+ else:
+ # Just return a non-zero value since
+ # the errno is never consumed.
+ return 1
+ if use_na_flist:
+ k64 = kh_get_float64(na_flist, data[0])
+ if k64 != na_flist.n_buckets:
+ na_count[0] += 1
+ data[0] = NA
+ data += 1
+ else:
+ for i in range(lines):
+ COLITER_NEXT(it, word)
+ data[0] = double_converter(word, &p_end, parser.decimal,
+ parser.sci, parser.thousands, 1)
+ if errno != 0 or p_end[0] or p_end == word:
+ if (strcasecmp(word, cinf) == 0 or
+ strcasecmp(word, cposinf) == 0):
+ data[0] = INF
+ elif strcasecmp(word, cneginf) == 0:
+ data[0] = NEGINF
+ else:
+ # Just return a non-zero value since
+ # the errno is never consumed.
+ return 1
+ data += 1
+
+ return 0
+
+
+cdef _try_uint64(parser_t *parser, int64_t col,
+ int64_t line_start, int64_t line_end,
+ bint na_filter, kh_str_t *na_hashset):
+ cdef:
+ int error
+ Py_ssize_t i, lines
+ coliter_t it
+ uint64_t *data
+ ndarray result
+ khiter_t k
+ uint_state state
+
+ lines = line_end - line_start
+ result = np.empty(lines, dtype=np.uint64)
+ data = <uint64_t *>result.data
+
+ uint_state_init(&state)
+ coliter_setup(&it, parser, col, line_start)
+ with nogil:
+ error = _try_uint64_nogil(parser, col, line_start, line_end,
+ na_filter, na_hashset, data, &state)
+ if error != 0:
+ if error == ERROR_OVERFLOW:
+ # Can't get the word variable
+ raise OverflowError('Overflow')
+ return None
+
+ if uint64_conflict(&state):
+ raise ValueError('Cannot convert to numerical dtype')
+
+ if state.seen_sint:
+ raise OverflowError('Overflow')
+
+ return result
+
+
+cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col,
+ int64_t line_start,
+ int64_t line_end, bint na_filter,
+ const kh_str_t *na_hashset,
+ uint64_t *data, uint_state *state) nogil:
+ cdef:
+ int error
+ Py_ssize_t i, lines = line_end - line_start
+ coliter_t it
+ const char *word = NULL
+ khiter_t k
+
+ coliter_setup(&it, parser, col, line_start)
+
+ if na_filter:
+ for i in range(lines):
+ COLITER_NEXT(it, word)
+ k = kh_get_str(na_hashset, word)
+ # in the hash table
+ if k != na_hashset.n_buckets:
+ state.seen_null = 1
+ data[i] = 0
+ continue
+
+ data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
+ &error, parser.thousands)
+ if error != 0:
+ return error
+ else:
+ for i in range(lines):
+ COLITER_NEXT(it, word)
+ data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
+ &error, parser.thousands)
+ if error != 0:
+ return error
+
+ return 0
+
+
+cdef _try_int64(parser_t *parser, int64_t col,
+ int64_t line_start, int64_t line_end,
+ bint na_filter, kh_str_t *na_hashset):
+ cdef:
+ int error, na_count = 0
+ Py_ssize_t i, lines
+ coliter_t it
+ int64_t *data
+ ndarray result
+
+ int64_t NA = na_values[np.int64]
+ khiter_t k
+
+ lines = line_end - line_start
+ result = np.empty(lines, dtype=np.int64)
+ data = <int64_t *>result.data
+ coliter_setup(&it, parser, col, line_start)
+ with nogil:
+ error = _try_int64_nogil(parser, col, line_start, line_end,
+ na_filter, na_hashset, NA, data, &na_count)
+ if error != 0:
+ if error == ERROR_OVERFLOW:
+ # Can't get the word variable
+ raise OverflowError('Overflow')
+ return None, None
+
+ return result, na_count
+
+
+cdef inline int _try_int64_nogil(parser_t *parser, int64_t col,
+ int64_t line_start,
+ int64_t line_end, bint na_filter,
+ const kh_str_t *na_hashset, int64_t NA,
+ int64_t *data, int *na_count) nogil:
+ cdef:
+ int error
+ Py_ssize_t i, lines = line_end - line_start
+ coliter_t it
+ const char *word = NULL
+ khiter_t k
+
+ na_count[0] = 0
+ coliter_setup(&it, parser, col, line_start)
+
+ if na_filter:
+ for i in range(lines):
+ COLITER_NEXT(it, word)
+ k = kh_get_str(na_hashset, word)
+ # in the hash table
+ if k != na_hashset.n_buckets:
+ na_count[0] += 1
+ data[i] = NA
+ continue
+
+ data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
+ &error, parser.thousands)
+ if error != 0:
+ return error
+ else:
+ for i in range(lines):
+ COLITER_NEXT(it, word)
+ data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
+ &error, parser.thousands)
+ if error != 0:
+ return error
+
+ return 0
+
+
+cdef _try_bool_flex(parser_t *parser, int64_t col,
+ int64_t line_start, int64_t line_end,
+ bint na_filter, const kh_str_t *na_hashset,
+ const kh_str_t *true_hashset,
+ const kh_str_t *false_hashset):
+ cdef:
+ int error, na_count = 0
+ Py_ssize_t i, lines
+ coliter_t it
+ const char *word = NULL
+ uint8_t *data
+ ndarray result
+
+ uint8_t NA = na_values[np.bool_]
+ khiter_t k
+
+ lines = line_end - line_start
+ result = np.empty(lines, dtype=np.uint8)
+ data = <uint8_t *>result.data
+ with nogil:
+ error = _try_bool_flex_nogil(parser, col, line_start, line_end,
+ na_filter, na_hashset, true_hashset,
+ false_hashset, NA, data, &na_count)
+ if error != 0:
+ return None, None
+ return result.view(np.bool_), na_count
+
+
+cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col,
+ int64_t line_start,
+ int64_t line_end, bint na_filter,
+ const kh_str_t *na_hashset,
+ const kh_str_t *true_hashset,
+ const kh_str_t *false_hashset,
+ uint8_t NA, uint8_t *data,
+ int *na_count) nogil:
+ cdef:
+ int error = 0
+ Py_ssize_t i, lines = line_end - line_start
+ coliter_t it
+ const char *word = NULL
+ khiter_t k
+
+ na_count[0] = 0
+ coliter_setup(&it, parser, col, line_start)
+
+ if na_filter:
+ for i in range(lines):
+ COLITER_NEXT(it, word)
+
+ k = kh_get_str(na_hashset, word)
+ # in the hash table
+ if k != na_hashset.n_buckets:
+ na_count[0] += 1
+ data[0] = NA
+ data += 1
+ continue
+
+ k = kh_get_str(true_hashset, word)
+ if k != true_hashset.n_buckets:
+ data[0] = 1
+ data += 1
+ continue
+ k = kh_get_str(false_hashset, word)
+ if k != false_hashset.n_buckets:
+ data[0] = 0
+ data += 1
+ continue
+
+ error = to_boolean(word, data)
+ if error != 0:
+ return error
+ data += 1
+ else:
+ for i in range(lines):
+ COLITER_NEXT(it, word)
+
+ k = kh_get_str(true_hashset, word)
+ if k != true_hashset.n_buckets:
+ data[0] = 1
+ data += 1
+ continue
+
+ k = kh_get_str(false_hashset, word)
+ if k != false_hashset.n_buckets:
+ data[0] = 0
+ data += 1
+ continue
+
+ error = to_boolean(word, data)
+ if error != 0:
+ return error
+ data += 1
+
+ return 0
+
+
+cdef kh_str_t* kset_from_list(list values) except NULL:
+ # caller takes responsibility for freeing the hash table
+ cdef:
+ Py_ssize_t i
+ khiter_t k
+ kh_str_t *table
+ int ret = 0
+
+ object val
+
+ table = kh_init_str()
+
+ for i in range(len(values)):
+ val = values[i]
+
+ # None creeps in sometimes, which isn't possible here
+ if not isinstance(val, bytes):
+ kh_destroy_str(table)
+ raise ValueError('Must be all encoded bytes')
+
+ k = kh_put_str(table, PyBytes_AsString(val), &ret)
+
+ return table
+
+
+cdef kh_float64_t* kset_float64_from_list(values) except NULL:
+ # caller takes responsibility for freeing the hash table
+ cdef:
+ Py_ssize_t i
+ khiter_t k
+ kh_float64_t *table
+ int ret = 0
+ float64_t val
+ object value
+
+ table = kh_init_float64()
+
+ for value in values:
+ val = float(value)
+
+ k = kh_put_float64(table, val, &ret)
+
+ return table
+
+
+cdef raise_parser_error(object base, parser_t *parser):
+ cdef:
+ object old_exc
+ object exc_type
+ PyObject *type
+ PyObject *value
+ PyObject *traceback
+
+ if PyErr_Occurred():
+ PyErr_Fetch(&type, &value, &traceback)
+ Py_XDECREF(traceback)
+
+ if value != NULL:
+ old_exc = <object>value
+ Py_XDECREF(value)
+
+ # PyErr_Fetch only returned the error message in *value,
+ # so the Exception class must be extracted from *type.
+ if isinstance(old_exc, compat.string_types):
+ if type != NULL:
+ exc_type = <object>type
+ else:
+ exc_type = ParserError
+
+ Py_XDECREF(type)
+ raise exc_type(old_exc)
+ else:
+ Py_XDECREF(type)
+ raise old_exc
+
+ message = '{base}. C error: '.format(base=base)
+ if parser.error_msg != NULL:
+ if PY3:
+ message += parser.error_msg.decode('utf-8')
+ else:
+ message += parser.error_msg
+ else:
+ message += 'no error message set'
+
+ raise ParserError(message)
+
+
+def _concatenate_chunks(list chunks):
+ cdef:
+ list names = list(chunks[0].keys())
+ object name
+ list warning_columns
+ object warning_names
+ object common_type
+
+ result = {}
+ warning_columns = list()
+ for name in names:
+ arrs = [chunk.pop(name) for chunk in chunks]
+ # Check each arr for consistent types.
+ dtypes = {a.dtype for a in arrs}
+ numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}
+ if len(numpy_dtypes) > 1:
+ common_type = np.find_common_type(numpy_dtypes, [])
+ if common_type == np.object:
+ warning_columns.append(str(name))
+
+ dtype = dtypes.pop()
+ if is_categorical_dtype(dtype):
+ sort_categories = isinstance(dtype, str)
+ result[name] = union_categoricals(arrs,
+ sort_categories=sort_categories)
+ else:
+ if is_extension_array_dtype(dtype):
+ array_type = dtype.construct_array_type()
+ result[name] = array_type._concat_same_type(arrs)
+ else:
+ result[name] = np.concatenate(arrs)
+
+ if warning_columns:
+ warning_names = ','.join(warning_columns)
+ warning_message = " ".join([
+ "Columns (%s) have mixed types." % warning_names,
+ "Specify dtype option on import or set low_memory=False."
+ ])
+ warnings.warn(warning_message, DtypeWarning, stacklevel=8)
+ return result
+
+
+# ----------------------------------------------------------------------
+# NA values
+def _compute_na_values():
+ int64info = np.iinfo(np.int64)
+ int32info = np.iinfo(np.int32)
+ int16info = np.iinfo(np.int16)
+ int8info = np.iinfo(np.int8)
+ uint64info = np.iinfo(np.uint64)
+ uint32info = np.iinfo(np.uint32)
+ uint16info = np.iinfo(np.uint16)
+ uint8info = np.iinfo(np.uint8)
+ na_values = {
+ np.float64: np.nan,
+ np.int64: int64info.min,
+ np.int32: int32info.min,
+ np.int16: int16info.min,
+ np.int8: int8info.min,
+ np.uint64: uint64info.max,
+ np.uint32: uint32info.max,
+ np.uint16: uint16info.max,
+ np.uint8: uint8info.max,
+ np.bool_: uint8info.max,
+ np.object_: np.nan # oof
+ }
+ return na_values
+
+
+na_values = _compute_na_values()
+
+for k in list(na_values):
+ na_values[np.dtype(k)] = na_values[k]
+
+
+cdef _apply_converter(object f, parser_t *parser, int64_t col,
+ int64_t line_start, int64_t line_end,
+ char* c_encoding):
+ cdef:
+ int error
+ Py_ssize_t i, lines
+ coliter_t it
+ const char *word = NULL
+ char *errors = "strict"
+ ndarray[object] result
+ object val
+
+ lines = line_end - line_start
+ result = np.empty(lines, dtype=np.object_)
+
+ coliter_setup(&it, parser, col, line_start)
+
+ if not PY3 and c_encoding == NULL:
+ for i in range(lines):
+ COLITER_NEXT(it, word)
+ val = PyBytes_FromString(word)
+ result[i] = f(val)
+ elif ((PY3 and c_encoding == NULL) or c_encoding == b'utf-8'):
+ for i in range(lines):
+ COLITER_NEXT(it, word)
+ val = PyUnicode_FromString(word)
+ result[i] = f(val)
+ else:
+ for i in range(lines):
+ COLITER_NEXT(it, word)
+ val = PyUnicode_Decode(word, strlen(word),
+ c_encoding, errors)
+ result[i] = f(val)
+
+ return lib.maybe_convert_objects(result)
+
+
+def _maybe_encode(values):
+ if values is None:
+ return []
+ return [x.encode('utf-8') if isinstance(x, unicode) else x for x in values]
+
+
+def sanitize_objects(ndarray[object] values, set na_values,
+ convert_empty=True):
+ """
+ Convert specified values, including the given set na_values and empty
+ strings if convert_empty is True, to np.nan.
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ na_values : set
+ convert_empty : bool (default True)
+ """
+ cdef:
+ Py_ssize_t i, n
+ object val, onan
+ Py_ssize_t na_count = 0
+ dict memo = {}
+
+ n = len(values)
+ onan = np.nan
+
+ for i in range(n):
+ val = values[i]
+ if (convert_empty and val == '') or (val in na_values):
+ values[i] = onan
+ na_count += 1
+ elif val in memo:
+ values[i] = memo[val]
+ else:
+ memo[val] = val
+
+ return na_count
diff --git a/contrib/python/pandas/py2/pandas/_libs/properties.pyx b/contrib/python/pandas/py2/pandas/_libs/properties.pyx
new file mode 100644
index 00000000000..d2fbf5aa66f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/properties.pyx
@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+
+from cython import Py_ssize_t
+
+from cpython cimport (
+ PyDict_Contains, PyDict_GetItem, PyDict_SetItem)
+
+
+cdef class CachedProperty(object):
+
+ cdef readonly:
+ object func, name, __doc__
+
+ def __init__(self, func):
+ self.func = func
+ self.name = func.__name__
+ self.__doc__ = getattr(func, '__doc__', None)
+
+ def __get__(self, obj, typ):
+ if obj is None:
+ # accessed on the class, not the instance
+ return self
+
+ # Get the cache or set a default one if needed
+ cache = getattr(obj, '_cache', None)
+ if cache is None:
+ try:
+ cache = obj._cache = {}
+ except (AttributeError):
+ return self
+
+ if PyDict_Contains(cache, self.name):
+ # not necessary to Py_INCREF
+ val = <object>PyDict_GetItem(cache, self.name)
+ else:
+ val = self.func(obj)
+ PyDict_SetItem(cache, self.name, val)
+ return val
+
+ def __set__(self, obj, value):
+ raise AttributeError("Can't set attribute")
+
+
+cache_readonly = CachedProperty
+
+
+cdef class AxisProperty(object):
+
+ cdef readonly:
+ Py_ssize_t axis
+ object __doc__
+
+ def __init__(self, axis=0, doc=""):
+ self.axis = axis
+ self.__doc__ = doc
+
+ def __get__(self, obj, type):
+ cdef:
+ list axes
+
+ if obj is None:
+ # Only instances have _data, not classes
+ return self
+ else:
+ axes = obj._data.axes
+ return axes[self.axis]
+
+ def __set__(self, obj, value):
+ obj._set_axis(self.axis, value)
diff --git a/contrib/python/pandas/py2/pandas/_libs/reduction.pyx b/contrib/python/pandas/py2/pandas/_libs/reduction.pyx
new file mode 100644
index 00000000000..ca39c4de4d3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/reduction.pyx
@@ -0,0 +1,641 @@
+# -*- coding: utf-8 -*-
+from distutils.version import LooseVersion
+
+from cython import Py_ssize_t
+from cpython cimport Py_INCREF
+
+from libc.stdlib cimport malloc, free
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport (ndarray,
+ int64_t,
+ PyArray_SETITEM,
+ PyArray_ITER_NEXT, PyArray_ITER_DATA, PyArray_IterNew,
+ flatiter)
+cnp.import_array()
+
+cimport pandas._libs.util as util
+from pandas._libs.lib import maybe_convert_objects
+
+
+cdef _get_result_array(object obj, Py_ssize_t size, Py_ssize_t cnt):
+
+ if (util.is_array(obj) or
+ (isinstance(obj, list) and len(obj) == cnt) or
+ getattr(obj, 'shape', None) == (cnt,)):
+ raise ValueError('function does not reduce')
+
+ return np.empty(size, dtype='O')
+
+
+cdef class Reducer:
+ """
+ Performs generic reduction operation on a C or Fortran-contiguous ndarray
+ while avoiding ndarray construction overhead
+ """
+ cdef:
+ Py_ssize_t increment, chunksize, nresults
+ object arr, dummy, f, labels, typ, ityp, index
+
+ def __init__(self, object arr, object f, axis=1, dummy=None,
+ labels=None):
+ n, k = arr.shape
+
+ if axis == 0:
+ if not arr.flags.f_contiguous:
+ arr = arr.copy('F')
+
+ self.nresults = k
+ self.chunksize = n
+ self.increment = n * arr.dtype.itemsize
+ else:
+ if not arr.flags.c_contiguous:
+ arr = arr.copy('C')
+
+ self.nresults = n
+ self.chunksize = k
+ self.increment = k * arr.dtype.itemsize
+
+ self.f = f
+ self.arr = arr
+ self.labels = labels
+ self.dummy, self.typ, self.index, self.ityp = self._check_dummy(
+ dummy=dummy)
+
+ def _check_dummy(self, dummy=None):
+ cdef object index=None, typ=None, ityp=None
+
+ if dummy is None:
+ dummy = np.empty(self.chunksize, dtype=self.arr.dtype)
+
+ # our ref is stolen later since we are creating this array
+ # in cython, so increment first
+ Py_INCREF(dummy)
+
+ else:
+
+ # we passed a series-like
+ if hasattr(dummy, 'values'):
+
+ typ = type(dummy)
+ index = getattr(dummy, 'index', None)
+ dummy = dummy.values
+
+ if dummy.dtype != self.arr.dtype:
+ raise ValueError('Dummy array must be same dtype')
+ if len(dummy) != self.chunksize:
+ raise ValueError('Dummy array must be length %d' %
+ self.chunksize)
+
+ return dummy, typ, index, ityp
+
+ def get_result(self):
+ cdef:
+ char* dummy_buf
+ ndarray arr, result, chunk
+ Py_ssize_t i, incr
+ flatiter it
+ bint has_labels
+ object res, name, labels, index
+ object cached_typ=None
+
+ arr = self.arr
+ chunk = self.dummy
+ dummy_buf = chunk.data
+ chunk.data = arr.data
+ labels = self.labels
+ has_labels = labels is not None
+ has_index = self.index is not None
+ incr = self.increment
+
+ try:
+ for i in range(self.nresults):
+
+ if has_labels:
+ name = util.get_value_at(labels, i)
+ else:
+ name = None
+
+ # create the cached type
+ # each time just reassign the data
+ if i == 0:
+
+ if self.typ is not None:
+
+ # recreate with the index if supplied
+ if has_index:
+
+ cached_typ = self.typ(
+ chunk, index=self.index, name=name)
+
+ else:
+
+ # use the passsed typ, sans index
+ cached_typ = self.typ(chunk, name=name)
+
+ # use the cached_typ if possible
+ if cached_typ is not None:
+
+ if has_index:
+ object.__setattr__(cached_typ, 'index', self.index)
+
+ object.__setattr__(
+ cached_typ._data._block, 'values', chunk)
+ object.__setattr__(cached_typ, 'name', name)
+ res = self.f(cached_typ)
+ else:
+ res = self.f(chunk)
+
+ if hasattr(res, 'values') and util.is_array(res.values):
+ res = res.values
+ if i == 0:
+ result = _get_result_array(res,
+ self.nresults,
+ len(self.dummy))
+ it = <flatiter>PyArray_IterNew(result)
+
+ PyArray_SETITEM(result, PyArray_ITER_DATA(it), res)
+ chunk.data = chunk.data + self.increment
+ PyArray_ITER_NEXT(it)
+ except Exception, e:
+ if hasattr(e, 'args'):
+ e.args = e.args + (i,)
+ raise
+ finally:
+ # so we don't free the wrong memory
+ chunk.data = dummy_buf
+
+ if result.dtype == np.object_:
+ result = maybe_convert_objects(result)
+
+ return result
+
+
+cdef class SeriesBinGrouper:
+ """
+ Performs grouping operation according to bin edges, rather than labels
+ """
+ cdef:
+ Py_ssize_t nresults, ngroups
+ bint passed_dummy
+
+ cdef public:
+ object arr, index, dummy_arr, dummy_index
+ object values, f, bins, typ, ityp, name
+
+ def __init__(self, object series, object f, object bins, object dummy):
+ n = len(series)
+
+ self.bins = bins
+ self.f = f
+
+ values = series.values
+ if not values.flags.c_contiguous:
+ values = values.copy('C')
+ self.arr = values
+ self.typ = series._constructor
+ self.ityp = series.index._constructor
+ self.index = series.index.values
+ self.name = getattr(series, 'name', None)
+
+ self.dummy_arr, self.dummy_index = self._check_dummy(dummy)
+ self.passed_dummy = dummy is not None
+
+ # kludge for #1688
+ if len(bins) > 0 and bins[-1] == len(series):
+ self.ngroups = len(bins)
+ else:
+ self.ngroups = len(bins) + 1
+
+ def _check_dummy(self, dummy=None):
+ # both values and index must be an ndarray!
+
+ if dummy is None:
+ values = np.empty(0, dtype=self.arr.dtype)
+ index = None
+ else:
+ values = dummy.values
+ if values.dtype != self.arr.dtype:
+ raise ValueError('Dummy array must be same dtype')
+ if not values.flags.contiguous:
+ values = values.copy()
+ index = dummy.index.values
+ if not index.flags.contiguous:
+ index = index.copy()
+
+ return values, index
+
+ def get_result(self):
+ cdef:
+ ndarray arr, result
+ ndarray[int64_t] counts
+ Py_ssize_t i, n, group_size
+ object res
+ bint initialized = 0
+ Slider vslider, islider
+ object name, cached_typ=None, cached_ityp=None
+
+ counts = np.zeros(self.ngroups, dtype=np.int64)
+
+ if self.ngroups > 0:
+ counts[0] = self.bins[0]
+ for i in range(1, self.ngroups):
+ if i == self.ngroups - 1:
+ counts[i] = len(self.arr) - self.bins[i - 1]
+ else:
+ counts[i] = self.bins[i] - self.bins[i - 1]
+
+ group_size = 0
+ n = len(self.arr)
+ name = self.name
+
+ vslider = Slider(self.arr, self.dummy_arr)
+ islider = Slider(self.index, self.dummy_index)
+
+ try:
+ for i in range(self.ngroups):
+ group_size = counts[i]
+
+ islider.set_length(group_size)
+ vslider.set_length(group_size)
+
+ if cached_typ is None:
+ cached_ityp = self.ityp(islider.buf)
+ cached_typ = self.typ(vslider.buf, index=cached_ityp,
+ name=name)
+ else:
+ # See the comment in indexes/base.py about _index_data.
+ # We need this for EA-backed indexes that have a reference
+ # to a 1-d ndarray like datetime / timedelta / period.
+ object.__setattr__(cached_ityp, '_index_data', islider.buf)
+ cached_ityp._engine.clear_mapping()
+ object.__setattr__(
+ cached_typ._data._block, 'values', vslider.buf)
+ object.__setattr__(cached_typ, '_index', cached_ityp)
+ object.__setattr__(cached_typ, 'name', name)
+
+ cached_ityp._engine.clear_mapping()
+ res = self.f(cached_typ)
+ res = _extract_result(res)
+ if not initialized:
+ initialized = 1
+ result = _get_result_array(res,
+ self.ngroups,
+ len(self.dummy_arr))
+ result[i] = res
+
+ islider.advance(group_size)
+ vslider.advance(group_size)
+
+ except:
+ raise
+ finally:
+ # so we don't free the wrong memory
+ islider.reset()
+ vslider.reset()
+
+ if result.dtype == np.object_:
+ result = maybe_convert_objects(result)
+
+ return result, counts
+
+
+cdef class SeriesGrouper:
+ """
+ Performs generic grouping operation while avoiding ndarray construction
+ overhead
+ """
+ cdef:
+ Py_ssize_t nresults, ngroups
+ bint passed_dummy
+
+ cdef public:
+ object arr, index, dummy_arr, dummy_index
+ object f, labels, values, typ, ityp, name
+
+ def __init__(self, object series, object f, object labels,
+ Py_ssize_t ngroups, object dummy):
+ n = len(series)
+
+ self.labels = labels
+ self.f = f
+
+ values = series.values
+ if not values.flags.c_contiguous:
+ values = values.copy('C')
+ self.arr = values
+ self.typ = series._constructor
+ self.ityp = series.index._constructor
+ self.index = series.index.values
+ self.name = getattr(series, 'name', None)
+
+ self.dummy_arr, self.dummy_index = self._check_dummy(dummy)
+ self.passed_dummy = dummy is not None
+ self.ngroups = ngroups
+
+ def _check_dummy(self, dummy=None):
+ # both values and index must be an ndarray!
+
+ if dummy is None:
+ values = np.empty(0, dtype=self.arr.dtype)
+ index = None
+ else:
+ values = dummy.values
+ if dummy.dtype != self.arr.dtype:
+ raise ValueError('Dummy array must be same dtype')
+ if not values.flags.contiguous:
+ values = values.copy()
+ index = dummy.index.values
+ if not index.flags.contiguous:
+ index = index.copy()
+
+ return values, index
+
+ def get_result(self):
+ cdef:
+ ndarray arr, result
+ ndarray[int64_t] labels, counts
+ Py_ssize_t i, n, group_size, lab
+ object res
+ bint initialized = 0
+ Slider vslider, islider
+ object name, cached_typ=None, cached_ityp=None
+
+ labels = self.labels
+ counts = np.zeros(self.ngroups, dtype=np.int64)
+ group_size = 0
+ n = len(self.arr)
+ name = self.name
+
+ vslider = Slider(self.arr, self.dummy_arr)
+ islider = Slider(self.index, self.dummy_index)
+
+ try:
+ for i in range(n):
+ group_size += 1
+
+ lab = labels[i]
+
+ if i == n - 1 or lab != labels[i + 1]:
+ if lab == -1:
+ islider.advance(group_size)
+ vslider.advance(group_size)
+ group_size = 0
+ continue
+
+ islider.set_length(group_size)
+ vslider.set_length(group_size)
+
+ if cached_typ is None:
+ cached_ityp = self.ityp(islider.buf)
+ cached_typ = self.typ(vslider.buf, index=cached_ityp,
+ name=name)
+ else:
+ object.__setattr__(cached_ityp, '_data', islider.buf)
+ cached_ityp._engine.clear_mapping()
+ object.__setattr__(
+ cached_typ._data._block, 'values', vslider.buf)
+ object.__setattr__(cached_typ, '_index', cached_ityp)
+ object.__setattr__(cached_typ, 'name', name)
+
+ cached_ityp._engine.clear_mapping()
+ res = self.f(cached_typ)
+ res = _extract_result(res)
+ if not initialized:
+ initialized = 1
+ result = _get_result_array(res,
+ self.ngroups,
+ len(self.dummy_arr))
+
+ result[lab] = res
+ counts[lab] = group_size
+ islider.advance(group_size)
+ vslider.advance(group_size)
+
+ group_size = 0
+
+ except:
+ raise
+ finally:
+ # so we don't free the wrong memory
+ islider.reset()
+ vslider.reset()
+
+ if result.dtype == np.object_:
+ result = maybe_convert_objects(result)
+
+ return result, counts
+
+
+cdef inline _extract_result(object res):
+ """ extract the result object, it might be a 0-dim ndarray
+ or a len-1 0-dim, or a scalar """
+ if hasattr(res, 'values') and util.is_array(res.values):
+ res = res.values
+ if not np.isscalar(res):
+ if util.is_array(res):
+ if res.ndim == 0:
+ res = res.item()
+ elif res.ndim == 1 and len(res) == 1:
+ res = res[0]
+ return res
+
+
+cdef class Slider:
+ """
+ Only handles contiguous data for now
+ """
+ cdef:
+ ndarray values, buf
+ Py_ssize_t stride, orig_len, orig_stride
+ char *orig_data
+
+ def __init__(self, object values, object buf):
+ assert(values.ndim == 1)
+
+ if not values.flags.contiguous:
+ values = values.copy()
+
+ assert(values.dtype == buf.dtype)
+ self.values = values
+ self.buf = buf
+ self.stride = values.strides[0]
+
+ self.orig_data = self.buf.data
+ self.orig_len = self.buf.shape[0]
+ self.orig_stride = self.buf.strides[0]
+
+ self.buf.data = self.values.data
+ self.buf.strides[0] = self.stride
+
+ cpdef advance(self, Py_ssize_t k):
+ self.buf.data = <char*>self.buf.data + self.stride * k
+
+ cdef move(self, int start, int end):
+ """
+ For slicing
+ """
+ self.buf.data = self.values.data + self.stride * start
+ self.buf.shape[0] = end - start
+
+ cpdef set_length(self, Py_ssize_t length):
+ self.buf.shape[0] = length
+
+ cpdef reset(self):
+
+ self.buf.shape[0] = self.orig_len
+ self.buf.data = self.orig_data
+ self.buf.strides[0] = self.orig_stride
+
+
+class InvalidApply(Exception):
+ pass
+
+
+def apply_frame_axis0(object frame, object f, object names,
+ ndarray[int64_t] starts, ndarray[int64_t] ends):
+ cdef:
+ BlockSlider slider
+ Py_ssize_t i, n = len(starts)
+ list results
+ object piece
+ dict item_cache
+
+ if frame.index._has_complex_internals:
+ raise InvalidApply('Cannot modify frame index internals')
+
+ results = []
+
+ # Need to infer if our low-level mucking is going to cause a segfault
+ if n > 0:
+ chunk = frame.iloc[starts[0]:ends[0]]
+ object.__setattr__(chunk, 'name', names[0])
+ try:
+ result = f(chunk)
+ if result is chunk:
+ raise InvalidApply('Function unsafe for fast apply')
+ except:
+ raise InvalidApply('Let this error raise above us')
+
+ slider = BlockSlider(frame)
+
+ mutated = False
+ item_cache = slider.dummy._item_cache
+ try:
+ for i in range(n):
+ slider.move(starts[i], ends[i])
+
+ item_cache.clear() # ugh
+
+ object.__setattr__(slider.dummy, 'name', names[i])
+ piece = f(slider.dummy)
+
+ # I'm paying the price for index-sharing, ugh
+ try:
+ if piece.index is slider.dummy.index:
+ piece = piece.copy(deep='all')
+ else:
+ mutated = True
+ except AttributeError:
+ pass
+
+ results.append(piece)
+ finally:
+ slider.reset()
+
+ return results, mutated
+
+
+cdef class BlockSlider:
+ """
+ Only capable of sliding on axis=0
+ """
+
+ cdef public:
+ object frame, dummy, index
+ int nblocks
+ Slider idx_slider
+ list blocks
+
+ cdef:
+ char **base_ptrs
+
+ def __init__(self, frame):
+ self.frame = frame
+ self.dummy = frame[:0]
+ self.index = self.dummy.index
+
+ self.blocks = [b.values for b in self.dummy._data.blocks]
+
+ for x in self.blocks:
+ util.set_array_not_contiguous(x)
+
+ self.nblocks = len(self.blocks)
+ # See the comment in indexes/base.py about _index_data.
+ # We need this for EA-backed indexes that have a reference to a 1-d
+ # ndarray like datetime / timedelta / period.
+ self.idx_slider = Slider(
+ self.frame.index._index_data, self.dummy.index._index_data)
+
+ self.base_ptrs = <char**>malloc(sizeof(char*) * len(self.blocks))
+ for i, block in enumerate(self.blocks):
+ self.base_ptrs[i] = (<ndarray>block).data
+
+ def __dealloc__(self):
+ free(self.base_ptrs)
+
+ cpdef move(self, int start, int end):
+ cdef:
+ ndarray arr
+ object index
+
+ # move blocks
+ for i in range(self.nblocks):
+ arr = self.blocks[i]
+
+ # axis=1 is the frame's axis=0
+ arr.data = self.base_ptrs[i] + arr.strides[1] * start
+ arr.shape[1] = end - start
+
+ # move and set the index
+ self.idx_slider.move(start, end)
+
+ object.__setattr__(self.index, '_index_data', self.idx_slider.buf)
+ self.index._engine.clear_mapping()
+
+ cdef reset(self):
+ cdef:
+ ndarray arr
+
+ # reset blocks
+ for i in range(self.nblocks):
+ arr = self.blocks[i]
+
+ # axis=1 is the frame's axis=0
+ arr.data = self.base_ptrs[i]
+ arr.shape[1] = 0
+
+
+def reduce(arr, f, axis=0, dummy=None, labels=None):
+ """
+
+ Parameters
+ -----------
+ arr : NDFrame object
+ f : function
+ axis : integer axis
+ dummy : type of reduced output (series)
+ labels : Index or None
+ """
+
+ if labels is not None:
+ if labels._has_complex_internals:
+ raise Exception('Cannot use shortcut')
+
+ # pass as an ndarray
+ if hasattr(labels, 'values'):
+ labels = labels.values
+
+ reducer = Reducer(arr, f, axis=axis, dummy=dummy, labels=labels)
+ return reducer.get_result()
diff --git a/contrib/python/pandas/py2/pandas/_libs/reshape.pyx b/contrib/python/pandas/py2/pandas/_libs/reshape.pyx
new file mode 100644
index 00000000000..9f4e67ca4e2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/reshape.pyx
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+
+import cython
+from cython import Py_ssize_t
+
+from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
+ uint32_t, uint64_t, float32_t, float64_t)
+
+
+ctypedef fused reshape_t:
+ uint8_t
+ uint16_t
+ uint32_t
+ uint64_t
+ int8_t
+ int16_t
+ int32_t
+ int64_t
+ float32_t
+ float64_t
+ object
+
+
+def unstack(reshape_t[:, :] values, uint8_t[:] mask,
+ Py_ssize_t stride, Py_ssize_t length, Py_ssize_t width,
+ reshape_t[:, :] new_values, uint8_t[:, :] new_mask):
+ """
+ transform long sorted_values to wide new_values
+
+ Parameters
+ ----------
+ values : typed ndarray
+ mask : boolean ndarray
+ stride : int
+ length : int
+ width : int
+ new_values : typed ndarray
+ result array
+ new_mask : boolean ndarray
+ result mask
+ """
+ cdef:
+ Py_ssize_t i, j, w, nulls, s, offset
+
+ if reshape_t is not object:
+ # evaluated at compile-time
+ with nogil:
+ for i in range(stride):
+
+ nulls = 0
+ for j in range(length):
+
+ for w in range(width):
+
+ offset = j * width + w
+
+ if mask[offset]:
+ s = i * width + w
+ new_values[j, s] = values[offset - nulls, i]
+ new_mask[j, s] = 1
+ else:
+ nulls += 1
+
+ else:
+ # object-dtype, identical to above but we cannot use nogil
+ for i in range(stride):
+
+ nulls = 0
+ for j in range(length):
+
+ for w in range(width):
+
+ offset = j * width + w
+
+ if mask[offset]:
+ s = i * width + w
+ new_values[j, s] = values[offset - nulls, i]
+ new_mask[j, s] = 1
+ else:
+ nulls += 1
+
+
+unstack_uint8 = unstack["uint8_t"]
+unstack_uint16 = unstack["uint16_t"]
+unstack_uint32 = unstack["uint32_t"]
+unstack_uint64 = unstack["uint64_t"]
+unstack_int8 = unstack["int8_t"]
+unstack_int16 = unstack["int16_t"]
+unstack_int32 = unstack["int32_t"]
+unstack_int64 = unstack["int64_t"]
+unstack_float32 = unstack["float32_t"]
+unstack_float64 = unstack["float64_t"]
+unstack_object = unstack["object"]
diff --git a/contrib/python/pandas/py2/pandas/_libs/skiplist.pxd b/contrib/python/pandas/py2/pandas/_libs/skiplist.pxd
new file mode 100644
index 00000000000..a273d2c445d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/skiplist.pxd
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+
+from cython cimport Py_ssize_t
+
+
+cdef extern from "src/skiplist.h":
+ ctypedef struct node_t:
+ node_t **next
+ int *width
+ double value
+ int is_nil
+ int levels
+ int ref_count
+
+ ctypedef struct skiplist_t:
+ node_t *head
+ node_t **tmp_chain
+ int *tmp_steps
+ int size
+ int maxlevels
+
+ skiplist_t* skiplist_init(int) nogil
+ void skiplist_destroy(skiplist_t*) nogil
+ double skiplist_get(skiplist_t*, int, int*) nogil
+ int skiplist_insert(skiplist_t*, double) nogil
+ int skiplist_remove(skiplist_t*, double) nogil
+
+
+# Note: Node is declared here so that IndexableSkiplist can be exposed;
+# Node itself not intended to be exposed.
+cdef class Node:
+ cdef public:
+ double value
+ list next
+ list width
+
+
+cdef class IndexableSkiplist:
+ cdef:
+ Py_ssize_t size, maxlevels
+ Node head
+
+ cpdef get(self, Py_ssize_t i)
+ cpdef insert(self, double value)
+ cpdef remove(self, double value)
diff --git a/contrib/python/pandas/py2/pandas/_libs/skiplist.pyx b/contrib/python/pandas/py2/pandas/_libs/skiplist.pyx
new file mode 100644
index 00000000000..6698fcb767d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/skiplist.pyx
@@ -0,0 +1,145 @@
+# Cython version of IndexableSkiplist, for implementing moving median
+# with O(log n) updates
+# Original author: Raymond Hettinger
+# Original license: MIT
+# Link: http://code.activestate.com/recipes/576930/
+
+# Cython version: Wes McKinney
+from random import random
+
+from libc.math cimport log
+
+import numpy as np
+
+
+# MSVC does not have log2!
+
+cdef double Log2(double x):
+ return log(x) / log(2.)
+
+
+# TODO: optimize this, make less messy
+
+cdef class Node:
+ # cdef public:
+ # double value
+ # list next
+ # list width
+
+ def __init__(self, double value, list next, list width):
+ self.value = value
+ self.next = next
+ self.width = width
+
+
+# Singleton terminator node
+NIL = Node(np.inf, [], [])
+
+
+cdef class IndexableSkiplist:
+ """
+ Sorted collection supporting O(lg n) insertion, removal, and
+ lookup by rank.
+ """
+ # cdef:
+ # Py_ssize_t size, maxlevels
+ # Node head
+
+ def __init__(self, expected_size=100):
+ self.size = 0
+ self.maxlevels = int(1 + Log2(expected_size))
+ self.head = Node(np.NaN, [NIL] * self.maxlevels, [1] * self.maxlevels)
+
+ def __len__(self):
+ return self.size
+
+ def __getitem__(self, i):
+ return self.get(i)
+
+ cpdef get(self, Py_ssize_t i):
+ cdef Py_ssize_t level
+ cdef Node node
+
+ node = self.head
+ i += 1
+
+ for level in range(self.maxlevels - 1, -1, -1):
+ while node.width[level] <= i:
+ i -= node.width[level]
+ node = node.next[level]
+
+ return node.value
+
+ cpdef insert(self, double value):
+ cdef Py_ssize_t level, steps, d
+ cdef Node node, prevnode, newnode, next_at_level, tmp
+ cdef list chain, steps_at_level
+
+ # find first node on each level where node.next[levels].value > value
+ chain = [None] * self.maxlevels
+ steps_at_level = [0] * self.maxlevels
+ node = self.head
+
+ for level in range(self.maxlevels - 1, -1, -1):
+ next_at_level = node.next[level]
+
+ while next_at_level.value <= value:
+ steps_at_level[level] = (steps_at_level[level] +
+ node.width[level])
+ node = next_at_level
+ next_at_level = node.next[level]
+
+ chain[level] = node
+
+ # insert a link to the newnode at each level
+ d = min(self.maxlevels, 1 - int(Log2(random())))
+ newnode = Node(value, [None] * d, [None] * d)
+ steps = 0
+
+ for level in range(d):
+ prevnode = chain[level]
+ newnode.next[level] = prevnode.next[level]
+ prevnode.next[level] = newnode
+ newnode.width[level] = (prevnode.width[level] - steps)
+ prevnode.width[level] = steps + 1
+ steps += steps_at_level[level]
+
+ for level in range(d, self.maxlevels):
+ (<Node>chain[level]).width[level] += 1
+
+ self.size += 1
+
+ cpdef remove(self, double value):
+ cdef Py_ssize_t level, d
+ cdef Node node, prevnode, tmpnode, next_at_level
+ cdef list chain
+
+ # find first node on each level where node.next[levels].value >= value
+ chain = [None] * self.maxlevels
+ node = self.head
+
+ for level in range(self.maxlevels - 1, -1, -1):
+ next_at_level = node.next[level]
+ while next_at_level.value < value:
+ node = next_at_level
+ next_at_level = node.next[level]
+
+ chain[level] = node
+
+ if value != (<Node>(<Node>(<Node>chain[0]).next)[0]).value:
+ raise KeyError('Not Found')
+
+ # remove one link at each level
+ d = len((<Node>(<Node>(<Node>chain[0]).next)[0]).next)
+
+ for level in range(d):
+ prevnode = chain[level]
+ tmpnode = prevnode.next[level]
+ prevnode.width[level] += tmpnode.width[level] - 1
+ prevnode.next[level] = tmpnode.next[level]
+
+ for level in range(d, self.maxlevels):
+ tmpnode = chain[level]
+ tmpnode.width[level] -= 1
+
+ self.size -= 1
diff --git a/contrib/python/pandas/py2/pandas/_libs/sparse.pyx b/contrib/python/pandas/py2/pandas/_libs/sparse.pyx
new file mode 100644
index 00000000000..f5980998f6d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/sparse.pyx
@@ -0,0 +1,807 @@
+# -*- coding: utf-8 -*-
+import cython
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport (ndarray, uint8_t, int64_t, int32_t, int16_t, int8_t,
+ float64_t, float32_t)
+cnp.import_array()
+
+
+# -----------------------------------------------------------------------------
+# Preamble stuff
+
+cdef float64_t NaN = <float64_t>np.NaN
+cdef float64_t INF = <float64_t>np.inf
+
+# -----------------------------------------------------------------------------
+
+
+cdef class SparseIndex:
+ """
+ Abstract superclass for sparse index types.
+ """
+
+ def __init__(self):
+ raise NotImplementedError
+
+
+cdef class IntIndex(SparseIndex):
+ """
+ Object for holding exact integer sparse indexing information
+
+ Parameters
+ ----------
+ length : integer
+ indices : array-like
+ Contains integers corresponding to the indices.
+ """
+
+ cdef readonly:
+ Py_ssize_t length, npoints
+ ndarray indices
+
+ def __init__(self, Py_ssize_t length, indices):
+ self.length = length
+ self.indices = np.ascontiguousarray(indices, dtype=np.int32)
+ self.npoints = len(self.indices)
+
+ self.check_integrity()
+
+ def __reduce__(self):
+ args = (self.length, self.indices)
+ return IntIndex, args
+
+ def __repr__(self):
+ output = 'IntIndex\n'
+ output += 'Indices: %s\n' % repr(self.indices)
+ return output
+
+ @property
+ def nbytes(self):
+ return self.indices.nbytes
+
+ def check_integrity(self):
+ """
+ Checks the following:
+
+ - Indices are strictly ascending
+ - Number of indices is at most self.length
+ - Indices are at least 0 and at most the total length less one
+
+ A ValueError is raised if any of these conditions is violated.
+ """
+
+ cdef:
+ int32_t index, prev = -1
+
+ if self.npoints > self.length:
+ msg = ("Too many indices. Expected "
+ "{exp} but found {act}").format(
+ exp=self.length, act=self.npoints)
+ raise ValueError(msg)
+
+ # Indices are vacuously ordered and non-negative
+ # if the sequence of indices is empty.
+ if self.npoints == 0:
+ return
+
+ if min(self.indices) < 0:
+ raise ValueError("No index can be less than zero")
+
+ if max(self.indices) >= self.length:
+ raise ValueError("All indices must be less than the length")
+
+ for index in self.indices:
+ if prev != -1 and index <= prev:
+ raise ValueError("Indices must be strictly increasing")
+
+ prev = index
+
+ def equals(self, other):
+ if not isinstance(other, IntIndex):
+ return False
+
+ if self is other:
+ return True
+
+ same_length = self.length == other.length
+ same_indices = np.array_equal(self.indices, other.indices)
+ return same_length and same_indices
+
+ @property
+ def ngaps(self):
+ return self.length - self.npoints
+
+ def to_int_index(self):
+ return self
+
+ def to_block_index(self):
+ locs, lens = get_blocks(self.indices)
+ return BlockIndex(self.length, locs, lens)
+
+ cpdef IntIndex intersect(self, SparseIndex y_):
+ cdef:
+ Py_ssize_t out_length, xi, yi = 0, result_indexer = 0
+ int32_t xind
+ ndarray[int32_t, ndim=1] xindices, yindices, new_indices
+ IntIndex y
+
+ # if is one already, returns self
+ y = y_.to_int_index()
+
+ if self.length != y.length:
+ raise Exception('Indices must reference same underlying length')
+
+ xindices = self.indices
+ yindices = y.indices
+ new_indices = np.empty(min(
+ len(xindices), len(yindices)), dtype=np.int32)
+
+ for xi in range(self.npoints):
+ xind = xindices[xi]
+
+ while yi < y.npoints and yindices[yi] < xind:
+ yi += 1
+
+ if yi >= y.npoints:
+ break
+
+ # TODO: would a two-pass algorithm be faster?
+ if yindices[yi] == xind:
+ new_indices[result_indexer] = xind
+ result_indexer += 1
+
+ new_indices = new_indices[:result_indexer]
+ return IntIndex(self.length, new_indices)
+
+ cpdef IntIndex make_union(self, SparseIndex y_):
+
+ cdef:
+ ndarray[int32_t, ndim=1] new_indices
+ IntIndex y
+
+ # if is one already, returns self
+ y = y_.to_int_index()
+
+ if self.length != y.length:
+ raise ValueError('Indices must reference same underlying length')
+
+ new_indices = np.union1d(self.indices, y.indices)
+ return IntIndex(self.length, new_indices)
+
+ @cython.wraparound(False)
+ cpdef int32_t lookup(self, Py_ssize_t index):
+ """
+ Return the internal location if value exists on given index.
+ Return -1 otherwise.
+ """
+ cdef:
+ int32_t res
+ ndarray[int32_t, ndim=1] inds
+
+ inds = self.indices
+ if self.npoints == 0:
+ return -1
+ elif index < 0 or self.length <= index:
+ return -1
+
+ res = inds.searchsorted(index)
+ if res == self.npoints:
+ return -1
+ elif inds[res] == index:
+ return res
+ else:
+ return -1
+
+ @cython.wraparound(False)
+ cpdef ndarray[int32_t] lookup_array(self, ndarray[
+ int32_t, ndim=1] indexer):
+ """
+ Vectorized lookup, returns ndarray[int32_t]
+ """
+ cdef:
+ Py_ssize_t n, i, ind_val
+ ndarray[int32_t, ndim=1] inds
+ ndarray[uint8_t, ndim=1, cast=True] mask
+ ndarray[int32_t, ndim=1] masked
+ ndarray[int32_t, ndim=1] res
+ ndarray[int32_t, ndim=1] results
+
+ n = len(indexer)
+ results = np.empty(n, dtype=np.int32)
+ results[:] = -1
+
+ if self.npoints == 0:
+ return results
+
+ inds = self.indices
+ mask = (inds[0] <= indexer) & (indexer <= inds[len(inds) - 1])
+
+ masked = indexer[mask]
+ res = inds.searchsorted(masked).astype(np.int32)
+
+ res[inds[res] != masked] = -1
+ results[mask] = res
+ return results
+
+ cpdef ndarray reindex(self, ndarray[float64_t, ndim=1] values,
+ float64_t fill_value, SparseIndex other_):
+ cdef:
+ Py_ssize_t i = 0, j = 0
+ IntIndex other
+ ndarray[float64_t, ndim=1] result
+ ndarray[int32_t, ndim=1] sinds, oinds
+
+ other = other_.to_int_index()
+
+ oinds = other.indices
+ sinds = self.indices
+
+ result = np.empty(other.npoints, dtype=np.float64)
+ result[:] = fill_value
+
+ for i in range(other.npoints):
+ while oinds[i] > sinds[j] and j < self.npoints:
+ j += 1
+
+ if j == self.npoints:
+ break
+
+ if oinds[i] < sinds[j]:
+ continue
+ elif oinds[i] == sinds[j]:
+ result[i] = values[j]
+ j += 1
+
+ return result
+
+ cpdef put(self, ndarray[float64_t, ndim=1] values,
+ ndarray[int32_t, ndim=1] indices, object to_put):
+ pass
+
+ cpdef take(self, ndarray[float64_t, ndim=1] values,
+ ndarray[int32_t, ndim=1] indices):
+ pass
+
+
+cpdef get_blocks(ndarray[int32_t, ndim=1] indices):
+ cdef:
+ Py_ssize_t init_len, i, npoints, result_indexer = 0
+ int32_t block, length = 1, cur, prev
+ ndarray[int32_t, ndim=1] locs, lens
+
+ npoints = len(indices)
+
+ # just handle the special empty case separately
+ if npoints == 0:
+ return np.array([], dtype=np.int32), np.array([], dtype=np.int32)
+
+ # block size can't be longer than npoints
+ locs = np.empty(npoints, dtype=np.int32)
+ lens = np.empty(npoints, dtype=np.int32)
+
+ # TODO: two-pass algorithm faster?
+ prev = block = indices[0]
+ for i in range(1, npoints):
+ cur = indices[i]
+ if cur - prev > 1:
+ # new block
+ locs[result_indexer] = block
+ lens[result_indexer] = length
+ block = cur
+ length = 1
+ result_indexer += 1
+ else:
+ # same block, increment length
+ length += 1
+
+ prev = cur
+
+ locs[result_indexer] = block
+ lens[result_indexer] = length
+ result_indexer += 1
+ locs = locs[:result_indexer]
+ lens = lens[:result_indexer]
+ return locs, lens
+
+
+# -----------------------------------------------------------------------------
+# BlockIndex
+
+cdef class BlockIndex(SparseIndex):
+ """
+ Object for holding block-based sparse indexing information
+
+ Parameters
+ ----------
+ """
+ cdef readonly:
+ int32_t nblocks, npoints, length
+ ndarray blocs, blengths
+
+ cdef:
+ object __weakref__ # need to be picklable
+ int32_t *locbuf
+ int32_t *lenbuf
+
+ def __init__(self, length, blocs, blengths):
+
+ self.blocs = np.ascontiguousarray(blocs, dtype=np.int32)
+ self.blengths = np.ascontiguousarray(blengths, dtype=np.int32)
+
+ # in case we need
+ self.locbuf = <int32_t*>self.blocs.data
+ self.lenbuf = <int32_t*>self.blengths.data
+
+ self.length = length
+ self.nblocks = np.int32(len(self.blocs))
+ self.npoints = self.blengths.sum()
+
+ # self.block_start = blocs
+ # self.block_end = blocs + blengths
+
+ self.check_integrity()
+
+ def __reduce__(self):
+ args = (self.length, self.blocs, self.blengths)
+ return BlockIndex, args
+
+ def __repr__(self):
+ output = 'BlockIndex\n'
+ output += 'Block locations: %s\n' % repr(self.blocs)
+ output += 'Block lengths: %s' % repr(self.blengths)
+
+ return output
+
+ @property
+ def nbytes(self):
+ return self.blocs.nbytes + self.blengths.nbytes
+
+ @property
+ def ngaps(self):
+ return self.length - self.npoints
+
+ cpdef check_integrity(self):
+ """
+ Check:
+ - Locations are in ascending order
+ - No overlapping blocks
+ - Blocks to not start after end of index, nor extend beyond end
+ """
+ cdef:
+ Py_ssize_t i
+ ndarray[int32_t, ndim=1] blocs, blengths
+
+ blocs = self.blocs
+ blengths = self.blengths
+
+ if len(blocs) != len(blengths):
+ raise ValueError('block bound arrays must be same length')
+
+ for i in range(self.nblocks):
+ if i > 0:
+ if blocs[i] <= blocs[i - 1]:
+ raise ValueError('Locations not in ascending order')
+
+ if i < self.nblocks - 1:
+ if blocs[i] + blengths[i] > blocs[i + 1]:
+ raise ValueError('Block {idx} overlaps'.format(idx=i))
+ else:
+ if blocs[i] + blengths[i] > self.length:
+ raise ValueError('Block {idx} extends beyond end'
+ .format(idx=i))
+
+ # no zero-length blocks
+ if blengths[i] == 0:
+ raise ValueError('Zero-length block {idx}'.format(idx=i))
+
+ def equals(self, other):
+ if not isinstance(other, BlockIndex):
+ return False
+
+ if self is other:
+ return True
+
+ same_length = self.length == other.length
+ same_blocks = (np.array_equal(self.blocs, other.blocs) and
+ np.array_equal(self.blengths, other.blengths))
+ return same_length and same_blocks
+
+ def to_block_index(self):
+ return self
+
+ def to_int_index(self):
+ cdef:
+ int32_t i = 0, j, b
+ int32_t offset
+ ndarray[int32_t, ndim=1] indices
+
+ indices = np.empty(self.npoints, dtype=np.int32)
+
+ for b in range(self.nblocks):
+ offset = self.locbuf[b]
+
+ for j in range(self.lenbuf[b]):
+ indices[i] = offset + j
+ i += 1
+
+ return IntIndex(self.length, indices)
+
+ cpdef BlockIndex intersect(self, SparseIndex other):
+ """
+ Intersect two BlockIndex objects
+
+ Parameters
+ ----------
+
+ Returns
+ -------
+ intersection : BlockIndex
+ """
+ cdef:
+ BlockIndex y
+ ndarray[int32_t, ndim=1] xloc, xlen, yloc, ylen, out_bloc, out_blen
+ Py_ssize_t xi = 0, yi = 0, max_len, result_indexer = 0
+ int32_t cur_loc, cur_length, diff
+
+ y = other.to_block_index()
+
+ if self.length != y.length:
+ raise Exception('Indices must reference same underlying length')
+
+ xloc = self.blocs
+ xlen = self.blengths
+ yloc = y.blocs
+ ylen = y.blengths
+
+ # block may be split, but can't exceed original len / 2 + 1
+ max_len = int(min(self.length, y.length) / 2) + 1
+ out_bloc = np.empty(max_len, dtype=np.int32)
+ out_blen = np.empty(max_len, dtype=np.int32)
+
+ while True:
+ # we are done (or possibly never began)
+ if xi >= self.nblocks or yi >= y.nblocks:
+ break
+
+ # completely symmetric...would like to avoid code dup but oh well
+ if xloc[xi] >= yloc[yi]:
+ cur_loc = xloc[xi]
+ diff = xloc[xi] - yloc[yi]
+
+ if ylen[yi] <= diff:
+ # have to skip this block
+ yi += 1
+ continue
+
+ if ylen[yi] - diff < xlen[xi]:
+ # take end of y block, move onward
+ cur_length = ylen[yi] - diff
+ yi += 1
+ else:
+ # take end of x block
+ cur_length = xlen[xi]
+ xi += 1
+
+ else: # xloc[xi] < yloc[yi]
+ cur_loc = yloc[yi]
+ diff = yloc[yi] - xloc[xi]
+
+ if xlen[xi] <= diff:
+ # have to skip this block
+ xi += 1
+ continue
+
+ if xlen[xi] - diff < ylen[yi]:
+ # take end of x block, move onward
+ cur_length = xlen[xi] - diff
+ xi += 1
+ else:
+ # take end of y block
+ cur_length = ylen[yi]
+ yi += 1
+
+ out_bloc[result_indexer] = cur_loc
+ out_blen[result_indexer] = cur_length
+ result_indexer += 1
+
+ out_bloc = out_bloc[:result_indexer]
+ out_blen = out_blen[:result_indexer]
+
+ return BlockIndex(self.length, out_bloc, out_blen)
+
+ cpdef BlockIndex make_union(self, SparseIndex y):
+ """
+ Combine together two BlockIndex objects, accepting indices if contained
+ in one or the other
+
+ Parameters
+ ----------
+ other : SparseIndex
+
+ Notes
+ -----
+ union is a protected keyword in Cython, hence make_union
+
+ Returns
+ -------
+ union : BlockIndex
+ """
+ return BlockUnion(self, y.to_block_index()).result
+
+ cpdef Py_ssize_t lookup(self, Py_ssize_t index):
+ """
+ Return the internal location if value exists on given index.
+ Return -1 otherwise.
+ """
+ cdef:
+ Py_ssize_t i, cum_len
+ ndarray[int32_t, ndim=1] locs, lens
+
+ locs = self.blocs
+ lens = self.blengths
+
+ if self.nblocks == 0:
+ return -1
+ elif index < locs[0]:
+ return -1
+
+ cum_len = 0
+ for i in range(self.nblocks):
+ if index >= locs[i] and index < locs[i] + lens[i]:
+ return cum_len + index - locs[i]
+ cum_len += lens[i]
+
+ return -1
+
+ @cython.wraparound(False)
+ cpdef ndarray[int32_t] lookup_array(self, ndarray[
+ int32_t, ndim=1] indexer):
+ """
+ Vectorized lookup, returns ndarray[int32_t]
+ """
+ cdef:
+ Py_ssize_t n, i, j, ind_val
+ ndarray[int32_t, ndim=1] locs, lens
+ ndarray[int32_t, ndim=1] results
+
+ locs = self.blocs
+ lens = self.blengths
+
+ n = len(indexer)
+ results = np.empty(n, dtype=np.int32)
+ results[:] = -1
+
+ if self.npoints == 0:
+ return results
+
+ for i in range(n):
+ ind_val = indexer[i]
+ if not (ind_val < 0 or self.length <= ind_val):
+ cum_len = 0
+ for j in range(self.nblocks):
+ if ind_val >= locs[j] and ind_val < locs[j] + lens[j]:
+ results[i] = cum_len + ind_val - locs[j]
+ cum_len += lens[j]
+ return results
+
+ cpdef ndarray reindex(self, ndarray[float64_t, ndim=1] values,
+ float64_t fill_value, SparseIndex other_):
+ cdef:
+ Py_ssize_t i = 0, j = 0, ocur, ocurlen
+ BlockIndex other
+ ndarray[float64_t, ndim=1] result
+ ndarray[int32_t, ndim=1] slocs, slens, olocs, olens
+
+ other = other_.to_block_index()
+
+ olocs = other.blocs
+ olens = other.blengths
+ slocs = self.blocs
+ slens = self.blengths
+
+ result = np.empty(other.npoints, dtype=np.float64)
+
+ for 0 <= i < other.nblocks:
+ ocur = olocs[i]
+ ocurlen = olens[i]
+
+ while slocs[j] + slens[j] < ocur:
+ j += 1
+
+ cpdef put(self, ndarray[float64_t, ndim=1] values,
+ ndarray[int32_t, ndim=1] indices, object to_put):
+ pass
+
+ cpdef take(self, ndarray[float64_t, ndim=1] values,
+ ndarray[int32_t, ndim=1] indices):
+ pass
+
+
+cdef class BlockMerge(object):
+ """
+ Object-oriented approach makes sharing state between recursive functions a
+ lot easier and reduces code duplication
+ """
+ cdef:
+ BlockIndex x, y, result
+ ndarray xstart, xlen, xend, ystart, ylen, yend
+ int32_t xi, yi # block indices
+
+ def __init__(self, BlockIndex x, BlockIndex y):
+ self.x = x
+ self.y = y
+
+ if x.length != y.length:
+ raise Exception('Indices must reference same underlying length')
+
+ self.xstart = self.x.blocs
+ self.ystart = self.y.blocs
+
+ self.xend = self.x.blocs + self.x.blengths
+ self.yend = self.y.blocs + self.y.blengths
+
+ # self.xlen = self.x.blengths
+ # self.ylen = self.y.blengths
+
+ self.xi = 0
+ self.yi = 0
+
+ self.result = self._make_merged_blocks()
+
+ cdef _make_merged_blocks(self):
+ raise NotImplementedError
+
+ cdef _set_current_indices(self, int32_t xi, int32_t yi, bint mode):
+ if mode == 0:
+ self.xi = xi
+ self.yi = yi
+ else:
+ self.xi = yi
+ self.yi = xi
+
+
+cdef class BlockUnion(BlockMerge):
+ """
+ Object-oriented approach makes sharing state between recursive functions a
+ lot easier and reduces code duplication
+ """
+
+ cdef _make_merged_blocks(self):
+ cdef:
+ ndarray[int32_t, ndim=1] xstart, xend, ystart
+ ndarray[int32_t, ndim=1] yend, out_bloc, out_blen
+ int32_t nstart, nend, diff
+ Py_ssize_t max_len, result_indexer = 0
+
+ xstart = self.xstart
+ xend = self.xend
+ ystart = self.ystart
+ yend = self.yend
+
+ max_len = int(min(self.x.length, self.y.length) / 2) + 1
+ out_bloc = np.empty(max_len, dtype=np.int32)
+ out_blen = np.empty(max_len, dtype=np.int32)
+
+ while True:
+ # we are done (or possibly never began)
+ if self.xi >= self.x.nblocks and self.yi >= self.y.nblocks:
+ break
+ elif self.yi >= self.y.nblocks:
+ # through with y, just pass through x blocks
+ nstart = xstart[self.xi]
+ nend = xend[self.xi]
+ self.xi += 1
+ elif self.xi >= self.x.nblocks:
+ # through with x, just pass through y blocks
+ nstart = ystart[self.yi]
+ nend = yend[self.yi]
+ self.yi += 1
+ else:
+ # find end of new block
+ if xstart[self.xi] < ystart[self.yi]:
+ nstart = xstart[self.xi]
+ nend = self._find_next_block_end(0)
+ else:
+ nstart = ystart[self.yi]
+ nend = self._find_next_block_end(1)
+
+ out_bloc[result_indexer] = nstart
+ out_blen[result_indexer] = nend - nstart
+ result_indexer += 1
+
+ out_bloc = out_bloc[:result_indexer]
+ out_blen = out_blen[:result_indexer]
+
+ return BlockIndex(self.x.length, out_bloc, out_blen)
+
+ cdef int32_t _find_next_block_end(self, bint mode) except -1:
+ """
+ Wow, this got complicated in a hurry
+
+ mode 0: block started in index x
+ mode 1: block started in index y
+ """
+ cdef:
+ ndarray[int32_t, ndim=1] xstart, xend, ystart, yend
+ int32_t xi, yi, xnblocks, ynblocks, nend
+
+ if mode != 0 and mode != 1:
+ raise Exception('Mode must be 0 or 1')
+
+ # so symmetric code will work
+ if mode == 0:
+ xstart = self.xstart
+ xend = self.xend
+ xi = self.xi
+
+ ystart = self.ystart
+ yend = self.yend
+ yi = self.yi
+ ynblocks = self.y.nblocks
+ else:
+ xstart = self.ystart
+ xend = self.yend
+ xi = self.yi
+
+ ystart = self.xstart
+ yend = self.xend
+ yi = self.xi
+ ynblocks = self.x.nblocks
+
+ nend = xend[xi]
+
+ # print 'here xi=%d, yi=%d, mode=%d, nend=%d' % (self.xi, self.yi,
+ # mode, nend)
+
+ # done with y?
+ if yi == ynblocks:
+ self._set_current_indices(xi + 1, yi, mode)
+ return nend
+ elif nend < ystart[yi]:
+ # block ends before y block
+ self._set_current_indices(xi + 1, yi, mode)
+ return nend
+ else:
+ while yi < ynblocks and nend > yend[yi]:
+ yi += 1
+
+ self._set_current_indices(xi + 1, yi, mode)
+
+ if yi == ynblocks:
+ return nend
+
+ if nend < ystart[yi]:
+ # we're done, return the block end
+ return nend
+ else:
+ # merge blocks, continue searching
+ # this also catches the case where blocks
+ return self._find_next_block_end(1 - mode)
+
+
+# -----------------------------------------------------------------------------
+# Sparse arithmetic
+
+include "sparse_op_helper.pxi"
+
+
+# -----------------------------------------------------------------------------
+# SparseArray mask create operations
+
+def make_mask_object_ndarray(ndarray[object, ndim=1] arr, object fill_value):
+ cdef:
+ object value
+ Py_ssize_t i
+ Py_ssize_t new_length = len(arr)
+ ndarray[int8_t, ndim=1] mask
+
+ mask = np.ones(new_length, dtype=np.int8)
+
+ for i in range(new_length):
+ value = arr[i]
+ if value == fill_value and type(value) == type(fill_value):
+ mask[i] = 0
+
+ return mask.view(dtype=np.bool)
diff --git a/contrib/python/pandas/py2/pandas/_libs/sparse_op_helper.pxi b/contrib/python/pandas/py2/pandas/_libs/sparse_op_helper.pxi
new file mode 100644
index 00000000000..43cf662c801
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/sparse_op_helper.pxi
@@ -0,0 +1,5846 @@
+"""
+Template for each `dtype` helper function for sparse ops
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+# ----------------------------------------------------------------------
+# Sparse op
+# ----------------------------------------------------------------------
+
+ctypedef fused sparse_t:
+ float64_t
+ int64_t
+
+
+cdef inline float64_t __div__(sparse_t a, sparse_t b):
+ if b == 0:
+ if a > 0:
+ return INF
+ elif a < 0:
+ return -INF
+ else:
+ return NaN
+ else:
+ return float(a) / b
+
+
+cdef inline float64_t __truediv__(sparse_t a, sparse_t b):
+ return __div__(a, b)
+
+
+cdef inline sparse_t __mod__(sparse_t a, sparse_t b):
+ if b == 0:
+ if sparse_t is float64_t:
+ return NaN
+ else:
+ return 0
+ else:
+ return a % b
+
+
+cdef inline sparse_t __floordiv__(sparse_t a, sparse_t b):
+ if b == 0:
+ if sparse_t is float64_t:
+ return NaN
+ else:
+ return 0
+ else:
+ return a // b
+
+
+# ----------------------------------------------------------------------
+# sparse array op
+# ----------------------------------------------------------------------
+
+
+cdef inline tuple block_op_add_float64(ndarray x_,
+ BlockIndex xindex,
+ float64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ float64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] + yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill + y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] + y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] + yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill + y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill + yfill
+
+
+cdef inline tuple int_op_add_float64(ndarray x_, IntIndex xindex,
+ float64_t xfill,
+ ndarray y_, IntIndex yindex,
+ float64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill + y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] + yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] + y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] + yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill + y[yi]
+ yi += 1
+
+ return out, out_index, xfill + yfill
+
+
+cpdef sparse_add_float64(ndarray[float64_t, ndim=1] x,
+ SparseIndex xindex, float64_t xfill,
+ ndarray[float64_t, ndim=1] y,
+ SparseIndex yindex, float64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_add_float64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_add_float64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_add_float64(float64_t xfill,
+ float64_t yfill):
+ return xfill + yfill
+
+
+cdef inline tuple block_op_add_int64(ndarray x_,
+ BlockIndex xindex,
+ int64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ int64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[int64_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.int64)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] + yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill + y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] + y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] + yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill + y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill + yfill
+
+
+cdef inline tuple int_op_add_int64(ndarray x_, IntIndex xindex,
+ int64_t xfill,
+ ndarray y_, IntIndex yindex,
+ int64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[int64_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.int64)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill + y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] + yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] + y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] + yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill + y[yi]
+ yi += 1
+
+ return out, out_index, xfill + yfill
+
+
+cpdef sparse_add_int64(ndarray[int64_t, ndim=1] x,
+ SparseIndex xindex, int64_t xfill,
+ ndarray[int64_t, ndim=1] y,
+ SparseIndex yindex, int64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_add_int64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_add_int64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_add_int64(int64_t xfill,
+ int64_t yfill):
+ return xfill + yfill
+
+
+cdef inline tuple block_op_sub_float64(ndarray x_,
+ BlockIndex xindex,
+ float64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ float64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] - yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill - y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] - y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] - yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill - y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill - yfill
+
+
+cdef inline tuple int_op_sub_float64(ndarray x_, IntIndex xindex,
+ float64_t xfill,
+ ndarray y_, IntIndex yindex,
+ float64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill - y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] - yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] - y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] - yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill - y[yi]
+ yi += 1
+
+ return out, out_index, xfill - yfill
+
+
+cpdef sparse_sub_float64(ndarray[float64_t, ndim=1] x,
+ SparseIndex xindex, float64_t xfill,
+ ndarray[float64_t, ndim=1] y,
+ SparseIndex yindex, float64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_sub_float64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_sub_float64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_sub_float64(float64_t xfill,
+ float64_t yfill):
+ return xfill - yfill
+
+
+cdef inline tuple block_op_sub_int64(ndarray x_,
+ BlockIndex xindex,
+ int64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ int64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[int64_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.int64)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] - yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill - y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] - y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] - yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill - y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill - yfill
+
+
+cdef inline tuple int_op_sub_int64(ndarray x_, IntIndex xindex,
+ int64_t xfill,
+ ndarray y_, IntIndex yindex,
+ int64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[int64_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.int64)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill - y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] - yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] - y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] - yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill - y[yi]
+ yi += 1
+
+ return out, out_index, xfill - yfill
+
+
+cpdef sparse_sub_int64(ndarray[int64_t, ndim=1] x,
+ SparseIndex xindex, int64_t xfill,
+ ndarray[int64_t, ndim=1] y,
+ SparseIndex yindex, int64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_sub_int64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_sub_int64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_sub_int64(int64_t xfill,
+ int64_t yfill):
+ return xfill - yfill
+
+
+cdef inline tuple block_op_mul_float64(ndarray x_,
+ BlockIndex xindex,
+ float64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ float64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] * yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill * y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] * y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] * yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill * y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill * yfill
+
+
+cdef inline tuple int_op_mul_float64(ndarray x_, IntIndex xindex,
+ float64_t xfill,
+ ndarray y_, IntIndex yindex,
+ float64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill * y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] * yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] * y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] * yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill * y[yi]
+ yi += 1
+
+ return out, out_index, xfill * yfill
+
+
+cpdef sparse_mul_float64(ndarray[float64_t, ndim=1] x,
+ SparseIndex xindex, float64_t xfill,
+ ndarray[float64_t, ndim=1] y,
+ SparseIndex yindex, float64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_mul_float64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_mul_float64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_mul_float64(float64_t xfill,
+ float64_t yfill):
+ return xfill * yfill
+
+
+cdef inline tuple block_op_mul_int64(ndarray x_,
+ BlockIndex xindex,
+ int64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ int64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[int64_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.int64)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] * yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill * y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] * y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] * yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill * y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill * yfill
+
+
+cdef inline tuple int_op_mul_int64(ndarray x_, IntIndex xindex,
+ int64_t xfill,
+ ndarray y_, IntIndex yindex,
+ int64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[int64_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.int64)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill * y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] * yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] * y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] * yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill * y[yi]
+ yi += 1
+
+ return out, out_index, xfill * yfill
+
+
+cpdef sparse_mul_int64(ndarray[int64_t, ndim=1] x,
+ SparseIndex xindex, int64_t xfill,
+ ndarray[int64_t, ndim=1] y,
+ SparseIndex yindex, int64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_mul_int64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_mul_int64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_mul_int64(int64_t xfill,
+ int64_t yfill):
+ return xfill * yfill
+
+
+cdef inline tuple block_op_div_float64(ndarray x_,
+ BlockIndex xindex,
+ float64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ float64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = __div__(x[xi], yfill)
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = __div__(xfill, y[yi])
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = __div__(x[xi], y[yi])
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = __div__(x[xi], yfill)
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = __div__(xfill, y[yi])
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, __div__(xfill, yfill)
+
+
+cdef inline tuple int_op_div_float64(ndarray x_, IntIndex xindex,
+ float64_t xfill,
+ ndarray y_, IntIndex yindex,
+ float64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = __div__(xfill, y[yi])
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = __div__(x[xi], yfill)
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = __div__(x[xi], y[yi])
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = __div__(x[xi], yfill)
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = __div__(xfill, y[yi])
+ yi += 1
+
+ return out, out_index, __div__(xfill, yfill)
+
+
+cpdef sparse_div_float64(ndarray[float64_t, ndim=1] x,
+ SparseIndex xindex, float64_t xfill,
+ ndarray[float64_t, ndim=1] y,
+ SparseIndex yindex, float64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_div_float64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_div_float64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_div_float64(float64_t xfill,
+ float64_t yfill):
+ return __div__(xfill, yfill)
+
+
+cdef inline tuple block_op_div_int64(ndarray x_,
+ BlockIndex xindex,
+ int64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ int64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = __div__(x[xi], yfill)
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = __div__(xfill, y[yi])
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = __div__(x[xi], y[yi])
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = __div__(x[xi], yfill)
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = __div__(xfill, y[yi])
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, __div__(xfill, yfill)
+
+
+cdef inline tuple int_op_div_int64(ndarray x_, IntIndex xindex,
+ int64_t xfill,
+ ndarray y_, IntIndex yindex,
+ int64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = __div__(xfill, y[yi])
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = __div__(x[xi], yfill)
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = __div__(x[xi], y[yi])
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = __div__(x[xi], yfill)
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = __div__(xfill, y[yi])
+ yi += 1
+
+ return out, out_index, __div__(xfill, yfill)
+
+
+cpdef sparse_div_int64(ndarray[int64_t, ndim=1] x,
+ SparseIndex xindex, int64_t xfill,
+ ndarray[int64_t, ndim=1] y,
+ SparseIndex yindex, int64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_div_int64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_div_int64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_div_int64(int64_t xfill,
+ int64_t yfill):
+ return __div__(xfill, yfill)
+
+
+cdef inline tuple block_op_mod_float64(ndarray x_,
+ BlockIndex xindex,
+ float64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ float64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = __mod__(x[xi], yfill)
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = __mod__(xfill, y[yi])
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = __mod__(x[xi], y[yi])
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = __mod__(x[xi], yfill)
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = __mod__(xfill, y[yi])
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, __mod__(xfill, yfill)
+
+
+cdef inline tuple int_op_mod_float64(ndarray x_, IntIndex xindex,
+ float64_t xfill,
+ ndarray y_, IntIndex yindex,
+ float64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = __mod__(xfill, y[yi])
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = __mod__(x[xi], yfill)
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = __mod__(x[xi], y[yi])
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = __mod__(x[xi], yfill)
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = __mod__(xfill, y[yi])
+ yi += 1
+
+ return out, out_index, __mod__(xfill, yfill)
+
+
+cpdef sparse_mod_float64(ndarray[float64_t, ndim=1] x,
+ SparseIndex xindex, float64_t xfill,
+ ndarray[float64_t, ndim=1] y,
+ SparseIndex yindex, float64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_mod_float64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_mod_float64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_mod_float64(float64_t xfill,
+ float64_t yfill):
+ return __mod__(xfill, yfill)
+
+
+cdef inline tuple block_op_mod_int64(ndarray x_,
+ BlockIndex xindex,
+ int64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ int64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[int64_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.int64)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = __mod__(x[xi], yfill)
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = __mod__(xfill, y[yi])
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = __mod__(x[xi], y[yi])
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = __mod__(x[xi], yfill)
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = __mod__(xfill, y[yi])
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, __mod__(xfill, yfill)
+
+
+cdef inline tuple int_op_mod_int64(ndarray x_, IntIndex xindex,
+ int64_t xfill,
+ ndarray y_, IntIndex yindex,
+ int64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[int64_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.int64)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = __mod__(xfill, y[yi])
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = __mod__(x[xi], yfill)
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = __mod__(x[xi], y[yi])
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = __mod__(x[xi], yfill)
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = __mod__(xfill, y[yi])
+ yi += 1
+
+ return out, out_index, __mod__(xfill, yfill)
+
+
+cpdef sparse_mod_int64(ndarray[int64_t, ndim=1] x,
+ SparseIndex xindex, int64_t xfill,
+ ndarray[int64_t, ndim=1] y,
+ SparseIndex yindex, int64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_mod_int64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_mod_int64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_mod_int64(int64_t xfill,
+ int64_t yfill):
+ return __mod__(xfill, yfill)
+
+
+cdef inline tuple block_op_truediv_float64(ndarray x_,
+ BlockIndex xindex,
+ float64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ float64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = __truediv__(x[xi], yfill)
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = __truediv__(xfill, y[yi])
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = __truediv__(x[xi], y[yi])
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = __truediv__(x[xi], yfill)
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = __truediv__(xfill, y[yi])
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, __truediv__(xfill, yfill)
+
+
+cdef inline tuple int_op_truediv_float64(ndarray x_, IntIndex xindex,
+ float64_t xfill,
+ ndarray y_, IntIndex yindex,
+ float64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = __truediv__(xfill, y[yi])
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = __truediv__(x[xi], yfill)
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = __truediv__(x[xi], y[yi])
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = __truediv__(x[xi], yfill)
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = __truediv__(xfill, y[yi])
+ yi += 1
+
+ return out, out_index, __truediv__(xfill, yfill)
+
+
+cpdef sparse_truediv_float64(ndarray[float64_t, ndim=1] x,
+ SparseIndex xindex, float64_t xfill,
+ ndarray[float64_t, ndim=1] y,
+ SparseIndex yindex, float64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_truediv_float64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_truediv_float64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_truediv_float64(float64_t xfill,
+ float64_t yfill):
+ return __truediv__(xfill, yfill)
+
+
+cdef inline tuple block_op_truediv_int64(ndarray x_,
+ BlockIndex xindex,
+ int64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ int64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = __truediv__(x[xi], yfill)
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = __truediv__(xfill, y[yi])
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = __truediv__(x[xi], y[yi])
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = __truediv__(x[xi], yfill)
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = __truediv__(xfill, y[yi])
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, __truediv__(xfill, yfill)
+
+
+cdef inline tuple int_op_truediv_int64(ndarray x_, IntIndex xindex,
+ int64_t xfill,
+ ndarray y_, IntIndex yindex,
+ int64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = __truediv__(xfill, y[yi])
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = __truediv__(x[xi], yfill)
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = __truediv__(x[xi], y[yi])
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = __truediv__(x[xi], yfill)
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = __truediv__(xfill, y[yi])
+ yi += 1
+
+ return out, out_index, __truediv__(xfill, yfill)
+
+
+cpdef sparse_truediv_int64(ndarray[int64_t, ndim=1] x,
+ SparseIndex xindex, int64_t xfill,
+ ndarray[int64_t, ndim=1] y,
+ SparseIndex yindex, int64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_truediv_int64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_truediv_int64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_truediv_int64(int64_t xfill,
+ int64_t yfill):
+ return __truediv__(xfill, yfill)
+
+
+cdef inline tuple block_op_floordiv_float64(ndarray x_,
+ BlockIndex xindex,
+ float64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ float64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = __floordiv__(x[xi], yfill)
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = __floordiv__(xfill, y[yi])
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = __floordiv__(x[xi], y[yi])
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = __floordiv__(x[xi], yfill)
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = __floordiv__(xfill, y[yi])
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, __floordiv__(xfill, yfill)
+
+
+cdef inline tuple int_op_floordiv_float64(ndarray x_, IntIndex xindex,
+ float64_t xfill,
+ ndarray y_, IntIndex yindex,
+ float64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = __floordiv__(xfill, y[yi])
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = __floordiv__(x[xi], yfill)
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = __floordiv__(x[xi], y[yi])
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = __floordiv__(x[xi], yfill)
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = __floordiv__(xfill, y[yi])
+ yi += 1
+
+ return out, out_index, __floordiv__(xfill, yfill)
+
+
+cpdef sparse_floordiv_float64(ndarray[float64_t, ndim=1] x,
+ SparseIndex xindex, float64_t xfill,
+ ndarray[float64_t, ndim=1] y,
+ SparseIndex yindex, float64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_floordiv_float64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_floordiv_float64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_floordiv_float64(float64_t xfill,
+ float64_t yfill):
+ return __floordiv__(xfill, yfill)
+
+
+cdef inline tuple block_op_floordiv_int64(ndarray x_,
+ BlockIndex xindex,
+ int64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ int64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[int64_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.int64)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = __floordiv__(x[xi], yfill)
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = __floordiv__(xfill, y[yi])
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = __floordiv__(x[xi], y[yi])
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = __floordiv__(x[xi], yfill)
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = __floordiv__(xfill, y[yi])
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, __floordiv__(xfill, yfill)
+
+
+cdef inline tuple int_op_floordiv_int64(ndarray x_, IntIndex xindex,
+ int64_t xfill,
+ ndarray y_, IntIndex yindex,
+ int64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[int64_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.int64)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = __floordiv__(xfill, y[yi])
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = __floordiv__(x[xi], yfill)
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = __floordiv__(x[xi], y[yi])
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = __floordiv__(x[xi], yfill)
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = __floordiv__(xfill, y[yi])
+ yi += 1
+
+ return out, out_index, __floordiv__(xfill, yfill)
+
+
+cpdef sparse_floordiv_int64(ndarray[int64_t, ndim=1] x,
+ SparseIndex xindex, int64_t xfill,
+ ndarray[int64_t, ndim=1] y,
+ SparseIndex yindex, int64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_floordiv_int64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_floordiv_int64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_floordiv_int64(int64_t xfill,
+ int64_t yfill):
+ return __floordiv__(xfill, yfill)
+
+
+cdef inline tuple block_op_pow_float64(ndarray x_,
+ BlockIndex xindex,
+ float64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ float64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] ** yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill ** y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] ** y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] ** yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill ** y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill ** yfill
+
+
+cdef inline tuple int_op_pow_float64(ndarray x_, IntIndex xindex,
+ float64_t xfill,
+ ndarray y_, IntIndex yindex,
+ float64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[float64_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.float64)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill ** y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] ** yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] ** y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] ** yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill ** y[yi]
+ yi += 1
+
+ return out, out_index, xfill ** yfill
+
+
+cpdef sparse_pow_float64(ndarray[float64_t, ndim=1] x,
+ SparseIndex xindex, float64_t xfill,
+ ndarray[float64_t, ndim=1] y,
+ SparseIndex yindex, float64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_pow_float64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_pow_float64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_pow_float64(float64_t xfill,
+ float64_t yfill):
+ return xfill ** yfill
+
+
+cdef inline tuple block_op_pow_int64(ndarray x_,
+ BlockIndex xindex,
+ int64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ int64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[int64_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.int64)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] ** yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill ** y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] ** y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] ** yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill ** y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill ** yfill
+
+
+cdef inline tuple int_op_pow_int64(ndarray x_, IntIndex xindex,
+ int64_t xfill,
+ ndarray y_, IntIndex yindex,
+ int64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[int64_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.int64)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill ** y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] ** yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] ** y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] ** yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill ** y[yi]
+ yi += 1
+
+ return out, out_index, xfill ** yfill
+
+
+cpdef sparse_pow_int64(ndarray[int64_t, ndim=1] x,
+ SparseIndex xindex, int64_t xfill,
+ ndarray[int64_t, ndim=1] y,
+ SparseIndex yindex, int64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_pow_int64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_pow_int64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_pow_int64(int64_t xfill,
+ int64_t yfill):
+ return xfill ** yfill
+
+
+cdef inline tuple block_op_eq_float64(ndarray x_,
+ BlockIndex xindex,
+ float64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ float64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] == yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill == y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] == y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] == yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill == y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill == yfill
+
+
+cdef inline tuple int_op_eq_float64(ndarray x_, IntIndex xindex,
+ float64_t xfill,
+ ndarray y_, IntIndex yindex,
+ float64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill == y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] == yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] == y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] == yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill == y[yi]
+ yi += 1
+
+ return out, out_index, xfill == yfill
+
+
+cpdef sparse_eq_float64(ndarray[float64_t, ndim=1] x,
+ SparseIndex xindex, float64_t xfill,
+ ndarray[float64_t, ndim=1] y,
+ SparseIndex yindex, float64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_eq_float64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_eq_float64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_eq_float64(float64_t xfill,
+ float64_t yfill):
+ return xfill == yfill
+
+
+cdef inline tuple block_op_eq_int64(ndarray x_,
+ BlockIndex xindex,
+ int64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ int64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] == yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill == y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] == y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] == yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill == y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill == yfill
+
+
+cdef inline tuple int_op_eq_int64(ndarray x_, IntIndex xindex,
+ int64_t xfill,
+ ndarray y_, IntIndex yindex,
+ int64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill == y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] == yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] == y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] == yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill == y[yi]
+ yi += 1
+
+ return out, out_index, xfill == yfill
+
+
+cpdef sparse_eq_int64(ndarray[int64_t, ndim=1] x,
+ SparseIndex xindex, int64_t xfill,
+ ndarray[int64_t, ndim=1] y,
+ SparseIndex yindex, int64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_eq_int64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_eq_int64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_eq_int64(int64_t xfill,
+ int64_t yfill):
+ return xfill == yfill
+
+
+cdef inline tuple block_op_ne_float64(ndarray x_,
+ BlockIndex xindex,
+ float64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ float64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] != yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill != y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] != y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] != yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill != y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill != yfill
+
+
+cdef inline tuple int_op_ne_float64(ndarray x_, IntIndex xindex,
+ float64_t xfill,
+ ndarray y_, IntIndex yindex,
+ float64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill != y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] != yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] != y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] != yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill != y[yi]
+ yi += 1
+
+ return out, out_index, xfill != yfill
+
+
+cpdef sparse_ne_float64(ndarray[float64_t, ndim=1] x,
+ SparseIndex xindex, float64_t xfill,
+ ndarray[float64_t, ndim=1] y,
+ SparseIndex yindex, float64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_ne_float64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_ne_float64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_ne_float64(float64_t xfill,
+ float64_t yfill):
+ return xfill != yfill
+
+
+cdef inline tuple block_op_ne_int64(ndarray x_,
+ BlockIndex xindex,
+ int64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ int64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] != yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill != y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] != y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] != yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill != y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill != yfill
+
+
+cdef inline tuple int_op_ne_int64(ndarray x_, IntIndex xindex,
+ int64_t xfill,
+ ndarray y_, IntIndex yindex,
+ int64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill != y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] != yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] != y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] != yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill != y[yi]
+ yi += 1
+
+ return out, out_index, xfill != yfill
+
+
+cpdef sparse_ne_int64(ndarray[int64_t, ndim=1] x,
+ SparseIndex xindex, int64_t xfill,
+ ndarray[int64_t, ndim=1] y,
+ SparseIndex yindex, int64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_ne_int64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_ne_int64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_ne_int64(int64_t xfill,
+ int64_t yfill):
+ return xfill != yfill
+
+
+cdef inline tuple block_op_lt_float64(ndarray x_,
+ BlockIndex xindex,
+ float64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ float64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] < yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill < y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] < y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] < yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill < y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill < yfill
+
+
+cdef inline tuple int_op_lt_float64(ndarray x_, IntIndex xindex,
+ float64_t xfill,
+ ndarray y_, IntIndex yindex,
+ float64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill < y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] < yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] < y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] < yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill < y[yi]
+ yi += 1
+
+ return out, out_index, xfill < yfill
+
+
+cpdef sparse_lt_float64(ndarray[float64_t, ndim=1] x,
+ SparseIndex xindex, float64_t xfill,
+ ndarray[float64_t, ndim=1] y,
+ SparseIndex yindex, float64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_lt_float64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_lt_float64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_lt_float64(float64_t xfill,
+ float64_t yfill):
+ return xfill < yfill
+
+
+cdef inline tuple block_op_lt_int64(ndarray x_,
+ BlockIndex xindex,
+ int64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ int64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] < yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill < y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] < y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] < yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill < y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill < yfill
+
+
+cdef inline tuple int_op_lt_int64(ndarray x_, IntIndex xindex,
+ int64_t xfill,
+ ndarray y_, IntIndex yindex,
+ int64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill < y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] < yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] < y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] < yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill < y[yi]
+ yi += 1
+
+ return out, out_index, xfill < yfill
+
+
+cpdef sparse_lt_int64(ndarray[int64_t, ndim=1] x,
+ SparseIndex xindex, int64_t xfill,
+ ndarray[int64_t, ndim=1] y,
+ SparseIndex yindex, int64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_lt_int64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_lt_int64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_lt_int64(int64_t xfill,
+ int64_t yfill):
+ return xfill < yfill
+
+
+cdef inline tuple block_op_gt_float64(ndarray x_,
+ BlockIndex xindex,
+ float64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ float64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] > yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill > y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] > y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] > yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill > y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill > yfill
+
+
+cdef inline tuple int_op_gt_float64(ndarray x_, IntIndex xindex,
+ float64_t xfill,
+ ndarray y_, IntIndex yindex,
+ float64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill > y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] > yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] > y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] > yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill > y[yi]
+ yi += 1
+
+ return out, out_index, xfill > yfill
+
+
+cpdef sparse_gt_float64(ndarray[float64_t, ndim=1] x,
+ SparseIndex xindex, float64_t xfill,
+ ndarray[float64_t, ndim=1] y,
+ SparseIndex yindex, float64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_gt_float64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_gt_float64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_gt_float64(float64_t xfill,
+ float64_t yfill):
+ return xfill > yfill
+
+
+cdef inline tuple block_op_gt_int64(ndarray x_,
+ BlockIndex xindex,
+ int64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ int64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] > yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill > y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] > y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] > yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill > y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill > yfill
+
+
+cdef inline tuple int_op_gt_int64(ndarray x_, IntIndex xindex,
+ int64_t xfill,
+ ndarray y_, IntIndex yindex,
+ int64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill > y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] > yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] > y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] > yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill > y[yi]
+ yi += 1
+
+ return out, out_index, xfill > yfill
+
+
+cpdef sparse_gt_int64(ndarray[int64_t, ndim=1] x,
+ SparseIndex xindex, int64_t xfill,
+ ndarray[int64_t, ndim=1] y,
+ SparseIndex yindex, int64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_gt_int64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_gt_int64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_gt_int64(int64_t xfill,
+ int64_t yfill):
+ return xfill > yfill
+
+
+cdef inline tuple block_op_le_float64(ndarray x_,
+ BlockIndex xindex,
+ float64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ float64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] <= yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill <= y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] <= y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] <= yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill <= y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill <= yfill
+
+
+cdef inline tuple int_op_le_float64(ndarray x_, IntIndex xindex,
+ float64_t xfill,
+ ndarray y_, IntIndex yindex,
+ float64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill <= y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] <= yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] <= y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] <= yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill <= y[yi]
+ yi += 1
+
+ return out, out_index, xfill <= yfill
+
+
+cpdef sparse_le_float64(ndarray[float64_t, ndim=1] x,
+ SparseIndex xindex, float64_t xfill,
+ ndarray[float64_t, ndim=1] y,
+ SparseIndex yindex, float64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_le_float64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_le_float64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_le_float64(float64_t xfill,
+ float64_t yfill):
+ return xfill <= yfill
+
+
+cdef inline tuple block_op_le_int64(ndarray x_,
+ BlockIndex xindex,
+ int64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ int64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] <= yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill <= y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] <= y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] <= yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill <= y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill <= yfill
+
+
+cdef inline tuple int_op_le_int64(ndarray x_, IntIndex xindex,
+ int64_t xfill,
+ ndarray y_, IntIndex yindex,
+ int64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill <= y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] <= yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] <= y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] <= yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill <= y[yi]
+ yi += 1
+
+ return out, out_index, xfill <= yfill
+
+
+cpdef sparse_le_int64(ndarray[int64_t, ndim=1] x,
+ SparseIndex xindex, int64_t xfill,
+ ndarray[int64_t, ndim=1] y,
+ SparseIndex yindex, int64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_le_int64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_le_int64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_le_int64(int64_t xfill,
+ int64_t yfill):
+ return xfill <= yfill
+
+
+cdef inline tuple block_op_ge_float64(ndarray x_,
+ BlockIndex xindex,
+ float64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ float64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] >= yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill >= y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] >= y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] >= yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill >= y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill >= yfill
+
+
+cdef inline tuple int_op_ge_float64(ndarray x_, IntIndex xindex,
+ float64_t xfill,
+ ndarray y_, IntIndex yindex,
+ float64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[float64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill >= y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] >= yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] >= y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] >= yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill >= y[yi]
+ yi += 1
+
+ return out, out_index, xfill >= yfill
+
+
+cpdef sparse_ge_float64(ndarray[float64_t, ndim=1] x,
+ SparseIndex xindex, float64_t xfill,
+ ndarray[float64_t, ndim=1] y,
+ SparseIndex yindex, float64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_ge_float64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_ge_float64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_ge_float64(float64_t xfill,
+ float64_t yfill):
+ return xfill >= yfill
+
+
+cdef inline tuple block_op_ge_int64(ndarray x_,
+ BlockIndex xindex,
+ int64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ int64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] >= yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill >= y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] >= y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] >= yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill >= y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill >= yfill
+
+
+cdef inline tuple int_op_ge_int64(ndarray x_, IntIndex xindex,
+ int64_t xfill,
+ ndarray y_, IntIndex yindex,
+ int64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill >= y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] >= yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] >= y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] >= yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill >= y[yi]
+ yi += 1
+
+ return out, out_index, xfill >= yfill
+
+
+cpdef sparse_ge_int64(ndarray[int64_t, ndim=1] x,
+ SparseIndex xindex, int64_t xfill,
+ ndarray[int64_t, ndim=1] y,
+ SparseIndex yindex, int64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_ge_int64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_ge_int64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_ge_int64(int64_t xfill,
+ int64_t yfill):
+ return xfill >= yfill
+
+
+cdef inline tuple block_op_and_int64(ndarray x_,
+ BlockIndex xindex,
+ int64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ int64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] & yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill & y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] & y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] & yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill & y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill & yfill
+
+
+cdef inline tuple int_op_and_int64(ndarray x_, IntIndex xindex,
+ int64_t xfill,
+ ndarray y_, IntIndex yindex,
+ int64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill & y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] & yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] & y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] & yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill & y[yi]
+ yi += 1
+
+ return out, out_index, xfill & yfill
+
+
+cpdef sparse_and_int64(ndarray[int64_t, ndim=1] x,
+ SparseIndex xindex, int64_t xfill,
+ ndarray[int64_t, ndim=1] y,
+ SparseIndex yindex, int64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_and_int64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_and_int64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_and_int64(int64_t xfill,
+ int64_t yfill):
+ return xfill & yfill
+
+
+cdef inline tuple block_op_and_uint8(ndarray x_,
+ BlockIndex xindex,
+ uint8_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ uint8_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[uint8_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] & yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill & y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] & y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] & yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill & y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill & yfill
+
+
+cdef inline tuple int_op_and_uint8(ndarray x_, IntIndex xindex,
+ uint8_t xfill,
+ ndarray y_, IntIndex yindex,
+ uint8_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[uint8_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill & y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] & yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] & y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] & yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill & y[yi]
+ yi += 1
+
+ return out, out_index, xfill & yfill
+
+
+cpdef sparse_and_uint8(ndarray[uint8_t, ndim=1] x,
+ SparseIndex xindex, uint8_t xfill,
+ ndarray[uint8_t, ndim=1] y,
+ SparseIndex yindex, uint8_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_and_uint8(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_and_uint8(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_and_uint8(uint8_t xfill,
+ uint8_t yfill):
+ return xfill & yfill
+
+
+cdef inline tuple block_op_or_int64(ndarray x_,
+ BlockIndex xindex,
+ int64_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ int64_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] | yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill | y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] | y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] | yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill | y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill | yfill
+
+
+cdef inline tuple int_op_or_int64(ndarray x_, IntIndex xindex,
+ int64_t xfill,
+ ndarray y_, IntIndex yindex,
+ int64_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[int64_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill | y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] | yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] | y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] | yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill | y[yi]
+ yi += 1
+
+ return out, out_index, xfill | yfill
+
+
+cpdef sparse_or_int64(ndarray[int64_t, ndim=1] x,
+ SparseIndex xindex, int64_t xfill,
+ ndarray[int64_t, ndim=1] y,
+ SparseIndex yindex, int64_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_or_int64(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_or_int64(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_or_int64(int64_t xfill,
+ int64_t yfill):
+ return xfill | yfill
+
+
+cdef inline tuple block_op_or_uint8(ndarray x_,
+ BlockIndex xindex,
+ uint8_t xfill,
+ ndarray y_,
+ BlockIndex yindex,
+ uint8_t yfill):
+ '''
+ Binary operator on BlockIndex objects with fill values
+ '''
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ ndarray[uint8_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = x[xi] | yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = xfill | y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] | y[yi]
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] | yfill
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = xfill | y[yi]
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, xfill | yfill
+
+
+cdef inline tuple int_op_or_uint8(ndarray x_, IntIndex xindex,
+ uint8_t xfill,
+ ndarray y_, IntIndex yindex,
+ uint8_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ ndarray[int32_t, ndim=1] xindices, yindices, out_indices
+ ndarray[uint8_t, ndim=1] x, y
+ ndarray[uint8_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.uint8)
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = xfill | y[yi]
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = x[xi] | yfill
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = x[xi] | y[yi]
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = x[xi] | yfill
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = xfill | y[yi]
+ yi += 1
+
+ return out, out_index, xfill | yfill
+
+
+cpdef sparse_or_uint8(ndarray[uint8_t, ndim=1] x,
+ SparseIndex xindex, uint8_t xfill,
+ ndarray[uint8_t, ndim=1] y,
+ SparseIndex yindex, uint8_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_or_uint8(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_or_uint8(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+
+cpdef sparse_fill_or_uint8(uint8_t xfill,
+ uint8_t yfill):
+ return xfill | yfill
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/compat_helper.h b/contrib/python/pandas/py2/pandas/_libs/src/compat_helper.h
new file mode 100644
index 00000000000..462f53392ad
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/compat_helper.h
@@ -0,0 +1,50 @@
+/*
+Copyright (c) 2016, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+*/
+
+#ifndef PANDAS__LIBS_SRC_COMPAT_HELPER_H_
+#define PANDAS__LIBS_SRC_COMPAT_HELPER_H_
+
+#include "Python.h"
+#include "inline_helper.h"
+
+/*
+PySlice_GetIndicesEx changes signature in PY3
+but 3.6.1 in particular changes the behavior of this function slightly
+https://bugs.python.org/issue27867
+
+
+In 3.6.1 PySlice_GetIndicesEx was changed to a macro
+inadvertently breaking ABI compat. For now, undefing
+the macro, which restores compat.
+https://github.com/pandas-dev/pandas/issues/15961
+https://bugs.python.org/issue29943
+*/
+
+#ifndef PYPY_VERSION
+# if PY_VERSION_HEX < 0x03070000 && defined(PySlice_GetIndicesEx)
+# undef PySlice_GetIndicesEx
+# endif
+#endif
+
+PANDAS_INLINE int slice_get_indices(PyObject *s,
+ Py_ssize_t length,
+ Py_ssize_t *start,
+ Py_ssize_t *stop,
+ Py_ssize_t *step,
+ Py_ssize_t *slicelength) {
+#if PY_VERSION_HEX >= 0x03000000
+ return PySlice_GetIndicesEx(s, length, start, stop,
+ step, slicelength);
+#else
+ return PySlice_GetIndicesEx((PySliceObject *)s, length, start,
+ stop, step, slicelength);
+#endif
+}
+
+#endif // PANDAS__LIBS_SRC_COMPAT_HELPER_H_
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/headers/cmath b/contrib/python/pandas/py2/pandas/_libs/src/headers/cmath
new file mode 100644
index 00000000000..632e1fc2390
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/headers/cmath
@@ -0,0 +1,36 @@
+#ifndef _PANDAS_MATH_H_
+#define _PANDAS_MATH_H_
+
+// MSVC 2017 has a bug where `x == x` can be true for NaNs.
+// MSC_VER from https://stackoverflow.com/a/70630/1889400
+// Place upper bound on this check once a fixed MSVC is released.
+#if defined(_MSC_VER) && (_MSC_VER < 1800)
+#include <cmath>
+// In older versions of Visual Studio there wasn't a std::signbit defined
+// This defines it using _copysign
+namespace std {
+ __inline int isnan(double x) { return _isnan(x); }
+ __inline int signbit(double num) { return _copysign(1.0, num) < 0; }
+ __inline int notnan(double x) { return !isnan(x); }
+}
+#elif defined(_MSC_VER) && (_MSC_VER >= 1900)
+#include <cmath>
+namespace std {
+ __inline int isnan(double x) { return _isnan(x); }
+ __inline int notnan(double x) { return !isnan(x); }
+}
+#elif defined(_MSC_VER)
+#include <cmath>
+namespace std {
+ __inline int isnan(double x) { return _isnan(x); }
+ __inline int notnan(double x) { return x == x; }
+}
+#else
+#include <cmath>
+
+namespace std {
+ __inline int notnan(double x) { return x == x; }
+}
+
+#endif
+#endif
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/headers/ms_stdint.h b/contrib/python/pandas/py2/pandas/_libs/src/headers/ms_stdint.h
new file mode 100644
index 00000000000..c66fbb817c0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/headers/ms_stdint.h
@@ -0,0 +1,247 @@
+// ISO C9x compliant stdint.h for Microsoft Visual Studio
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
+//
+// Copyright (c) 2006-2008 Alexander Chemeris
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. The name of the author may be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_STDINT_H_ // [
+#define _MSC_STDINT_H_
+
+#if _MSC_VER > 1000
+#pragma once
+#endif
+
+#include <limits.h>
+
+// For Visual Studio 6 in C++ mode and for many Visual Studio versions when
+// compiling for ARM we should wrap <wchar.h> include with 'extern "C++" {}'
+// or compiler give many errors like this:
+// error C2733: second C linkage of overloaded function 'wmemchr' not allowed
+#ifdef __cplusplus
+extern "C" {
+#endif
+# include <wchar.h>
+#ifdef __cplusplus
+}
+#endif
+
+// Define _W64 macros to mark types changing their size, like intptr_t.
+#ifndef _W64
+# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
+# define _W64 __w64
+# else
+# define _W64
+# endif
+#endif
+
+
+// 7.18.1 Integer types
+
+// 7.18.1.1 Exact-width integer types
+
+// Visual Studio 6 and Embedded Visual C++ 4 doesn't
+// realize that, e.g. char has the same size as __int8
+// so we give up on __intX for them.
+#if (_MSC_VER < 1300)
+ typedef signed char int8_t;
+ typedef signed short int16_t;
+ typedef signed int int32_t;
+ typedef unsigned char uint8_t;
+ typedef unsigned short uint16_t;
+ typedef unsigned int uint32_t;
+#else
+ typedef signed __int8 int8_t;
+ typedef signed __int16 int16_t;
+ typedef signed __int32 int32_t;
+ typedef unsigned __int8 uint8_t;
+ typedef unsigned __int16 uint16_t;
+ typedef unsigned __int32 uint32_t;
+#endif
+typedef signed __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+
+
+// 7.18.1.2 Minimum-width integer types
+typedef int8_t int_least8_t;
+typedef int16_t int_least16_t;
+typedef int32_t int_least32_t;
+typedef int64_t int_least64_t;
+typedef uint8_t uint_least8_t;
+typedef uint16_t uint_least16_t;
+typedef uint32_t uint_least32_t;
+typedef uint64_t uint_least64_t;
+
+// 7.18.1.3 Fastest minimum-width integer types
+typedef int8_t int_fast8_t;
+typedef int16_t int_fast16_t;
+typedef int32_t int_fast32_t;
+typedef int64_t int_fast64_t;
+typedef uint8_t uint_fast8_t;
+typedef uint16_t uint_fast16_t;
+typedef uint32_t uint_fast32_t;
+typedef uint64_t uint_fast64_t;
+
+// 7.18.1.4 Integer types capable of holding object pointers
+#ifdef _WIN64 // [
+ typedef signed __int64 intptr_t;
+ typedef unsigned __int64 uintptr_t;
+#else // _WIN64 ][
+ typedef _W64 signed int intptr_t;
+ typedef _W64 unsigned int uintptr_t;
+#endif // _WIN64 ]
+
+// 7.18.1.5 Greatest-width integer types
+typedef int64_t intmax_t;
+typedef uint64_t uintmax_t;
+
+
+// 7.18.2 Limits of specified-width integer types
+
+#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259
+
+// 7.18.2.1 Limits of exact-width integer types
+#define INT8_MIN ((int8_t)_I8_MIN)
+#define INT8_MAX _I8_MAX
+#define INT16_MIN ((int16_t)_I16_MIN)
+#define INT16_MAX _I16_MAX
+#define INT32_MIN ((int32_t)_I32_MIN)
+#define INT32_MAX _I32_MAX
+#define INT64_MIN ((int64_t)_I64_MIN)
+#define INT64_MAX _I64_MAX
+#define UINT8_MAX _UI8_MAX
+#define UINT16_MAX _UI16_MAX
+#define UINT32_MAX _UI32_MAX
+#define UINT64_MAX _UI64_MAX
+
+// 7.18.2.2 Limits of minimum-width integer types
+#define INT_LEAST8_MIN INT8_MIN
+#define INT_LEAST8_MAX INT8_MAX
+#define INT_LEAST16_MIN INT16_MIN
+#define INT_LEAST16_MAX INT16_MAX
+#define INT_LEAST32_MIN INT32_MIN
+#define INT_LEAST32_MAX INT32_MAX
+#define INT_LEAST64_MIN INT64_MIN
+#define INT_LEAST64_MAX INT64_MAX
+#define UINT_LEAST8_MAX UINT8_MAX
+#define UINT_LEAST16_MAX UINT16_MAX
+#define UINT_LEAST32_MAX UINT32_MAX
+#define UINT_LEAST64_MAX UINT64_MAX
+
+// 7.18.2.3 Limits of fastest minimum-width integer types
+#define INT_FAST8_MIN INT8_MIN
+#define INT_FAST8_MAX INT8_MAX
+#define INT_FAST16_MIN INT16_MIN
+#define INT_FAST16_MAX INT16_MAX
+#define INT_FAST32_MIN INT32_MIN
+#define INT_FAST32_MAX INT32_MAX
+#define INT_FAST64_MIN INT64_MIN
+#define INT_FAST64_MAX INT64_MAX
+#define UINT_FAST8_MAX UINT8_MAX
+#define UINT_FAST16_MAX UINT16_MAX
+#define UINT_FAST32_MAX UINT32_MAX
+#define UINT_FAST64_MAX UINT64_MAX
+
+// 7.18.2.4 Limits of integer types capable of holding object pointers
+#ifdef _WIN64 // [
+# define INTPTR_MIN INT64_MIN
+# define INTPTR_MAX INT64_MAX
+# define UINTPTR_MAX UINT64_MAX
+#else // _WIN64 ][
+# define INTPTR_MIN INT32_MIN
+# define INTPTR_MAX INT32_MAX
+# define UINTPTR_MAX UINT32_MAX
+#endif // _WIN64 ]
+
+// 7.18.2.5 Limits of greatest-width integer types
+#define INTMAX_MIN INT64_MIN
+#define INTMAX_MAX INT64_MAX
+#define UINTMAX_MAX UINT64_MAX
+
+// 7.18.3 Limits of other integer types
+
+#ifdef _WIN64 // [
+# define PTRDIFF_MIN _I64_MIN
+# define PTRDIFF_MAX _I64_MAX
+#else // _WIN64 ][
+# define PTRDIFF_MIN _I32_MIN
+# define PTRDIFF_MAX _I32_MAX
+#endif // _WIN64 ]
+
+#define SIG_ATOMIC_MIN INT_MIN
+#define SIG_ATOMIC_MAX INT_MAX
+
+#ifndef SIZE_MAX // [
+# ifdef _WIN64 // [
+# define SIZE_MAX _UI64_MAX
+# else // _WIN64 ][
+# define SIZE_MAX _UI32_MAX
+# endif // _WIN64 ]
+#endif // SIZE_MAX ]
+
+// WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h>
+#ifndef WCHAR_MIN // [
+# define WCHAR_MIN 0
+#endif // WCHAR_MIN ]
+#ifndef WCHAR_MAX // [
+# define WCHAR_MAX _UI16_MAX
+#endif // WCHAR_MAX ]
+
+#define WINT_MIN 0
+#define WINT_MAX _UI16_MAX
+
+#endif // __STDC_LIMIT_MACROS ]
+
+
+// 7.18.4 Limits of other integer types
+
+#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260
+
+// 7.18.4.1 Macros for minimum-width integer constants
+
+#define INT8_C(val) val##i8
+#define INT16_C(val) val##i16
+#define INT32_C(val) val##i32
+#define INT64_C(val) val##i64
+
+#define UINT8_C(val) val##ui8
+#define UINT16_C(val) val##ui16
+#define UINT32_C(val) val##ui32
+#define UINT64_C(val) val##ui64
+
+// 7.18.4.2 Macros for greatest-width integer constants
+#define INTMAX_C INT64_C
+#define UINTMAX_C UINT64_C
+
+#endif // __STDC_CONSTANT_MACROS ]
+
+
+#endif // _MSC_STDINT_H_ ]
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/headers/portable.h b/contrib/python/pandas/py2/pandas/_libs/src/headers/portable.h
new file mode 100644
index 00000000000..1976addace3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/headers/portable.h
@@ -0,0 +1,15 @@
+#ifndef _PANDAS_PORTABLE_H_
+#define _PANDAS_PORTABLE_H_
+
+#if defined(_MSC_VER)
+#define strcasecmp( s1, s2 ) _stricmp( s1, s2 )
+#endif
+
+// GH-23516 - works around locale perf issues
+// from MUSL libc, MIT Licensed - see LICENSES
+#define isdigit_ascii(c) (((unsigned)(c) - '0') < 10u)
+#define isspace_ascii(c) (((c) == ' ') || (((unsigned)(c) - '\t') < 5))
+#define toupper_ascii(c) ((((unsigned)(c) - 'a') < 26) ? ((c) & 0x5f) : (c))
+#define tolower_ascii(c) ((((unsigned)(c) - 'A') < 26) ? ((c) | 0x20) : (c))
+
+#endif
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/headers/stdint.h b/contrib/python/pandas/py2/pandas/_libs/src/headers/stdint.h
new file mode 100644
index 00000000000..8746bf132d0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/headers/stdint.h
@@ -0,0 +1,10 @@
+#ifndef _PANDAS_STDINT_H_
+#define _PANDAS_STDINT_H_
+
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+#include "ms_stdint.h"
+#else
+#include <stdint.h>
+#endif
+
+#endif
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/inline_helper.h b/contrib/python/pandas/py2/pandas/_libs/src/inline_helper.h
new file mode 100644
index 00000000000..397ec8e7b2c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/inline_helper.h
@@ -0,0 +1,25 @@
+/*
+Copyright (c) 2016, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+*/
+
+#ifndef PANDAS__LIBS_SRC_INLINE_HELPER_H_
+#define PANDAS__LIBS_SRC_INLINE_HELPER_H_
+
+#ifndef PANDAS_INLINE
+ #if defined(__GNUC__)
+ #define PANDAS_INLINE static __inline__
+ #elif defined(_MSC_VER)
+ #define PANDAS_INLINE static __inline
+ #elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+ #define PANDAS_INLINE static inline
+ #else
+ #define PANDAS_INLINE
+ #endif
+#endif
+
+#endif // PANDAS__LIBS_SRC_INLINE_HELPER_H_
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/klib/khash.h b/contrib/python/pandas/py2/pandas/_libs/src/klib/khash.h
new file mode 100644
index 00000000000..77ec519cc24
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/klib/khash.h
@@ -0,0 +1,569 @@
+/* The MIT License
+
+ Copyright (c) 2008, 2009, 2011 by Attractive Chaos <[email protected]>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/*
+ An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+ int ret, is_missing;
+ khiter_t k;
+ khash_t(32) *h = kh_init(32);
+ k = kh_put(32, h, 5, &ret);
+ if (!ret) kh_del(32, h, k);
+ kh_value(h, k) = 10;
+ k = kh_get(32, h, 10);
+ is_missing = (k == kh_end(h));
+ k = kh_get(32, h, 5);
+ kh_del(32, h, k);
+ for (k = kh_begin(h); k != kh_end(h); ++k)
+ if (kh_exist(h, k)) kh_value(h, k) = 1;
+ kh_destroy(32, h);
+ return 0;
+}
+*/
+
+/*
+ 2011-09-16 (0.2.6):
+
+ * The capacity is a power of 2. This seems to dramatically improve the
+ speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+ - https://github.com/stefanocasazza/ULib
+ - http://nothings.org/computer/judy/
+
+ * Allow to optionally use linear probing which usually has better
+ performance for random input. Double hashing is still the default as it
+ is more robust to certain non-random input.
+
+ * Added Wang's integer hash function (not used by default). This hash
+ function is more robust to certain non-random input.
+
+ 2011-02-14 (0.2.5):
+
+ * Allow to declare global functions.
+
+ 2009-09-26 (0.2.4):
+
+ * Improve portability
+
+ 2008-09-19 (0.2.3):
+
+ * Corrected the example
+ * Improved interfaces
+
+ 2008-09-11 (0.2.2):
+
+ * Improved speed a little in kh_put()
+
+ 2008-09-10 (0.2.1):
+
+ * Added kh_clear()
+ * Fixed a compiling error
+
+ 2008-09-02 (0.2.0):
+
+ * Changed to token concatenation which increases flexibility.
+
+ 2008-08-31 (0.1.2):
+
+ * Fixed a bug in kh_get(), which has not been tested previously.
+
+ 2008-08-31 (0.1.1):
+
+ * Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+ @header
+
+ Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.6"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "../inline_helper.h"
+
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khuint64_t;
+typedef signed long khint64_t;
+#else
+typedef unsigned long long khuint64_t;
+typedef signed long long khint64_t;
+#endif
+
+typedef double khfloat64_t;
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>5]>>(i&0x1fU))&1)
+#define __ac_isdel(flag, i) (0)
+#define __ac_iseither(flag, i) __ac_isempty(flag, i)
+#define __ac_set_isdel_false(flag, i) (0)
+#define __ac_set_isempty_false(flag, i) (flag[i>>5]&=~(1ul<<(i&0x1fU)))
+#define __ac_set_isempty_true(flag, i) (flag[i>>5]|=(1ul<<(i&0x1fU)))
+#define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i)
+#define __ac_set_isdel_true(flag, i) ((void)0)
+
+#ifdef KHASH_LINEAR
+#define __ac_inc(k, m) 1
+#else
+#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m)
+#endif
+
+#define __ac_fsize(m) ((m) < 32? 1 : (m)>>5)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define KHASH_DECLARE(name, khkey_t, khval_t) \
+ typedef struct { \
+ khint_t n_buckets, size, n_occupied, upper_bound; \
+ khint32_t *flags; \
+ khkey_t *keys; \
+ khval_t *vals; \
+ } kh_##name##_t; \
+ extern kh_##name##_t *kh_init_##name(); \
+ extern void kh_destroy_##name(kh_##name##_t *h); \
+ extern void kh_clear_##name(kh_##name##_t *h); \
+ extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
+ extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+ extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+ extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ typedef struct { \
+ khint_t n_buckets, size, n_occupied, upper_bound; \
+ khint32_t *flags; \
+ khkey_t *keys; \
+ khval_t *vals; \
+ } kh_##name##_t; \
+ SCOPE kh_##name##_t *kh_init_##name(void) { \
+ return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \
+ } \
+ SCOPE void kh_destroy_##name(kh_##name##_t *h) \
+ { \
+ if (h) { \
+ free(h->keys); free(h->flags); \
+ free(h->vals); \
+ free(h); \
+ } \
+ } \
+ SCOPE void kh_clear_##name(kh_##name##_t *h) \
+ { \
+ if (h && h->flags) { \
+ memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+ h->size = h->n_occupied = 0; \
+ } \
+ } \
+ SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
+ { \
+ if (h->n_buckets) { \
+ khint_t inc, k, i, last, mask; \
+ mask = h->n_buckets - 1; \
+ k = __hash_func(key); i = k & mask; \
+ inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
+ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+ i = (i + inc) & mask; \
+ if (i == last) return h->n_buckets; \
+ } \
+ return __ac_iseither(h->flags, i)? h->n_buckets : i; \
+ } else return 0; \
+ } \
+ SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+ { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+ khint32_t *new_flags = 0; \
+ khint_t j = 1; \
+ { \
+ kroundup32(new_n_buckets); \
+ if (new_n_buckets < 4) new_n_buckets = 4; \
+ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
+ else { /* hash table size to be changed (shrink or expand); rehash */ \
+ new_flags = (khint32_t*)malloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+ memset(new_flags, 0xff, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+ if (h->n_buckets < new_n_buckets) { /* expand */ \
+ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
+ if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
+ } /* otherwise shrink */ \
+ } \
+ } \
+ if (j) { /* rehashing is needed */ \
+ for (j = 0; j != h->n_buckets; ++j) { \
+ if (__ac_iseither(h->flags, j) == 0) { \
+ khkey_t key = h->keys[j]; \
+ khval_t val; \
+ khint_t new_mask; \
+ new_mask = new_n_buckets - 1; \
+ if (kh_is_map) val = h->vals[j]; \
+ __ac_set_isempty_true(h->flags, j); \
+ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+ khint_t inc, k, i; \
+ k = __hash_func(key); \
+ i = k & new_mask; \
+ inc = __ac_inc(k, new_mask); \
+ while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
+ __ac_set_isempty_false(new_flags, i); \
+ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+ __ac_set_isempty_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+ } else { /* write the element and jump out of the loop */ \
+ h->keys[i] = key; \
+ if (kh_is_map) h->vals[i] = val; \
+ break; \
+ } \
+ } \
+ } \
+ } \
+ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
+ if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
+ } \
+ free(h->flags); /* free the working space */ \
+ h->flags = new_flags; \
+ h->n_buckets = new_n_buckets; \
+ h->n_occupied = h->size; \
+ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+ } \
+ } \
+ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+ { \
+ khint_t x; \
+ if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+ if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \
+ else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \
+ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+ { \
+ khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \
+ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+ if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
+ else { \
+ inc = __ac_inc(k, mask); last = i; \
+ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+ if (__ac_isdel(h->flags, i)) site = i; \
+ i = (i + inc) & mask; \
+ if (i == last) { x = site; break; } \
+ } \
+ if (x == h->n_buckets) { \
+ if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+ else x = i; \
+ } \
+ } \
+ } \
+ if (__ac_isempty(h->flags, x)) { /* not present at all */ \
+ h->keys[x] = key; \
+ __ac_set_isboth_false(h->flags, x); \
+ ++h->size; ++h->n_occupied; \
+ *ret = 1; \
+ } else if (__ac_isdel(h->flags, x)) { /* deleted */ \
+ h->keys[x] = key; \
+ __ac_set_isboth_false(h->flags, x); \
+ ++h->size; \
+ *ret = 2; \
+ } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+ return x; \
+ } \
+ SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
+ { \
+ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
+ __ac_set_isdel_true(h->flags, x); \
+ --h->size; \
+ } \
+ }
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ KHASH_INIT2(name, PANDAS_INLINE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+ @abstract Integer hash function
+ @param key The integer [khint32_t]
+ @return The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+ @abstract Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+ @abstract 64-bit integer hash function
+ @param key The integer [khint64_t]
+ @return The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+ @abstract 64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+
+/*! @function
+ @abstract const char* hash function
+ @param s Pointer to a null terminated string
+ @return The hash value
+ */
+PANDAS_INLINE khint_t __ac_X31_hash_string(const char *s)
+{
+ khint_t h = *s;
+ if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
+ return h;
+}
+/*! @function
+ @abstract Another interface to const char* hash function
+ @param key Pointer to a null terminated string [const char*]
+ @return The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+ @abstract Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key)
+{
+ key += ~(key << 15);
+ key ^= (key >> 10);
+ key += (key << 3);
+ key ^= (key >> 6);
+ key += ~(key << 11);
+ key ^= (key >> 16);
+ return key;
+}
+#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+ @abstract Type of the hash table.
+ @param name Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+ @abstract Initiate a hash table.
+ @param name Name of the hash table [symbol]
+ @return Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name(void)
+
+/*! @function
+ @abstract Destroy a hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+ @abstract Reset a hash table without deallocating memory.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+ @abstract Resize a hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param s New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+ @abstract Insert a key to the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Key [type of keys]
+ @param r Extra return code: 0 if the key is present in the hash table;
+ 1 if the bucket is empty (never used); 2 if the element in
+ the bucket has been deleted [int*]
+ @return Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+ @abstract Retrieve a key from the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Key [type of keys]
+ @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+ @abstract Remove a key from the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+ @abstract Test whether a bucket contains data.
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return 1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+ @abstract Get key given an iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+ @abstract Get value given an iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return Value [type of values]
+ @discussion For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+ @abstract Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+ @abstract Get the start iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+ @abstract Get the end iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+ @abstract Get the number of elements in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+ @abstract Get the number of buckets in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/* More conenient interfaces */
+
+/*! @function
+ @abstract Instantiate a hash set containing integer keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name) \
+ KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing integer keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t) \
+ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing 64-bit integer keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_UINT64(name) \
+ KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+#define KHASH_SET_INIT_INT64(name) \
+ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing 64-bit integer keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_UINT64(name, khval_t) \
+ KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+#define KHASH_MAP_INIT_INT64(name, khval_t) \
+ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+
+typedef const char *kh_cstr_t;
+/*! @function
+ @abstract Instantiate a hash map containing const char* keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name) \
+ KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing const char* keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t) \
+ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+
+#define kh_exist_str(h, k) (kh_exist(h, k))
+#define kh_exist_float64(h, k) (kh_exist(h, k))
+#define kh_exist_uint64(h, k) (kh_exist(h, k))
+#define kh_exist_int64(h, k) (kh_exist(h, k))
+#define kh_exist_int32(h, k) (kh_exist(h, k))
+
+KHASH_MAP_INIT_STR(str, size_t)
+KHASH_MAP_INIT_INT(int32, size_t)
+KHASH_MAP_INIT_INT64(int64, size_t)
+KHASH_MAP_INIT_UINT64(uint64, size_t)
+
+
+#endif /* __AC_KHASH_H */
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/klib/khash_python.h b/contrib/python/pandas/py2/pandas/_libs/src/klib/khash_python.h
new file mode 100644
index 00000000000..45a93051f78
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/klib/khash_python.h
@@ -0,0 +1,86 @@
+#include <string.h>
+#include <Python.h>
+
+#include "khash.h"
+
+// Previously we were using the built in cpython hash function for doubles
+// python 2.7 https://github.com/python/cpython/blob/2.7/Objects/object.c#L1021
+// python 3.5 https://github.com/python/cpython/blob/3.5/Python/pyhash.c#L85
+
+// The python 3 hash function has the invariant hash(x) == hash(int(x)) == hash(decimal(x))
+// and the size of hash may be different by platform / version (long in py2, Py_ssize_t in py3).
+// We don't need those invariants because types will be cast before hashing, and if Py_ssize_t
+// is 64 bits the truncation causes collission issues. Given all that, we use our own
+// simple hash, viewing the double bytes as an int64 and using khash's default
+// hash for 64 bit integers.
+// GH 13436
+khint64_t PANDAS_INLINE asint64(double key) {
+ khint64_t val;
+ memcpy(&val, &key, sizeof(double));
+ return val;
+}
+
+// correct for all inputs but not -0.0 and NaNs
+#define kh_float64_hash_func_0_NAN(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
+
+// correct for all inputs but not NaNs
+#define kh_float64_hash_func_NAN(key) ((key) == 0.0 ? \
+ kh_float64_hash_func_0_NAN(0.0) : \
+ kh_float64_hash_func_0_NAN(key))
+
+// correct for all
+#define kh_float64_hash_func(key) ((key) != (key) ? \
+ kh_float64_hash_func_NAN(Py_NAN) : \
+ kh_float64_hash_func_NAN(key))
+
+#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
+
+#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \
+ KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_float64_hash_equal)
+
+KHASH_MAP_INIT_FLOAT64(float64, size_t)
+
+
+int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
+ int result = PyObject_RichCompareBool(a, b, Py_EQ);
+ if (result < 0) {
+ PyErr_Clear();
+ return 0;
+ }
+ if (result == 0) { // still could be two NaNs
+ return PyFloat_CheckExact(a) &&
+ PyFloat_CheckExact(b) &&
+ Py_IS_NAN(PyFloat_AS_DOUBLE(a)) &&
+ Py_IS_NAN(PyFloat_AS_DOUBLE(b));
+ }
+ return result;
+}
+
+// For PyObject_Hash holds:
+// hash(0.0) == 0 == hash(-0.0)
+// hash(X) == 0 if X is a NaN-value
+// so it is OK to use it directly
+#define kh_python_hash_func(key) (PyObject_Hash(key))
+#define kh_python_hash_equal(a, b) (pyobject_cmp(a, b))
+
+
+// Python object
+
+typedef PyObject* kh_pyobject_t;
+
+#define KHASH_MAP_INIT_PYOBJECT(name, khval_t) \
+ KHASH_INIT(name, kh_pyobject_t, khval_t, 1, \
+ kh_python_hash_func, kh_python_hash_equal)
+
+KHASH_MAP_INIT_PYOBJECT(pymap, Py_ssize_t)
+
+#define KHASH_SET_INIT_PYOBJECT(name) \
+ KHASH_INIT(name, kh_pyobject_t, char, 0, \
+ kh_python_hash_func, kh_python_hash_equal)
+
+KHASH_SET_INIT_PYOBJECT(pyset)
+
+#define kh_exist_pymap(h, k) (kh_exist(h, k))
+#define kh_exist_pyset(h, k) (kh_exist(h, k))
+
+KHASH_MAP_INIT_STR(strbox, kh_pyobject_t)
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/msgpack/pack.h b/contrib/python/pandas/py2/pandas/_libs/src/msgpack/pack.h
new file mode 100644
index 00000000000..02379c91884
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/msgpack/pack.h
@@ -0,0 +1,103 @@
+/*
+ * MessagePack for Python packing routine
+ *
+ * Copyright (C) 2009 Naoki INADA
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stddef.h>
+#include <stdlib.h>
+#include "sysdep.h"
+#include <limits.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+#define inline __inline
+#endif
+
+typedef struct msgpack_packer {
+ char *buf;
+ size_t length;
+ size_t buf_size;
+ bool use_bin_type;
+} msgpack_packer;
+
+typedef struct Packer Packer;
+
+static inline int msgpack_pack_int(msgpack_packer* pk, int d);
+static inline int msgpack_pack_long(msgpack_packer* pk, long d);
+static inline int msgpack_pack_long_long(msgpack_packer* pk, long long d);
+static inline int msgpack_pack_unsigned_short(msgpack_packer* pk, unsigned short d);
+static inline int msgpack_pack_unsigned_int(msgpack_packer* pk, unsigned int d);
+static inline int msgpack_pack_unsigned_long(msgpack_packer* pk, unsigned long d);
+//static inline int msgpack_pack_unsigned_long_long(msgpack_packer* pk, unsigned long long d);
+
+static inline int msgpack_pack_uint8(msgpack_packer* pk, uint8_t d);
+static inline int msgpack_pack_uint16(msgpack_packer* pk, uint16_t d);
+static inline int msgpack_pack_uint32(msgpack_packer* pk, uint32_t d);
+static inline int msgpack_pack_uint64(msgpack_packer* pk, uint64_t d);
+static inline int msgpack_pack_int8(msgpack_packer* pk, int8_t d);
+static inline int msgpack_pack_int16(msgpack_packer* pk, int16_t d);
+static inline int msgpack_pack_int32(msgpack_packer* pk, int32_t d);
+static inline int msgpack_pack_int64(msgpack_packer* pk, int64_t d);
+
+static inline int msgpack_pack_float(msgpack_packer* pk, float d);
+static inline int msgpack_pack_double(msgpack_packer* pk, double d);
+
+static inline int msgpack_pack_nil(msgpack_packer* pk);
+static inline int msgpack_pack_true(msgpack_packer* pk);
+static inline int msgpack_pack_false(msgpack_packer* pk);
+
+static inline int msgpack_pack_array(msgpack_packer* pk, unsigned int n);
+
+static inline int msgpack_pack_map(msgpack_packer* pk, unsigned int n);
+
+static inline int msgpack_pack_raw(msgpack_packer* pk, size_t l);
+static inline int msgpack_pack_bin(msgpack_packer* pk, size_t l);
+static inline int msgpack_pack_raw_body(msgpack_packer* pk, const void* b, size_t l);
+
+static inline int msgpack_pack_ext(msgpack_packer* pk, char typecode, size_t l);
+
+static inline int msgpack_pack_write(msgpack_packer* pk, const char *data, size_t l)
+{
+ char* buf = pk->buf;
+ size_t bs = pk->buf_size;
+ size_t len = pk->length;
+
+ if (len + l > bs) {
+ bs = (len + l) * 2;
+ buf = (char*)realloc(buf, bs);
+ if (!buf) return -1;
+ }
+ memcpy(buf + len, data, l);
+ len += l;
+
+ pk->buf = buf;
+ pk->buf_size = bs;
+ pk->length = len;
+ return 0;
+}
+
+#define msgpack_pack_append_buffer(user, buf, len) \
+ return msgpack_pack_write(user, (const char*)buf, len)
+
+#include "pack_template.h"
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/msgpack/pack_template.h b/contrib/python/pandas/py2/pandas/_libs/src/msgpack/pack_template.h
new file mode 100644
index 00000000000..5d1088f4b7d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/msgpack/pack_template.h
@@ -0,0 +1,785 @@
+/*
+ * MessagePack packing routine template
+ *
+ * Copyright (C) 2008-2010 FURUHASHI Sadayuki
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined(__LITTLE_ENDIAN__)
+#define TAKE8_8(d) ((uint8_t*)&d)[0]
+#define TAKE8_16(d) ((uint8_t*)&d)[0]
+#define TAKE8_32(d) ((uint8_t*)&d)[0]
+#define TAKE8_64(d) ((uint8_t*)&d)[0]
+#elif defined(__BIG_ENDIAN__)
+#define TAKE8_8(d) ((uint8_t*)&d)[0]
+#define TAKE8_16(d) ((uint8_t*)&d)[1]
+#define TAKE8_32(d) ((uint8_t*)&d)[3]
+#define TAKE8_64(d) ((uint8_t*)&d)[7]
+#endif
+
+#ifndef msgpack_pack_append_buffer
+#error msgpack_pack_append_buffer callback is not defined
+#endif
+
+
+/*
+ * Integer
+ */
+
+#define msgpack_pack_real_uint8(x, d) \
+do { \
+ if(d < (1<<7)) { \
+ /* fixnum */ \
+ msgpack_pack_append_buffer(x, &TAKE8_8(d), 1); \
+ } else { \
+ /* unsigned 8 */ \
+ unsigned char buf[2] = {0xcc, TAKE8_8(d)}; \
+ msgpack_pack_append_buffer(x, buf, 2); \
+ } \
+} while(0)
+
+#define msgpack_pack_real_uint16(x, d) \
+do { \
+ if(d < (1<<7)) { \
+ /* fixnum */ \
+ msgpack_pack_append_buffer(x, &TAKE8_16(d), 1); \
+ } else if(d < (1<<8)) { \
+ /* unsigned 8 */ \
+ unsigned char buf[2] = {0xcc, TAKE8_16(d)}; \
+ msgpack_pack_append_buffer(x, buf, 2); \
+ } else { \
+ /* unsigned 16 */ \
+ unsigned char buf[3]; \
+ buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \
+ msgpack_pack_append_buffer(x, buf, 3); \
+ } \
+} while(0)
+
+#define msgpack_pack_real_uint32(x, d) \
+do { \
+ if(d < (1<<8)) { \
+ if(d < (1<<7)) { \
+ /* fixnum */ \
+ msgpack_pack_append_buffer(x, &TAKE8_32(d), 1); \
+ } else { \
+ /* unsigned 8 */ \
+ unsigned char buf[2] = {0xcc, TAKE8_32(d)}; \
+ msgpack_pack_append_buffer(x, buf, 2); \
+ } \
+ } else { \
+ if(d < (1<<16)) { \
+ /* unsigned 16 */ \
+ unsigned char buf[3]; \
+ buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \
+ msgpack_pack_append_buffer(x, buf, 3); \
+ } else { \
+ /* unsigned 32 */ \
+ unsigned char buf[5]; \
+ buf[0] = 0xce; _msgpack_store32(&buf[1], (uint32_t)d); \
+ msgpack_pack_append_buffer(x, buf, 5); \
+ } \
+ } \
+} while(0)
+
+#define msgpack_pack_real_uint64(x, d) \
+do { \
+ if(d < (1ULL<<8)) { \
+ if(d < (1ULL<<7)) { \
+ /* fixnum */ \
+ msgpack_pack_append_buffer(x, &TAKE8_64(d), 1); \
+ } else { \
+ /* unsigned 8 */ \
+ unsigned char buf[2] = {0xcc, TAKE8_64(d)}; \
+ msgpack_pack_append_buffer(x, buf, 2); \
+ } \
+ } else { \
+ if(d < (1ULL<<16)) { \
+ /* unsigned 16 */ \
+ unsigned char buf[3]; \
+ buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \
+ msgpack_pack_append_buffer(x, buf, 3); \
+ } else if(d < (1ULL<<32)) { \
+ /* unsigned 32 */ \
+ unsigned char buf[5]; \
+ buf[0] = 0xce; _msgpack_store32(&buf[1], (uint32_t)d); \
+ msgpack_pack_append_buffer(x, buf, 5); \
+ } else { \
+ /* unsigned 64 */ \
+ unsigned char buf[9]; \
+ buf[0] = 0xcf; _msgpack_store64(&buf[1], d); \
+ msgpack_pack_append_buffer(x, buf, 9); \
+ } \
+ } \
+} while(0)
+
+#define msgpack_pack_real_int8(x, d) \
+do { \
+ if(d < -(1<<5)) { \
+ /* signed 8 */ \
+ unsigned char buf[2] = {0xd0, TAKE8_8(d)}; \
+ msgpack_pack_append_buffer(x, buf, 2); \
+ } else { \
+ /* fixnum */ \
+ msgpack_pack_append_buffer(x, &TAKE8_8(d), 1); \
+ } \
+} while(0)
+
+#define msgpack_pack_real_int16(x, d) \
+do { \
+ if(d < -(1<<5)) { \
+ if(d < -(1<<7)) { \
+ /* signed 16 */ \
+ unsigned char buf[3]; \
+ buf[0] = 0xd1; _msgpack_store16(&buf[1], (int16_t)d); \
+ msgpack_pack_append_buffer(x, buf, 3); \
+ } else { \
+ /* signed 8 */ \
+ unsigned char buf[2] = {0xd0, TAKE8_16(d)}; \
+ msgpack_pack_append_buffer(x, buf, 2); \
+ } \
+ } else if(d < (1<<7)) { \
+ /* fixnum */ \
+ msgpack_pack_append_buffer(x, &TAKE8_16(d), 1); \
+ } else { \
+ if(d < (1<<8)) { \
+ /* unsigned 8 */ \
+ unsigned char buf[2] = {0xcc, TAKE8_16(d)}; \
+ msgpack_pack_append_buffer(x, buf, 2); \
+ } else { \
+ /* unsigned 16 */ \
+ unsigned char buf[3]; \
+ buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \
+ msgpack_pack_append_buffer(x, buf, 3); \
+ } \
+ } \
+} while(0)
+
+#define msgpack_pack_real_int32(x, d) \
+do { \
+ if(d < -(1<<5)) { \
+ if(d < -(1<<15)) { \
+ /* signed 32 */ \
+ unsigned char buf[5]; \
+ buf[0] = 0xd2; _msgpack_store32(&buf[1], (int32_t)d); \
+ msgpack_pack_append_buffer(x, buf, 5); \
+ } else if(d < -(1<<7)) { \
+ /* signed 16 */ \
+ unsigned char buf[3]; \
+ buf[0] = 0xd1; _msgpack_store16(&buf[1], (int16_t)d); \
+ msgpack_pack_append_buffer(x, buf, 3); \
+ } else { \
+ /* signed 8 */ \
+ unsigned char buf[2] = {0xd0, TAKE8_32(d)}; \
+ msgpack_pack_append_buffer(x, buf, 2); \
+ } \
+ } else if(d < (1<<7)) { \
+ /* fixnum */ \
+ msgpack_pack_append_buffer(x, &TAKE8_32(d), 1); \
+ } else { \
+ if(d < (1<<8)) { \
+ /* unsigned 8 */ \
+ unsigned char buf[2] = {0xcc, TAKE8_32(d)}; \
+ msgpack_pack_append_buffer(x, buf, 2); \
+ } else if(d < (1<<16)) { \
+ /* unsigned 16 */ \
+ unsigned char buf[3]; \
+ buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \
+ msgpack_pack_append_buffer(x, buf, 3); \
+ } else { \
+ /* unsigned 32 */ \
+ unsigned char buf[5]; \
+ buf[0] = 0xce; _msgpack_store32(&buf[1], (uint32_t)d); \
+ msgpack_pack_append_buffer(x, buf, 5); \
+ } \
+ } \
+} while(0)
+
+#define msgpack_pack_real_int64(x, d) \
+do { \
+ if(d < -(1LL<<5)) { \
+ if(d < -(1LL<<15)) { \
+ if(d < -(1LL<<31)) { \
+ /* signed 64 */ \
+ unsigned char buf[9]; \
+ buf[0] = 0xd3; _msgpack_store64(&buf[1], d); \
+ msgpack_pack_append_buffer(x, buf, 9); \
+ } else { \
+ /* signed 32 */ \
+ unsigned char buf[5]; \
+ buf[0] = 0xd2; _msgpack_store32(&buf[1], (int32_t)d); \
+ msgpack_pack_append_buffer(x, buf, 5); \
+ } \
+ } else { \
+ if(d < -(1<<7)) { \
+ /* signed 16 */ \
+ unsigned char buf[3]; \
+ buf[0] = 0xd1; _msgpack_store16(&buf[1], (int16_t)d); \
+ msgpack_pack_append_buffer(x, buf, 3); \
+ } else { \
+ /* signed 8 */ \
+ unsigned char buf[2] = {0xd0, TAKE8_64(d)}; \
+ msgpack_pack_append_buffer(x, buf, 2); \
+ } \
+ } \
+ } else if(d < (1<<7)) { \
+ /* fixnum */ \
+ msgpack_pack_append_buffer(x, &TAKE8_64(d), 1); \
+ } else { \
+ if(d < (1LL<<16)) { \
+ if(d < (1<<8)) { \
+ /* unsigned 8 */ \
+ unsigned char buf[2] = {0xcc, TAKE8_64(d)}; \
+ msgpack_pack_append_buffer(x, buf, 2); \
+ } else { \
+ /* unsigned 16 */ \
+ unsigned char buf[3]; \
+ buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \
+ msgpack_pack_append_buffer(x, buf, 3); \
+ } \
+ } else { \
+ if(d < (1LL<<32)) { \
+ /* unsigned 32 */ \
+ unsigned char buf[5]; \
+ buf[0] = 0xce; _msgpack_store32(&buf[1], (uint32_t)d); \
+ msgpack_pack_append_buffer(x, buf, 5); \
+ } else { \
+ /* unsigned 64 */ \
+ unsigned char buf[9]; \
+ buf[0] = 0xcf; _msgpack_store64(&buf[1], d); \
+ msgpack_pack_append_buffer(x, buf, 9); \
+ } \
+ } \
+ } \
+} while(0)
+
+
+static inline int msgpack_pack_uint8(msgpack_packer* x, uint8_t d)
+{
+ msgpack_pack_real_uint8(x, d);
+}
+
+static inline int msgpack_pack_uint16(msgpack_packer* x, uint16_t d)
+{
+ msgpack_pack_real_uint16(x, d);
+}
+
+static inline int msgpack_pack_uint32(msgpack_packer* x, uint32_t d)
+{
+ msgpack_pack_real_uint32(x, d);
+}
+
+static inline int msgpack_pack_uint64(msgpack_packer* x, uint64_t d)
+{
+ msgpack_pack_real_uint64(x, d);
+}
+
+static inline int msgpack_pack_int8(msgpack_packer* x, int8_t d)
+{
+ msgpack_pack_real_int8(x, d);
+}
+
+static inline int msgpack_pack_int16(msgpack_packer* x, int16_t d)
+{
+ msgpack_pack_real_int16(x, d);
+}
+
+static inline int msgpack_pack_int32(msgpack_packer* x, int32_t d)
+{
+ msgpack_pack_real_int32(x, d);
+}
+
+static inline int msgpack_pack_int64(msgpack_packer* x, int64_t d)
+{
+ msgpack_pack_real_int64(x, d);
+}
+
+
+//#ifdef msgpack_pack_inline_func_cint
+
+static inline int msgpack_pack_short(msgpack_packer* x, short d)
+{
+#if defined(SIZEOF_SHORT)
+#if SIZEOF_SHORT == 2
+ msgpack_pack_real_int16(x, d);
+#elif SIZEOF_SHORT == 4
+ msgpack_pack_real_int32(x, d);
+#else
+ msgpack_pack_real_int64(x, d);
+#endif
+
+#elif defined(SHRT_MAX)
+#if SHRT_MAX == 0x7fff
+ msgpack_pack_real_int16(x, d);
+#elif SHRT_MAX == 0x7fffffff
+ msgpack_pack_real_int32(x, d);
+#else
+ msgpack_pack_real_int64(x, d);
+#endif
+
+#else
+if(sizeof(short) == 2) {
+ msgpack_pack_real_int16(x, d);
+} else if(sizeof(short) == 4) {
+ msgpack_pack_real_int32(x, d);
+} else {
+ msgpack_pack_real_int64(x, d);
+}
+#endif
+}
+
+static inline int msgpack_pack_int(msgpack_packer* x, int d)
+{
+#if defined(SIZEOF_INT)
+#if SIZEOF_INT == 2
+ msgpack_pack_real_int16(x, d);
+#elif SIZEOF_INT == 4
+ msgpack_pack_real_int32(x, d);
+#else
+ msgpack_pack_real_int64(x, d);
+#endif
+
+#elif defined(INT_MAX)
+#if INT_MAX == 0x7fff
+ msgpack_pack_real_int16(x, d);
+#elif INT_MAX == 0x7fffffff
+ msgpack_pack_real_int32(x, d);
+#else
+ msgpack_pack_real_int64(x, d);
+#endif
+
+#else
+if(sizeof(int) == 2) {
+ msgpack_pack_real_int16(x, d);
+} else if(sizeof(int) == 4) {
+ msgpack_pack_real_int32(x, d);
+} else {
+ msgpack_pack_real_int64(x, d);
+}
+#endif
+}
+
+static inline int msgpack_pack_long(msgpack_packer* x, long d)
+{
+#if defined(SIZEOF_LONG)
+#if SIZEOF_LONG == 2
+ msgpack_pack_real_int16(x, d);
+#elif SIZEOF_LONG == 4
+ msgpack_pack_real_int32(x, d);
+#else
+ msgpack_pack_real_int64(x, d);
+#endif
+
+#elif defined(LONG_MAX)
+#if LONG_MAX == 0x7fffL
+ msgpack_pack_real_int16(x, d);
+#elif LONG_MAX == 0x7fffffffL
+ msgpack_pack_real_int32(x, d);
+#else
+ msgpack_pack_real_int64(x, d);
+#endif
+
+#else
+if(sizeof(long) == 2) {
+ msgpack_pack_real_int16(x, d);
+} else if(sizeof(long) == 4) {
+ msgpack_pack_real_int32(x, d);
+} else {
+ msgpack_pack_real_int64(x, d);
+}
+#endif
+}
+
+static inline int msgpack_pack_long_long(msgpack_packer* x, long long d)
+{
+#if defined(SIZEOF_LONG_LONG)
+#if SIZEOF_LONG_LONG == 2
+ msgpack_pack_real_int16(x, d);
+#elif SIZEOF_LONG_LONG == 4
+ msgpack_pack_real_int32(x, d);
+#else
+ msgpack_pack_real_int64(x, d);
+#endif
+
+#elif defined(LLONG_MAX)
+#if LLONG_MAX == 0x7fffL
+ msgpack_pack_real_int16(x, d);
+#elif LLONG_MAX == 0x7fffffffL
+ msgpack_pack_real_int32(x, d);
+#else
+ msgpack_pack_real_int64(x, d);
+#endif
+
+#else
+if(sizeof(long long) == 2) {
+ msgpack_pack_real_int16(x, d);
+} else if(sizeof(long long) == 4) {
+ msgpack_pack_real_int32(x, d);
+} else {
+ msgpack_pack_real_int64(x, d);
+}
+#endif
+}
+
+static inline int msgpack_pack_unsigned_short(msgpack_packer* x, unsigned short d)
+{
+#if defined(SIZEOF_SHORT)
+#if SIZEOF_SHORT == 2
+ msgpack_pack_real_uint16(x, d);
+#elif SIZEOF_SHORT == 4
+ msgpack_pack_real_uint32(x, d);
+#else
+ msgpack_pack_real_uint64(x, d);
+#endif
+
+#elif defined(USHRT_MAX)
+#if USHRT_MAX == 0xffffU
+ msgpack_pack_real_uint16(x, d);
+#elif USHRT_MAX == 0xffffffffU
+ msgpack_pack_real_uint32(x, d);
+#else
+ msgpack_pack_real_uint64(x, d);
+#endif
+
+#else
+if(sizeof(unsigned short) == 2) {
+ msgpack_pack_real_uint16(x, d);
+} else if(sizeof(unsigned short) == 4) {
+ msgpack_pack_real_uint32(x, d);
+} else {
+ msgpack_pack_real_uint64(x, d);
+}
+#endif
+}
+
+static inline int msgpack_pack_unsigned_int(msgpack_packer* x, unsigned int d)
+{
+#if defined(SIZEOF_INT)
+#if SIZEOF_INT == 2
+ msgpack_pack_real_uint16(x, d);
+#elif SIZEOF_INT == 4
+ msgpack_pack_real_uint32(x, d);
+#else
+ msgpack_pack_real_uint64(x, d);
+#endif
+
+#elif defined(UINT_MAX)
+#if UINT_MAX == 0xffffU
+ msgpack_pack_real_uint16(x, d);
+#elif UINT_MAX == 0xffffffffU
+ msgpack_pack_real_uint32(x, d);
+#else
+ msgpack_pack_real_uint64(x, d);
+#endif
+
+#else
+if(sizeof(unsigned int) == 2) {
+ msgpack_pack_real_uint16(x, d);
+} else if(sizeof(unsigned int) == 4) {
+ msgpack_pack_real_uint32(x, d);
+} else {
+ msgpack_pack_real_uint64(x, d);
+}
+#endif
+}
+
+static inline int msgpack_pack_unsigned_long(msgpack_packer* x, unsigned long d)
+{
+#if defined(SIZEOF_LONG)
+#if SIZEOF_LONG == 2
+ msgpack_pack_real_uint16(x, d);
+#elif SIZEOF_LONG == 4
+ msgpack_pack_real_uint32(x, d);
+#else
+ msgpack_pack_real_uint64(x, d);
+#endif
+
+#elif defined(ULONG_MAX)
+#if ULONG_MAX == 0xffffUL
+ msgpack_pack_real_uint16(x, d);
+#elif ULONG_MAX == 0xffffffffUL
+ msgpack_pack_real_uint32(x, d);
+#else
+ msgpack_pack_real_uint64(x, d);
+#endif
+
+#else
+if(sizeof(unsigned long) == 2) {
+ msgpack_pack_real_uint16(x, d);
+} else if(sizeof(unsigned long) == 4) {
+ msgpack_pack_real_uint32(x, d);
+} else {
+ msgpack_pack_real_uint64(x, d);
+}
+#endif
+}
+
+static inline int msgpack_pack_unsigned_long_long(msgpack_packer* x, unsigned long long d)
+{
+#if defined(SIZEOF_LONG_LONG)
+#if SIZEOF_LONG_LONG == 2
+ msgpack_pack_real_uint16(x, d);
+#elif SIZEOF_LONG_LONG == 4
+ msgpack_pack_real_uint32(x, d);
+#else
+ msgpack_pack_real_uint64(x, d);
+#endif
+
+#elif defined(ULLONG_MAX)
+#if ULLONG_MAX == 0xffffUL
+ msgpack_pack_real_uint16(x, d);
+#elif ULLONG_MAX == 0xffffffffUL
+ msgpack_pack_real_uint32(x, d);
+#else
+ msgpack_pack_real_uint64(x, d);
+#endif
+
+#else
+if(sizeof(unsigned long long) == 2) {
+ msgpack_pack_real_uint16(x, d);
+} else if(sizeof(unsigned long long) == 4) {
+ msgpack_pack_real_uint32(x, d);
+} else {
+ msgpack_pack_real_uint64(x, d);
+}
+#endif
+}
+
+//#undef msgpack_pack_inline_func_cint
+//#endif
+
+
+
+/*
+ * Float
+ */
+
+static inline int msgpack_pack_float(msgpack_packer* x, float d)
+{
+ union { float f; uint32_t i; } mem;
+ mem.f = d;
+ unsigned char buf[5];
+ buf[0] = 0xca; _msgpack_store32(&buf[1], mem.i);
+ msgpack_pack_append_buffer(x, buf, 5);
+}
+
+static inline int msgpack_pack_double(msgpack_packer* x, double d)
+{
+ union { double f; uint64_t i; } mem;
+ mem.f = d;
+ unsigned char buf[9];
+ buf[0] = 0xcb;
+#if defined(__arm__) && !(__ARM_EABI__) // arm-oabi
+ // https://github.com/msgpack/msgpack-perl/pull/1
+ mem.i = (mem.i & 0xFFFFFFFFUL) << 32UL | (mem.i >> 32UL);
+#endif
+ _msgpack_store64(&buf[1], mem.i);
+ msgpack_pack_append_buffer(x, buf, 9);
+}
+
+
+/*
+ * Nil
+ */
+
+static inline int msgpack_pack_nil(msgpack_packer* x)
+{
+ static const unsigned char d = 0xc0;
+ msgpack_pack_append_buffer(x, &d, 1);
+}
+
+
+/*
+ * Boolean
+ */
+
+static inline int msgpack_pack_true(msgpack_packer* x)
+{
+ static const unsigned char d = 0xc3;
+ msgpack_pack_append_buffer(x, &d, 1);
+}
+
+static inline int msgpack_pack_false(msgpack_packer* x)
+{
+ static const unsigned char d = 0xc2;
+ msgpack_pack_append_buffer(x, &d, 1);
+}
+
+
+/*
+ * Array
+ */
+
+static inline int msgpack_pack_array(msgpack_packer* x, unsigned int n)
+{
+ if(n < 16) {
+ unsigned char d = 0x90 | n;
+ msgpack_pack_append_buffer(x, &d, 1);
+ } else if(n < 65536) {
+ unsigned char buf[3];
+ buf[0] = 0xdc; _msgpack_store16(&buf[1], (uint16_t)n);
+ msgpack_pack_append_buffer(x, buf, 3);
+ } else {
+ unsigned char buf[5];
+ buf[0] = 0xdd; _msgpack_store32(&buf[1], (uint32_t)n);
+ msgpack_pack_append_buffer(x, buf, 5);
+ }
+}
+
+
+/*
+ * Map
+ */
+
+static inline int msgpack_pack_map(msgpack_packer* x, unsigned int n)
+{
+ if(n < 16) {
+ unsigned char d = 0x80 | n;
+ msgpack_pack_append_buffer(x, &TAKE8_8(d), 1);
+ } else if(n < 65536) {
+ unsigned char buf[3];
+ buf[0] = 0xde; _msgpack_store16(&buf[1], (uint16_t)n);
+ msgpack_pack_append_buffer(x, buf, 3);
+ } else {
+ unsigned char buf[5];
+ buf[0] = 0xdf; _msgpack_store32(&buf[1], (uint32_t)n);
+ msgpack_pack_append_buffer(x, buf, 5);
+ }
+}
+
+
+/*
+ * Raw
+ */
+
+static inline int msgpack_pack_raw(msgpack_packer* x, size_t l)
+{
+ if (l < 32) {
+ unsigned char d = 0xa0 | (uint8_t)l;
+ msgpack_pack_append_buffer(x, &TAKE8_8(d), 1);
+ } else if (x->use_bin_type && l < 256) { // str8 is new format introduced with bin.
+ unsigned char buf[2] = {0xd9, (uint8_t)l};
+ msgpack_pack_append_buffer(x, buf, 2);
+ } else if (l < 65536) {
+ unsigned char buf[3];
+ buf[0] = 0xda; _msgpack_store16(&buf[1], (uint16_t)l);
+ msgpack_pack_append_buffer(x, buf, 3);
+ } else {
+ unsigned char buf[5];
+ buf[0] = 0xdb; _msgpack_store32(&buf[1], (uint32_t)l);
+ msgpack_pack_append_buffer(x, buf, 5);
+ }
+}
+
+/*
+ * bin
+ */
+static inline int msgpack_pack_bin(msgpack_packer *x, size_t l)
+{
+ if (!x->use_bin_type) {
+ return msgpack_pack_raw(x, l);
+ }
+ if (l < 256) {
+ unsigned char buf[2] = {0xc4, (unsigned char)l};
+ msgpack_pack_append_buffer(x, buf, 2);
+ } else if (l < 65536) {
+ unsigned char buf[3] = {0xc5};
+ _msgpack_store16(&buf[1], (uint16_t)l);
+ msgpack_pack_append_buffer(x, buf, 3);
+ } else {
+ unsigned char buf[5] = {0xc6};
+ _msgpack_store32(&buf[1], (uint32_t)l);
+ msgpack_pack_append_buffer(x, buf, 5);
+ }
+}
+
+static inline int msgpack_pack_raw_body(msgpack_packer* x, const void* b, size_t l)
+{
+ if (l > 0) msgpack_pack_append_buffer(x, (const unsigned char*)b, l);
+ return 0;
+}
+
+/*
+ * Ext
+ */
+static inline int msgpack_pack_ext(msgpack_packer* x, char typecode, size_t l)
+{
+ if (l == 1) {
+ unsigned char buf[2];
+ buf[0] = 0xd4;
+ buf[1] = (unsigned char)typecode;
+ msgpack_pack_append_buffer(x, buf, 2);
+ }
+ else if(l == 2) {
+ unsigned char buf[2];
+ buf[0] = 0xd5;
+ buf[1] = (unsigned char)typecode;
+ msgpack_pack_append_buffer(x, buf, 2);
+ }
+ else if(l == 4) {
+ unsigned char buf[2];
+ buf[0] = 0xd6;
+ buf[1] = (unsigned char)typecode;
+ msgpack_pack_append_buffer(x, buf, 2);
+ }
+ else if(l == 8) {
+ unsigned char buf[2];
+ buf[0] = 0xd7;
+ buf[1] = (unsigned char)typecode;
+ msgpack_pack_append_buffer(x, buf, 2);
+ }
+ else if(l == 16) {
+ unsigned char buf[2];
+ buf[0] = 0xd8;
+ buf[1] = (unsigned char)typecode;
+ msgpack_pack_append_buffer(x, buf, 2);
+ }
+ else if(l < 256) {
+ unsigned char buf[3];
+ buf[0] = 0xc7;
+ buf[1] = l;
+ buf[2] = (unsigned char)typecode;
+ msgpack_pack_append_buffer(x, buf, 3);
+ } else if(l < 65536) {
+ unsigned char buf[4];
+ buf[0] = 0xc8;
+ _msgpack_store16(&buf[1], (uint16_t)l);
+ buf[3] = (unsigned char)typecode;
+ msgpack_pack_append_buffer(x, buf, 4);
+ } else {
+ unsigned char buf[6];
+ buf[0] = 0xc9;
+ _msgpack_store32(&buf[1], (uint32_t)l);
+ buf[5] = (unsigned char)typecode;
+ msgpack_pack_append_buffer(x, buf, 6);
+ }
+
+}
+
+
+
+#undef msgpack_pack_append_buffer
+
+#undef TAKE8_8
+#undef TAKE8_16
+#undef TAKE8_32
+#undef TAKE8_64
+
+#undef msgpack_pack_real_uint8
+#undef msgpack_pack_real_uint16
+#undef msgpack_pack_real_uint32
+#undef msgpack_pack_real_uint64
+#undef msgpack_pack_real_int8
+#undef msgpack_pack_real_int16
+#undef msgpack_pack_real_int32
+#undef msgpack_pack_real_int64
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/msgpack/sysdep.h b/contrib/python/pandas/py2/pandas/_libs/src/msgpack/sysdep.h
new file mode 100644
index 00000000000..ed9c1bc0b80
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/msgpack/sysdep.h
@@ -0,0 +1,194 @@
+/*
+ * MessagePack system dependencies
+ *
+ * Copyright (C) 2008-2010 FURUHASHI Sadayuki
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MSGPACK_SYSDEP_H__
+#define MSGPACK_SYSDEP_H__
+
+#include <stdlib.h>
+#include <stddef.h>
+#if defined(_MSC_VER) && _MSC_VER < 1600
+typedef __int8 int8_t;
+typedef unsigned __int8 uint8_t;
+typedef __int16 int16_t;
+typedef unsigned __int16 uint16_t;
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#elif defined(_MSC_VER) // && _MSC_VER >= 1600
+#include <stdint.h>
+#else
+#include <stdint.h>
+#include <stdbool.h>
+#endif
+
+#ifdef _WIN32
+#define _msgpack_atomic_counter_header <windows.h>
+typedef long _msgpack_atomic_counter_t;
+#define _msgpack_sync_decr_and_fetch(ptr) InterlockedDecrement(ptr)
+#define _msgpack_sync_incr_and_fetch(ptr) InterlockedIncrement(ptr)
+#elif defined(__GNUC__) && ((__GNUC__*10 + __GNUC_MINOR__) < 41)
+#define _msgpack_atomic_counter_header "gcc_atomic.h"
+#else
+typedef unsigned int _msgpack_atomic_counter_t;
+#define _msgpack_sync_decr_and_fetch(ptr) __sync_sub_and_fetch(ptr, 1)
+#define _msgpack_sync_incr_and_fetch(ptr) __sync_add_and_fetch(ptr, 1)
+#endif
+
+#ifdef _WIN32
+
+#ifdef __cplusplus
+/* numeric_limits<T>::min,max */
+#ifdef max
+#undef max
+#endif
+#ifdef min
+#undef min
+#endif
+#endif
+
+#else
+#include <arpa/inet.h> /* __BYTE_ORDER */
+#endif
+
+#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define __LITTLE_ENDIAN__
+#elif __BYTE_ORDER == __BIG_ENDIAN
+#define __BIG_ENDIAN__
+#elif _WIN32
+#define __LITTLE_ENDIAN__
+#endif
+#endif
+
+
+#ifdef __LITTLE_ENDIAN__
+
+#ifdef _WIN32
+# if defined(ntohs)
+# define _msgpack_be16(x) ntohs(x)
+# elif defined(_byteswap_ushort) || (defined(_MSC_VER) && _MSC_VER >= 1400)
+# define _msgpack_be16(x) ((uint16_t)_byteswap_ushort((unsigned short)x))
+# else
+# define _msgpack_be16(x) ( \
+ ((((uint16_t)x) << 8) ) | \
+ ((((uint16_t)x) >> 8) ) )
+# endif
+#else
+# define _msgpack_be16(x) ntohs(x)
+#endif
+
+#ifdef _WIN32
+# if defined(ntohl)
+# define _msgpack_be32(x) ntohl(x)
+# elif defined(_byteswap_ulong) || (defined(_MSC_VER) && _MSC_VER >= 1400)
+# define _msgpack_be32(x) ((uint32_t)_byteswap_ulong((unsigned long)x))
+# else
+# define _msgpack_be32(x) \
+ ( ((((uint32_t)x) << 24) ) | \
+ ((((uint32_t)x) << 8) & 0x00ff0000U ) | \
+ ((((uint32_t)x) >> 8) & 0x0000ff00U ) | \
+ ((((uint32_t)x) >> 24) ) )
+# endif
+#else
+# define _msgpack_be32(x) ntohl(x)
+#endif
+
+#if defined(_byteswap_uint64) || (defined(_MSC_VER) && _MSC_VER >= 1400)
+# define _msgpack_be64(x) (_byteswap_uint64(x))
+#elif defined(bswap_64)
+# define _msgpack_be64(x) bswap_64(x)
+#elif defined(__DARWIN_OSSwapInt64)
+# define _msgpack_be64(x) __DARWIN_OSSwapInt64(x)
+#else
+#define _msgpack_be64(x) \
+ ( ((((uint64_t)x) << 56) ) | \
+ ((((uint64_t)x) << 40) & 0x00ff000000000000ULL ) | \
+ ((((uint64_t)x) << 24) & 0x0000ff0000000000ULL ) | \
+ ((((uint64_t)x) << 8) & 0x000000ff00000000ULL ) | \
+ ((((uint64_t)x) >> 8) & 0x00000000ff000000ULL ) | \
+ ((((uint64_t)x) >> 24) & 0x0000000000ff0000ULL ) | \
+ ((((uint64_t)x) >> 40) & 0x000000000000ff00ULL ) | \
+ ((((uint64_t)x) >> 56) ) )
+#endif
+
+#define _msgpack_load16(cast, from) ((cast)( \
+ (((uint16_t)((uint8_t*)(from))[0]) << 8) | \
+ (((uint16_t)((uint8_t*)(from))[1]) ) ))
+
+#define _msgpack_load32(cast, from) ((cast)( \
+ (((uint32_t)((uint8_t*)(from))[0]) << 24) | \
+ (((uint32_t)((uint8_t*)(from))[1]) << 16) | \
+ (((uint32_t)((uint8_t*)(from))[2]) << 8) | \
+ (((uint32_t)((uint8_t*)(from))[3]) ) ))
+
+#define _msgpack_load64(cast, from) ((cast)( \
+ (((uint64_t)((uint8_t*)(from))[0]) << 56) | \
+ (((uint64_t)((uint8_t*)(from))[1]) << 48) | \
+ (((uint64_t)((uint8_t*)(from))[2]) << 40) | \
+ (((uint64_t)((uint8_t*)(from))[3]) << 32) | \
+ (((uint64_t)((uint8_t*)(from))[4]) << 24) | \
+ (((uint64_t)((uint8_t*)(from))[5]) << 16) | \
+ (((uint64_t)((uint8_t*)(from))[6]) << 8) | \
+ (((uint64_t)((uint8_t*)(from))[7]) ) ))
+
+#else
+
+#define _msgpack_be16(x) (x)
+#define _msgpack_be32(x) (x)
+#define _msgpack_be64(x) (x)
+
+#define _msgpack_load16(cast, from) ((cast)( \
+ (((uint16_t)((uint8_t*)from)[0]) << 8) | \
+ (((uint16_t)((uint8_t*)from)[1]) ) ))
+
+#define _msgpack_load32(cast, from) ((cast)( \
+ (((uint32_t)((uint8_t*)from)[0]) << 24) | \
+ (((uint32_t)((uint8_t*)from)[1]) << 16) | \
+ (((uint32_t)((uint8_t*)from)[2]) << 8) | \
+ (((uint32_t)((uint8_t*)from)[3]) ) ))
+
+#define _msgpack_load64(cast, from) ((cast)( \
+ (((uint64_t)((uint8_t*)from)[0]) << 56) | \
+ (((uint64_t)((uint8_t*)from)[1]) << 48) | \
+ (((uint64_t)((uint8_t*)from)[2]) << 40) | \
+ (((uint64_t)((uint8_t*)from)[3]) << 32) | \
+ (((uint64_t)((uint8_t*)from)[4]) << 24) | \
+ (((uint64_t)((uint8_t*)from)[5]) << 16) | \
+ (((uint64_t)((uint8_t*)from)[6]) << 8) | \
+ (((uint64_t)((uint8_t*)from)[7]) ) ))
+#endif
+
+
+#define _msgpack_store16(to, num) \
+ do { uint16_t val = _msgpack_be16(num); memcpy(to, &val, 2); } while(0)
+#define _msgpack_store32(to, num) \
+ do { uint32_t val = _msgpack_be32(num); memcpy(to, &val, 4); } while(0)
+#define _msgpack_store64(to, num) \
+ do { uint64_t val = _msgpack_be64(num); memcpy(to, &val, 8); } while(0)
+
+/*
+#define _msgpack_load16(cast, from) \
+ ({ cast val; memcpy(&val, (char*)from, 2); _msgpack_be16(val); })
+#define _msgpack_load32(cast, from) \
+ ({ cast val; memcpy(&val, (char*)from, 4); _msgpack_be32(val); })
+#define _msgpack_load64(cast, from) \
+ ({ cast val; memcpy(&val, (char*)from, 8); _msgpack_be64(val); })
+*/
+
+
+#endif /* msgpack/sysdep.h */
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/msgpack/unpack.h b/contrib/python/pandas/py2/pandas/_libs/src/msgpack/unpack.h
new file mode 100644
index 00000000000..591fad1ae46
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/msgpack/unpack.h
@@ -0,0 +1,278 @@
+/*
+ * MessagePack for Python unpacking routine
+ *
+ * Copyright (C) 2009 Naoki INADA
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define MSGPACK_EMBED_STACK_SIZE (1024)
+#include "unpack_define.h"
+
+typedef struct unpack_user {
+ int use_list;
+ PyObject *object_hook;
+ bool has_pairs_hook;
+ PyObject *list_hook;
+ PyObject *ext_hook;
+ const char *encoding;
+ const char *unicode_errors;
+ Py_ssize_t max_str_len, max_bin_len, max_array_len, max_map_len, max_ext_len;
+} unpack_user;
+
+typedef PyObject* msgpack_unpack_object;
+struct unpack_context;
+typedef struct unpack_context unpack_context;
+typedef int (*execute_fn)(unpack_context *ctx, const char* data, size_t len, size_t* off);
+
+static inline msgpack_unpack_object unpack_callback_root(unpack_user* u)
+{
+ return NULL;
+}
+
+static inline int unpack_callback_uint16(unpack_user* u, uint16_t d, msgpack_unpack_object* o)
+{
+ PyObject *p = PyInt_FromLong((long)d);
+ if (!p)
+ return -1;
+ *o = p;
+ return 0;
+}
+static inline int unpack_callback_uint8(unpack_user* u, uint8_t d, msgpack_unpack_object* o)
+{
+ return unpack_callback_uint16(u, d, o);
+}
+
+
+static inline int unpack_callback_uint32(unpack_user* u, uint32_t d, msgpack_unpack_object* o)
+{
+ PyObject *p = PyInt_FromSize_t((size_t)d);
+ if (!p)
+ return -1;
+ *o = p;
+ return 0;
+}
+
+static inline int unpack_callback_uint64(unpack_user* u, uint64_t d, msgpack_unpack_object* o)
+{
+ PyObject *p;
+ if (d > LONG_MAX) {
+ p = PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG)d);
+ } else {
+ p = PyInt_FromSize_t((size_t)d);
+ }
+ if (!p)
+ return -1;
+ *o = p;
+ return 0;
+}
+
+static inline int unpack_callback_int32(unpack_user* u, int32_t d, msgpack_unpack_object* o)
+{
+ PyObject *p = PyInt_FromLong(d);
+ if (!p)
+ return -1;
+ *o = p;
+ return 0;
+}
+
+static inline int unpack_callback_int16(unpack_user* u, int16_t d, msgpack_unpack_object* o)
+{
+ return unpack_callback_int32(u, d, o);
+}
+
+static inline int unpack_callback_int8(unpack_user* u, int8_t d, msgpack_unpack_object* o)
+{
+ return unpack_callback_int32(u, d, o);
+}
+
+static inline int unpack_callback_int64(unpack_user* u, int64_t d, msgpack_unpack_object* o)
+{
+ PyObject *p;
+ if (d > LONG_MAX || d < LONG_MIN) {
+ p = PyLong_FromLongLong((unsigned PY_LONG_LONG)d);
+ } else {
+ p = PyInt_FromLong((long)d);
+ }
+ *o = p;
+ return 0;
+}
+
+static inline int unpack_callback_double(unpack_user* u, double d, msgpack_unpack_object* o)
+{
+ PyObject *p = PyFloat_FromDouble(d);
+ if (!p)
+ return -1;
+ *o = p;
+ return 0;
+}
+
+static inline int unpack_callback_float(unpack_user* u, float d, msgpack_unpack_object* o)
+{
+ return unpack_callback_double(u, d, o);
+}
+
+static inline int unpack_callback_nil(unpack_user* u, msgpack_unpack_object* o)
+{ Py_INCREF(Py_None); *o = Py_None; return 0; }
+
+static inline int unpack_callback_true(unpack_user* u, msgpack_unpack_object* o)
+{ Py_INCREF(Py_True); *o = Py_True; return 0; }
+
+static inline int unpack_callback_false(unpack_user* u, msgpack_unpack_object* o)
+{ Py_INCREF(Py_False); *o = Py_False; return 0; }
+
+static inline int unpack_callback_array(unpack_user* u, unsigned int n, msgpack_unpack_object* o)
+{
+ if (n > u->max_array_len) {
+ PyErr_Format(PyExc_ValueError, "%u exceeds max_array_len(%zd)", n, u->max_array_len);
+ return -1;
+ }
+ PyObject *p = u->use_list ? PyList_New(n) : PyTuple_New(n);
+
+ if (!p)
+ return -1;
+ *o = p;
+ return 0;
+}
+
+static inline int unpack_callback_array_item(unpack_user* u, unsigned int current, msgpack_unpack_object* c, msgpack_unpack_object o)
+{
+ if (u->use_list)
+ PyList_SET_ITEM(*c, current, o);
+ else
+ PyTuple_SET_ITEM(*c, current, o);
+ return 0;
+}
+
+static inline int unpack_callback_array_end(unpack_user* u, msgpack_unpack_object* c)
+{
+ if (u->list_hook) {
+ PyObject *new_c = PyObject_CallFunctionObjArgs(u->list_hook, *c, NULL);
+ if (!new_c)
+ return -1;
+ Py_DECREF(*c);
+ *c = new_c;
+ }
+ return 0;
+}
+
+static inline int unpack_callback_map(unpack_user* u, unsigned int n, msgpack_unpack_object* o)
+{
+ if (n > u->max_map_len) {
+ PyErr_Format(PyExc_ValueError, "%u exceeds max_map_len(%zd)", n, u->max_map_len);
+ return -1;
+ }
+ PyObject *p;
+ if (u->has_pairs_hook) {
+ p = PyList_New(n); // Or use tuple?
+ }
+ else {
+ p = PyDict_New();
+ }
+ if (!p)
+ return -1;
+ *o = p;
+ return 0;
+}
+
+static inline int unpack_callback_map_item(unpack_user* u, unsigned int current, msgpack_unpack_object* c, msgpack_unpack_object k, msgpack_unpack_object v)
+{
+ if (u->has_pairs_hook) {
+ msgpack_unpack_object item = PyTuple_Pack(2, k, v);
+ if (!item)
+ return -1;
+ Py_DECREF(k);
+ Py_DECREF(v);
+ PyList_SET_ITEM(*c, current, item);
+ return 0;
+ }
+ else if (PyDict_SetItem(*c, k, v) == 0) {
+ Py_DECREF(k);
+ Py_DECREF(v);
+ return 0;
+ }
+ return -1;
+}
+
+static inline int unpack_callback_map_end(unpack_user* u, msgpack_unpack_object* c)
+{
+ if (u->object_hook) {
+ PyObject *new_c = PyObject_CallFunctionObjArgs(u->object_hook, *c, NULL);
+ if (!new_c)
+ return -1;
+
+ Py_DECREF(*c);
+ *c = new_c;
+ }
+ return 0;
+}
+
+static inline int unpack_callback_raw(unpack_user* u, const char* b, const char* p, unsigned int l, msgpack_unpack_object* o)
+{
+ if (l > u->max_str_len) {
+ PyErr_Format(PyExc_ValueError, "%u exceeds max_str_len(%zd)", l, u->max_str_len);
+ return -1;
+ }
+
+ PyObject *py;
+ if(u->encoding) {
+ py = PyUnicode_Decode(p, l, u->encoding, u->unicode_errors);
+ } else {
+ py = PyBytes_FromStringAndSize(p, l);
+ }
+ if (!py)
+ return -1;
+ *o = py;
+ return 0;
+}
+
+static inline int unpack_callback_bin(unpack_user* u, const char* b, const char* p, unsigned int l, msgpack_unpack_object* o)
+{
+ if (l > u->max_bin_len) {
+ PyErr_Format(PyExc_ValueError, "%u exceeds max_bin_len(%zd)", l, u->max_bin_len);
+ return -1;
+ }
+
+ PyObject *py = PyBytes_FromStringAndSize(p, l);
+ if (!py)
+ return -1;
+ *o = py;
+ return 0;
+}
+
+static inline int unpack_callback_ext(unpack_user* u, const char* base, const char* pos,
+ unsigned int length, msgpack_unpack_object* o)
+{
+ PyObject *py;
+ int8_t typecode = (int8_t)*pos++;
+ if (!u->ext_hook) {
+ PyErr_SetString(PyExc_AssertionError, "u->ext_hook cannot be NULL");
+ return -1;
+ }
+ if (length-1 > u->max_ext_len) {
+ PyErr_Format(PyExc_ValueError, "%u exceeds max_ext_len(%zd)", length, u->max_ext_len);
+ return -1;
+ }
+ // length also includes the typecode, so the actual data is length-1
+#if PY_MAJOR_VERSION == 2
+ py = PyObject_CallFunction(u->ext_hook, (char*)"(is#)", typecode, pos, (Py_ssize_t)length-1);
+#else
+ py = PyObject_CallFunction(u->ext_hook, (char*)"(iy#)", typecode, pos, (Py_ssize_t)length-1);
+#endif
+ if (!py)
+ return -1;
+ *o = py;
+ return 0;
+}
+
+#include "unpack_template.h"
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/msgpack/unpack_define.h b/contrib/python/pandas/py2/pandas/_libs/src/msgpack/unpack_define.h
new file mode 100644
index 00000000000..0dd708d17c3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/msgpack/unpack_define.h
@@ -0,0 +1,95 @@
+/*
+ * MessagePack unpacking routine template
+ *
+ * Copyright (C) 2008-2010 FURUHASHI Sadayuki
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MSGPACK_UNPACK_DEFINE_H__
+#define MSGPACK_UNPACK_DEFINE_H__
+
+#include "msgpack/sysdep.h"
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifndef MSGPACK_EMBED_STACK_SIZE
+#define MSGPACK_EMBED_STACK_SIZE 32
+#endif
+
+
+// CS is first byte & 0x1f
+typedef enum {
+ CS_HEADER = 0x00, // nil
+
+ //CS_ = 0x01,
+ //CS_ = 0x02, // false
+ //CS_ = 0x03, // true
+
+ CS_BIN_8 = 0x04,
+ CS_BIN_16 = 0x05,
+ CS_BIN_32 = 0x06,
+
+ CS_EXT_8 = 0x07,
+ CS_EXT_16 = 0x08,
+ CS_EXT_32 = 0x09,
+
+ CS_FLOAT = 0x0a,
+ CS_DOUBLE = 0x0b,
+ CS_UINT_8 = 0x0c,
+ CS_UINT_16 = 0x0d,
+ CS_UINT_32 = 0x0e,
+ CS_UINT_64 = 0x0f,
+ CS_INT_8 = 0x10,
+ CS_INT_16 = 0x11,
+ CS_INT_32 = 0x12,
+ CS_INT_64 = 0x13,
+
+ //CS_FIXEXT1 = 0x14,
+ //CS_FIXEXT2 = 0x15,
+ //CS_FIXEXT4 = 0x16,
+ //CS_FIXEXT8 = 0x17,
+ //CS_FIXEXT16 = 0x18,
+
+ CS_RAW_8 = 0x19,
+ CS_RAW_16 = 0x1a,
+ CS_RAW_32 = 0x1b,
+ CS_ARRAY_16 = 0x1c,
+ CS_ARRAY_32 = 0x1d,
+ CS_MAP_16 = 0x1e,
+ CS_MAP_32 = 0x1f,
+
+ ACS_RAW_VALUE,
+ ACS_BIN_VALUE,
+ ACS_EXT_VALUE,
+} msgpack_unpack_state;
+
+
+typedef enum {
+ CT_ARRAY_ITEM,
+ CT_MAP_KEY,
+ CT_MAP_VALUE,
+} msgpack_container_type;
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* msgpack/unpack_define.h */
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/msgpack/unpack_template.h b/contrib/python/pandas/py2/pandas/_libs/src/msgpack/unpack_template.h
new file mode 100644
index 00000000000..402dcd48cb3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/msgpack/unpack_template.h
@@ -0,0 +1,475 @@
+/*
+ * MessagePack unpacking routine template
+ *
+ * Copyright (C) 2008-2010 FURUHASHI Sadayuki
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef USE_CASE_RANGE
+#ifdef __GNUC__
+#define USE_CASE_RANGE
+#endif
+#endif
+
+typedef struct unpack_stack {
+ PyObject* obj;
+ size_t size;
+ size_t count;
+ unsigned int ct;
+ PyObject* map_key;
+} unpack_stack;
+
+struct unpack_context {
+ unpack_user user;
+ unsigned int cs;
+ unsigned int trail;
+ unsigned int top;
+ /*
+ unpack_stack* stack;
+ unsigned int stack_size;
+ unpack_stack embed_stack[MSGPACK_EMBED_STACK_SIZE];
+ */
+ unpack_stack stack[MSGPACK_EMBED_STACK_SIZE];
+};
+
+
+static inline void unpack_init(unpack_context* ctx)
+{
+ ctx->cs = CS_HEADER;
+ ctx->trail = 0;
+ ctx->top = 0;
+ /*
+ ctx->stack = ctx->embed_stack;
+ ctx->stack_size = MSGPACK_EMBED_STACK_SIZE;
+ */
+ ctx->stack[0].obj = unpack_callback_root(&ctx->user);
+}
+
+/*
+static inline void unpack_destroy(unpack_context* ctx)
+{
+ if(ctx->stack_size != MSGPACK_EMBED_STACK_SIZE) {
+ free(ctx->stack);
+ }
+}
+*/
+
+static inline PyObject* unpack_data(unpack_context* ctx)
+{
+ return (ctx)->stack[0].obj;
+}
+
+
+template <bool construct>
+static inline int unpack_execute(unpack_context* ctx, const char* data, size_t len, size_t* off)
+{
+ assert(len >= *off);
+
+ const unsigned char* p = (unsigned char*)data + *off;
+ const unsigned char* const pe = (unsigned char*)data + len;
+ const void* n = NULL;
+
+ unsigned int trail = ctx->trail;
+ unsigned int cs = ctx->cs;
+ unsigned int top = ctx->top;
+ unpack_stack* stack = ctx->stack;
+ /*
+ unsigned int stack_size = ctx->stack_size;
+ */
+ unpack_user* user = &ctx->user;
+
+ PyObject* obj = NULL;
+ unpack_stack* c = NULL;
+
+ int ret;
+
+#define construct_cb(name) \
+ construct && unpack_callback ## name
+
+#define push_simple_value(func) \
+ if(construct_cb(func)(user, &obj) < 0) { goto _failed; } \
+ goto _push
+#define push_fixed_value(func, arg) \
+ if(construct_cb(func)(user, arg, &obj) < 0) { goto _failed; } \
+ goto _push
+#define push_variable_value(func, base, pos, len) \
+ if(construct_cb(func)(user, \
+ (const char*)base, (const char*)pos, len, &obj) < 0) { goto _failed; } \
+ goto _push
+
+#define again_fixed_trail(_cs, trail_len) \
+ trail = trail_len; \
+ cs = _cs; \
+ goto _fixed_trail_again
+#define again_fixed_trail_if_zero(_cs, trail_len, ifzero) \
+ trail = trail_len; \
+ if(trail == 0) { goto ifzero; } \
+ cs = _cs; \
+ goto _fixed_trail_again
+
+#define start_container(func, count_, ct_) \
+ if(top >= MSGPACK_EMBED_STACK_SIZE) { goto _failed; } /* FIXME */ \
+ if(construct_cb(func)(user, count_, &stack[top].obj) < 0) { goto _failed; } \
+ if((count_) == 0) { obj = stack[top].obj; \
+ if (construct_cb(func##_end)(user, &obj) < 0) { goto _failed; } \
+ goto _push; } \
+ stack[top].ct = ct_; \
+ stack[top].size = count_; \
+ stack[top].count = 0; \
+ ++top; \
+ /*printf("container %d count %d stack %d\n",stack[top].obj,count_,top);*/ \
+ /*printf("stack push %d\n", top);*/ \
+ /* FIXME \
+ if(top >= stack_size) { \
+ if(stack_size == MSGPACK_EMBED_STACK_SIZE) { \
+ size_t csize = sizeof(unpack_stack) * MSGPACK_EMBED_STACK_SIZE; \
+ size_t nsize = csize * 2; \
+ unpack_stack* tmp = (unpack_stack*)malloc(nsize); \
+ if(tmp == NULL) { goto _failed; } \
+ memcpy(tmp, ctx->stack, csize); \
+ ctx->stack = stack = tmp; \
+ ctx->stack_size = stack_size = MSGPACK_EMBED_STACK_SIZE * 2; \
+ } else { \
+ size_t nsize = sizeof(unpack_stack) * ctx->stack_size * 2; \
+ unpack_stack* tmp = (unpack_stack*)realloc(ctx->stack, nsize); \
+ if(tmp == NULL) { goto _failed; } \
+ ctx->stack = stack = tmp; \
+ ctx->stack_size = stack_size = stack_size * 2; \
+ } \
+ } \
+ */ \
+ goto _header_again
+
+#define NEXT_CS(p) ((unsigned int)*p & 0x1f)
+
+#ifdef USE_CASE_RANGE
+#define SWITCH_RANGE_BEGIN switch(*p) {
+#define SWITCH_RANGE(FROM, TO) case FROM ... TO:
+#define SWITCH_RANGE_DEFAULT default:
+#define SWITCH_RANGE_END }
+#else
+#define SWITCH_RANGE_BEGIN { if(0) {
+#define SWITCH_RANGE(FROM, TO) } else if(FROM <= *p && *p <= TO) {
+#define SWITCH_RANGE_DEFAULT } else {
+#define SWITCH_RANGE_END } }
+#endif
+
+ if(p == pe) { goto _out; }
+ do {
+ switch(cs) {
+ case CS_HEADER:
+ SWITCH_RANGE_BEGIN
+ SWITCH_RANGE(0x00, 0x7f) // Positive Fixnum
+ push_fixed_value(_uint8, *(uint8_t*)p);
+ SWITCH_RANGE(0xe0, 0xff) // Negative Fixnum
+ push_fixed_value(_int8, *(int8_t*)p);
+ SWITCH_RANGE(0xc0, 0xdf) // Variable
+ switch(*p) {
+ case 0xc0: // nil
+ push_simple_value(_nil);
+ //case 0xc1: // never used
+ case 0xc2: // false
+ push_simple_value(_false);
+ case 0xc3: // true
+ push_simple_value(_true);
+ case 0xc4: // bin 8
+ again_fixed_trail(NEXT_CS(p), 1);
+ case 0xc5: // bin 16
+ again_fixed_trail(NEXT_CS(p), 2);
+ case 0xc6: // bin 32
+ again_fixed_trail(NEXT_CS(p), 4);
+ case 0xc7: // ext 8
+ again_fixed_trail(NEXT_CS(p), 1);
+ case 0xc8: // ext 16
+ again_fixed_trail(NEXT_CS(p), 2);
+ case 0xc9: // ext 32
+ again_fixed_trail(NEXT_CS(p), 4);
+ case 0xca: // float
+ case 0xcb: // double
+ case 0xcc: // unsigned int 8
+ case 0xcd: // unsigned int 16
+ case 0xce: // unsigned int 32
+ case 0xcf: // unsigned int 64
+ case 0xd0: // signed int 8
+ case 0xd1: // signed int 16
+ case 0xd2: // signed int 32
+ case 0xd3: // signed int 64
+ again_fixed_trail(NEXT_CS(p), 1 << (((unsigned int)*p) & 0x03));
+ case 0xd4: // fixext 1
+ case 0xd5: // fixext 2
+ case 0xd6: // fixext 4
+ case 0xd7: // fixext 8
+ again_fixed_trail_if_zero(ACS_EXT_VALUE,
+ (1 << (((unsigned int)*p) & 0x03))+1,
+ _ext_zero);
+ case 0xd8: // fixext 16
+ again_fixed_trail_if_zero(ACS_EXT_VALUE, 16+1, _ext_zero);
+ case 0xd9: // str 8
+ again_fixed_trail(NEXT_CS(p), 1);
+ case 0xda: // raw 16
+ case 0xdb: // raw 32
+ case 0xdc: // array 16
+ case 0xdd: // array 32
+ case 0xde: // map 16
+ case 0xdf: // map 32
+ again_fixed_trail(NEXT_CS(p), 2 << (((unsigned int)*p) & 0x01));
+ default:
+ goto _failed;
+ }
+ SWITCH_RANGE(0xa0, 0xbf) // FixRaw
+ again_fixed_trail_if_zero(ACS_RAW_VALUE, ((unsigned int)*p & 0x1f), _raw_zero);
+ SWITCH_RANGE(0x90, 0x9f) // FixArray
+ start_container(_array, ((unsigned int)*p) & 0x0f, CT_ARRAY_ITEM);
+ SWITCH_RANGE(0x80, 0x8f) // FixMap
+ start_container(_map, ((unsigned int)*p) & 0x0f, CT_MAP_KEY);
+
+ SWITCH_RANGE_DEFAULT
+ goto _failed;
+ SWITCH_RANGE_END
+ // end CS_HEADER
+
+
+ _fixed_trail_again:
+ ++p;
+
+ default:
+ if((size_t)(pe - p) < trail) { goto _out; }
+ n = p; p += trail - 1;
+ switch(cs) {
+ case CS_EXT_8:
+ again_fixed_trail_if_zero(ACS_EXT_VALUE, *(uint8_t*)n+1, _ext_zero);
+ case CS_EXT_16:
+ again_fixed_trail_if_zero(ACS_EXT_VALUE,
+ _msgpack_load16(uint16_t,n)+1,
+ _ext_zero);
+ case CS_EXT_32:
+ again_fixed_trail_if_zero(ACS_EXT_VALUE,
+ _msgpack_load32(uint32_t,n)+1,
+ _ext_zero);
+ case CS_FLOAT: {
+ union { uint32_t i; float f; } mem;
+ mem.i = _msgpack_load32(uint32_t,n);
+ push_fixed_value(_float, mem.f); }
+ case CS_DOUBLE: {
+ union { uint64_t i; double f; } mem;
+ mem.i = _msgpack_load64(uint64_t,n);
+#if defined(__arm__) && !(__ARM_EABI__) // arm-oabi
+ // https://github.com/msgpack/msgpack-perl/pull/1
+ mem.i = (mem.i & 0xFFFFFFFFUL) << 32UL | (mem.i >> 32UL);
+#endif
+ push_fixed_value(_double, mem.f); }
+ case CS_UINT_8:
+ push_fixed_value(_uint8, *(uint8_t*)n);
+ case CS_UINT_16:
+ push_fixed_value(_uint16, _msgpack_load16(uint16_t,n));
+ case CS_UINT_32:
+ push_fixed_value(_uint32, _msgpack_load32(uint32_t,n));
+ case CS_UINT_64:
+ push_fixed_value(_uint64, _msgpack_load64(uint64_t,n));
+
+ case CS_INT_8:
+ push_fixed_value(_int8, *(int8_t*)n);
+ case CS_INT_16:
+ push_fixed_value(_int16, _msgpack_load16(int16_t,n));
+ case CS_INT_32:
+ push_fixed_value(_int32, _msgpack_load32(int32_t,n));
+ case CS_INT_64:
+ push_fixed_value(_int64, _msgpack_load64(int64_t,n));
+
+ case CS_BIN_8:
+ again_fixed_trail_if_zero(ACS_BIN_VALUE, *(uint8_t*)n, _bin_zero);
+ case CS_BIN_16:
+ again_fixed_trail_if_zero(ACS_BIN_VALUE, _msgpack_load16(uint16_t,n), _bin_zero);
+ case CS_BIN_32:
+ again_fixed_trail_if_zero(ACS_BIN_VALUE, _msgpack_load32(uint32_t,n), _bin_zero);
+ case ACS_BIN_VALUE:
+ _bin_zero:
+ push_variable_value(_bin, data, n, trail);
+
+ case CS_RAW_8:
+ again_fixed_trail_if_zero(ACS_RAW_VALUE, *(uint8_t*)n, _raw_zero);
+ case CS_RAW_16:
+ again_fixed_trail_if_zero(ACS_RAW_VALUE, _msgpack_load16(uint16_t,n), _raw_zero);
+ case CS_RAW_32:
+ again_fixed_trail_if_zero(ACS_RAW_VALUE, _msgpack_load32(uint32_t,n), _raw_zero);
+ case ACS_RAW_VALUE:
+ _raw_zero:
+ push_variable_value(_raw, data, n, trail);
+
+ case ACS_EXT_VALUE:
+ _ext_zero:
+ push_variable_value(_ext, data, n, trail);
+
+ case CS_ARRAY_16:
+ start_container(_array, _msgpack_load16(uint16_t,n), CT_ARRAY_ITEM);
+ case CS_ARRAY_32:
+ /* FIXME security guard */
+ start_container(_array, _msgpack_load32(uint32_t,n), CT_ARRAY_ITEM);
+
+ case CS_MAP_16:
+ start_container(_map, _msgpack_load16(uint16_t,n), CT_MAP_KEY);
+ case CS_MAP_32:
+ /* FIXME security guard */
+ start_container(_map, _msgpack_load32(uint32_t,n), CT_MAP_KEY);
+
+ default:
+ goto _failed;
+ }
+ }
+
+_push:
+ if(top == 0) { goto _finish; }
+ c = &stack[top-1];
+ switch(c->ct) {
+ case CT_ARRAY_ITEM:
+ if(construct_cb(_array_item)(user, c->count, &c->obj, obj) < 0) { goto _failed; }
+ if(++c->count == c->size) {
+ obj = c->obj;
+ if (construct_cb(_array_end)(user, &obj) < 0) { goto _failed; }
+ --top;
+ /*printf("stack pop %d\n", top);*/
+ goto _push;
+ }
+ goto _header_again;
+ case CT_MAP_KEY:
+ c->map_key = obj;
+ c->ct = CT_MAP_VALUE;
+ goto _header_again;
+ case CT_MAP_VALUE:
+ if(construct_cb(_map_item)(user, c->count, &c->obj, c->map_key, obj) < 0) { goto _failed; }
+ if(++c->count == c->size) {
+ obj = c->obj;
+ if (construct_cb(_map_end)(user, &obj) < 0) { goto _failed; }
+ --top;
+ /*printf("stack pop %d\n", top);*/
+ goto _push;
+ }
+ c->ct = CT_MAP_KEY;
+ goto _header_again;
+
+ default:
+ goto _failed;
+ }
+
+_header_again:
+ cs = CS_HEADER;
+ ++p;
+ } while(p != pe);
+ goto _out;
+
+
+_finish:
+ if (!construct)
+ unpack_callback_nil(user, &obj);
+ stack[0].obj = obj;
+ ++p;
+ ret = 1;
+ /*printf("-- finish --\n"); */
+ goto _end;
+
+_failed:
+ /*printf("** FAILED **\n"); */
+ ret = -1;
+ goto _end;
+
+_out:
+ ret = 0;
+ goto _end;
+
+_end:
+ ctx->cs = cs;
+ ctx->trail = trail;
+ ctx->top = top;
+ *off = p - (const unsigned char*)data;
+
+ return ret;
+#undef construct_cb
+}
+
+#undef SWITCH_RANGE_BEGIN
+#undef SWITCH_RANGE
+#undef SWITCH_RANGE_DEFAULT
+#undef SWITCH_RANGE_END
+#undef push_simple_value
+#undef push_fixed_value
+#undef push_variable_value
+#undef again_fixed_trail
+#undef again_fixed_trail_if_zero
+#undef start_container
+
+template <unsigned int fixed_offset, unsigned int var_offset>
+static inline int unpack_container_header(unpack_context* ctx, const char* data, size_t len, size_t* off)
+{
+ assert(len >= *off);
+ uint32_t size;
+ const unsigned char *const p = (unsigned char*)data + *off;
+
+#define inc_offset(inc) \
+ if (len - *off < inc) \
+ return 0; \
+ *off += inc;
+
+ switch (*p) {
+ case var_offset:
+ inc_offset(3);
+ size = _msgpack_load16(uint16_t, p + 1);
+ break;
+ case var_offset + 1:
+ inc_offset(5);
+ size = _msgpack_load32(uint32_t, p + 1);
+ break;
+#ifdef USE_CASE_RANGE
+ case fixed_offset + 0x0 ... fixed_offset + 0xf:
+#else
+ case fixed_offset + 0x0:
+ case fixed_offset + 0x1:
+ case fixed_offset + 0x2:
+ case fixed_offset + 0x3:
+ case fixed_offset + 0x4:
+ case fixed_offset + 0x5:
+ case fixed_offset + 0x6:
+ case fixed_offset + 0x7:
+ case fixed_offset + 0x8:
+ case fixed_offset + 0x9:
+ case fixed_offset + 0xa:
+ case fixed_offset + 0xb:
+ case fixed_offset + 0xc:
+ case fixed_offset + 0xd:
+ case fixed_offset + 0xe:
+ case fixed_offset + 0xf:
+#endif
+ ++*off;
+ size = ((unsigned int)*p) & 0x0f;
+ break;
+ default:
+ PyErr_SetString(PyExc_ValueError, "Unexpected type header on stream");
+ return -1;
+ }
+ unpack_callback_uint32(&ctx->user, size, &ctx->stack[0].obj);
+ return 1;
+}
+
+#undef SWITCH_RANGE_BEGIN
+#undef SWITCH_RANGE
+#undef SWITCH_RANGE_DEFAULT
+#undef SWITCH_RANGE_END
+
+static const execute_fn unpack_construct = &unpack_execute<true>;
+static const execute_fn unpack_skip = &unpack_execute<false>;
+static const execute_fn read_array_header = &unpack_container_header<0x90, 0xdc>;
+static const execute_fn read_map_header = &unpack_container_header<0x80, 0xde>;
+
+#undef NEXT_CS
+
+/* vim: set ts=4 sw=4 sts=4 expandtab */
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/parse_helper.h b/contrib/python/pandas/py2/pandas/_libs/src/parse_helper.h
new file mode 100644
index 00000000000..b71131bee70
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/parse_helper.h
@@ -0,0 +1,274 @@
+/*
+Copyright (c) 2016, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+*/
+
+#ifndef PANDAS__LIBS_SRC_PARSE_HELPER_H_
+#define PANDAS__LIBS_SRC_PARSE_HELPER_H_
+
+#include <errno.h>
+#include <float.h>
+#include "inline_helper.h"
+#include "headers/portable.h"
+
+static double xstrtod(const char *p, char **q, char decimal, char sci,
+ int skip_trailing, int *maybe_int);
+
+int to_double(char *item, double *p_value, char sci, char decimal,
+ int *maybe_int) {
+ char *p_end = NULL;
+
+ *p_value = xstrtod(item, &p_end, decimal, sci, 1, maybe_int);
+
+ return (errno == 0) && (!*p_end);
+}
+
+#if PY_VERSION_HEX < 0x02060000
+#define PyBytes_Check PyString_Check
+#define PyBytes_AS_STRING PyString_AS_STRING
+#endif
+
+int floatify(PyObject *str, double *result, int *maybe_int) {
+ int status;
+ char *data;
+ PyObject *tmp = NULL;
+ const char sci = 'E';
+ const char dec = '.';
+
+ if (PyBytes_Check(str)) {
+ data = PyBytes_AS_STRING(str);
+ } else if (PyUnicode_Check(str)) {
+ tmp = PyUnicode_AsUTF8String(str);
+ data = PyBytes_AS_STRING(tmp);
+ } else {
+ PyErr_SetString(PyExc_TypeError, "Invalid object type");
+ return -1;
+ }
+
+ status = to_double(data, result, sci, dec, maybe_int);
+
+ if (!status) {
+ /* handle inf/-inf */
+ if (strlen(data) == 3) {
+ if (0 == strcasecmp(data, "inf")) {
+ *result = HUGE_VAL;
+ *maybe_int = 0;
+ } else {
+ goto parsingerror;
+ }
+ } else if (strlen(data) == 4) {
+ if (0 == strcasecmp(data, "-inf")) {
+ *result = -HUGE_VAL;
+ *maybe_int = 0;
+ } else if (0 == strcasecmp(data, "+inf")) {
+ *result = HUGE_VAL;
+ *maybe_int = 0;
+ } else {
+ goto parsingerror;
+ }
+ } else {
+ goto parsingerror;
+ }
+ }
+
+ Py_XDECREF(tmp);
+ return 0;
+
+parsingerror:
+ PyErr_Format(PyExc_ValueError, "Unable to parse string \"%s\"", data);
+ Py_XDECREF(tmp);
+ return -1;
+
+ /*
+ #if PY_VERSION_HEX >= 0x03000000
+ return PyFloat_FromString(str);
+ #else
+ return PyFloat_FromString(str, NULL);
+ #endif
+ */
+}
+
+// ---------------------------------------------------------------------------
+// Implementation of xstrtod
+
+//
+// strtod.c
+//
+// Convert string to double
+//
+// Copyright (C) 2002 Michael Ringgaard. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// 3. Neither the name of the project nor the names of its contributors
+// may be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// -----------------------------------------------------------------------
+// Modifications by Warren Weckesser, March 2011:
+// * Rename strtod() to xstrtod().
+// * Added decimal and sci arguments.
+// * Skip trailing spaces.
+// * Commented out the other functions.
+//
+
+PANDAS_INLINE void lowercase(char *p) {
+ for (; *p; ++p) *p = tolower_ascii(*p);
+}
+
+PANDAS_INLINE void uppercase(char *p) {
+ for (; *p; ++p) *p = toupper_ascii(*p);
+}
+
+static double xstrtod(const char *str, char **endptr, char decimal, char sci,
+ int skip_trailing, int *maybe_int) {
+ double number;
+ int exponent;
+ int negative;
+ char *p = (char *)str;
+ double p10;
+ int n;
+ int num_digits;
+ int num_decimals;
+
+ errno = 0;
+ *maybe_int = 1;
+
+ // Skip leading whitespace
+ while (isspace(*p)) p++;
+
+ // Handle optional sign
+ negative = 0;
+ switch (*p) {
+ case '-':
+ negative = 1; // Fall through to increment position
+ case '+':
+ p++;
+ }
+
+ number = 0.;
+ exponent = 0;
+ num_digits = 0;
+ num_decimals = 0;
+
+ // Process string of digits
+ while (isdigit_ascii(*p)) {
+ number = number * 10. + (*p - '0');
+ p++;
+ num_digits++;
+ }
+
+ // Process decimal part
+ if (*p == decimal) {
+ *maybe_int = 0;
+ p++;
+
+ while (isdigit_ascii(*p)) {
+ number = number * 10. + (*p - '0');
+ p++;
+ num_digits++;
+ num_decimals++;
+ }
+
+ exponent -= num_decimals;
+ }
+
+ if (num_digits == 0) {
+ errno = ERANGE;
+ return 0.0;
+ }
+
+ // Correct for sign
+ if (negative) number = -number;
+
+ // Process an exponent string
+ if (toupper_ascii(*p) == toupper_ascii(sci)) {
+ *maybe_int = 0;
+
+ // Handle optional sign
+ negative = 0;
+ switch (*++p) {
+ case '-':
+ negative = 1; // Fall through to increment pos
+ case '+':
+ p++;
+ }
+
+ // Process string of digits
+ num_digits = 0;
+ n = 0;
+ while (isdigit_ascii(*p)) {
+ n = n * 10 + (*p - '0');
+ num_digits++;
+ p++;
+ }
+
+ if (negative)
+ exponent -= n;
+ else
+ exponent += n;
+
+ // If no digits, after the 'e'/'E', un-consume it
+ if (num_digits == 0) p--;
+ }
+
+ if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) {
+ errno = ERANGE;
+ return HUGE_VAL;
+ }
+
+ // Scale the result
+ p10 = 10.;
+ n = exponent;
+ if (n < 0) n = -n;
+ while (n) {
+ if (n & 1) {
+ if (exponent < 0)
+ number /= p10;
+ else
+ number *= p10;
+ }
+ n >>= 1;
+ p10 *= p10;
+ }
+
+ if (number == HUGE_VAL) {
+ errno = ERANGE;
+ }
+
+ if (skip_trailing) {
+ // Skip trailing whitespace
+ while (isspace_ascii(*p)) p++;
+ }
+
+ if (endptr) *endptr = p;
+
+ return number;
+}
+
+#endif // PANDAS__LIBS_SRC_PARSE_HELPER_H_
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/parser/io.c b/contrib/python/pandas/py2/pandas/_libs/src/parser/io.c
new file mode 100644
index 00000000000..19271c78501
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/parser/io.c
@@ -0,0 +1,280 @@
+/*
+Copyright (c) 2016, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+*/
+
+#include "io.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#ifndef O_BINARY
+#define O_BINARY 0
+#endif /* O_BINARY */
+
+/*
+ On-disk FILE, uncompressed
+*/
+
+void *new_file_source(char *fname, size_t buffer_size) {
+ file_source *fs = (file_source *)malloc(sizeof(file_source));
+ if (fs == NULL) {
+ return NULL;
+ }
+
+ fs->fd = open(fname, O_RDONLY | O_BINARY);
+ if (fs->fd == -1) {
+ free(fs);
+ return NULL;
+ }
+
+ // Only allocate this heap memory if we are not memory-mapping the file
+ fs->buffer = (char *)malloc((buffer_size + 1) * sizeof(char));
+
+ if (fs->buffer == NULL) {
+ close(fs->fd);
+ free(fs);
+ return NULL;
+ }
+
+ memset(fs->buffer, '\0', buffer_size + 1);
+ fs->size = buffer_size;
+
+ return (void *)fs;
+}
+
+void *new_rd_source(PyObject *obj) {
+ rd_source *rds = (rd_source *)malloc(sizeof(rd_source));
+
+ /* hold on to this object */
+ Py_INCREF(obj);
+ rds->obj = obj;
+ rds->buffer = NULL;
+ rds->position = 0;
+
+ return (void *)rds;
+}
+
+/*
+
+ Cleanup callbacks
+
+ */
+
+int del_file_source(void *ptr) {
+ file_source *fs = ptr;
+ if (fs == NULL) return 0;
+
+ free(fs->buffer);
+ close(fs->fd);
+ free(fs);
+
+ return 0;
+}
+
+int del_rd_source(void *rds) {
+ Py_XDECREF(RDS(rds)->obj);
+ Py_XDECREF(RDS(rds)->buffer);
+ free(rds);
+
+ return 0;
+}
+
+/*
+
+ IO callbacks
+
+ */
+
+void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read,
+ int *status) {
+ file_source *fs = FS(source);
+ ssize_t rv;
+
+ if (nbytes > fs->size) {
+ nbytes = fs->size;
+ }
+
+ rv = read(fs->fd, fs->buffer, nbytes);
+ switch (rv) {
+ case -1:
+ *status = CALLING_READ_FAILED;
+ *bytes_read = 0;
+ return NULL;
+ case 0:
+ *status = REACHED_EOF;
+ *bytes_read = 0;
+ return NULL;
+ default:
+ *status = 0;
+ *bytes_read = rv;
+ fs->buffer[rv] = '\0';
+ break;
+ }
+
+ return (void *)fs->buffer;
+}
+
+void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
+ int *status) {
+ PyGILState_STATE state;
+ PyObject *result, *func, *args, *tmp;
+
+ void *retval;
+
+ size_t length;
+ rd_source *src = RDS(source);
+ state = PyGILState_Ensure();
+
+ /* delete old object */
+ Py_XDECREF(src->buffer);
+ src->buffer = NULL;
+ args = Py_BuildValue("(i)", nbytes);
+
+ func = PyObject_GetAttrString(src->obj, "read");
+
+ /* TODO: does this release the GIL? */
+ result = PyObject_CallObject(func, args);
+ Py_XDECREF(args);
+ Py_XDECREF(func);
+
+ if (result == NULL) {
+ PyGILState_Release(state);
+ *bytes_read = 0;
+ *status = CALLING_READ_FAILED;
+ return NULL;
+ } else if (!PyBytes_Check(result)) {
+ tmp = PyUnicode_AsUTF8String(result);
+ Py_DECREF(result);
+ if (tmp == NULL) {
+ PyGILState_Release(state);
+ return NULL;
+ }
+ result = tmp;
+ }
+
+ length = PySequence_Length(result);
+
+ if (length == 0)
+ *status = REACHED_EOF;
+ else
+ *status = 0;
+
+ /* hang on to the Python object */
+ src->buffer = result;
+ retval = (void *)PyBytes_AsString(result);
+
+ PyGILState_Release(state);
+
+ /* TODO: more error handling */
+ *bytes_read = length;
+
+ return retval;
+}
+
+#ifdef HAVE_MMAP
+
+#include <sys/mman.h>
+
+void *new_mmap(char *fname) {
+ memory_map *mm;
+ struct stat stat;
+ size_t filesize;
+
+ mm = (memory_map *)malloc(sizeof(memory_map));
+ if (mm == NULL) {
+ fprintf(stderr, "new_file_buffer: malloc() failed.\n");
+ return (NULL);
+ }
+ mm->fd = open(fname, O_RDONLY | O_BINARY);
+ if (mm->fd == -1) {
+ fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n",
+ fname, errno);
+ free(mm);
+ return NULL;
+ }
+
+ if (fstat(mm->fd, &stat) == -1) {
+ fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n",
+ errno);
+ close(mm->fd);
+ free(mm);
+ return NULL;
+ }
+ filesize = stat.st_size; /* XXX This might be 32 bits. */
+
+ mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, mm->fd, 0);
+ if (mm->memmap == MAP_FAILED) {
+ /* XXX Eventually remove this print statement. */
+ fprintf(stderr, "new_file_buffer: mmap() failed.\n");
+ close(mm->fd);
+ free(mm);
+ return NULL;
+ }
+
+ mm->size = (off_t)filesize;
+ mm->position = 0;
+
+ return mm;
+}
+
+int del_mmap(void *ptr) {
+ memory_map *mm = ptr;
+
+ if (mm == NULL) return 0;
+
+ munmap(mm->memmap, mm->size);
+ close(mm->fd);
+ free(mm);
+
+ return 0;
+}
+
+void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read,
+ int *status) {
+ void *retval;
+ memory_map *src = source;
+ size_t remaining = src->size - src->position;
+
+ if (remaining == 0) {
+ *bytes_read = 0;
+ *status = REACHED_EOF;
+ return NULL;
+ }
+
+ if (nbytes > remaining) {
+ nbytes = remaining;
+ }
+
+ retval = src->memmap + src->position;
+
+ /* advance position in mmap data structure */
+ src->position += nbytes;
+
+ *bytes_read = nbytes;
+ *status = 0;
+
+ return retval;
+}
+
+#else
+
+/* kludgy */
+
+void *new_mmap(char *fname) { return NULL; }
+
+int del_mmap(void *src) { return 0; }
+
+/* don't use this! */
+
+void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read,
+ int *status) {
+ return NULL;
+}
+
+#endif
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/parser/io.h b/contrib/python/pandas/py2/pandas/_libs/src/parser/io.h
new file mode 100644
index 00000000000..d22e8ddaea8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/parser/io.h
@@ -0,0 +1,70 @@
+/*
+Copyright (c) 2016, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+*/
+
+#ifndef PANDAS__LIBS_SRC_PARSER_IO_H_
+#define PANDAS__LIBS_SRC_PARSER_IO_H_
+
+#include "Python.h"
+#include "tokenizer.h"
+
+typedef struct _file_source {
+ /* The file being read. */
+ int fd;
+
+ char *buffer;
+ size_t size;
+} file_source;
+
+#define FS(source) ((file_source *)source)
+
+#if !defined(_WIN32) && !defined(HAVE_MMAP)
+#define HAVE_MMAP
+#endif
+
+typedef struct _memory_map {
+ int fd;
+
+ /* Size of the file, in bytes. */
+ char *memmap;
+ size_t size;
+
+ size_t position;
+} memory_map;
+
+#define MM(src) ((memory_map *)src)
+
+void *new_mmap(char *fname);
+
+int del_mmap(void *src);
+
+void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read,
+ int *status);
+
+typedef struct _rd_source {
+ PyObject *obj;
+ PyObject *buffer;
+ size_t position;
+} rd_source;
+
+#define RDS(source) ((rd_source *)source)
+
+void *new_file_source(char *fname, size_t buffer_size);
+
+void *new_rd_source(PyObject *obj);
+
+int del_file_source(void *src);
+int del_rd_source(void *src);
+
+void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read,
+ int *status);
+
+void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
+ int *status);
+
+#endif // PANDAS__LIBS_SRC_PARSER_IO_H_
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/parser/tokenizer.c b/contrib/python/pandas/py2/pandas/_libs/src/parser/tokenizer.c
new file mode 100644
index 00000000000..a86af7c5416
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/parser/tokenizer.c
@@ -0,0 +1,2033 @@
+/*
+
+Copyright (c) 2012, Lambda Foundry, Inc., except where noted
+
+Incorporates components of WarrenWeckesser/textreader, licensed under 3-clause
+BSD
+
+See LICENSE for the license
+
+*/
+
+/*
+
+Low-level ascii-file processing for pandas. Combines some elements from
+Python's built-in csv module and Warren Weckesser's textreader project on
+GitHub. See Python Software Foundation License and BSD licenses for these.
+
+*/
+
+#include "tokenizer.h"
+
+#include <ctype.h>
+#include <float.h>
+#include <math.h>
+
+#include "../headers/portable.h"
+
+static void *safe_realloc(void *buffer, size_t size) {
+ void *result;
+ // OSX is weird.
+ // http://stackoverflow.com/questions/9560609/
+ // different-realloc-behaviour-in-linux-and-osx
+
+ result = realloc(buffer, size);
+ TRACE(("safe_realloc: buffer = %p, size = %zu, result = %p\n", buffer, size,
+ result))
+
+ return result;
+}
+
+void coliter_setup(coliter_t *self, parser_t *parser, int i, int start) {
+ // column i, starting at 0
+ self->words = parser->words;
+ self->col = i;
+ self->line_start = parser->line_start + start;
+}
+
+coliter_t *coliter_new(parser_t *self, int i) {
+ // column i, starting at 0
+ coliter_t *iter = (coliter_t *)malloc(sizeof(coliter_t));
+
+ if (NULL == iter) {
+ return NULL;
+ }
+
+ coliter_setup(iter, self, i, 0);
+ return iter;
+}
+
+static void free_if_not_null(void **ptr) {
+ TRACE(("free_if_not_null %p\n", *ptr))
+ if (*ptr != NULL) {
+ free(*ptr);
+ *ptr = NULL;
+ }
+}
+
+/*
+
+ Parser / tokenizer
+
+*/
+
+static void *grow_buffer(void *buffer, int64_t length, int64_t *capacity,
+ int64_t space, int64_t elsize, int *error) {
+ int64_t cap = *capacity;
+ void *newbuffer = buffer;
+
+ // Can we fit potentially nbytes tokens (+ null terminators) in the stream?
+ while ((length + space >= cap) && (newbuffer != NULL)) {
+ cap = cap ? cap << 1 : 2;
+ buffer = newbuffer;
+ newbuffer = safe_realloc(newbuffer, elsize * cap);
+ }
+
+ if (newbuffer == NULL) {
+ // realloc failed so don't change *capacity, set *error to errno
+ // and return the last good realloc'd buffer so it can be freed
+ *error = errno;
+ newbuffer = buffer;
+ } else {
+ // realloc worked, update *capacity and set *error to 0
+ // sigh, multiple return values
+ *capacity = cap;
+ *error = 0;
+ }
+ return newbuffer;
+}
+
+void parser_set_default_options(parser_t *self) {
+ self->decimal = '.';
+ self->sci = 'E';
+
+ // For tokenization
+ self->state = START_RECORD;
+
+ self->delimiter = ','; // XXX
+ self->delim_whitespace = 0;
+
+ self->doublequote = 0;
+ self->quotechar = '"';
+ self->escapechar = 0;
+
+ self->lineterminator = '\0'; /* NUL->standard logic */
+
+ self->skipinitialspace = 0;
+ self->quoting = QUOTE_MINIMAL;
+ self->allow_embedded_newline = 1;
+ self->strict = 0;
+
+ self->expected_fields = -1;
+ self->error_bad_lines = 0;
+ self->warn_bad_lines = 0;
+
+ self->commentchar = '#';
+ self->thousands = '\0';
+
+ self->skipset = NULL;
+ self->skipfunc = NULL;
+ self->skip_first_N_rows = -1;
+ self->skip_footer = 0;
+}
+
+int get_parser_memory_footprint(parser_t *self) { return 0; }
+
+parser_t *parser_new() { return (parser_t *)calloc(1, sizeof(parser_t)); }
+
+int parser_clear_data_buffers(parser_t *self) {
+ free_if_not_null((void *)&self->stream);
+ free_if_not_null((void *)&self->words);
+ free_if_not_null((void *)&self->word_starts);
+ free_if_not_null((void *)&self->line_start);
+ free_if_not_null((void *)&self->line_fields);
+ return 0;
+}
+
+int parser_cleanup(parser_t *self) {
+ int status = 0;
+
+ // XXX where to put this
+ free_if_not_null((void *)&self->error_msg);
+ free_if_not_null((void *)&self->warn_msg);
+
+ if (self->skipset != NULL) {
+ kh_destroy_int64((kh_int64_t *)self->skipset);
+ self->skipset = NULL;
+ }
+
+ if (parser_clear_data_buffers(self) < 0) {
+ status = -1;
+ }
+
+ if (self->cb_cleanup != NULL) {
+ if (self->cb_cleanup(self->source) < 0) {
+ status = -1;
+ }
+ self->cb_cleanup = NULL;
+ }
+
+ return status;
+}
+
+int parser_init(parser_t *self) {
+ int64_t sz;
+
+ /*
+ Initialize data buffers
+ */
+
+ self->stream = NULL;
+ self->words = NULL;
+ self->word_starts = NULL;
+ self->line_start = NULL;
+ self->line_fields = NULL;
+ self->error_msg = NULL;
+ self->warn_msg = NULL;
+
+ // token stream
+ self->stream = (char *)malloc(STREAM_INIT_SIZE * sizeof(char));
+ if (self->stream == NULL) {
+ parser_cleanup(self);
+ return PARSER_OUT_OF_MEMORY;
+ }
+ self->stream_cap = STREAM_INIT_SIZE;
+ self->stream_len = 0;
+
+ // word pointers and metadata
+ sz = STREAM_INIT_SIZE / 10;
+ sz = sz ? sz : 1;
+ self->words = (char **)malloc(sz * sizeof(char *));
+ self->word_starts = (int64_t *)malloc(sz * sizeof(int64_t));
+ self->max_words_cap = sz;
+ self->words_cap = sz;
+ self->words_len = 0;
+
+ // line pointers and metadata
+ self->line_start = (int64_t *)malloc(sz * sizeof(int64_t));
+
+ self->line_fields = (int64_t *)malloc(sz * sizeof(int64_t));
+
+ self->lines_cap = sz;
+ self->lines = 0;
+ self->file_lines = 0;
+
+ if (self->stream == NULL || self->words == NULL ||
+ self->word_starts == NULL || self->line_start == NULL ||
+ self->line_fields == NULL) {
+ parser_cleanup(self);
+
+ return PARSER_OUT_OF_MEMORY;
+ }
+
+ /* amount of bytes buffered */
+ self->datalen = 0;
+ self->datapos = 0;
+
+ self->line_start[0] = 0;
+ self->line_fields[0] = 0;
+
+ self->pword_start = self->stream;
+ self->word_start = 0;
+
+ self->state = START_RECORD;
+
+ self->error_msg = NULL;
+ self->warn_msg = NULL;
+
+ self->commentchar = '\0';
+
+ return 0;
+}
+
+void parser_free(parser_t *self) {
+ // opposite of parser_init
+ parser_cleanup(self);
+}
+
+void parser_del(parser_t *self) {
+ free(self);
+}
+
+static int make_stream_space(parser_t *self, size_t nbytes) {
+ int64_t i, cap, length;
+ int status;
+ void *orig_ptr, *newptr;
+
+ // Can we fit potentially nbytes tokens (+ null terminators) in the stream?
+
+ /*
+ TOKEN STREAM
+ */
+
+ orig_ptr = (void *)self->stream;
+ TRACE(
+ ("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n",
+ nbytes))
+ self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len,
+ (int64_t*)&self->stream_cap, nbytes * 2,
+ sizeof(char), &status);
+ TRACE(
+ ("make_stream_space: self->stream=%p, self->stream_len = %zu, "
+ "self->stream_cap=%zu, status=%zu\n",
+ self->stream, self->stream_len, self->stream_cap, status))
+
+ if (status != 0) {
+ return PARSER_OUT_OF_MEMORY;
+ }
+
+ // realloc sets errno when moving buffer?
+ if (self->stream != orig_ptr) {
+ self->pword_start = self->stream + self->word_start;
+
+ for (i = 0; i < self->words_len; ++i) {
+ self->words[i] = self->stream + self->word_starts[i];
+ }
+ }
+
+ /*
+ WORD VECTORS
+ */
+
+ cap = self->words_cap;
+
+ /**
+ * If we are reading in chunks, we need to be aware of the maximum number
+ * of words we have seen in previous chunks (self->max_words_cap), so
+ * that way, we can properly allocate when reading subsequent ones.
+ *
+ * Otherwise, we risk a buffer overflow if we mistakenly under-allocate
+ * just because a recent chunk did not have as many words.
+ */
+ if (self->words_len + nbytes < self->max_words_cap) {
+ length = self->max_words_cap - nbytes - 1;
+ } else {
+ length = self->words_len;
+ }
+
+ self->words =
+ (char **)grow_buffer((void *)self->words, length,
+ (int64_t*)&self->words_cap, nbytes,
+ sizeof(char *), &status);
+ TRACE(
+ ("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, "
+ "%d)\n",
+ self->words_len, self->words_cap, nbytes, status))
+ if (status != 0) {
+ return PARSER_OUT_OF_MEMORY;
+ }
+
+ // realloc took place
+ if (cap != self->words_cap) {
+ TRACE(
+ ("make_stream_space: cap != self->words_cap, nbytes = %d, "
+ "self->words_cap=%d\n",
+ nbytes, self->words_cap))
+ newptr = safe_realloc((void *)self->word_starts,
+ sizeof(int64_t) * self->words_cap);
+ if (newptr == NULL) {
+ return PARSER_OUT_OF_MEMORY;
+ } else {
+ self->word_starts = (int64_t *)newptr;
+ }
+ }
+
+ /*
+ LINE VECTORS
+ */
+ cap = self->lines_cap;
+ self->line_start =
+ (int64_t *)grow_buffer((void *)self->line_start, self->lines + 1,
+ (int64_t*)&self->lines_cap, nbytes,
+ sizeof(int64_t), &status);
+ TRACE((
+ "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n",
+ self->lines + 1, self->lines_cap, nbytes, status))
+ if (status != 0) {
+ return PARSER_OUT_OF_MEMORY;
+ }
+
+ // realloc took place
+ if (cap != self->lines_cap) {
+ TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n",
+ nbytes))
+ newptr = safe_realloc((void *)self->line_fields,
+ sizeof(int64_t) * self->lines_cap);
+ if (newptr == NULL) {
+ return PARSER_OUT_OF_MEMORY;
+ } else {
+ self->line_fields = (int64_t *)newptr;
+ }
+ }
+
+ return 0;
+}
+
+static int push_char(parser_t *self, char c) {
+ TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n",
+ self->stream_len + 1, c, self->stream_cap))
+ if (self->stream_len >= self->stream_cap) {
+ TRACE(
+ ("push_char: ERROR!!! self->stream_len(%d) >= "
+ "self->stream_cap(%d)\n",
+ self->stream_len, self->stream_cap))
+ int64_t bufsize = 100;
+ self->error_msg = (char *)malloc(bufsize);
+ snprintf(self->error_msg, bufsize,
+ "Buffer overflow caught - possible malformed input file.\n");
+ return PARSER_OUT_OF_MEMORY;
+ }
+ self->stream[self->stream_len++] = c;
+ return 0;
+}
+
+int PANDAS_INLINE end_field(parser_t *self) {
+ // XXX cruft
+ if (self->words_len >= self->words_cap) {
+ TRACE(
+ ("end_field: ERROR!!! self->words_len(%zu) >= "
+ "self->words_cap(%zu)\n",
+ self->words_len, self->words_cap))
+ int64_t bufsize = 100;
+ self->error_msg = (char *)malloc(bufsize);
+ snprintf(self->error_msg, bufsize,
+ "Buffer overflow caught - possible malformed input file.\n");
+ return PARSER_OUT_OF_MEMORY;
+ }
+
+ // null terminate token
+ push_char(self, '\0');
+
+ // set pointer and metadata
+ self->words[self->words_len] = self->pword_start;
+
+ TRACE(("end_field: Char diff: %d\n", self->pword_start - self->words[0]));
+
+ TRACE(("end_field: Saw word %s at: %d. Total: %d\n", self->pword_start,
+ self->word_start, self->words_len + 1))
+
+ self->word_starts[self->words_len] = self->word_start;
+ self->words_len++;
+
+ // increment line field count
+ self->line_fields[self->lines]++;
+
+ // New field begin in stream
+ self->pword_start = self->stream + self->stream_len;
+ self->word_start = self->stream_len;
+
+ return 0;
+}
+
+static void append_warning(parser_t *self, const char *msg) {
+ int64_t ex_length;
+ int64_t length = strlen(msg);
+ void *newptr;
+
+ if (self->warn_msg == NULL) {
+ self->warn_msg = (char *)malloc(length + 1);
+ strncpy(self->warn_msg, msg, strlen(msg) + 1);
+ } else {
+ ex_length = strlen(self->warn_msg);
+ newptr = safe_realloc(self->warn_msg, ex_length + length + 1);
+ if (newptr != NULL) {
+ self->warn_msg = (char *)newptr;
+ strncpy(self->warn_msg + ex_length, msg, strlen(msg) + 1);
+ }
+ }
+}
+
+static int end_line(parser_t *self) {
+ char *msg;
+ int64_t fields;
+ int ex_fields = self->expected_fields;
+ int64_t bufsize = 100; // for error or warning messages
+
+ fields = self->line_fields[self->lines];
+
+ TRACE(("end_line: Line end, nfields: %d\n", fields));
+
+ TRACE(("end_line: lines: %d\n", self->lines));
+ if (self->lines > 0) {
+ if (self->expected_fields >= 0) {
+ ex_fields = self->expected_fields;
+ } else {
+ ex_fields = self->line_fields[self->lines - 1];
+ }
+ }
+ TRACE(("end_line: ex_fields: %d\n", ex_fields));
+
+ if (self->state == START_FIELD_IN_SKIP_LINE ||
+ self->state == IN_FIELD_IN_SKIP_LINE ||
+ self->state == IN_QUOTED_FIELD_IN_SKIP_LINE ||
+ self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE) {
+ TRACE(("end_line: Skipping row %d\n", self->file_lines));
+ // increment file line count
+ self->file_lines++;
+
+ // skip the tokens from this bad line
+ self->line_start[self->lines] += fields;
+
+ // reset field count
+ self->line_fields[self->lines] = 0;
+ return 0;
+ }
+
+ if (!(self->lines <= (int64_t) self->header_end + 1) &&
+ (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) {
+ // increment file line count
+ self->file_lines++;
+
+ // skip the tokens from this bad line
+ self->line_start[self->lines] += fields;
+
+ // reset field count
+ self->line_fields[self->lines] = 0;
+
+ // file_lines is now the actual file line number (starting at 1)
+ if (self->error_bad_lines) {
+ self->error_msg = (char *)malloc(bufsize);
+ snprintf(self->error_msg, bufsize,
+ "Expected %d fields in line %lld, saw %lld\n",
+ ex_fields, (long long)self->file_lines, (long long)fields);
+
+ TRACE(("Error at line %d, %d fields\n", self->file_lines, fields));
+
+ return -1;
+ } else {
+ // simply skip bad lines
+ if (self->warn_bad_lines) {
+ // pass up error message
+ msg = (char *)malloc(bufsize);
+ snprintf(msg, bufsize,
+ "Skipping line %lld: expected %d fields, saw %lld\n",
+ (long long)self->file_lines, ex_fields,
+ (long long)fields);
+ append_warning(self, msg);
+ free(msg);
+ }
+ }
+ } else {
+ // missing trailing delimiters
+ if ((self->lines >= (int64_t) self->header_end + 1) &&
+ fields < ex_fields) {
+ // might overrun the buffer when closing fields
+ if (make_stream_space(self, ex_fields - fields) < 0) {
+ int64_t bufsize = 100;
+ self->error_msg = (char *)malloc(bufsize);
+ snprintf(self->error_msg, bufsize, "out of memory");
+ return -1;
+ }
+
+ while (fields < ex_fields) {
+ end_field(self);
+ fields++;
+ }
+ }
+
+ // increment both line counts
+ self->file_lines++;
+ self->lines++;
+
+ // good line, set new start point
+ if (self->lines >= self->lines_cap) {
+ TRACE((
+ "end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n",
+ self->lines, self->lines_cap))
+ int64_t bufsize = 100;
+ self->error_msg = (char *)malloc(bufsize);
+ snprintf(self->error_msg, bufsize,
+ "Buffer overflow caught - "
+ "possible malformed input file.\n");
+ return PARSER_OUT_OF_MEMORY;
+ }
+ self->line_start[self->lines] =
+ (self->line_start[self->lines - 1] + fields);
+
+ TRACE(
+ ("end_line: new line start: %d\n", self->line_start[self->lines]));
+
+ // new line start with 0 fields
+ self->line_fields[self->lines] = 0;
+ }
+
+ TRACE(("end_line: Finished line, at %d\n", self->lines));
+
+ return 0;
+}
+
+int parser_add_skiprow(parser_t *self, int64_t row) {
+ khiter_t k;
+ kh_int64_t *set;
+ int ret = 0;
+
+ if (self->skipset == NULL) {
+ self->skipset = (void *)kh_init_int64();
+ }
+
+ set = (kh_int64_t *)self->skipset;
+
+ k = kh_put_int64(set, row, &ret);
+ set->keys[k] = row;
+
+ return 0;
+}
+
+int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) {
+ // self->file_lines is zero based so subtract 1 from nrows
+ if (nrows > 0) {
+ self->skip_first_N_rows = nrows - 1;
+ }
+
+ return 0;
+}
+
+static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
+ int status;
+ size_t bytes_read;
+
+ status = 0;
+ self->datapos = 0;
+ self->data = self->cb_io(self->source, nbytes, &bytes_read, &status);
+ TRACE((
+ "parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n",
+ nbytes, bytes_read, status));
+ self->datalen = bytes_read;
+
+ if (status != REACHED_EOF && self->data == NULL) {
+ int64_t bufsize = 200;
+ self->error_msg = (char *)malloc(bufsize);
+
+ if (status == CALLING_READ_FAILED) {
+ snprintf(self->error_msg, bufsize,
+ "Calling read(nbytes) on source failed. "
+ "Try engine='python'.");
+ } else {
+ snprintf(self->error_msg, bufsize, "Unknown error in IO callback");
+ }
+ return -1;
+ }
+
+ TRACE(("datalen: %d\n", self->datalen));
+
+ return status;
+}
+
+/*
+
+ Tokenization macros and state machine code
+
+*/
+
+#define PUSH_CHAR(c) \
+ TRACE( \
+ ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \
+ c, slen, self->stream_cap, self->stream_len)) \
+ if (slen >= self->stream_cap) { \
+ TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \
+ self->stream_cap)) \
+ int64_t bufsize = 100; \
+ self->error_msg = (char *)malloc(bufsize); \
+ snprintf(self->error_msg, bufsize, \
+ "Buffer overflow caught - possible malformed input file.\n");\
+ return PARSER_OUT_OF_MEMORY; \
+ } \
+ *stream++ = c; \
+ slen++;
+
+// This is a little bit of a hack but works for now
+
+#define END_FIELD() \
+ self->stream_len = slen; \
+ if (end_field(self) < 0) { \
+ goto parsingerror; \
+ } \
+ stream = self->stream + self->stream_len; \
+ slen = self->stream_len;
+
+#define END_LINE_STATE(STATE) \
+ self->stream_len = slen; \
+ if (end_line(self) < 0) { \
+ goto parsingerror; \
+ } \
+ stream = self->stream + self->stream_len; \
+ slen = self->stream_len; \
+ self->state = STATE; \
+ if (line_limit > 0 && self->lines == start_lines + (int64_t)line_limit) { \
+ goto linelimit; \
+ }
+
+#define END_LINE_AND_FIELD_STATE(STATE) \
+ self->stream_len = slen; \
+ if (end_line(self) < 0) { \
+ goto parsingerror; \
+ } \
+ if (end_field(self) < 0) { \
+ goto parsingerror; \
+ } \
+ stream = self->stream + self->stream_len; \
+ slen = self->stream_len; \
+ self->state = STATE; \
+ if (line_limit > 0 && self->lines == start_lines + (int64_t)line_limit) { \
+ goto linelimit; \
+ }
+
+#define END_LINE() END_LINE_STATE(START_RECORD)
+
+#define IS_WHITESPACE(c) ((c == ' ' || c == '\t'))
+
+#define IS_TERMINATOR(c) \
+ ((self->lineterminator == '\0' && c == '\n') || \
+ (self->lineterminator != '\0' && c == self->lineterminator))
+
+#define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE))
+
+// don't parse '\r' with a custom line terminator
+#define IS_CARRIAGE(c) ((self->lineterminator == '\0' && c == '\r'))
+
+#define IS_COMMENT_CHAR(c) \
+ ((self->commentchar != '\0' && c == self->commentchar))
+
+#define IS_ESCAPE_CHAR(c) ((self->escapechar != '\0' && c == self->escapechar))
+
+#define IS_SKIPPABLE_SPACE(c) \
+ ((!self->delim_whitespace && c == ' ' && self->skipinitialspace))
+
+// applied when in a field
+#define IS_DELIMITER(c) \
+ ((!self->delim_whitespace && c == self->delimiter) || \
+ (self->delim_whitespace && IS_WHITESPACE(c)))
+
+#define _TOKEN_CLEANUP() \
+ self->stream_len = slen; \
+ self->datapos = i; \
+ TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, \
+ self->datalen));
+
+#define CHECK_FOR_BOM() \
+ if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \
+ buf += 3; \
+ self->datapos += 3; \
+ }
+
+int skip_this_line(parser_t *self, int64_t rownum) {
+ int should_skip;
+ PyObject *result;
+ PyGILState_STATE state;
+
+ if (self->skipfunc != NULL) {
+ state = PyGILState_Ensure();
+ result = PyObject_CallFunction(self->skipfunc, "i", rownum);
+
+ // Error occurred. It will be processed
+ // and caught at the Cython level.
+ if (result == NULL) {
+ should_skip = -1;
+ } else {
+ should_skip = PyObject_IsTrue(result);
+ }
+
+ Py_XDECREF(result);
+ PyGILState_Release(state);
+
+ return should_skip;
+ } else if (self->skipset != NULL) {
+ return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) !=
+ ((kh_int64_t *)self->skipset)->n_buckets);
+ } else {
+ return (rownum <= self->skip_first_N_rows);
+ }
+}
+
+int tokenize_bytes(parser_t *self, size_t line_limit, int64_t start_lines) {
+ int64_t i, slen;
+ int should_skip;
+ char c;
+ char *stream;
+ char *buf = self->data + self->datapos;
+
+ if (make_stream_space(self, self->datalen - self->datapos) < 0) {
+ int64_t bufsize = 100;
+ self->error_msg = (char *)malloc(bufsize);
+ snprintf(self->error_msg, bufsize, "out of memory");
+ return -1;
+ }
+
+ stream = self->stream + self->stream_len;
+ slen = self->stream_len;
+
+ TRACE(("%s\n", buf));
+
+ if (self->file_lines == 0) {
+ CHECK_FOR_BOM();
+ }
+
+ for (i = self->datapos; i < self->datalen; ++i) {
+ // next character in file
+ c = *buf++;
+
+ TRACE(
+ ("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, "
+ "state %d\n",
+ i, c, self->file_lines + 1, self->line_fields[self->lines],
+ self->state));
+
+ switch (self->state) {
+ case START_FIELD_IN_SKIP_LINE:
+ if (IS_TERMINATOR(c)) {
+ END_LINE();
+ } else if (IS_CARRIAGE(c)) {
+ self->file_lines++;
+ self->state = EAT_CRNL_NOP;
+ } else if (IS_QUOTE(c)) {
+ self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
+ } else if (IS_DELIMITER(c)) {
+ // Do nothing, we're starting a new field again.
+ } else {
+ self->state = IN_FIELD_IN_SKIP_LINE;
+ }
+ break;
+
+ case IN_FIELD_IN_SKIP_LINE:
+ if (IS_TERMINATOR(c)) {
+ END_LINE();
+ } else if (IS_CARRIAGE(c)) {
+ self->file_lines++;
+ self->state = EAT_CRNL_NOP;
+ } else if (IS_DELIMITER(c)) {
+ self->state = START_FIELD_IN_SKIP_LINE;
+ }
+ break;
+
+ case IN_QUOTED_FIELD_IN_SKIP_LINE:
+ if (IS_QUOTE(c)) {
+ if (self->doublequote) {
+ self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE;
+ } else {
+ self->state = IN_FIELD_IN_SKIP_LINE;
+ }
+ }
+ break;
+
+ case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE:
+ if (IS_QUOTE(c)) {
+ self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
+ } else if (IS_TERMINATOR(c)) {
+ END_LINE();
+ } else if (IS_CARRIAGE(c)) {
+ self->file_lines++;
+ self->state = EAT_CRNL_NOP;
+ } else if (IS_DELIMITER(c)) {
+ self->state = START_FIELD_IN_SKIP_LINE;
+ } else {
+ self->state = IN_FIELD_IN_SKIP_LINE;
+ }
+ break;
+
+ case WHITESPACE_LINE:
+ if (IS_TERMINATOR(c)) {
+ self->file_lines++;
+ self->state = START_RECORD;
+ break;
+ } else if (IS_CARRIAGE(c)) {
+ self->file_lines++;
+ self->state = EAT_CRNL_NOP;
+ break;
+ } else if (!self->delim_whitespace) {
+ if (IS_WHITESPACE(c) && c != self->delimiter) {
+ } else { // backtrack
+ // use i + 1 because buf has been incremented but not i
+ do {
+ --buf;
+ --i;
+ } while (i + 1 > self->datapos && !IS_TERMINATOR(*buf));
+
+ // reached a newline rather than the beginning
+ if (IS_TERMINATOR(*buf)) {
+ ++buf; // move pointer to first char after newline
+ ++i;
+ }
+ self->state = START_FIELD;
+ }
+ break;
+ }
+ // fall through
+
+ case EAT_WHITESPACE:
+ if (IS_TERMINATOR(c)) {
+ END_LINE();
+ self->state = START_RECORD;
+ break;
+ } else if (IS_CARRIAGE(c)) {
+ self->state = EAT_CRNL;
+ break;
+ } else if (IS_COMMENT_CHAR(c)) {
+ self->state = EAT_COMMENT;
+ break;
+ } else if (!IS_WHITESPACE(c)) {
+ self->state = START_FIELD;
+ // fall through to subsequent state
+ } else {
+ // if whitespace char, keep slurping
+ break;
+ }
+
+ case START_RECORD:
+ // start of record
+ should_skip = skip_this_line(self, self->file_lines);
+
+ if (should_skip == -1) {
+ goto parsingerror;
+ } else if (should_skip) {
+ if (IS_QUOTE(c)) {
+ self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
+ } else {
+ self->state = IN_FIELD_IN_SKIP_LINE;
+
+ if (IS_TERMINATOR(c)) {
+ END_LINE();
+ }
+ }
+ break;
+ } else if (IS_TERMINATOR(c)) {
+ // \n\r possible?
+ if (self->skip_empty_lines) {
+ self->file_lines++;
+ } else {
+ END_LINE();
+ }
+ break;
+ } else if (IS_CARRIAGE(c)) {
+ if (self->skip_empty_lines) {
+ self->file_lines++;
+ self->state = EAT_CRNL_NOP;
+ } else {
+ self->state = EAT_CRNL;
+ }
+ break;
+ } else if (IS_COMMENT_CHAR(c)) {
+ self->state = EAT_LINE_COMMENT;
+ break;
+ } else if (IS_WHITESPACE(c)) {
+ if (self->delim_whitespace) {
+ if (self->skip_empty_lines) {
+ self->state = WHITESPACE_LINE;
+ } else {
+ self->state = EAT_WHITESPACE;
+ }
+ break;
+ } else if (c != self->delimiter && self->skip_empty_lines) {
+ self->state = WHITESPACE_LINE;
+ break;
+ }
+ // fall through
+ }
+
+ // normal character - fall through
+ // to handle as START_FIELD
+ self->state = START_FIELD;
+
+ case START_FIELD:
+ // expecting field
+ if (IS_TERMINATOR(c)) {
+ END_FIELD();
+ END_LINE();
+ } else if (IS_CARRIAGE(c)) {
+ END_FIELD();
+ self->state = EAT_CRNL;
+ } else if (IS_QUOTE(c)) {
+ // start quoted field
+ self->state = IN_QUOTED_FIELD;
+ } else if (IS_ESCAPE_CHAR(c)) {
+ // possible escaped character
+ self->state = ESCAPED_CHAR;
+ } else if (IS_SKIPPABLE_SPACE(c)) {
+ // ignore space at start of field
+ } else if (IS_DELIMITER(c)) {
+ if (self->delim_whitespace) {
+ self->state = EAT_WHITESPACE;
+ } else {
+ // save empty field
+ END_FIELD();
+ }
+ } else if (IS_COMMENT_CHAR(c)) {
+ END_FIELD();
+ self->state = EAT_COMMENT;
+ } else {
+ // begin new unquoted field
+ PUSH_CHAR(c);
+ self->state = IN_FIELD;
+ }
+ break;
+
+ case ESCAPED_CHAR:
+ PUSH_CHAR(c);
+ self->state = IN_FIELD;
+ break;
+
+ case EAT_LINE_COMMENT:
+ if (IS_TERMINATOR(c)) {
+ self->file_lines++;
+ self->state = START_RECORD;
+ } else if (IS_CARRIAGE(c)) {
+ self->file_lines++;
+ self->state = EAT_CRNL_NOP;
+ }
+ break;
+
+ case IN_FIELD:
+ // in unquoted field
+ if (IS_TERMINATOR(c)) {
+ END_FIELD();
+ END_LINE();
+ } else if (IS_CARRIAGE(c)) {
+ END_FIELD();
+ self->state = EAT_CRNL;
+ } else if (IS_ESCAPE_CHAR(c)) {
+ // possible escaped character
+ self->state = ESCAPED_CHAR;
+ } else if (IS_DELIMITER(c)) {
+ // end of field - end of line not reached yet
+ END_FIELD();
+
+ if (self->delim_whitespace) {
+ self->state = EAT_WHITESPACE;
+ } else {
+ self->state = START_FIELD;
+ }
+ } else if (IS_COMMENT_CHAR(c)) {
+ END_FIELD();
+ self->state = EAT_COMMENT;
+ } else {
+ // normal character - save in field
+ PUSH_CHAR(c);
+ }
+ break;
+
+ case IN_QUOTED_FIELD:
+ // in quoted field
+ if (IS_ESCAPE_CHAR(c)) {
+ // possible escape character
+ self->state = ESCAPE_IN_QUOTED_FIELD;
+ } else if (IS_QUOTE(c)) {
+ if (self->doublequote) {
+ // double quote - " represented by ""
+ self->state = QUOTE_IN_QUOTED_FIELD;
+ } else {
+ // end of quote part of field
+ self->state = IN_FIELD;
+ }
+ } else {
+ // normal character - save in field
+ PUSH_CHAR(c);
+ }
+ break;
+
+ case ESCAPE_IN_QUOTED_FIELD:
+ PUSH_CHAR(c);
+ self->state = IN_QUOTED_FIELD;
+ break;
+
+ case QUOTE_IN_QUOTED_FIELD:
+ // double quote - seen a quote in an quoted field
+ if (IS_QUOTE(c)) {
+ // save "" as "
+
+ PUSH_CHAR(c);
+ self->state = IN_QUOTED_FIELD;
+ } else if (IS_DELIMITER(c)) {
+ // end of field - end of line not reached yet
+ END_FIELD();
+
+ if (self->delim_whitespace) {
+ self->state = EAT_WHITESPACE;
+ } else {
+ self->state = START_FIELD;
+ }
+ } else if (IS_TERMINATOR(c)) {
+ END_FIELD();
+ END_LINE();
+ } else if (IS_CARRIAGE(c)) {
+ END_FIELD();
+ self->state = EAT_CRNL;
+ } else if (!self->strict) {
+ PUSH_CHAR(c);
+ self->state = IN_FIELD;
+ } else {
+ int64_t bufsize = 100;
+ self->error_msg = (char *)malloc(bufsize);
+ snprintf(self->error_msg, bufsize,
+ "delimiter expected after quote in quote");
+ goto parsingerror;
+ }
+ break;
+
+ case EAT_COMMENT:
+ if (IS_TERMINATOR(c)) {
+ END_LINE();
+ } else if (IS_CARRIAGE(c)) {
+ self->state = EAT_CRNL;
+ }
+ break;
+
+ // only occurs with non-custom line terminator,
+ // which is why we directly check for '\n'
+ case EAT_CRNL:
+ if (c == '\n') {
+ END_LINE();
+ } else if (IS_DELIMITER(c)) {
+ if (self->delim_whitespace) {
+ END_LINE_STATE(EAT_WHITESPACE);
+ } else {
+ // Handle \r-delimited files
+ END_LINE_AND_FIELD_STATE(START_FIELD);
+ }
+ } else {
+ if (self->delim_whitespace) {
+ /* XXX
+ * first character of a new record--need to back up and
+ * reread
+ * to handle properly...
+ */
+ i--;
+ buf--; // back up one character (HACK!)
+ END_LINE_STATE(START_RECORD);
+ } else {
+ // \r line terminator
+ // UGH. we don't actually want
+ // to consume the token. fix this later
+ self->stream_len = slen;
+ if (end_line(self) < 0) {
+ goto parsingerror;
+ }
+
+ stream = self->stream + self->stream_len;
+ slen = self->stream_len;
+ self->state = START_RECORD;
+
+ --i;
+ buf--; // let's try this character again (HACK!)
+ if (line_limit > 0 &&
+ self->lines == start_lines + line_limit) {
+ goto linelimit;
+ }
+ }
+ }
+ break;
+
+ // only occurs with non-custom line terminator,
+ // which is why we directly check for '\n'
+ case EAT_CRNL_NOP: // inside an ignored comment line
+ self->state = START_RECORD;
+ // \r line terminator -- parse this character again
+ if (c != '\n' && !IS_DELIMITER(c)) {
+ --i;
+ --buf;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ _TOKEN_CLEANUP();
+
+ TRACE(("Finished tokenizing input\n"))
+
+ return 0;
+
+parsingerror:
+ i++;
+ _TOKEN_CLEANUP();
+
+ return -1;
+
+linelimit:
+ i++;
+ _TOKEN_CLEANUP();
+
+ return 0;
+}
+
+static int parser_handle_eof(parser_t *self) {
+ int64_t bufsize = 100;
+
+ TRACE(
+ ("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state))
+
+ if (self->datalen != 0) return -1;
+
+ switch (self->state) {
+ case START_RECORD:
+ case WHITESPACE_LINE:
+ case EAT_CRNL_NOP:
+ case EAT_LINE_COMMENT:
+ return 0;
+
+ case ESCAPE_IN_QUOTED_FIELD:
+ case IN_QUOTED_FIELD:
+ self->error_msg = (char *)malloc(bufsize);
+ snprintf(self->error_msg, bufsize,
+ "EOF inside string starting at row %lld",
+ (long long)self->file_lines);
+ return -1;
+
+ case ESCAPED_CHAR:
+ self->error_msg = (char *)malloc(bufsize);
+ snprintf(self->error_msg, bufsize,
+ "EOF following escape character");
+ return -1;
+
+ case IN_FIELD:
+ case START_FIELD:
+ case QUOTE_IN_QUOTED_FIELD:
+ if (end_field(self) < 0) return -1;
+ break;
+
+ default:
+ break;
+ }
+
+ if (end_line(self) < 0)
+ return -1;
+ else
+ return 0;
+}
+
+int parser_consume_rows(parser_t *self, size_t nrows) {
+ int64_t i, offset, word_deletions, char_count;
+
+ if (nrows > self->lines) {
+ nrows = self->lines;
+ }
+
+ /* do nothing */
+ if (nrows == 0) return 0;
+
+ /* cannot guarantee that nrows + 1 has been observed */
+ word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1];
+ char_count = (self->word_starts[word_deletions - 1] +
+ strlen(self->words[word_deletions - 1]) + 1);
+
+ TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions,
+ char_count));
+
+ /* move stream, only if something to move */
+ if (char_count < self->stream_len) {
+ memmove((void *)self->stream, (void *)(self->stream + char_count),
+ self->stream_len - char_count);
+ }
+ /* buffer counts */
+ self->stream_len -= char_count;
+
+ /* move token metadata */
+ for (i = 0; i < self->words_len - word_deletions; ++i) {
+ offset = i + word_deletions;
+
+ self->words[i] = self->words[offset] - char_count;
+ self->word_starts[i] = self->word_starts[offset] - char_count;
+ }
+ self->words_len -= word_deletions;
+
+ /* move current word pointer to stream */
+ self->pword_start -= char_count;
+ self->word_start -= char_count;
+
+ /* move line metadata */
+ for (i = 0; i < self->lines - nrows + 1; ++i) {
+ offset = i + nrows;
+ self->line_start[i] = self->line_start[offset] - word_deletions;
+ self->line_fields[i] = self->line_fields[offset];
+ }
+ self->lines -= nrows;
+
+ return 0;
+}
+
+static size_t _next_pow2(size_t sz) {
+ size_t result = 1;
+ while (result < sz) result *= 2;
+ return result;
+}
+
+int parser_trim_buffers(parser_t *self) {
+ /*
+ Free memory
+ */
+ size_t new_cap;
+ void *newptr;
+
+ int64_t i;
+
+ /**
+ * Before we free up space and trim, we should
+ * save how many words we saw when parsing, if
+ * it exceeds the maximum number we saw before.
+ *
+ * This is important for when we read in chunks,
+ * so that we can inform subsequent chunk parsing
+ * as to how many words we could possibly see.
+ */
+ if (self->words_cap > self->max_words_cap) {
+ self->max_words_cap = self->words_cap;
+ }
+
+ /* trim words, word_starts */
+ new_cap = _next_pow2(self->words_len) + 1;
+ if (new_cap < self->words_cap) {
+ TRACE(("parser_trim_buffers: new_cap < self->words_cap\n"));
+ newptr = safe_realloc((void *)self->words, new_cap * sizeof(char *));
+ if (newptr == NULL) {
+ return PARSER_OUT_OF_MEMORY;
+ } else {
+ self->words = (char **)newptr;
+ }
+ newptr = safe_realloc((void *)self->word_starts,
+ new_cap * sizeof(int64_t));
+ if (newptr == NULL) {
+ return PARSER_OUT_OF_MEMORY;
+ } else {
+ self->word_starts = (int64_t *)newptr;
+ self->words_cap = new_cap;
+ }
+ }
+
+ /* trim stream */
+ new_cap = _next_pow2(self->stream_len) + 1;
+ TRACE(
+ ("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = "
+ "%zu\n",
+ new_cap, self->stream_cap, self->lines_cap));
+ if (new_cap < self->stream_cap) {
+ TRACE(
+ ("parser_trim_buffers: new_cap < self->stream_cap, calling "
+ "safe_realloc\n"));
+ newptr = safe_realloc((void *)self->stream, new_cap);
+ if (newptr == NULL) {
+ return PARSER_OUT_OF_MEMORY;
+ } else {
+ // Update the pointers in the self->words array (char **) if
+ // `safe_realloc`
+ // moved the `self->stream` buffer. This block mirrors a similar
+ // block in
+ // `make_stream_space`.
+ if (self->stream != newptr) {
+ self->pword_start = (char *)newptr + self->word_start;
+
+ for (i = 0; i < self->words_len; ++i) {
+ self->words[i] = (char *)newptr + self->word_starts[i];
+ }
+ }
+
+ self->stream = newptr;
+ self->stream_cap = new_cap;
+ }
+ }
+
+ /* trim line_start, line_fields */
+ new_cap = _next_pow2(self->lines) + 1;
+ if (new_cap < self->lines_cap) {
+ TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n"));
+ newptr = safe_realloc((void *)self->line_start,
+ new_cap * sizeof(int64_t));
+ if (newptr == NULL) {
+ return PARSER_OUT_OF_MEMORY;
+ } else {
+ self->line_start = (int64_t *)newptr;
+ }
+ newptr = safe_realloc((void *)self->line_fields,
+ new_cap * sizeof(int64_t));
+ if (newptr == NULL) {
+ return PARSER_OUT_OF_MEMORY;
+ } else {
+ self->line_fields = (int64_t *)newptr;
+ self->lines_cap = new_cap;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ nrows : number of rows to tokenize (or until reach EOF)
+ all : tokenize all the data vs. certain number of rows
+ */
+
+int _tokenize_helper(parser_t *self, size_t nrows, int all) {
+ int status = 0;
+ int64_t start_lines = self->lines;
+
+ if (self->state == FINISHED) {
+ return 0;
+ }
+
+ TRACE((
+ "_tokenize_helper: Asked to tokenize %d rows, datapos=%d, datalen=%d\n",
+ nrows, self->datapos, self->datalen));
+
+ while (1) {
+ if (!all && self->lines - start_lines >= nrows) break;
+
+ if (self->datapos == self->datalen) {
+ status = parser_buffer_bytes(self, self->chunksize);
+
+ if (status == REACHED_EOF) {
+ // close out last line
+ status = parser_handle_eof(self);
+ self->state = FINISHED;
+ break;
+ } else if (status != 0) {
+ return status;
+ }
+ }
+
+ TRACE(
+ ("_tokenize_helper: Trying to process %d bytes, datalen=%d, "
+ "datapos= %d\n",
+ self->datalen - self->datapos, self->datalen, self->datapos));
+
+ status = tokenize_bytes(self, nrows, start_lines);
+
+ if (status < 0) {
+ // XXX
+ TRACE(
+ ("_tokenize_helper: Status %d returned from tokenize_bytes, "
+ "breaking\n",
+ status));
+ status = -1;
+ break;
+ }
+ }
+ TRACE(("leaving tokenize_helper\n"));
+ return status;
+}
+
+int tokenize_nrows(parser_t *self, size_t nrows) {
+ int status = _tokenize_helper(self, nrows, 0);
+ return status;
+}
+
+int tokenize_all_rows(parser_t *self) {
+ int status = _tokenize_helper(self, -1, 1);
+ return status;
+}
+
+PANDAS_INLINE void uppercase(char *p) {
+ for (; *p; ++p) *p = toupper_ascii(*p);
+}
+
+int PANDAS_INLINE to_longlong(char *item, long long *p_value) {
+ char *p_end;
+
+ // Try integer conversion. We explicitly give the base to be 10. If
+ // we used 0, strtoll() would convert '012' to 10, because the leading 0 in
+ // '012' signals an octal number in C. For a general purpose reader, that
+ // would be a bug, not a feature.
+ *p_value = strtoll(item, &p_end, 10);
+
+ // Allow trailing spaces.
+ while (isspace_ascii(*p_end)) ++p_end;
+
+ return (errno == 0) && (!*p_end);
+}
+
+int to_boolean(const char *item, uint8_t *val) {
+ char *tmp;
+ int i, status = 0;
+ int bufsize = sizeof(char) * (strlen(item) + 1);
+
+ static const char *tstrs[1] = {"TRUE"};
+ static const char *fstrs[1] = {"FALSE"};
+
+ tmp = malloc(bufsize);
+ strncpy(tmp, item, bufsize);
+ uppercase(tmp);
+
+ for (i = 0; i < 1; ++i) {
+ if (strcmp(tmp, tstrs[i]) == 0) {
+ *val = 1;
+ goto done;
+ }
+ }
+
+ for (i = 0; i < 1; ++i) {
+ if (strcmp(tmp, fstrs[i]) == 0) {
+ *val = 0;
+ goto done;
+ }
+ }
+
+ status = -1;
+
+done:
+ free(tmp);
+ return status;
+}
+
+#ifdef TEST
+
+int main(int argc, char *argv[]) {
+ double x, y;
+ long long xi;
+ int status;
+ char *s;
+
+ s = "123,789";
+ status = to_longlong_thousands(s, &xi, ',');
+ printf("s = '%s'\n", s);
+ printf("status = %d\n", status);
+ printf("x = %d\n", (int)xi);
+
+ return 0;
+}
+#endif
+
+// ---------------------------------------------------------------------------
+// Implementation of xstrtod
+
+//
+// strtod.c
+//
+// Convert string to double
+//
+// Copyright (C) 2002 Michael Ringgaard. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// 3. Neither the name of the project nor the names of its contributors
+// may be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// -----------------------------------------------------------------------
+// Modifications by Warren Weckesser, March 2011:
+// * Rename strtod() to xstrtod().
+// * Added decimal and sci arguments.
+// * Skip trailing spaces.
+// * Commented out the other functions.
+// Modifications by Richard T Guy, August 2013:
+// * Add tsep argument for thousands separator
+//
+
+double xstrtod(const char *str, char **endptr, char decimal, char sci,
+ char tsep, int skip_trailing) {
+ double number;
+ int exponent;
+ int negative;
+ char *p = (char *)str;
+ double p10;
+ int n;
+ int num_digits;
+ int num_decimals;
+
+ errno = 0;
+
+ // Skip leading whitespace.
+ while (isspace_ascii(*p)) p++;
+
+ // Handle optional sign.
+ negative = 0;
+ switch (*p) {
+ case '-':
+ negative = 1; // Fall through to increment position.
+ case '+':
+ p++;
+ }
+
+ number = 0.;
+ exponent = 0;
+ num_digits = 0;
+ num_decimals = 0;
+
+ // Process string of digits.
+ while (isdigit_ascii(*p)) {
+ number = number * 10. + (*p - '0');
+ p++;
+ num_digits++;
+
+ p += (tsep != '\0' && *p == tsep);
+ }
+
+ // Process decimal part.
+ if (*p == decimal) {
+ p++;
+
+ while (isdigit_ascii(*p)) {
+ number = number * 10. + (*p - '0');
+ p++;
+ num_digits++;
+ num_decimals++;
+ }
+
+ exponent -= num_decimals;
+ }
+
+ if (num_digits == 0) {
+ errno = ERANGE;
+ return 0.0;
+ }
+
+ // Correct for sign.
+ if (negative) number = -number;
+
+ // Process an exponent string.
+ if (toupper_ascii(*p) == toupper_ascii(sci)) {
+ // Handle optional sign.
+ negative = 0;
+ switch (*++p) {
+ case '-':
+ negative = 1; // Fall through to increment pos.
+ case '+':
+ p++;
+ }
+
+ // Process string of digits.
+ num_digits = 0;
+ n = 0;
+ while (isdigit_ascii(*p)) {
+ n = n * 10 + (*p - '0');
+ num_digits++;
+ p++;
+ }
+
+ if (negative)
+ exponent -= n;
+ else
+ exponent += n;
+
+ // If no digits, after the 'e'/'E', un-consume it
+ if (num_digits == 0) p--;
+ }
+
+ if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) {
+ errno = ERANGE;
+ return HUGE_VAL;
+ }
+
+ // Scale the result.
+ p10 = 10.;
+ n = exponent;
+ if (n < 0) n = -n;
+ while (n) {
+ if (n & 1) {
+ if (exponent < 0)
+ number /= p10;
+ else
+ number *= p10;
+ }
+ n >>= 1;
+ p10 *= p10;
+ }
+
+ if (number == HUGE_VAL) {
+ errno = ERANGE;
+ }
+
+ if (skip_trailing) {
+ // Skip trailing whitespace.
+ while (isspace_ascii(*p)) p++;
+ }
+
+ if (endptr) *endptr = p;
+
+ return number;
+}
+
+double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
+ char tsep, int skip_trailing) {
+ double number;
+ int exponent;
+ int negative;
+ char *p = (char *)str;
+ int num_digits;
+ int num_decimals;
+ int max_digits = 17;
+ int n;
+ // Cache powers of 10 in memory.
+ static double e[] = {
+ 1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9,
+ 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19,
+ 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29,
+ 1e30, 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39,
+ 1e40, 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49,
+ 1e50, 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59,
+ 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69,
+ 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79,
+ 1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89,
+ 1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99,
+ 1e100, 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109,
+ 1e110, 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119,
+ 1e120, 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129,
+ 1e130, 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139,
+ 1e140, 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149,
+ 1e150, 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159,
+ 1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169,
+ 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179,
+ 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189,
+ 1e190, 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199,
+ 1e200, 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209,
+ 1e210, 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219,
+ 1e220, 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229,
+ 1e230, 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239,
+ 1e240, 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249,
+ 1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259,
+ 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269,
+ 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279,
+ 1e280, 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289,
+ 1e290, 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299,
+ 1e300, 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308};
+ errno = 0;
+
+ // Skip leading whitespace.
+ while (isspace_ascii(*p)) p++;
+
+ // Handle optional sign.
+ negative = 0;
+ switch (*p) {
+ case '-':
+ negative = 1; // Fall through to increment position.
+ case '+':
+ p++;
+ }
+
+ number = 0.;
+ exponent = 0;
+ num_digits = 0;
+ num_decimals = 0;
+
+ // Process string of digits.
+ while (isdigit_ascii(*p)) {
+ if (num_digits < max_digits) {
+ number = number * 10. + (*p - '0');
+ num_digits++;
+ } else {
+ ++exponent;
+ }
+
+ p++;
+ p += (tsep != '\0' && *p == tsep);
+ }
+
+ // Process decimal part
+ if (*p == decimal) {
+ p++;
+
+ while (num_digits < max_digits && isdigit_ascii(*p)) {
+ number = number * 10. + (*p - '0');
+ p++;
+ num_digits++;
+ num_decimals++;
+ }
+
+ if (num_digits >= max_digits) // Consume extra decimal digits.
+ while (isdigit_ascii(*p)) ++p;
+
+ exponent -= num_decimals;
+ }
+
+ if (num_digits == 0) {
+ errno = ERANGE;
+ return 0.0;
+ }
+
+ // Correct for sign.
+ if (negative) number = -number;
+
+ // Process an exponent string.
+ if (toupper_ascii(*p) == toupper_ascii(sci)) {
+ // Handle optional sign
+ negative = 0;
+ switch (*++p) {
+ case '-':
+ negative = 1; // Fall through to increment pos.
+ case '+':
+ p++;
+ }
+
+ // Process string of digits.
+ num_digits = 0;
+ n = 0;
+ while (isdigit_ascii(*p)) {
+ n = n * 10 + (*p - '0');
+ num_digits++;
+ p++;
+ }
+
+ if (negative)
+ exponent -= n;
+ else
+ exponent += n;
+
+ // If no digits after the 'e'/'E', un-consume it.
+ if (num_digits == 0) p--;
+ }
+
+ if (exponent > 308) {
+ errno = ERANGE;
+ return HUGE_VAL;
+ } else if (exponent > 0) {
+ number *= e[exponent];
+ } else if (exponent < -308) { // Subnormal
+ if (exponent < -616) // Prevent invalid array access.
+ number = 0.;
+ number /= e[-308 - exponent];
+ number /= e[308];
+ } else {
+ number /= e[-exponent];
+ }
+
+ if (number == HUGE_VAL || number == -HUGE_VAL) errno = ERANGE;
+
+ if (skip_trailing) {
+ // Skip trailing whitespace.
+ while (isspace_ascii(*p)) p++;
+ }
+
+ if (endptr) *endptr = p;
+ return number;
+}
+
+double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
+ int skip_trailing) {
+ double r = PyOS_string_to_double(p, q, 0);
+ PyErr_Clear();
+ return r;
+}
+
+// End of xstrtod code
+// ---------------------------------------------------------------------------
+
+void uint_state_init(uint_state *self) {
+ self->seen_sint = 0;
+ self->seen_uint = 0;
+ self->seen_null = 0;
+}
+
+int uint64_conflict(uint_state *self) {
+ return self->seen_uint && (self->seen_sint || self->seen_null);
+}
+
+int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
+ int *error, char tsep) {
+ const char *p = (const char *)p_item;
+ int isneg = 0;
+ int64_t number = 0;
+ int d;
+
+ // Skip leading spaces.
+ while (isspace_ascii(*p)) {
+ ++p;
+ }
+
+ // Handle sign.
+ if (*p == '-') {
+ isneg = 1;
+ ++p;
+ } else if (*p == '+') {
+ p++;
+ }
+
+ // Check that there is a first digit.
+ if (!isdigit_ascii(*p)) {
+ // Error...
+ *error = ERROR_NO_DIGITS;
+ return 0;
+ }
+
+ if (isneg) {
+ // If number is greater than pre_min, at least one more digit
+ // can be processed without overflowing.
+ int dig_pre_min = -(int_min % 10);
+ int64_t pre_min = int_min / 10;
+
+ // Process the digits.
+ d = *p;
+ if (tsep != '\0') {
+ while (1) {
+ if (d == tsep) {
+ d = *++p;
+ continue;
+ } else if (!isdigit_ascii(d)) {
+ break;
+ }
+ if ((number > pre_min) ||
+ ((number == pre_min) && (d - '0' <= dig_pre_min))) {
+ number = number * 10 - (d - '0');
+ d = *++p;
+ } else {
+ *error = ERROR_OVERFLOW;
+ return 0;
+ }
+ }
+ } else {
+ while (isdigit_ascii(d)) {
+ if ((number > pre_min) ||
+ ((number == pre_min) && (d - '0' <= dig_pre_min))) {
+ number = number * 10 - (d - '0');
+ d = *++p;
+ } else {
+ *error = ERROR_OVERFLOW;
+ return 0;
+ }
+ }
+ }
+ } else {
+ // If number is less than pre_max, at least one more digit
+ // can be processed without overflowing.
+ int64_t pre_max = int_max / 10;
+ int dig_pre_max = int_max % 10;
+
+ // Process the digits.
+ d = *p;
+ if (tsep != '\0') {
+ while (1) {
+ if (d == tsep) {
+ d = *++p;
+ continue;
+ } else if (!isdigit_ascii(d)) {
+ break;
+ }
+ if ((number < pre_max) ||
+ ((number == pre_max) && (d - '0' <= dig_pre_max))) {
+ number = number * 10 + (d - '0');
+ d = *++p;
+
+ } else {
+ *error = ERROR_OVERFLOW;
+ return 0;
+ }
+ }
+ } else {
+ while (isdigit_ascii(d)) {
+ if ((number < pre_max) ||
+ ((number == pre_max) && (d - '0' <= dig_pre_max))) {
+ number = number * 10 + (d - '0');
+ d = *++p;
+
+ } else {
+ *error = ERROR_OVERFLOW;
+ return 0;
+ }
+ }
+ }
+ }
+
+ // Skip trailing spaces.
+ while (isspace_ascii(*p)) {
+ ++p;
+ }
+
+ // Did we use up all the characters?
+ if (*p) {
+ *error = ERROR_INVALID_CHARS;
+ return 0;
+ }
+
+ *error = 0;
+ return number;
+}
+
+uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
+ uint64_t uint_max, int *error, char tsep) {
+ const char *p = (const char *)p_item;
+ uint64_t pre_max = uint_max / 10;
+ int dig_pre_max = uint_max % 10;
+ uint64_t number = 0;
+ int d;
+
+ // Skip leading spaces.
+ while (isspace_ascii(*p)) {
+ ++p;
+ }
+
+ // Handle sign.
+ if (*p == '-') {
+ state->seen_sint = 1;
+ *error = 0;
+ return 0;
+ } else if (*p == '+') {
+ p++;
+ }
+
+ // Check that there is a first digit.
+ if (!isdigit_ascii(*p)) {
+ // Error...
+ *error = ERROR_NO_DIGITS;
+ return 0;
+ }
+
+ // If number is less than pre_max, at least one more digit
+ // can be processed without overflowing.
+ //
+ // Process the digits.
+ d = *p;
+ if (tsep != '\0') {
+ while (1) {
+ if (d == tsep) {
+ d = *++p;
+ continue;
+ } else if (!isdigit_ascii(d)) {
+ break;
+ }
+ if ((number < pre_max) ||
+ ((number == pre_max) && (d - '0' <= dig_pre_max))) {
+ number = number * 10 + (d - '0');
+ d = *++p;
+
+ } else {
+ *error = ERROR_OVERFLOW;
+ return 0;
+ }
+ }
+ } else {
+ while (isdigit_ascii(d)) {
+ if ((number < pre_max) ||
+ ((number == pre_max) && (d - '0' <= dig_pre_max))) {
+ number = number * 10 + (d - '0');
+ d = *++p;
+
+ } else {
+ *error = ERROR_OVERFLOW;
+ return 0;
+ }
+ }
+ }
+
+ // Skip trailing spaces.
+ while (isspace_ascii(*p)) {
+ ++p;
+ }
+
+ // Did we use up all the characters?
+ if (*p) {
+ *error = ERROR_INVALID_CHARS;
+ return 0;
+ }
+
+ if (number > (uint64_t)int_max) {
+ state->seen_uint = 1;
+ }
+
+ *error = 0;
+ return number;
+}
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/parser/tokenizer.h b/contrib/python/pandas/py2/pandas/_libs/src/parser/tokenizer.h
new file mode 100644
index 00000000000..c32c061c7fa
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/parser/tokenizer.h
@@ -0,0 +1,270 @@
+/*
+
+Copyright (c) 2012, Lambda Foundry, Inc., except where noted
+
+Incorporates components of WarrenWeckesser/textreader, licensed under 3-clause
+BSD
+
+See LICENSE for the license
+
+*/
+
+#ifndef PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_
+#define PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "Python.h"
+
+#include <ctype.h>
+
+#define ERROR_OK 0
+#define ERROR_NO_DIGITS 1
+#define ERROR_OVERFLOW 2
+#define ERROR_INVALID_CHARS 3
+
+#include "../headers/stdint.h"
+#include "../inline_helper.h"
+
+#include "khash.h"
+
+#define CHUNKSIZE 1024 * 256
+#define KB 1024
+#define MB 1024 * KB
+#define STREAM_INIT_SIZE 32
+
+#define REACHED_EOF 1
+#define CALLING_READ_FAILED 2
+
+
+#if defined(_MSC_VER)
+#define strtoll _strtoi64
+#endif
+
+/*
+
+ C flat file parsing low level code for pandas / NumPy
+
+ */
+
+#define FALSE 0
+#define TRUE 1
+
+// Maximum number of columns in a file.
+#define MAX_NUM_COLUMNS 2000
+
+// Maximum number of characters in single field.
+#define FIELD_BUFFER_SIZE 2000
+
+/*
+ * Common set of error types for the read_rows() and tokenize()
+ * functions.
+ */
+#define ERROR_OUT_OF_MEMORY 1
+#define ERROR_INVALID_COLUMN_INDEX 10
+#define ERROR_CHANGED_NUMBER_OF_FIELDS 12
+#define ERROR_TOO_MANY_CHARS 21
+#define ERROR_TOO_MANY_FIELDS 22
+#define ERROR_NO_DATA 23
+
+// #define VERBOSE
+#if defined(VERBOSE)
+#define TRACE(X) printf X;
+#else
+#define TRACE(X)
+#endif
+
+#define PARSER_OUT_OF_MEMORY -1
+
+/*
+ * XXX Might want to couple count_rows() with read_rows() to avoid duplication
+ * of some file I/O.
+ */
+
+/*
+ * WORD_BUFFER_SIZE determines the maximum amount of non-delimiter
+ * text in a row.
+ */
+#define WORD_BUFFER_SIZE 4000
+
+typedef enum {
+ START_RECORD,
+ START_FIELD,
+ ESCAPED_CHAR,
+ IN_FIELD,
+ IN_QUOTED_FIELD,
+ ESCAPE_IN_QUOTED_FIELD,
+ QUOTE_IN_QUOTED_FIELD,
+ EAT_CRNL,
+ EAT_CRNL_NOP,
+ EAT_WHITESPACE,
+ EAT_COMMENT,
+ EAT_LINE_COMMENT,
+ WHITESPACE_LINE,
+ START_FIELD_IN_SKIP_LINE,
+ IN_FIELD_IN_SKIP_LINE,
+ IN_QUOTED_FIELD_IN_SKIP_LINE,
+ QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE,
+ FINISHED
+} ParserState;
+
+typedef enum {
+ QUOTE_MINIMAL,
+ QUOTE_ALL,
+ QUOTE_NONNUMERIC,
+ QUOTE_NONE
+} QuoteStyle;
+
+typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
+ int *status);
+typedef int (*io_cleanup)(void *src);
+
+typedef struct parser_t {
+ void *source;
+ io_callback cb_io;
+ io_cleanup cb_cleanup;
+
+ int64_t chunksize; // Number of bytes to prepare for each chunk
+ char *data; // pointer to data to be processed
+ int64_t datalen; // amount of data available
+ int64_t datapos;
+
+ // where to write out tokenized data
+ char *stream;
+ int64_t stream_len;
+ int64_t stream_cap;
+
+ // Store words in (potentially ragged) matrix for now, hmm
+ char **words;
+ int64_t *word_starts; // where we are in the stream
+ int64_t words_len;
+ int64_t words_cap;
+ int64_t max_words_cap; // maximum word cap encountered
+
+ char *pword_start; // pointer to stream start of current field
+ int64_t word_start; // position start of current field
+
+ int64_t *line_start; // position in words for start of line
+ int64_t *line_fields; // Number of fields in each line
+ int64_t lines; // Number of (good) lines observed
+ int64_t file_lines; // Number of lines (including bad or skipped)
+ int64_t lines_cap; // Vector capacity
+
+ // Tokenizing stuff
+ ParserState state;
+ int doublequote; /* is " represented by ""? */
+ char delimiter; /* field separator */
+ int delim_whitespace; /* delimit by consuming space/tabs instead */
+ char quotechar; /* quote character */
+ char escapechar; /* escape character */
+ char lineterminator;
+ int skipinitialspace; /* ignore spaces following delimiter? */
+ int quoting; /* style of quoting to write */
+
+ // krufty, hmm =/
+ int numeric_field;
+
+ char commentchar;
+ int allow_embedded_newline;
+ int strict; /* raise exception on bad CSV */
+
+ int usecols; // Boolean: 1: usecols provided, 0: none provided
+
+ int expected_fields;
+ int error_bad_lines;
+ int warn_bad_lines;
+
+ // floating point options
+ char decimal;
+ char sci;
+
+ // thousands separator (comma, period)
+ char thousands;
+
+ int header; // Boolean: 1: has header, 0: no header
+ int64_t header_start; // header row start
+ int64_t header_end; // header row end
+
+ void *skipset;
+ PyObject *skipfunc;
+ int64_t skip_first_N_rows;
+ int skip_footer;
+ // pick one, depending on whether the converter requires GIL
+ double (*double_converter_nogil)(const char *, char **,
+ char, char, char, int);
+ double (*double_converter_withgil)(const char *, char **,
+ char, char, char, int);
+
+ // error handling
+ char *warn_msg;
+ char *error_msg;
+
+ int skip_empty_lines;
+} parser_t;
+
+typedef struct coliter_t {
+ char **words;
+ int64_t *line_start;
+ int col;
+} coliter_t;
+
+void coliter_setup(coliter_t *self, parser_t *parser, int i, int start);
+coliter_t *coliter_new(parser_t *self, int i);
+
+#define COLITER_NEXT(iter, word) \
+ do { \
+ const int64_t i = *iter.line_start++ + iter.col; \
+ word = i < *iter.line_start ? iter.words[i] : ""; \
+ } while (0)
+
+parser_t *parser_new(void);
+
+int parser_init(parser_t *self);
+
+int parser_consume_rows(parser_t *self, size_t nrows);
+
+int parser_trim_buffers(parser_t *self);
+
+int parser_add_skiprow(parser_t *self, int64_t row);
+
+int parser_set_skipfirstnrows(parser_t *self, int64_t nrows);
+
+void parser_free(parser_t *self);
+
+void parser_del(parser_t *self);
+
+void parser_set_default_options(parser_t *self);
+
+int tokenize_nrows(parser_t *self, size_t nrows);
+
+int tokenize_all_rows(parser_t *self);
+
+// Have parsed / type-converted a chunk of data
+// and want to free memory from the token stream
+
+typedef struct uint_state {
+ int seen_sint;
+ int seen_uint;
+ int seen_null;
+} uint_state;
+
+void uint_state_init(uint_state *self);
+
+int uint64_conflict(uint_state *self);
+
+uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
+ uint64_t uint_max, int *error, char tsep);
+int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
+ int *error, char tsep);
+double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
+ int skip_trailing);
+double precise_xstrtod(const char *p, char **q, char decimal, char sci,
+ char tsep, int skip_trailing);
+double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
+ int skip_trailing);
+int to_boolean(const char *item, uint8_t *val);
+
+#endif // PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/skiplist.h b/contrib/python/pandas/py2/pandas/_libs/src/skiplist.h
new file mode 100644
index 00000000000..60c1a567277
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/skiplist.h
@@ -0,0 +1,279 @@
+/*
+Copyright (c) 2016, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+
+Flexibly-sized, index-able skiplist data structure for maintaining a sorted
+list of values
+
+Port of Wes McKinney's Cython version of Raymond Hettinger's original pure
+Python recipe (http://rhettinger.wordpress.com/2010/02/06/lost-knowledge/)
+*/
+
+#ifndef PANDAS__LIBS_SRC_SKIPLIST_H_
+#define PANDAS__LIBS_SRC_SKIPLIST_H_
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "inline_helper.h"
+
+PANDAS_INLINE float __skiplist_nanf(void) {
+ const union {
+ int __i;
+ float __f;
+ } __bint = {0x7fc00000UL};
+ return __bint.__f;
+}
+#define PANDAS_NAN ((double)__skiplist_nanf())
+
+PANDAS_INLINE double Log2(double val) { return log(val) / log(2.); }
+
+typedef struct node_t node_t;
+
+struct node_t {
+ node_t **next;
+ int *width;
+ double value;
+ int is_nil;
+ int levels;
+ int ref_count;
+};
+
+typedef struct {
+ node_t *head;
+ node_t **tmp_chain;
+ int *tmp_steps;
+ int size;
+ int maxlevels;
+} skiplist_t;
+
+PANDAS_INLINE double urand(void) {
+ return ((double)rand() + 1) / ((double)RAND_MAX + 2);
+}
+
+PANDAS_INLINE int int_min(int a, int b) { return a < b ? a : b; }
+
+PANDAS_INLINE node_t *node_init(double value, int levels) {
+ node_t *result;
+ result = (node_t *)malloc(sizeof(node_t));
+ if (result) {
+ result->value = value;
+ result->levels = levels;
+ result->is_nil = 0;
+ result->ref_count = 0;
+ result->next = (node_t **)malloc(levels * sizeof(node_t *));
+ result->width = (int *)malloc(levels * sizeof(int));
+ if (!(result->next && result->width) && (levels != 0)) {
+ free(result->next);
+ free(result->width);
+ free(result);
+ return NULL;
+ }
+ }
+ return result;
+}
+
+// do this ourselves
+PANDAS_INLINE void node_incref(node_t *node) { ++(node->ref_count); }
+
+PANDAS_INLINE void node_decref(node_t *node) { --(node->ref_count); }
+
+static void node_destroy(node_t *node) {
+ int i;
+ if (node) {
+ if (node->ref_count <= 1) {
+ for (i = 0; i < node->levels; ++i) {
+ node_destroy(node->next[i]);
+ }
+ free(node->next);
+ free(node->width);
+ // printf("Reference count was 1, freeing\n");
+ free(node);
+ } else {
+ node_decref(node);
+ }
+ // pretty sure that freeing the struct above will be enough
+ }
+}
+
+PANDAS_INLINE void skiplist_destroy(skiplist_t *skp) {
+ if (skp) {
+ node_destroy(skp->head);
+ free(skp->tmp_steps);
+ free(skp->tmp_chain);
+ free(skp);
+ }
+}
+
+PANDAS_INLINE skiplist_t *skiplist_init(int expected_size) {
+ skiplist_t *result;
+ node_t *NIL, *head;
+ int maxlevels, i;
+
+ maxlevels = 1 + Log2((double)expected_size);
+ result = (skiplist_t *)malloc(sizeof(skiplist_t));
+ if (!result) {
+ return NULL;
+ }
+ result->tmp_chain = (node_t **)malloc(maxlevels * sizeof(node_t *));
+ result->tmp_steps = (int *)malloc(maxlevels * sizeof(int));
+ result->maxlevels = maxlevels;
+ result->size = 0;
+
+ head = result->head = node_init(PANDAS_NAN, maxlevels);
+ NIL = node_init(0.0, 0);
+
+ if (!(result->tmp_chain && result->tmp_steps && result->head && NIL)) {
+ skiplist_destroy(result);
+ node_destroy(NIL);
+ return NULL;
+ }
+
+ node_incref(head);
+
+ NIL->is_nil = 1;
+
+ for (i = 0; i < maxlevels; ++i) {
+ head->next[i] = NIL;
+ head->width[i] = 1;
+ node_incref(NIL);
+ }
+
+ return result;
+}
+
+// 1 if left < right, 0 if left == right, -1 if left > right
+PANDAS_INLINE int _node_cmp(node_t *node, double value) {
+ if (node->is_nil || node->value > value) {
+ return -1;
+ } else if (node->value < value) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) {
+ node_t *node;
+ int level;
+
+ if (i < 0 || i >= skp->size) {
+ *ret = 0;
+ return 0;
+ }
+
+ node = skp->head;
+ ++i;
+ for (level = skp->maxlevels - 1; level >= 0; --level) {
+ while (node->width[level] <= i) {
+ i -= node->width[level];
+ node = node->next[level];
+ }
+ }
+
+ *ret = 1;
+ return node->value;
+}
+
+PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) {
+ node_t *node, *prevnode, *newnode, *next_at_level;
+ int *steps_at_level;
+ int size, steps, level;
+ node_t **chain;
+
+ chain = skp->tmp_chain;
+
+ steps_at_level = skp->tmp_steps;
+ memset(steps_at_level, 0, skp->maxlevels * sizeof(int));
+
+ node = skp->head;
+
+ for (level = skp->maxlevels - 1; level >= 0; --level) {
+ next_at_level = node->next[level];
+ while (_node_cmp(next_at_level, value) >= 0) {
+ steps_at_level[level] += node->width[level];
+ node = next_at_level;
+ next_at_level = node->next[level];
+ }
+ chain[level] = node;
+ }
+
+ size = int_min(skp->maxlevels, 1 - ((int)Log2(urand())));
+
+ newnode = node_init(value, size);
+ if (!newnode) {
+ return -1;
+ }
+ steps = 0;
+
+ for (level = 0; level < size; ++level) {
+ prevnode = chain[level];
+ newnode->next[level] = prevnode->next[level];
+
+ prevnode->next[level] = newnode;
+ node_incref(newnode); // increment the reference count
+
+ newnode->width[level] = prevnode->width[level] - steps;
+ prevnode->width[level] = steps + 1;
+
+ steps += steps_at_level[level];
+ }
+
+ for (level = size; level < skp->maxlevels; ++level) {
+ chain[level]->width[level] += 1;
+ }
+
+ ++(skp->size);
+
+ return 1;
+}
+
+PANDAS_INLINE int skiplist_remove(skiplist_t *skp, double value) {
+ int level, size;
+ node_t *node, *prevnode, *tmpnode, *next_at_level;
+ node_t **chain;
+
+ chain = skp->tmp_chain;
+ node = skp->head;
+
+ for (level = skp->maxlevels - 1; level >= 0; --level) {
+ next_at_level = node->next[level];
+ while (_node_cmp(next_at_level, value) > 0) {
+ node = next_at_level;
+ next_at_level = node->next[level];
+ }
+ chain[level] = node;
+ }
+
+ if (value != chain[0]->next[0]->value) {
+ return 0;
+ }
+
+ size = chain[0]->next[0]->levels;
+
+ for (level = 0; level < size; ++level) {
+ prevnode = chain[level];
+
+ tmpnode = prevnode->next[level];
+
+ prevnode->width[level] += tmpnode->width[level] - 1;
+ prevnode->next[level] = tmpnode->next[level];
+
+ tmpnode->next[level] = NULL;
+ node_destroy(tmpnode); // decrement refcount or free
+ }
+
+ for (level = size; level < skp->maxlevels; ++level) {
+ --(chain[level]->width[level]);
+ }
+
+ --(skp->size);
+ return 1;
+}
+
+#endif // PANDAS__LIBS_SRC_SKIPLIST_H_
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/ujson/lib/ultrajson.h b/contrib/python/pandas/py2/pandas/_libs/src/ujson/lib/ultrajson.h
new file mode 100644
index 00000000000..0470fef450d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/ujson/lib/ultrajson.h
@@ -0,0 +1,317 @@
+/*
+Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the ESN Social Software AB nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
+https://github.com/client9/stringencoders
+Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
+
+Numeric decoder derived from from TCL library
+http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
+ * Copyright (c) 1988-1993 The Regents of the University of California.
+ * Copyright (c) 1994 Sun Microsystems, Inc.
+*/
+
+/*
+Ultra fast JSON encoder and decoder
+Developed by Jonas Tarnstrom ([email protected]).
+
+Encoder notes:
+------------------
+
+:: Cyclic references ::
+Cyclic referenced objects are not detected.
+Set JSONObjectEncoder.recursionMax to suitable value or make sure input object
+tree doesn't have cyclic references.
+
+*/
+
+#ifndef PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_
+#define PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_
+
+#include <stdio.h>
+#include <wchar.h>
+
+// Don't output any extra whitespaces when encoding
+#define JSON_NO_EXTRA_WHITESPACE
+
+// Max decimals to encode double floating point numbers with
+#ifndef JSON_DOUBLE_MAX_DECIMALS
+#define JSON_DOUBLE_MAX_DECIMALS 15
+#endif
+
+// Max recursion depth, default for encoder
+#ifndef JSON_MAX_RECURSION_DEPTH
+#define JSON_MAX_RECURSION_DEPTH 1024
+#endif
+
+// Max recursion depth, default for decoder
+#ifndef JSON_MAX_OBJECT_DEPTH
+#define JSON_MAX_OBJECT_DEPTH 1024
+#endif
+
+/*
+Dictates and limits how much stack space for buffers UltraJSON will use before resorting to provided heap functions */
+#ifndef JSON_MAX_STACK_BUFFER_SIZE
+#define JSON_MAX_STACK_BUFFER_SIZE 131072
+#endif
+
+#ifdef _WIN32
+
+typedef __int64 JSINT64;
+typedef unsigned __int64 JSUINT64;
+
+typedef __int32 JSINT32;
+typedef unsigned __int32 JSUINT32;
+typedef unsigned __int8 JSUINT8;
+typedef unsigned __int16 JSUTF16;
+typedef unsigned __int32 JSUTF32;
+typedef __int64 JSLONG;
+
+#define EXPORTFUNCTION __declspec(dllexport)
+
+#define FASTCALL_MSVC __fastcall
+#define FASTCALL_ATTR
+#define INLINE_PREFIX static __inline
+
+#else
+
+#include <stdint.h>
+typedef int64_t JSINT64;
+typedef uint64_t JSUINT64;
+
+typedef int32_t JSINT32;
+typedef uint32_t JSUINT32;
+
+#define FASTCALL_MSVC
+
+#if !defined __x86_64__
+#define FASTCALL_ATTR __attribute__((fastcall))
+#else
+#define FASTCALL_ATTR
+#endif
+
+#define INLINE_PREFIX static inline
+
+typedef uint8_t JSUINT8;
+typedef uint16_t JSUTF16;
+typedef uint32_t JSUTF32;
+
+typedef int64_t JSLONG;
+
+#define EXPORTFUNCTION
+#endif
+
+#if !(defined(__LITTLE_ENDIAN__) || defined(__BIG_ENDIAN__))
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define __LITTLE_ENDIAN__
+#else
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define __BIG_ENDIAN__
+#endif
+
+#endif
+
+#endif
+
+#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
+#error "Endianness not supported"
+#endif
+
+enum JSTYPES {
+ JT_NULL, // NULL
+ JT_TRUE, // boolean true
+ JT_FALSE, // boolean false
+ JT_INT, // (JSINT32 (signed 32-bit))
+ JT_LONG, // (JSINT64 (signed 64-bit))
+ JT_DOUBLE, // (double)
+ JT_UTF8, // (char 8-bit)
+ JT_ARRAY, // Array structure
+ JT_OBJECT, // Key/Value structure
+ JT_INVALID, // Internal, do not return nor expect
+};
+
+typedef void * JSOBJ;
+typedef void * JSITER;
+
+typedef struct __JSONTypeContext {
+ int type;
+ void *encoder;
+ void *prv;
+} JSONTypeContext;
+
+/*
+Function pointer declarations, suitable for implementing UltraJSON */
+typedef void (*JSPFN_ITERBEGIN)(JSOBJ obj, JSONTypeContext *tc);
+typedef int (*JSPFN_ITERNEXT)(JSOBJ obj, JSONTypeContext *tc);
+typedef void (*JSPFN_ITEREND)(JSOBJ obj, JSONTypeContext *tc);
+typedef JSOBJ (*JSPFN_ITERGETVALUE)(JSOBJ obj, JSONTypeContext *tc);
+typedef char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc,
+ size_t *outLen);
+typedef void *(*JSPFN_MALLOC)(size_t size);
+typedef void (*JSPFN_FREE)(void *pptr);
+typedef void *(*JSPFN_REALLOC)(void *base, size_t size);
+
+typedef struct __JSONObjectEncoder {
+ void (*beginTypeContext)(JSOBJ obj, JSONTypeContext *tc);
+ void (*endTypeContext)(JSOBJ obj, JSONTypeContext *tc);
+ const char *(*getStringValue)(JSOBJ obj, JSONTypeContext *tc,
+ size_t *_outLen);
+ JSINT64 (*getLongValue)(JSOBJ obj, JSONTypeContext *tc);
+ JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc);
+ double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc);
+
+ /*
+ Begin iteration of an iteratable object (JS_ARRAY or JS_OBJECT)
+ Implementor should setup iteration state in ti->prv
+ */
+ JSPFN_ITERBEGIN iterBegin;
+
+ /*
+ Retrieve next object in an iteration. Should return 0 to indicate iteration has reached end or 1 if there are more items.
+ Implementor is responsible for keeping state of the iteration. Use ti->prv fields for this
+ */
+ JSPFN_ITERNEXT iterNext;
+
+ /*
+ Ends the iteration of an iteratable object.
+ Any iteration state stored in ti->prv can be freed here
+ */
+ JSPFN_ITEREND iterEnd;
+
+ /*
+ Returns a reference to the value object of an iterator
+ The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object
+ */
+ JSPFN_ITERGETVALUE iterGetValue;
+
+ /*
+ Return name of iterator.
+ The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object
+ */
+ JSPFN_ITERGETNAME iterGetName;
+
+ /*
+ Release a value as indicated by setting ti->release = 1 in the previous getValue call.
+ The ti->prv array should contain the necessary context to release the value
+ */
+ void (*releaseObject)(JSOBJ obj);
+
+ /* Library functions
+ Set to NULL to use STDLIB malloc,realloc,free */
+ JSPFN_MALLOC malloc;
+ JSPFN_REALLOC realloc;
+ JSPFN_FREE free;
+
+ /*
+ Configuration for max recursion, set to 0 to use default (see JSON_MAX_RECURSION_DEPTH)*/
+ int recursionMax;
+
+ /*
+ Configuration for max decimals of double floating point numbers to encode (0-9) */
+ int doublePrecision;
+
+ /*
+ If true output will be ASCII with all characters above 127 encoded as \uXXXX. If false output will be UTF-8 or what ever charset strings are brought as */
+ int forceASCII;
+
+ /*
+ If true, '<', '>', and '&' characters will be encoded as \u003c, \u003e, and \u0026, respectively. If false, no special encoding will be used. */
+ int encodeHTMLChars;
+
+ /*
+ Set to an error message if error occurred */
+ const char *errorMsg;
+ JSOBJ errorObj;
+
+ /* Buffer stuff */
+ char *start;
+ char *offset;
+ char *end;
+ int heap;
+ int level;
+} JSONObjectEncoder;
+
+/*
+Encode an object structure into JSON.
+
+Arguments:
+obj - An anonymous type representing the object
+enc - Function definitions for querying JSOBJ type
+buffer - Preallocated buffer to store result in. If NULL function allocates own buffer
+cbBuffer - Length of buffer (ignored if buffer is NULL)
+
+Returns:
+Encoded JSON object as a null terminated char string.
+
+NOTE:
+If the supplied buffer wasn't enough to hold the result the function will allocate a new buffer.
+Life cycle of the provided buffer must still be handled by caller.
+
+If the return value doesn't equal the specified buffer caller must release the memory using
+JSONObjectEncoder.free or free() as specified when calling this function.
+*/
+EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc,
+ char *buffer, size_t cbBuffer);
+
+typedef struct __JSONObjectDecoder {
+ JSOBJ (*newString)(void *prv, wchar_t *start, wchar_t *end);
+ int (*objectAddKey)(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value);
+ int (*arrayAddItem)(void *prv, JSOBJ obj, JSOBJ value);
+ JSOBJ (*newTrue)(void *prv);
+ JSOBJ (*newFalse)(void *prv);
+ JSOBJ (*newNull)(void *prv);
+ JSOBJ (*newObject)(void *prv, void *decoder);
+ JSOBJ (*endObject)(void *prv, JSOBJ obj);
+ JSOBJ (*newArray)(void *prv, void *decoder);
+ JSOBJ (*endArray)(void *prv, JSOBJ obj);
+ JSOBJ (*newInt)(void *prv, JSINT32 value);
+ JSOBJ (*newLong)(void *prv, JSINT64 value);
+ JSOBJ (*newDouble)(void *prv, double value);
+ void (*releaseObject)(void *prv, JSOBJ obj, void *decoder);
+ JSPFN_MALLOC malloc;
+ JSPFN_FREE free;
+ JSPFN_REALLOC realloc;
+ char *errorStr;
+ char *errorOffset;
+ int preciseFloat;
+ void *prv;
+} JSONObjectDecoder;
+
+EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec,
+ const char *buffer, size_t cbBuffer);
+EXPORTFUNCTION void encode(JSOBJ, JSONObjectEncoder *, const char *, size_t);
+
+#define Buffer_Reserve(__enc, __len) \
+ if ((size_t)((__enc)->end - (__enc)->offset) < (size_t)(__len)) { \
+ Buffer_Realloc((__enc), (__len)); \
+ }
+
+void Buffer_Realloc(JSONObjectEncoder *enc, size_t cbNeeded);
+
+#endif // PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/ujson/lib/ultrajsondec.c b/contrib/python/pandas/py2/pandas/_libs/src/ujson/lib/ultrajsondec.c
new file mode 100644
index 00000000000..a847b0f5d51
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/ujson/lib/ultrajsondec.c
@@ -0,0 +1,1151 @@
+/*
+Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the ESN Social Software AB nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE
+LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
+https://github.com/client9/stringencoders
+Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights
+reserved.
+
+Numeric decoder derived from from TCL library
+http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
+* Copyright (c) 1988-1993 The Regents of the University of California.
+* Copyright (c) 1994 Sun Microsystems, Inc.
+*/
+
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <locale.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <wchar.h>
+#include "ultrajson.h"
+
+#ifndef TRUE
+#define TRUE 1
+#define FALSE 0
+#endif
+#ifndef NULL
+#define NULL 0
+#endif
+
+struct DecoderState {
+ char *start;
+ char *end;
+ wchar_t *escStart;
+ wchar_t *escEnd;
+ int escHeap;
+ int lastType;
+ JSUINT32 objDepth;
+ void *prv;
+ JSONObjectDecoder *dec;
+};
+
+JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) FASTCALL_ATTR;
+typedef JSOBJ (*PFN_DECODER)(struct DecoderState *ds);
+
+static JSOBJ SetError(struct DecoderState *ds, int offset,
+ const char *message) {
+ ds->dec->errorOffset = ds->start + offset;
+ ds->dec->errorStr = (char *)message;
+ return NULL;
+}
+
+double createDouble(double intNeg, double intValue, double frcValue,
+ int frcDecimalCount) {
+ static const double g_pow10[] = {1.0,
+ 0.1,
+ 0.01,
+ 0.001,
+ 0.0001,
+ 0.00001,
+ 0.000001,
+ 0.0000001,
+ 0.00000001,
+ 0.000000001,
+ 0.0000000001,
+ 0.00000000001,
+ 0.000000000001,
+ 0.0000000000001,
+ 0.00000000000001,
+ 0.000000000000001};
+ return (intValue + (frcValue * g_pow10[frcDecimalCount])) * intNeg;
+}
+
+FASTCALL_ATTR JSOBJ FASTCALL_MSVC decodePreciseFloat(struct DecoderState *ds) {
+ char *end;
+ double value;
+ errno = 0;
+
+ value = strtod(ds->start, &end);
+
+ if (errno == ERANGE) {
+ return SetError(ds, -1, "Range error when decoding numeric as double");
+ }
+
+ ds->start = end;
+ return ds->dec->newDouble(ds->prv, value);
+}
+
+FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
+ int intNeg = 1;
+ int mantSize = 0;
+ JSUINT64 intValue;
+ int chr;
+ int decimalCount = 0;
+ double frcValue = 0.0;
+ double expNeg;
+ double expValue;
+ char *offset = ds->start;
+
+ JSUINT64 overflowLimit = LLONG_MAX;
+
+ if (*(offset) == '-') {
+ offset++;
+ intNeg = -1;
+ overflowLimit = LLONG_MIN;
+ }
+
+ // Scan integer part
+ intValue = 0;
+
+ while (1) {
+ chr = (int)(unsigned char)*(offset);
+
+ switch (chr) {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9': {
+ // FIXME: Check for arithemtic overflow here
+ // PERF: Don't do 64-bit arithmetic here unless we know we have
+ // to
+ intValue = intValue * 10ULL + (JSLONG)(chr - 48);
+
+ if (intValue > overflowLimit) {
+ return SetError(ds, -1, overflowLimit == LLONG_MAX
+ ? "Value is too big"
+ : "Value is too small");
+ }
+
+ offset++;
+ mantSize++;
+ break;
+ }
+ case '.': {
+ offset++;
+ goto DECODE_FRACTION;
+ break;
+ }
+ case 'e':
+ case 'E': {
+ offset++;
+ goto DECODE_EXPONENT;
+ break;
+ }
+
+ default: {
+ goto BREAK_INT_LOOP;
+ break;
+ }
+ }
+ }
+
+BREAK_INT_LOOP:
+
+ ds->lastType = JT_INT;
+ ds->start = offset;
+
+ if ((intValue >> 31)) {
+ return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg));
+ } else {
+ return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg));
+ }
+
+DECODE_FRACTION:
+
+ if (ds->dec->preciseFloat) {
+ return decodePreciseFloat(ds);
+ }
+
+ // Scan fraction part
+ frcValue = 0.0;
+ for (;;) {
+ chr = (int)(unsigned char)*(offset);
+
+ switch (chr) {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9': {
+ if (decimalCount < JSON_DOUBLE_MAX_DECIMALS) {
+ frcValue = frcValue * 10.0 + (double)(chr - 48);
+ decimalCount++;
+ }
+ offset++;
+ break;
+ }
+ case 'e':
+ case 'E': {
+ offset++;
+ goto DECODE_EXPONENT;
+ break;
+ }
+ default: { goto BREAK_FRC_LOOP; }
+ }
+ }
+
+BREAK_FRC_LOOP:
+ // FIXME: Check for arithemtic overflow here
+ ds->lastType = JT_DOUBLE;
+ ds->start = offset;
+ return ds->dec->newDouble(
+ ds->prv,
+ createDouble((double)intNeg, (double)intValue, frcValue, decimalCount));
+
+DECODE_EXPONENT:
+ if (ds->dec->preciseFloat) {
+ return decodePreciseFloat(ds);
+ }
+
+ expNeg = 1.0;
+
+ if (*(offset) == '-') {
+ expNeg = -1.0;
+ offset++;
+ } else if (*(offset) == '+') {
+ expNeg = +1.0;
+ offset++;
+ }
+
+ expValue = 0.0;
+
+ for (;;) {
+ chr = (int)(unsigned char)*(offset);
+
+ switch (chr) {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9': {
+ expValue = expValue * 10.0 + (double)(chr - 48);
+ offset++;
+ break;
+ }
+ default: { goto BREAK_EXP_LOOP; }
+ }
+ }
+
+BREAK_EXP_LOOP:
+ // FIXME: Check for arithemtic overflow here
+ ds->lastType = JT_DOUBLE;
+ ds->start = offset;
+ return ds->dec->newDouble(
+ ds->prv,
+ createDouble((double)intNeg, (double)intValue, frcValue, decimalCount) *
+ pow(10.0, expValue * expNeg));
+}
+
+FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_true(struct DecoderState *ds) {
+ char *offset = ds->start;
+ offset++;
+
+ if (*(offset++) != 'r') goto SETERROR;
+ if (*(offset++) != 'u') goto SETERROR;
+ if (*(offset++) != 'e') goto SETERROR;
+
+ ds->lastType = JT_TRUE;
+ ds->start = offset;
+ return ds->dec->newTrue(ds->prv);
+
+SETERROR:
+ return SetError(ds, -1, "Unexpected character found when decoding 'true'");
+}
+
+FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_false(struct DecoderState *ds) {
+ char *offset = ds->start;
+ offset++;
+
+ if (*(offset++) != 'a') goto SETERROR;
+ if (*(offset++) != 'l') goto SETERROR;
+ if (*(offset++) != 's') goto SETERROR;
+ if (*(offset++) != 'e') goto SETERROR;
+
+ ds->lastType = JT_FALSE;
+ ds->start = offset;
+ return ds->dec->newFalse(ds->prv);
+
+SETERROR:
+ return SetError(ds, -1, "Unexpected character found when decoding 'false'");
+}
+
+FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_null(struct DecoderState *ds) {
+ char *offset = ds->start;
+ offset++;
+
+ if (*(offset++) != 'u') goto SETERROR;
+ if (*(offset++) != 'l') goto SETERROR;
+ if (*(offset++) != 'l') goto SETERROR;
+
+ ds->lastType = JT_NULL;
+ ds->start = offset;
+ return ds->dec->newNull(ds->prv);
+
+SETERROR:
+ return SetError(ds, -1, "Unexpected character found when decoding 'null'");
+}
+
+FASTCALL_ATTR void FASTCALL_MSVC SkipWhitespace(struct DecoderState *ds) {
+ char *offset;
+
+ for (offset = ds->start; (ds->end - offset) > 0; offset++) {
+ switch (*offset) {
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\n':
+ break;
+
+ default:
+ ds->start = offset;
+ return;
+ }
+ }
+
+ if (offset == ds->end) {
+ ds->start = ds->end;
+ }
+}
+
+enum DECODESTRINGSTATE {
+ DS_ISNULL = 0x32,
+ DS_ISQUOTE,
+ DS_ISESCAPE,
+ DS_UTFLENERROR,
+};
+
+static const JSUINT8 g_decoderLookup[256] = {
+ /* 0x00 */ DS_ISNULL,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ /* 0x10 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ /* 0x20 */ 1,
+ 1,
+ DS_ISQUOTE,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ /* 0x30 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ /* 0x40 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ /* 0x50 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ DS_ISESCAPE,
+ 1,
+ 1,
+ 1,
+ /* 0x60 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ /* 0x70 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ /* 0x80 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ /* 0x90 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ /* 0xa0 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ /* 0xb0 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ /* 0xc0 */ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ /* 0xd0 */ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ /* 0xe0 */ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ /* 0xf0 */ 4,
+ 4,
+ 4,
+ 4,
+ 4,
+ 4,
+ 4,
+ 4,
+ DS_UTFLENERROR,
+ DS_UTFLENERROR,
+ DS_UTFLENERROR,
+ DS_UTFLENERROR,
+ DS_UTFLENERROR,
+ DS_UTFLENERROR,
+ DS_UTFLENERROR,
+ DS_UTFLENERROR,
+};
+
+FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string(struct DecoderState *ds) {
+ JSUTF16 sur[2] = {0};
+ int iSur = 0;
+ int index;
+ wchar_t *escOffset;
+ wchar_t *escStart;
+ size_t escLen = (ds->escEnd - ds->escStart);
+ JSUINT8 *inputOffset;
+ JSUINT8 oct;
+ JSUTF32 ucs;
+ ds->lastType = JT_INVALID;
+ ds->start++;
+
+ if ((size_t)(ds->end - ds->start) > escLen) {
+ size_t newSize = (ds->end - ds->start);
+
+ if (ds->escHeap) {
+ if (newSize > (SIZE_MAX / sizeof(wchar_t))) {
+ return SetError(ds, -1, "Could not reserve memory block");
+ }
+ escStart = (wchar_t *)ds->dec->realloc(ds->escStart,
+ newSize * sizeof(wchar_t));
+ if (!escStart) {
+ ds->dec->free(ds->escStart);
+ return SetError(ds, -1, "Could not reserve memory block");
+ }
+ ds->escStart = escStart;
+ } else {
+ wchar_t *oldStart = ds->escStart;
+ if (newSize > (SIZE_MAX / sizeof(wchar_t))) {
+ return SetError(ds, -1, "Could not reserve memory block");
+ }
+ ds->escStart =
+ (wchar_t *)ds->dec->malloc(newSize * sizeof(wchar_t));
+ if (!ds->escStart) {
+ return SetError(ds, -1, "Could not reserve memory block");
+ }
+ ds->escHeap = 1;
+ memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t));
+ }
+
+ ds->escEnd = ds->escStart + newSize;
+ }
+
+ escOffset = ds->escStart;
+ inputOffset = (JSUINT8 *)ds->start;
+
+ for (;;) {
+ switch (g_decoderLookup[(JSUINT8)(*inputOffset)]) {
+ case DS_ISNULL: {
+ return SetError(ds, -1,
+ "Unmatched ''\"' when when decoding 'string'");
+ }
+ case DS_ISQUOTE: {
+ ds->lastType = JT_UTF8;
+ inputOffset++;
+ ds->start += ((char *)inputOffset - (ds->start));
+ return ds->dec->newString(ds->prv, ds->escStart, escOffset);
+ }
+ case DS_UTFLENERROR: {
+ return SetError(
+ ds, -1,
+ "Invalid UTF-8 sequence length when decoding 'string'");
+ }
+ case DS_ISESCAPE:
+ inputOffset++;
+ switch (*inputOffset) {
+ case '\\':
+ *(escOffset++) = L'\\';
+ inputOffset++;
+ continue;
+ case '\"':
+ *(escOffset++) = L'\"';
+ inputOffset++;
+ continue;
+ case '/':
+ *(escOffset++) = L'/';
+ inputOffset++;
+ continue;
+ case 'b':
+ *(escOffset++) = L'\b';
+ inputOffset++;
+ continue;
+ case 'f':
+ *(escOffset++) = L'\f';
+ inputOffset++;
+ continue;
+ case 'n':
+ *(escOffset++) = L'\n';
+ inputOffset++;
+ continue;
+ case 'r':
+ *(escOffset++) = L'\r';
+ inputOffset++;
+ continue;
+ case 't':
+ *(escOffset++) = L'\t';
+ inputOffset++;
+ continue;
+
+ case 'u': {
+ int index;
+ inputOffset++;
+
+ for (index = 0; index < 4; index++) {
+ switch (*inputOffset) {
+ case '\0':
+ return SetError(ds, -1,
+ "Unterminated unicode "
+ "escape sequence when "
+ "decoding 'string'");
+ default:
+ return SetError(ds, -1,
+ "Unexpected character in "
+ "unicode escape sequence "
+ "when decoding 'string'");
+
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ sur[iSur] = (sur[iSur] << 4) +
+ (JSUTF16)(*inputOffset - '0');
+ break;
+
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'e':
+ case 'f':
+ sur[iSur] = (sur[iSur] << 4) + 10 +
+ (JSUTF16)(*inputOffset - 'a');
+ break;
+
+ case 'A':
+ case 'B':
+ case 'C':
+ case 'D':
+ case 'E':
+ case 'F':
+ sur[iSur] = (sur[iSur] << 4) + 10 +
+ (JSUTF16)(*inputOffset - 'A');
+ break;
+ }
+
+ inputOffset++;
+ }
+
+ if (iSur == 0) {
+ if ((sur[iSur] & 0xfc00) == 0xd800) {
+ // First of a surrogate pair, continue parsing
+ iSur++;
+ break;
+ }
+ (*escOffset++) = (wchar_t)sur[iSur];
+ iSur = 0;
+ } else {
+ // Decode pair
+ if ((sur[1] & 0xfc00) != 0xdc00) {
+ return SetError(ds, -1,
+ "Unpaired high surrogate when "
+ "decoding 'string'");
+ }
+#if WCHAR_MAX == 0xffff
+ (*escOffset++) = (wchar_t)sur[0];
+ (*escOffset++) = (wchar_t)sur[1];
+#else
+ (*escOffset++) =
+ (wchar_t)0x10000 +
+ (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00));
+#endif
+ iSur = 0;
+ }
+ break;
+ }
+
+ case '\0':
+ return SetError(ds, -1,
+ "Unterminated escape sequence when "
+ "decoding 'string'");
+ default:
+ return SetError(ds, -1,
+ "Unrecognized escape sequence when "
+ "decoding 'string'");
+ }
+ break;
+
+ case 1: {
+ *(escOffset++) = (wchar_t)(*inputOffset++);
+ break;
+ }
+
+ case 2: {
+ ucs = (*inputOffset++) & 0x1f;
+ ucs <<= 6;
+ if (((*inputOffset) & 0x80) != 0x80) {
+ return SetError(ds, -1,
+ "Invalid octet in UTF-8 sequence when "
+ "decoding 'string'");
+ }
+ ucs |= (*inputOffset++) & 0x3f;
+ if (ucs < 0x80)
+ return SetError(ds, -1,
+ "Overlong 2 byte UTF-8 sequence detected "
+ "when decoding 'string'");
+ *(escOffset++) = (wchar_t)ucs;
+ break;
+ }
+
+ case 3: {
+ JSUTF32 ucs = 0;
+ ucs |= (*inputOffset++) & 0x0f;
+
+ for (index = 0; index < 2; index++) {
+ ucs <<= 6;
+ oct = (*inputOffset++);
+
+ if ((oct & 0x80) != 0x80) {
+ return SetError(ds, -1,
+ "Invalid octet in UTF-8 sequence when "
+ "decoding 'string'");
+ }
+
+ ucs |= oct & 0x3f;
+ }
+
+ if (ucs < 0x800)
+ return SetError(ds, -1,
+ "Overlong 3 byte UTF-8 sequence detected "
+ "when encoding string");
+ *(escOffset++) = (wchar_t)ucs;
+ break;
+ }
+
+ case 4: {
+ JSUTF32 ucs = 0;
+ ucs |= (*inputOffset++) & 0x07;
+
+ for (index = 0; index < 3; index++) {
+ ucs <<= 6;
+ oct = (*inputOffset++);
+
+ if ((oct & 0x80) != 0x80) {
+ return SetError(ds, -1,
+ "Invalid octet in UTF-8 sequence when "
+ "decoding 'string'");
+ }
+
+ ucs |= oct & 0x3f;
+ }
+
+ if (ucs < 0x10000)
+ return SetError(ds, -1,
+ "Overlong 4 byte UTF-8 sequence detected "
+ "when decoding 'string'");
+
+#if WCHAR_MAX == 0xffff
+ if (ucs >= 0x10000) {
+ ucs -= 0x10000;
+ *(escOffset++) = (wchar_t)(ucs >> 10) + 0xd800;
+ *(escOffset++) = (wchar_t)(ucs & 0x3ff) + 0xdc00;
+ } else {
+ *(escOffset++) = (wchar_t)ucs;
+ }
+#else
+ *(escOffset++) = (wchar_t)ucs;
+#endif
+ break;
+ }
+ }
+ }
+}
+
+FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_array(struct DecoderState *ds) {
+ JSOBJ itemValue;
+ JSOBJ newObj;
+ int len;
+ ds->objDepth++;
+ if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) {
+ return SetError(ds, -1, "Reached object decoding depth limit");
+ }
+
+ newObj = ds->dec->newArray(ds->prv, ds->dec);
+ len = 0;
+
+ ds->lastType = JT_INVALID;
+ ds->start++;
+
+ for (;;) {
+ SkipWhitespace(ds);
+
+ if ((*ds->start) == ']') {
+ ds->objDepth--;
+ if (len == 0) {
+ ds->start++;
+ return ds->dec->endArray(ds->prv, newObj);
+ }
+
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ return SetError(
+ ds, -1,
+ "Unexpected character found when decoding array value (1)");
+ }
+
+ itemValue = decode_any(ds);
+
+ if (itemValue == NULL) {
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ return NULL;
+ }
+
+ if (!ds->dec->arrayAddItem(ds->prv, newObj, itemValue)) {
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ return NULL;
+ }
+
+ SkipWhitespace(ds);
+
+ switch (*(ds->start++)) {
+ case ']': {
+ ds->objDepth--;
+ return ds->dec->endArray(ds->prv, newObj);
+ }
+ case ',':
+ break;
+
+ default:
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ return SetError(
+ ds, -1,
+ "Unexpected character found when decoding array value (2)");
+ }
+
+ len++;
+ }
+}
+
+FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_object(struct DecoderState *ds) {
+ JSOBJ itemName;
+ JSOBJ itemValue;
+ JSOBJ newObj;
+
+ ds->objDepth++;
+ if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) {
+ return SetError(ds, -1, "Reached object decoding depth limit");
+ }
+
+ newObj = ds->dec->newObject(ds->prv, ds->dec);
+
+ ds->start++;
+
+ for (;;) {
+ SkipWhitespace(ds);
+
+ if ((*ds->start) == '}') {
+ ds->objDepth--;
+ ds->start++;
+ return ds->dec->endObject(ds->prv, newObj);
+ }
+
+ ds->lastType = JT_INVALID;
+ itemName = decode_any(ds);
+
+ if (itemName == NULL) {
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ return NULL;
+ }
+
+ if (ds->lastType != JT_UTF8) {
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ ds->dec->releaseObject(ds->prv, itemName, ds->dec);
+ return SetError(
+ ds, -1,
+ "Key name of object must be 'string' when decoding 'object'");
+ }
+
+ SkipWhitespace(ds);
+
+ if (*(ds->start++) != ':') {
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ ds->dec->releaseObject(ds->prv, itemName, ds->dec);
+ return SetError(ds, -1, "No ':' found when decoding object value");
+ }
+
+ SkipWhitespace(ds);
+
+ itemValue = decode_any(ds);
+
+ if (itemValue == NULL) {
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ ds->dec->releaseObject(ds->prv, itemName, ds->dec);
+ return NULL;
+ }
+
+ if (!ds->dec->objectAddKey(ds->prv, newObj, itemName, itemValue)) {
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ ds->dec->releaseObject(ds->prv, itemName, ds->dec);
+ ds->dec->releaseObject(ds->prv, itemValue, ds->dec);
+ return NULL;
+ }
+
+ SkipWhitespace(ds);
+
+ switch (*(ds->start++)) {
+ case '}': {
+ ds->objDepth--;
+ return ds->dec->endObject(ds->prv, newObj);
+ }
+ case ',':
+ break;
+
+ default:
+ ds->dec->releaseObject(ds->prv, newObj, ds->dec);
+ return SetError(
+ ds, -1,
+ "Unexpected character found when decoding object value");
+ }
+ }
+}
+
+FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) {
+ for (;;) {
+ switch (*ds->start) {
+ case '\"':
+ return decode_string(ds);
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ case '-':
+ return decode_numeric(ds);
+
+ case '[':
+ return decode_array(ds);
+ case '{':
+ return decode_object(ds);
+ case 't':
+ return decode_true(ds);
+ case 'f':
+ return decode_false(ds);
+ case 'n':
+ return decode_null(ds);
+
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\n':
+ // White space
+ ds->start++;
+ break;
+
+ default:
+ return SetError(ds, -1, "Expected object or value");
+ }
+ }
+}
+
+JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer,
+ size_t cbBuffer) {
+ /*
+ FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode
+ escaping doesn't run into the wall each time */
+ char *locale;
+ struct DecoderState ds;
+ wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))];
+ JSOBJ ret;
+
+ ds.start = (char *)buffer;
+ ds.end = ds.start + cbBuffer;
+
+ ds.escStart = escBuffer;
+ ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t));
+ ds.escHeap = 0;
+ ds.prv = dec->prv;
+ ds.dec = dec;
+ ds.dec->errorStr = NULL;
+ ds.dec->errorOffset = NULL;
+ ds.objDepth = 0;
+
+ ds.dec = dec;
+
+ locale = setlocale(LC_NUMERIC, NULL);
+ if (strcmp(locale, "C")) {
+ locale = strdup(locale);
+ if (!locale) {
+ return SetError(&ds, -1, "Could not reserve memory block");
+ }
+ setlocale(LC_NUMERIC, "C");
+ ret = decode_any(&ds);
+ setlocale(LC_NUMERIC, locale);
+ free(locale);
+ } else {
+ ret = decode_any(&ds);
+ }
+
+ if (ds.escHeap) {
+ dec->free(ds.escStart);
+ }
+
+ SkipWhitespace(&ds);
+
+ if (ds.start != ds.end && ret) {
+ dec->releaseObject(ds.prv, ret, ds.dec);
+ return SetError(&ds, -1, "Trailing data");
+ }
+
+ return ret;
+}
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/contrib/python/pandas/py2/pandas/_libs/src/ujson/lib/ultrajsonenc.c
new file mode 100644
index 00000000000..2d6c823a455
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/ujson/lib/ultrajsonenc.c
@@ -0,0 +1,1143 @@
+/*
+Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the ESN Social Software AB nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE
+LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
+https://github.com/client9/stringencoders
+Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights
+reserved.
+
+Numeric decoder derived from from TCL library
+http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
+ * Copyright (c) 1988-1993 The Regents of the University of California.
+ * Copyright (c) 1994 Sun Microsystems, Inc.
+*/
+
+#include <assert.h>
+#include <float.h>
+#include <locale.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "ultrajson.h"
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+/*
+Worst cases being:
+
+Control characters (ASCII < 32)
+0x00 (1 byte) input => \u0000 output (6 bytes)
+1 * 6 => 6 (6 bytes required)
+
+or UTF-16 surrogate pairs
+4 bytes input in UTF-8 => \uXXXX\uYYYY (12 bytes).
+
+4 * 6 => 24 bytes (12 bytes required)
+
+The extra 2 bytes are for the quotes around the string
+
+*/
+#define RESERVE_STRING(_len) (2 + ((_len)*6))
+
+static const double g_pow10[] = {1,
+ 10,
+ 100,
+ 1000,
+ 10000,
+ 100000,
+ 1000000,
+ 10000000,
+ 100000000,
+ 1000000000,
+ 10000000000,
+ 100000000000,
+ 1000000000000,
+ 10000000000000,
+ 100000000000000,
+ 1000000000000000};
+static const char g_hexChars[] = "0123456789abcdef";
+static const char g_escapeChars[] = "0123456789\\b\\t\\n\\f\\r\\\"\\\\\\/";
+
+/*
+FIXME: While this is fine dandy and working it's a magic value mess which
+probably only the author understands.
+Needs a cleanup and more documentation */
+
+/*
+Table for pure ascii output escaping all characters above 127 to \uXXXX */
+static const JSUINT8 g_asciiOutputTable[256] = {
+ /* 0x00 */ 0,
+ 30,
+ 30,
+ 30,
+ 30,
+ 30,
+ 30,
+ 30,
+ 10,
+ 12,
+ 14,
+ 30,
+ 16,
+ 18,
+ 30,
+ 30,
+ /* 0x10 */ 30,
+ 30,
+ 30,
+ 30,
+ 30,
+ 30,
+ 30,
+ 30,
+ 30,
+ 30,
+ 30,
+ 30,
+ 30,
+ 30,
+ 30,
+ 30,
+ /* 0x20 */ 1,
+ 1,
+ 20,
+ 1,
+ 1,
+ 1,
+ 29,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 24,
+ /* 0x30 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 29,
+ 1,
+ 29,
+ 1,
+ /* 0x40 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ /* 0x50 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 22,
+ 1,
+ 1,
+ 1,
+ /* 0x60 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ /* 0x70 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ /* 0x80 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ /* 0x90 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ /* 0xa0 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ /* 0xb0 */ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ /* 0xc0 */ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ /* 0xd0 */ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ /* 0xe0 */ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ 3,
+ /* 0xf0 */ 4,
+ 4,
+ 4,
+ 4,
+ 4,
+ 4,
+ 4,
+ 4,
+ 5,
+ 5,
+ 5,
+ 5,
+ 6,
+ 6,
+ 1,
+ 1};
+
+static void SetError(JSOBJ obj, JSONObjectEncoder *enc, const char *message) {
+ enc->errorMsg = message;
+ enc->errorObj = obj;
+}
+
+/*
+FIXME: Keep track of how big these get across several encoder calls and try to
+make an estimate
+That way we won't run our head into the wall each call */
+void Buffer_Realloc(JSONObjectEncoder *enc, size_t cbNeeded) {
+ size_t curSize = enc->end - enc->start;
+ size_t newSize = curSize * 2;
+ size_t offset = enc->offset - enc->start;
+
+ while (newSize < curSize + cbNeeded) {
+ newSize *= 2;
+ }
+
+ if (enc->heap) {
+ enc->start = (char *)enc->realloc(enc->start, newSize);
+ if (!enc->start) {
+ SetError(NULL, enc, "Could not reserve memory block");
+ return;
+ }
+ } else {
+ char *oldStart = enc->start;
+ enc->heap = 1;
+ enc->start = (char *)enc->malloc(newSize);
+ if (!enc->start) {
+ SetError(NULL, enc, "Could not reserve memory block");
+ return;
+ }
+ memcpy(enc->start, oldStart, offset);
+ }
+ enc->offset = enc->start + offset;
+ enc->end = enc->start + newSize;
+}
+
+FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC
+Buffer_AppendShortHexUnchecked(char *outputOffset, unsigned short value) {
+ *(outputOffset++) = g_hexChars[(value & 0xf000) >> 12];
+ *(outputOffset++) = g_hexChars[(value & 0x0f00) >> 8];
+ *(outputOffset++) = g_hexChars[(value & 0x00f0) >> 4];
+ *(outputOffset++) = g_hexChars[(value & 0x000f) >> 0];
+}
+
+int Buffer_EscapeStringUnvalidated(JSONObjectEncoder *enc, const char *io,
+ const char *end) {
+ char *of = (char *)enc->offset;
+
+ for (;;) {
+ switch (*io) {
+ case 0x00: {
+ if (io < end) {
+ *(of++) = '\\';
+ *(of++) = 'u';
+ *(of++) = '0';
+ *(of++) = '0';
+ *(of++) = '0';
+ *(of++) = '0';
+ break;
+ } else {
+ enc->offset += (of - enc->offset);
+ return TRUE;
+ }
+ }
+ case '\"':
+ (*of++) = '\\';
+ (*of++) = '\"';
+ break;
+ case '\\':
+ (*of++) = '\\';
+ (*of++) = '\\';
+ break;
+ case '/':
+ (*of++) = '\\';
+ (*of++) = '/';
+ break;
+ case '\b':
+ (*of++) = '\\';
+ (*of++) = 'b';
+ break;
+ case '\f':
+ (*of++) = '\\';
+ (*of++) = 'f';
+ break;
+ case '\n':
+ (*of++) = '\\';
+ (*of++) = 'n';
+ break;
+ case '\r':
+ (*of++) = '\\';
+ (*of++) = 'r';
+ break;
+ case '\t':
+ (*of++) = '\\';
+ (*of++) = 't';
+ break;
+
+ case 0x26: // '/'
+ case 0x3c: // '<'
+ case 0x3e: // '>'
+ {
+ if (enc->encodeHTMLChars) {
+ // Fall through to \u00XX case below.
+ } else {
+ // Same as default case below.
+ (*of++) = (*io);
+ break;
+ }
+ }
+ case 0x01:
+ case 0x02:
+ case 0x03:
+ case 0x04:
+ case 0x05:
+ case 0x06:
+ case 0x07:
+ case 0x0b:
+ case 0x0e:
+ case 0x0f:
+ case 0x10:
+ case 0x11:
+ case 0x12:
+ case 0x13:
+ case 0x14:
+ case 0x15:
+ case 0x16:
+ case 0x17:
+ case 0x18:
+ case 0x19:
+ case 0x1a:
+ case 0x1b:
+ case 0x1c:
+ case 0x1d:
+ case 0x1e:
+ case 0x1f: {
+ *(of++) = '\\';
+ *(of++) = 'u';
+ *(of++) = '0';
+ *(of++) = '0';
+ *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)];
+ *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)];
+ break;
+ }
+ default:
+ (*of++) = (*io);
+ break;
+ }
+ io++;
+ }
+}
+
+int Buffer_EscapeStringValidated(JSOBJ obj, JSONObjectEncoder *enc,
+ const char *io, const char *end) {
+ JSUTF32 ucs;
+ char *of = (char *)enc->offset;
+
+ for (;;) {
+ JSUINT8 utflen = g_asciiOutputTable[(unsigned char)*io];
+
+ switch (utflen) {
+ case 0: {
+ if (io < end) {
+ *(of++) = '\\';
+ *(of++) = 'u';
+ *(of++) = '0';
+ *(of++) = '0';
+ *(of++) = '0';
+ *(of++) = '0';
+ io++;
+ continue;
+ } else {
+ enc->offset += (of - enc->offset);
+ return TRUE;
+ }
+ }
+
+ case 1: {
+ *(of++) = (*io++);
+ continue;
+ }
+
+ case 2: {
+ JSUTF32 in;
+ JSUTF16 in16;
+
+ if (end - io < 1) {
+ enc->offset += (of - enc->offset);
+ SetError(
+ obj, enc,
+ "Unterminated UTF-8 sequence when encoding string");
+ return FALSE;
+ }
+
+ memcpy(&in16, io, sizeof(JSUTF16));
+ in = (JSUTF32)in16;
+
+#ifdef __LITTLE_ENDIAN__
+ ucs = ((in & 0x1f) << 6) | ((in >> 8) & 0x3f);
+#else
+ ucs = ((in & 0x1f00) >> 2) | (in & 0x3f);
+#endif
+
+ if (ucs < 0x80) {
+ enc->offset += (of - enc->offset);
+ SetError(obj, enc,
+ "Overlong 2 byte UTF-8 sequence detected when "
+ "encoding string");
+ return FALSE;
+ }
+
+ io += 2;
+ break;
+ }
+
+ case 3: {
+ JSUTF32 in;
+ JSUTF16 in16;
+ JSUINT8 in8;
+
+ if (end - io < 2) {
+ enc->offset += (of - enc->offset);
+ SetError(
+ obj, enc,
+ "Unterminated UTF-8 sequence when encoding string");
+ return FALSE;
+ }
+
+ memcpy(&in16, io, sizeof(JSUTF16));
+ memcpy(&in8, io + 2, sizeof(JSUINT8));
+#ifdef __LITTLE_ENDIAN__
+ in = (JSUTF32)in16;
+ in |= in8 << 16;
+ ucs = ((in & 0x0f) << 12) | ((in & 0x3f00) >> 2) |
+ ((in & 0x3f0000) >> 16);
+#else
+ in = in16 << 8;
+ in |= in8;
+ ucs =
+ ((in & 0x0f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f);
+#endif
+
+ if (ucs < 0x800) {
+ enc->offset += (of - enc->offset);
+ SetError(obj, enc,
+ "Overlong 3 byte UTF-8 sequence detected when "
+ "encoding string");
+ return FALSE;
+ }
+
+ io += 3;
+ break;
+ }
+ case 4: {
+ JSUTF32 in;
+
+ if (end - io < 3) {
+ enc->offset += (of - enc->offset);
+ SetError(
+ obj, enc,
+ "Unterminated UTF-8 sequence when encoding string");
+ return FALSE;
+ }
+
+ memcpy(&in, io, sizeof(JSUTF32));
+#ifdef __LITTLE_ENDIAN__
+ ucs = ((in & 0x07) << 18) | ((in & 0x3f00) << 4) |
+ ((in & 0x3f0000) >> 10) | ((in & 0x3f000000) >> 24);
+#else
+ ucs = ((in & 0x07000000) >> 6) | ((in & 0x3f0000) >> 4) |
+ ((in & 0x3f00) >> 2) | (in & 0x3f);
+#endif
+ if (ucs < 0x10000) {
+ enc->offset += (of - enc->offset);
+ SetError(obj, enc,
+ "Overlong 4 byte UTF-8 sequence detected when "
+ "encoding string");
+ return FALSE;
+ }
+
+ io += 4;
+ break;
+ }
+
+ case 5:
+ case 6: {
+ enc->offset += (of - enc->offset);
+ SetError(
+ obj, enc,
+ "Unsupported UTF-8 sequence length when encoding string");
+ return FALSE;
+ }
+
+ case 29: {
+ if (enc->encodeHTMLChars) {
+ // Fall through to \u00XX case 30 below.
+ } else {
+ // Same as case 1 above.
+ *(of++) = (*io++);
+ continue;
+ }
+ }
+
+ case 30: {
+ // \uXXXX encode
+ *(of++) = '\\';
+ *(of++) = 'u';
+ *(of++) = '0';
+ *(of++) = '0';
+ *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)];
+ *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)];
+ io++;
+ continue;
+ }
+ case 10:
+ case 12:
+ case 14:
+ case 16:
+ case 18:
+ case 20:
+ case 22:
+ case 24: {
+ *(of++) = *((char *)(g_escapeChars + utflen + 0));
+ *(of++) = *((char *)(g_escapeChars + utflen + 1));
+ io++;
+ continue;
+ }
+ // This can never happen, it's here to make L4 VC++ happy
+ default: {
+ ucs = 0;
+ break;
+ }
+ }
+
+ /*
+ If the character is a UTF8 sequence of length > 1 we end up here */
+ if (ucs >= 0x10000) {
+ ucs -= 0x10000;
+ *(of++) = '\\';
+ *(of++) = 'u';
+ Buffer_AppendShortHexUnchecked(
+ of, (unsigned short)(ucs >> 10) + 0xd800);
+ of += 4;
+
+ *(of++) = '\\';
+ *(of++) = 'u';
+ Buffer_AppendShortHexUnchecked(
+ of, (unsigned short)(ucs & 0x3ff) + 0xdc00);
+ of += 4;
+ } else {
+ *(of++) = '\\';
+ *(of++) = 'u';
+ Buffer_AppendShortHexUnchecked(of, (unsigned short)ucs);
+ of += 4;
+ }
+ }
+}
+
+#define Buffer_AppendCharUnchecked(__enc, __chr) *((__enc)->offset++) = __chr;
+
+FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin,
+ char *end) {
+ char aux;
+ while (end > begin) aux = *end, *end-- = *begin, *begin++ = aux;
+}
+
+void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) {
+ char *wstr;
+ JSUINT32 uvalue = (value < 0) ? -value : value;
+ wstr = enc->offset;
+
+ // Conversion. Number is reversed.
+ do {
+ *wstr++ = (char)(48 + (uvalue % 10));
+ } while (uvalue /= 10);
+ if (value < 0) *wstr++ = '-';
+
+ // Reverse string
+ strreverse(enc->offset, wstr - 1);
+ enc->offset += (wstr - (enc->offset));
+}
+
+void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) {
+ char *wstr;
+ JSUINT64 uvalue = (value < 0) ? -value : value;
+
+ wstr = enc->offset;
+ // Conversion. Number is reversed.
+
+ do {
+ *wstr++ = (char)(48 + (uvalue % 10ULL));
+ } while (uvalue /= 10ULL);
+ if (value < 0) *wstr++ = '-';
+
+ // Reverse string
+ strreverse(enc->offset, wstr - 1);
+ enc->offset += (wstr - (enc->offset));
+}
+
+int Buffer_AppendDoubleUnchecked(JSOBJ obj, JSONObjectEncoder *enc,
+ double value) {
+ /* if input is beyond the thresholds, revert to exponential */
+ const double thres_max = (double)1e16 - 1;
+ const double thres_min = (double)1e-15;
+ char precision_str[20];
+ int count;
+ double diff = 0.0;
+ char *str = enc->offset;
+ char *wstr = str;
+ unsigned long long whole;
+ double tmp;
+ unsigned long long frac;
+ int neg;
+ double pow10;
+
+ if (value == HUGE_VAL || value == -HUGE_VAL) {
+ SetError(obj, enc, "Invalid Inf value when encoding double");
+ return FALSE;
+ }
+
+ if (!(value == value)) {
+ SetError(obj, enc, "Invalid Nan value when encoding double");
+ return FALSE;
+ }
+
+ /* we'll work in positive values and deal with the
+ negative sign issue later */
+ neg = 0;
+ if (value < 0) {
+ neg = 1;
+ value = -value;
+ }
+
+ /*
+ for very large or small numbers switch back to native sprintf for
+ exponentials. anyone want to write code to replace this? */
+ if (value > thres_max || (value != 0.0 && fabs(value) < thres_min)) {
+ precision_str[0] = '%';
+ precision_str[1] = '.';
+#if defined(_WIN32) && defined(_MSC_VER)
+ sprintf_s(precision_str + 2, sizeof(precision_str) - 2, "%ug",
+ enc->doublePrecision);
+ enc->offset += sprintf_s(str, enc->end - enc->offset, precision_str,
+ neg ? -value : value);
+#else
+ snprintf(precision_str + 2, sizeof(precision_str) - 2, "%ug",
+ enc->doublePrecision);
+ enc->offset += snprintf(str, enc->end - enc->offset, precision_str,
+ neg ? -value : value);
+#endif
+ return TRUE;
+ }
+
+ pow10 = g_pow10[enc->doublePrecision];
+
+ whole = (unsigned long long)value;
+ tmp = (value - whole) * pow10;
+ frac = (unsigned long long)(tmp);
+ diff = tmp - frac;
+
+ if (diff > 0.5) {
+ ++frac;
+ } else if (diff == 0.5 && ((frac == 0) || (frac & 1))) {
+ /* if halfway, round up if odd, OR
+ if last digit is 0. That last part is strange */
+ ++frac;
+ }
+
+ // handle rollover, e.g.
+ // case 0.99 with prec 1 is 1.0 and case 0.95 with prec is 1.0 as well
+ if (frac >= pow10) {
+ frac = 0;
+ ++whole;
+ }
+
+ if (enc->doublePrecision == 0) {
+ diff = value - whole;
+
+ if (diff > 0.5) {
+ /* greater than 0.5, round up, e.g. 1.6 -> 2 */
+ ++whole;
+ } else if (diff == 0.5 && (whole & 1)) {
+ /* exactly 0.5 and ODD, then round up */
+ /* 1.5 -> 2, but 2.5 -> 2 */
+ ++whole;
+ }
+
+ // vvvvvvvvvvvvvvvvvvv Diff from modp_dto2
+ } else if (frac) {
+ count = enc->doublePrecision;
+ // now do fractional part, as an unsigned number
+ // we know it is not 0 but we can have leading zeros, these
+ // should be removed
+ while (!(frac % 10)) {
+ --count;
+ frac /= 10;
+ }
+ //^^^^^^^^^^^^^^^^^^^ Diff from modp_dto2
+
+ // now do fractional part, as an unsigned number
+ do {
+ --count;
+ *wstr++ = (char)(48 + (frac % 10));
+ } while (frac /= 10);
+ // add extra 0s
+ while (count-- > 0) {
+ *wstr++ = '0';
+ }
+ // add decimal
+ *wstr++ = '.';
+ } else {
+ *wstr++ = '0';
+ *wstr++ = '.';
+ }
+
+ // Do whole part. Take care of sign
+ // conversion. Number is reversed.
+ do {
+ *wstr++ = (char)(48 + (whole % 10));
+ } while (whole /= 10);
+
+ if (neg) {
+ *wstr++ = '-';
+ }
+ strreverse(str, wstr - 1);
+ enc->offset += (wstr - (enc->offset));
+
+ return TRUE;
+}
+
+/*
+FIXME:
+Handle integration functions returning NULL here */
+
+/*
+FIXME:
+Perhaps implement recursion detection */
+
+void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name,
+ size_t cbName) {
+ const char *value;
+ char *objName;
+ int count;
+ JSOBJ iterObj;
+ size_t szlen;
+ JSONTypeContext tc;
+ tc.encoder = enc;
+
+ if (enc->level > enc->recursionMax) {
+ SetError(obj, enc, "Maximum recursion level reached");
+ return;
+ }
+
+ /*
+ This reservation must hold
+
+ length of _name as encoded worst case +
+ maxLength of double to string OR maxLength of JSLONG to string
+ */
+
+ Buffer_Reserve(enc, 256 + RESERVE_STRING(cbName));
+ if (enc->errorMsg) {
+ return;
+ }
+
+ if (name) {
+ Buffer_AppendCharUnchecked(enc, '\"');
+
+ if (enc->forceASCII) {
+ if (!Buffer_EscapeStringValidated(obj, enc, name, name + cbName)) {
+ return;
+ }
+ } else {
+ if (!Buffer_EscapeStringUnvalidated(enc, name, name + cbName)) {
+ return;
+ }
+ }
+
+ Buffer_AppendCharUnchecked(enc, '\"');
+
+ Buffer_AppendCharUnchecked(enc, ':');
+#ifndef JSON_NO_EXTRA_WHITESPACE
+ Buffer_AppendCharUnchecked(enc, ' ');
+#endif
+ }
+
+ enc->beginTypeContext(obj, &tc);
+
+ switch (tc.type) {
+ case JT_INVALID: {
+ return;
+ }
+
+ case JT_ARRAY: {
+ count = 0;
+ enc->iterBegin(obj, &tc);
+
+ Buffer_AppendCharUnchecked(enc, '[');
+
+ while (enc->iterNext(obj, &tc)) {
+ if (count > 0) {
+ Buffer_AppendCharUnchecked(enc, ',');
+#ifndef JSON_NO_EXTRA_WHITESPACE
+ Buffer_AppendCharUnchecked(buffer, ' ');
+#endif
+ }
+
+ iterObj = enc->iterGetValue(obj, &tc);
+
+ enc->level++;
+ encode(iterObj, enc, NULL, 0);
+ count++;
+ }
+
+ enc->iterEnd(obj, &tc);
+ Buffer_Reserve(enc, 2);
+ Buffer_AppendCharUnchecked(enc, ']');
+ break;
+ }
+
+ case JT_OBJECT: {
+ count = 0;
+ enc->iterBegin(obj, &tc);
+
+ Buffer_AppendCharUnchecked(enc, '{');
+
+ while (enc->iterNext(obj, &tc)) {
+ if (count > 0) {
+ Buffer_AppendCharUnchecked(enc, ',');
+#ifndef JSON_NO_EXTRA_WHITESPACE
+ Buffer_AppendCharUnchecked(enc, ' ');
+#endif
+ }
+
+ iterObj = enc->iterGetValue(obj, &tc);
+ objName = enc->iterGetName(obj, &tc, &szlen);
+
+ enc->level++;
+ encode(iterObj, enc, objName, szlen);
+ count++;
+ }
+
+ enc->iterEnd(obj, &tc);
+ Buffer_Reserve(enc, 2);
+ Buffer_AppendCharUnchecked(enc, '}');
+ break;
+ }
+
+ case JT_LONG: {
+ Buffer_AppendLongUnchecked(enc, enc->getLongValue(obj, &tc));
+ break;
+ }
+
+ case JT_INT: {
+ Buffer_AppendIntUnchecked(enc, enc->getIntValue(obj, &tc));
+ break;
+ }
+
+ case JT_TRUE: {
+ Buffer_AppendCharUnchecked(enc, 't');
+ Buffer_AppendCharUnchecked(enc, 'r');
+ Buffer_AppendCharUnchecked(enc, 'u');
+ Buffer_AppendCharUnchecked(enc, 'e');
+ break;
+ }
+
+ case JT_FALSE: {
+ Buffer_AppendCharUnchecked(enc, 'f');
+ Buffer_AppendCharUnchecked(enc, 'a');
+ Buffer_AppendCharUnchecked(enc, 'l');
+ Buffer_AppendCharUnchecked(enc, 's');
+ Buffer_AppendCharUnchecked(enc, 'e');
+ break;
+ }
+
+ case JT_NULL: {
+ Buffer_AppendCharUnchecked(enc, 'n');
+ Buffer_AppendCharUnchecked(enc, 'u');
+ Buffer_AppendCharUnchecked(enc, 'l');
+ Buffer_AppendCharUnchecked(enc, 'l');
+ break;
+ }
+
+ case JT_DOUBLE: {
+ if (!Buffer_AppendDoubleUnchecked(obj, enc,
+ enc->getDoubleValue(obj, &tc))) {
+ enc->endTypeContext(obj, &tc);
+ enc->level--;
+ return;
+ }
+ break;
+ }
+
+ case JT_UTF8: {
+ value = enc->getStringValue(obj, &tc, &szlen);
+ Buffer_Reserve(enc, RESERVE_STRING(szlen));
+ if (enc->errorMsg) {
+ enc->endTypeContext(obj, &tc);
+ return;
+ }
+ Buffer_AppendCharUnchecked(enc, '\"');
+
+ if (enc->forceASCII) {
+ if (!Buffer_EscapeStringValidated(obj, enc, value,
+ value + szlen)) {
+ enc->endTypeContext(obj, &tc);
+ enc->level--;
+ return;
+ }
+ } else {
+ if (!Buffer_EscapeStringUnvalidated(enc, value,
+ value + szlen)) {
+ enc->endTypeContext(obj, &tc);
+ enc->level--;
+ return;
+ }
+ }
+
+ Buffer_AppendCharUnchecked(enc, '\"');
+ break;
+ }
+ }
+
+ enc->endTypeContext(obj, &tc);
+ enc->level--;
+}
+
+char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer,
+ size_t _cbBuffer) {
+ char *locale;
+ enc->malloc = enc->malloc ? enc->malloc : malloc;
+ enc->free = enc->free ? enc->free : free;
+ enc->realloc = enc->realloc ? enc->realloc : realloc;
+ enc->errorMsg = NULL;
+ enc->errorObj = NULL;
+ enc->level = 0;
+
+ if (enc->recursionMax < 1) {
+ enc->recursionMax = JSON_MAX_RECURSION_DEPTH;
+ }
+
+ if (enc->doublePrecision < 0 ||
+ enc->doublePrecision > JSON_DOUBLE_MAX_DECIMALS) {
+ enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS;
+ }
+
+ if (_buffer == NULL) {
+ _cbBuffer = 32768;
+ enc->start = (char *)enc->malloc(_cbBuffer);
+ if (!enc->start) {
+ SetError(obj, enc, "Could not reserve memory block");
+ return NULL;
+ }
+ enc->heap = 1;
+ } else {
+ enc->start = _buffer;
+ enc->heap = 0;
+ }
+
+ enc->end = enc->start + _cbBuffer;
+ enc->offset = enc->start;
+
+ locale = setlocale(LC_NUMERIC, NULL);
+ if (strcmp(locale, "C")) {
+ locale = strdup(locale);
+ if (!locale) {
+ SetError(NULL, enc, "Could not reserve memory block");
+ return NULL;
+ }
+ setlocale(LC_NUMERIC, "C");
+ encode(obj, enc, NULL, 0);
+ setlocale(LC_NUMERIC, locale);
+ free(locale);
+ } else {
+ encode(obj, enc, NULL, 0);
+ }
+
+ Buffer_Reserve(enc, 1);
+ if (enc->errorMsg) {
+ return NULL;
+ }
+ Buffer_AppendCharUnchecked(enc, '\0');
+
+ return enc->start;
+}
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/ujson/python/JSONtoObj.c b/contrib/python/pandas/py2/pandas/_libs/src/ujson/python/JSONtoObj.c
new file mode 100644
index 00000000000..85cf1d5e5e7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/ujson/python/JSONtoObj.c
@@ -0,0 +1,638 @@
+/*
+Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the ESN Social Software AB nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
+https://github.com/client9/stringencoders
+Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
+
+Numeric decoder derived from from TCL library
+http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
+ * Copyright (c) 1988-1993 The Regents of the University of California.
+ * Copyright (c) 1994 Sun Microsystems, Inc.
+*/
+
+// "py_defines.h" needs to be included first to
+// avoid compilation errors, but it does violate
+// styleguide checks with regards to include order.
+#include "py_defines.h"
+#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY
+#define NO_IMPORT_ARRAY
+#include <numpy/arrayobject.h> // NOLINT(build/include_order)
+#include <ultrajson.h> // NOLINT(build/include_order)
+
+#define PRINTMARK()
+
+typedef struct __PyObjectDecoder {
+ JSONObjectDecoder dec;
+
+ void *npyarr; // Numpy context buffer
+ void *npyarr_addr; // Ref to npyarr ptr to track DECREF calls
+ npy_intp curdim; // Current array dimension
+
+ PyArray_Descr *dtype;
+} PyObjectDecoder;
+
+typedef struct __NpyArrContext {
+ PyObject *ret;
+ PyObject *labels[2];
+ PyArray_Dims shape;
+
+ PyObjectDecoder *dec;
+
+ npy_intp i;
+ npy_intp elsize;
+ npy_intp elcount;
+} NpyArrContext;
+
+// Numpy handling based on numpy internal code, specifically the function
+// PyArray_FromIter.
+
+// numpy related functions are inter-dependent so declare them all here,
+// to ensure the compiler catches any errors
+
+// standard numpy array handling
+JSOBJ Object_npyNewArray(void *prv, void *decoder);
+JSOBJ Object_npyEndArray(void *prv, JSOBJ obj);
+int Object_npyArrayAddItem(void *prv, JSOBJ obj, JSOBJ value);
+
+// for more complex dtypes (object and string) fill a standard Python list
+// and convert to a numpy array when done.
+JSOBJ Object_npyNewArrayList(void *prv, void *decoder);
+JSOBJ Object_npyEndArrayList(void *prv, JSOBJ obj);
+int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value);
+
+// labelled support, encode keys and values of JS object into separate numpy
+// arrays
+JSOBJ Object_npyNewObject(void *prv, void *decoder);
+JSOBJ Object_npyEndObject(void *prv, JSOBJ obj);
+int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value);
+
+// free the numpy context buffer
+void Npy_releaseContext(NpyArrContext *npyarr) {
+ PRINTMARK();
+ if (npyarr) {
+ if (npyarr->shape.ptr) {
+ PyObject_Free(npyarr->shape.ptr);
+ }
+ if (npyarr->dec) {
+ npyarr->dec->npyarr = NULL;
+ npyarr->dec->curdim = 0;
+ }
+ Py_XDECREF(npyarr->labels[0]);
+ Py_XDECREF(npyarr->labels[1]);
+ Py_XDECREF(npyarr->ret);
+ PyObject_Free(npyarr);
+ }
+}
+
+JSOBJ Object_npyNewArray(void *prv, void *_decoder) {
+ NpyArrContext *npyarr;
+ PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
+ PRINTMARK();
+ if (decoder->curdim <= 0) {
+ // start of array - initialise the context buffer
+ npyarr = decoder->npyarr = PyObject_Malloc(sizeof(NpyArrContext));
+ decoder->npyarr_addr = npyarr;
+
+ if (!npyarr) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+
+ npyarr->dec = decoder;
+ npyarr->labels[0] = npyarr->labels[1] = NULL;
+
+ npyarr->shape.ptr = PyObject_Malloc(sizeof(npy_intp) * NPY_MAXDIMS);
+ npyarr->shape.len = 1;
+ npyarr->ret = NULL;
+
+ npyarr->elsize = 0;
+ npyarr->elcount = 4;
+ npyarr->i = 0;
+ } else {
+ // starting a new dimension continue the current array (and reshape
+ // after)
+ npyarr = (NpyArrContext *)decoder->npyarr;
+ if (decoder->curdim >= npyarr->shape.len) {
+ npyarr->shape.len++;
+ }
+ }
+
+ npyarr->shape.ptr[decoder->curdim] = 0;
+ decoder->curdim++;
+ return npyarr;
+}
+
+PyObject *Npy_returnLabelled(NpyArrContext *npyarr) {
+ PyObject *ret = npyarr->ret;
+ npy_intp i;
+
+ if (npyarr->labels[0] || npyarr->labels[1]) {
+ // finished decoding, build tuple with values and labels
+ ret = PyTuple_New(npyarr->shape.len + 1);
+ for (i = 0; i < npyarr->shape.len; i++) {
+ if (npyarr->labels[i]) {
+ PyTuple_SET_ITEM(ret, i + 1, npyarr->labels[i]);
+ npyarr->labels[i] = NULL;
+ } else {
+ Py_INCREF(Py_None);
+ PyTuple_SET_ITEM(ret, i + 1, Py_None);
+ }
+ }
+ PyTuple_SET_ITEM(ret, 0, npyarr->ret);
+ }
+
+ return ret;
+}
+
+JSOBJ Object_npyEndArray(void *prv, JSOBJ obj) {
+ PyObject *ret;
+ char *new_data;
+ NpyArrContext *npyarr = (NpyArrContext *)obj;
+ int emptyType = NPY_DEFAULT_TYPE;
+ npy_intp i;
+ PRINTMARK();
+ if (!npyarr) {
+ return NULL;
+ }
+
+ ret = npyarr->ret;
+ i = npyarr->i;
+
+ npyarr->dec->curdim--;
+
+ if (i == 0 || !npyarr->ret) {
+ // empty array would not have been initialised so do it now.
+ if (npyarr->dec->dtype) {
+ emptyType = npyarr->dec->dtype->type_num;
+ }
+ npyarr->ret = ret =
+ PyArray_EMPTY(npyarr->shape.len, npyarr->shape.ptr, emptyType, 0);
+ } else if (npyarr->dec->curdim <= 0) {
+ // realloc to final size
+ new_data = PyDataMem_RENEW(PyArray_DATA(ret), i * npyarr->elsize);
+ if (new_data == NULL) {
+ PyErr_NoMemory();
+ Npy_releaseContext(npyarr);
+ return NULL;
+ }
+ ((PyArrayObject *)ret)->data = (void *)new_data;
+ // PyArray_BYTES(ret) = new_data;
+ }
+
+ if (npyarr->dec->curdim <= 0) {
+ // finished decoding array, reshape if necessary
+ if (npyarr->shape.len > 1) {
+ npyarr->ret = PyArray_Newshape((PyArrayObject *)ret, &npyarr->shape,
+ NPY_ANYORDER);
+ Py_DECREF(ret);
+ }
+
+ ret = Npy_returnLabelled(npyarr);
+
+ npyarr->ret = NULL;
+ Npy_releaseContext(npyarr);
+ }
+
+ return ret;
+}
+
+int Object_npyArrayAddItem(void *prv, JSOBJ obj, JSOBJ value) {
+ PyObject *type;
+ PyArray_Descr *dtype;
+ npy_intp i;
+ char *new_data, *item;
+ NpyArrContext *npyarr = (NpyArrContext *)obj;
+ PRINTMARK();
+ if (!npyarr) {
+ return 0;
+ }
+
+ i = npyarr->i;
+
+ npyarr->shape.ptr[npyarr->dec->curdim - 1]++;
+
+ if (PyArray_Check((PyObject *)value)) {
+ // multidimensional array, keep decoding values.
+ return 1;
+ }
+
+ if (!npyarr->ret) {
+ // Array not initialised yet.
+ // We do it here so we can 'sniff' the data type if none was provided
+ if (!npyarr->dec->dtype) {
+ type = PyObject_Type(value);
+ if (!PyArray_DescrConverter(type, &dtype)) {
+ Py_DECREF(type);
+ goto fail;
+ }
+ Py_INCREF(dtype);
+ Py_DECREF(type);
+ } else {
+ dtype = PyArray_DescrNew(npyarr->dec->dtype);
+ }
+
+ // If it's an object or string then fill a Python list and subsequently
+ // convert. Otherwise we would need to somehow mess about with
+ // reference counts when renewing memory.
+ npyarr->elsize = dtype->elsize;
+ if (PyDataType_REFCHK(dtype) || npyarr->elsize == 0) {
+ Py_XDECREF(dtype);
+
+ if (npyarr->dec->curdim > 1) {
+ PyErr_SetString(PyExc_ValueError,
+ "Cannot decode multidimensional arrays with "
+ "variable length elements to numpy");
+ goto fail;
+ }
+ npyarr->elcount = 0;
+ npyarr->ret = PyList_New(0);
+ if (!npyarr->ret) {
+ goto fail;
+ }
+ ((JSONObjectDecoder *)npyarr->dec)->newArray =
+ Object_npyNewArrayList;
+ ((JSONObjectDecoder *)npyarr->dec)->arrayAddItem =
+ Object_npyArrayListAddItem;
+ ((JSONObjectDecoder *)npyarr->dec)->endArray =
+ Object_npyEndArrayList;
+ return Object_npyArrayListAddItem(prv, obj, value);
+ }
+
+ npyarr->ret = PyArray_NewFromDescr(
+ &PyArray_Type, dtype, 1, &npyarr->elcount, NULL, NULL, 0, NULL);
+
+ if (!npyarr->ret) {
+ goto fail;
+ }
+ }
+
+ if (i >= npyarr->elcount) {
+ // Grow PyArray_DATA(ret):
+ // this is similar for the strategy for PyListObject, but we use
+ // 50% overallocation => 0, 4, 8, 14, 23, 36, 56, 86 ...
+ if (npyarr->elsize == 0) {
+ PyErr_SetString(PyExc_ValueError,
+ "Cannot decode multidimensional arrays with "
+ "variable length elements to numpy");
+ goto fail;
+ }
+
+ npyarr->elcount = (i >> 1) + (i < 4 ? 4 : 2) + i;
+ if (npyarr->elcount <= NPY_MAX_INTP / npyarr->elsize) {
+ new_data = PyDataMem_RENEW(PyArray_DATA(npyarr->ret),
+ npyarr->elcount * npyarr->elsize);
+ } else {
+ PyErr_NoMemory();
+ goto fail;
+ }
+ ((PyArrayObject *)npyarr->ret)->data = (void *)new_data;
+
+ // PyArray_BYTES(npyarr->ret) = new_data;
+ }
+
+ PyArray_DIMS(npyarr->ret)[0] = i + 1;
+
+ if ((item = PyArray_GETPTR1(npyarr->ret, i)) == NULL ||
+ PyArray_SETITEM(npyarr->ret, item, value) == -1) {
+ goto fail;
+ }
+
+ Py_DECREF((PyObject *)value);
+ npyarr->i++;
+ return 1;
+
+fail:
+
+ Npy_releaseContext(npyarr);
+ return 0;
+}
+
+JSOBJ Object_npyNewArrayList(void *prv, void *_decoder) {
+ PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
+ PRINTMARK();
+ PyErr_SetString(
+ PyExc_ValueError,
+ "nesting not supported for object or variable length dtypes");
+ Npy_releaseContext(decoder->npyarr);
+ return NULL;
+}
+
+JSOBJ Object_npyEndArrayList(void *prv, JSOBJ obj) {
+ PyObject *list, *ret;
+ NpyArrContext *npyarr = (NpyArrContext *)obj;
+ PRINTMARK();
+ if (!npyarr) {
+ return NULL;
+ }
+
+ // convert decoded list to numpy array
+ list = (PyObject *)npyarr->ret;
+ npyarr->ret = PyArray_FROM_O(list);
+
+ ret = Npy_returnLabelled(npyarr);
+ npyarr->ret = list;
+
+ ((JSONObjectDecoder *)npyarr->dec)->newArray = Object_npyNewArray;
+ ((JSONObjectDecoder *)npyarr->dec)->arrayAddItem = Object_npyArrayAddItem;
+ ((JSONObjectDecoder *)npyarr->dec)->endArray = Object_npyEndArray;
+ Npy_releaseContext(npyarr);
+ return ret;
+}
+
+int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value) {
+ NpyArrContext *npyarr = (NpyArrContext *)obj;
+ PRINTMARK();
+ if (!npyarr) {
+ return 0;
+ }
+ PyList_Append((PyObject *)npyarr->ret, value);
+ Py_DECREF((PyObject *)value);
+ npyarr->elcount++;
+ return 1;
+}
+
+JSOBJ Object_npyNewObject(void *prv, void *_decoder) {
+ PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
+ PRINTMARK();
+ if (decoder->curdim > 1) {
+ PyErr_SetString(PyExc_ValueError,
+ "labels only supported up to 2 dimensions");
+ return NULL;
+ }
+
+ return ((JSONObjectDecoder *)decoder)->newArray(prv, decoder);
+}
+
+JSOBJ Object_npyEndObject(void *prv, JSOBJ obj) {
+ PyObject *list;
+ npy_intp labelidx;
+ NpyArrContext *npyarr = (NpyArrContext *)obj;
+ PRINTMARK();
+ if (!npyarr) {
+ return NULL;
+ }
+
+ labelidx = npyarr->dec->curdim - 1;
+
+ list = npyarr->labels[labelidx];
+ if (list) {
+ npyarr->labels[labelidx] = PyArray_FROM_O(list);
+ Py_DECREF(list);
+ }
+
+ return (PyObject *)((JSONObjectDecoder *)npyarr->dec)->endArray(prv, obj);
+}
+
+int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
+ PyObject *label, *labels;
+ npy_intp labelidx;
+ // add key to label array, value to values array
+ NpyArrContext *npyarr = (NpyArrContext *)obj;
+ PRINTMARK();
+ if (!npyarr) {
+ return 0;
+ }
+
+ label = (PyObject *)name;
+ labelidx = npyarr->dec->curdim - 1;
+
+ if (!npyarr->labels[labelidx]) {
+ npyarr->labels[labelidx] = PyList_New(0);
+ }
+ labels = npyarr->labels[labelidx];
+ // only fill label array once, assumes all column labels are the same
+ // for 2-dimensional arrays.
+ if (PyList_Check(labels) && PyList_GET_SIZE(labels) <= npyarr->elcount) {
+ PyList_Append(labels, label);
+ }
+
+ if (((JSONObjectDecoder *)npyarr->dec)->arrayAddItem(prv, obj, value)) {
+ Py_DECREF(label);
+ return 1;
+ }
+ return 0;
+}
+
+int Object_objectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
+ int ret = PyDict_SetItem(obj, name, value);
+ Py_DECREF((PyObject *)name);
+ Py_DECREF((PyObject *)value);
+ return ret == 0 ? 1 : 0;
+}
+
+int Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) {
+ int ret = PyList_Append(obj, value);
+ Py_DECREF((PyObject *)value);
+ return ret == 0 ? 1 : 0;
+}
+
+JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) {
+ return PyUnicode_FromWideChar(start, (end - start));
+}
+
+JSOBJ Object_newTrue(void *prv) { Py_RETURN_TRUE; }
+
+JSOBJ Object_newFalse(void *prv) { Py_RETURN_FALSE; }
+
+JSOBJ Object_newNull(void *prv) { Py_RETURN_NONE; }
+
+JSOBJ Object_newObject(void *prv, void *decoder) { return PyDict_New(); }
+
+JSOBJ Object_endObject(void *prv, JSOBJ obj) { return obj; }
+
+JSOBJ Object_newArray(void *prv, void *decoder) { return PyList_New(0); }
+
+JSOBJ Object_endArray(void *prv, JSOBJ obj) { return obj; }
+
+JSOBJ Object_newInteger(void *prv, JSINT32 value) {
+ return PyInt_FromLong((long)value);
+}
+
+JSOBJ Object_newLong(void *prv, JSINT64 value) {
+ return PyLong_FromLongLong(value);
+}
+
+JSOBJ Object_newDouble(void *prv, double value) {
+ return PyFloat_FromDouble(value);
+}
+
+static void Object_releaseObject(void *prv, JSOBJ obj, void *_decoder) {
+ PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
+ if (obj != decoder->npyarr_addr) {
+ Py_XDECREF(((PyObject *)obj));
+ }
+}
+
+static char *g_kwlist[] = {"obj", "precise_float", "numpy",
+ "labelled", "dtype", NULL};
+
+PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) {
+ PyObject *ret;
+ PyObject *sarg;
+ PyObject *arg;
+ PyObject *opreciseFloat = NULL;
+ JSONObjectDecoder *decoder;
+ PyObjectDecoder pyDecoder;
+ PyArray_Descr *dtype = NULL;
+ int numpy = 0, labelled = 0;
+
+ JSONObjectDecoder dec = {
+ Object_newString, Object_objectAddKey, Object_arrayAddItem,
+ Object_newTrue, Object_newFalse, Object_newNull,
+ Object_newObject, Object_endObject, Object_newArray,
+ Object_endArray, Object_newInteger, Object_newLong,
+ Object_newDouble, Object_releaseObject, PyObject_Malloc,
+ PyObject_Free, PyObject_Realloc};
+
+ dec.preciseFloat = 0;
+ dec.prv = NULL;
+
+ pyDecoder.dec = dec;
+ pyDecoder.curdim = 0;
+ pyDecoder.npyarr = NULL;
+ pyDecoder.npyarr_addr = NULL;
+
+ decoder = (JSONObjectDecoder *)&pyDecoder;
+
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiiO&", g_kwlist, &arg,
+ &opreciseFloat, &numpy, &labelled,
+ PyArray_DescrConverter2, &dtype)) {
+ Npy_releaseContext(pyDecoder.npyarr);
+ return NULL;
+ }
+
+ if (opreciseFloat && PyObject_IsTrue(opreciseFloat)) {
+ decoder->preciseFloat = 1;
+ }
+
+ if (PyString_Check(arg)) {
+ sarg = arg;
+ } else if (PyUnicode_Check(arg)) {
+ sarg = PyUnicode_AsUTF8String(arg);
+ if (sarg == NULL) {
+ // Exception raised above us by codec according to docs
+ return NULL;
+ }
+ } else {
+ PyErr_Format(PyExc_TypeError, "Expected String or Unicode");
+ return NULL;
+ }
+
+ decoder->errorStr = NULL;
+ decoder->errorOffset = NULL;
+
+ if (numpy) {
+ pyDecoder.dtype = dtype;
+ decoder->newArray = Object_npyNewArray;
+ decoder->endArray = Object_npyEndArray;
+ decoder->arrayAddItem = Object_npyArrayAddItem;
+
+ if (labelled) {
+ decoder->newObject = Object_npyNewObject;
+ decoder->endObject = Object_npyEndObject;
+ decoder->objectAddKey = Object_npyObjectAddKey;
+ }
+ }
+
+ ret = JSON_DecodeObject(decoder, PyString_AS_STRING(sarg),
+ PyString_GET_SIZE(sarg));
+
+ if (sarg != arg) {
+ Py_DECREF(sarg);
+ }
+
+ if (PyErr_Occurred()) {
+ if (ret) {
+ Py_DECREF((PyObject *)ret);
+ }
+ Npy_releaseContext(pyDecoder.npyarr);
+ return NULL;
+ }
+
+ if (decoder->errorStr) {
+ /*
+ FIXME: It's possible to give a much nicer error message here with actual
+ failing element in input etc*/
+
+ PyErr_Format(PyExc_ValueError, "%s", decoder->errorStr);
+
+ if (ret) {
+ Py_DECREF((PyObject *)ret);
+ }
+ Npy_releaseContext(pyDecoder.npyarr);
+
+ return NULL;
+ }
+
+ return ret;
+}
+
+PyObject *JSONFileToObj(PyObject *self, PyObject *args, PyObject *kwargs) {
+ PyObject *read;
+ PyObject *string;
+ PyObject *result;
+ PyObject *file = NULL;
+ PyObject *argtuple;
+
+ if (!PyArg_ParseTuple(args, "O", &file)) {
+ return NULL;
+ }
+
+ if (!PyObject_HasAttrString(file, "read")) {
+ PyErr_Format(PyExc_TypeError, "expected file");
+ return NULL;
+ }
+
+ read = PyObject_GetAttrString(file, "read");
+
+ if (!PyCallable_Check(read)) {
+ Py_XDECREF(read);
+ PyErr_Format(PyExc_TypeError, "expected file");
+ return NULL;
+ }
+
+ string = PyObject_CallObject(read, NULL);
+ Py_XDECREF(read);
+
+ if (string == NULL) {
+ return NULL;
+ }
+
+ argtuple = PyTuple_Pack(1, string);
+
+ result = JSONToObj(self, argtuple, kwargs);
+
+ Py_XDECREF(argtuple);
+ Py_XDECREF(string);
+
+ if (result == NULL) {
+ return NULL;
+ }
+
+ return result;
+}
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/ujson/python/objToJSON.c b/contrib/python/pandas/py2/pandas/_libs/src/ujson/python/objToJSON.c
new file mode 100644
index 00000000000..d0caeb33335
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/ujson/python/objToJSON.c
@@ -0,0 +1,2539 @@
+/*
+Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the ESN Social Software AB nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
+https://github.com/client9/stringencoders
+Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
+
+Numeric decoder derived from from TCL library
+http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
+* Copyright (c) 1988-1993 The Regents of the University of California.
+* Copyright (c) 1994 Sun Microsystems, Inc.
+*/
+#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY
+
+// "py_defines.h" needs to be included first to
+// avoid compilation errors, but it does violate
+// styleguide checks with regards to include order.
+#include "py_defines.h" // NOLINT(build/include_order)
+#include <math.h> // NOLINT(build/include_order)
+#include <numpy/arrayobject.h> // NOLINT(build/include_order)
+#include <numpy/arrayscalars.h> // NOLINT(build/include_order)
+#include <numpy/ndarraytypes.h> // NOLINT(build/include_order)
+#include <numpy/npy_math.h> // NOLINT(build/include_order)
+#include <stdio.h> // NOLINT(build/include_order)
+#include <ultrajson.h> // NOLINT(build/include_order)
+#include <../../../tslibs/src/datetime/np_datetime.h> // NOLINT(build/include_order)
+#include <../../../tslibs/src/datetime/np_datetime_strings.h> // NOLINT(build/include_order)
+#include "datetime.h"
+
+static PyObject *type_decimal;
+
+#define NPY_JSON_BUFSIZE 32768
+
+static PyTypeObject *cls_dataframe;
+static PyTypeObject *cls_series;
+static PyTypeObject *cls_index;
+static PyTypeObject *cls_nat;
+
+npy_int64 get_nat(void) { return NPY_MIN_INT64; }
+
+typedef void *(*PFN_PyTypeToJSON)(JSOBJ obj, JSONTypeContext *ti,
+ void *outValue, size_t *_outLen);
+
+#if (PY_VERSION_HEX < 0x02050000)
+typedef ssize_t Py_ssize_t;
+#endif
+
+typedef struct __NpyArrContext {
+ PyObject *array;
+ char *dataptr;
+ int curdim; // current dimension in array's order
+ int stridedim; // dimension we are striding over
+ int inc; // stride dimension increment (+/- 1)
+ npy_intp dim;
+ npy_intp stride;
+ npy_intp ndim;
+ npy_intp index[NPY_MAXDIMS];
+ int type_num;
+ PyArray_GetItemFunc *getitem;
+
+ char **rowLabels;
+ char **columnLabels;
+} NpyArrContext;
+
+typedef struct __PdBlockContext {
+ int colIdx;
+ int ncols;
+ int transpose;
+
+ int *cindices; // frame column -> block column map
+ NpyArrContext **npyCtxts; // NpyArrContext for each column
+} PdBlockContext;
+
+typedef struct __TypeContext {
+ JSPFN_ITERBEGIN iterBegin;
+ JSPFN_ITEREND iterEnd;
+ JSPFN_ITERNEXT iterNext;
+ JSPFN_ITERGETNAME iterGetName;
+ JSPFN_ITERGETVALUE iterGetValue;
+ PFN_PyTypeToJSON PyTypeToJSON;
+ PyObject *newObj;
+ PyObject *dictObj;
+ Py_ssize_t index;
+ Py_ssize_t size;
+ PyObject *itemValue;
+ PyObject *itemName;
+ PyObject *attrList;
+ PyObject *iterator;
+
+ double doubleValue;
+ JSINT64 longValue;
+
+ char *cStr;
+ NpyArrContext *npyarr;
+ PdBlockContext *pdblock;
+ int transpose;
+ char **rowLabels;
+ char **columnLabels;
+ npy_intp rowLabelsLen;
+ npy_intp columnLabelsLen;
+} TypeContext;
+
+typedef struct __PyObjectEncoder {
+ JSONObjectEncoder enc;
+
+ // pass through the NpyArrContext when encoding multi-dimensional arrays
+ NpyArrContext *npyCtxtPassthru;
+
+ // pass through the PdBlockContext when encoding blocks
+ PdBlockContext *blkCtxtPassthru;
+
+ // pass-through to encode numpy data directly
+ int npyType;
+ void *npyValue;
+ TypeContext basicTypeContext;
+
+ int datetimeIso;
+ NPY_DATETIMEUNIT datetimeUnit;
+
+ // output format style for pandas data types
+ int outputFormat;
+ int originalOutputFormat;
+
+ PyObject *defaultHandler;
+} PyObjectEncoder;
+
+#define GET_TC(__ptrtc) ((TypeContext *)((__ptrtc)->prv))
+
+enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES };
+
+#define PRINTMARK()
+
+int PdBlock_iterNext(JSOBJ, JSONTypeContext *);
+
+// import_array() compat
+#if (PY_VERSION_HEX >= 0x03000000)
+void *initObjToJSON(void)
+#else
+void initObjToJSON(void)
+#endif
+{
+ PyObject *mod_pandas;
+ PyObject *mod_nattype;
+ PyObject *mod_decimal = PyImport_ImportModule("decimal");
+ type_decimal = PyObject_GetAttrString(mod_decimal, "Decimal");
+ Py_INCREF(type_decimal);
+ Py_DECREF(mod_decimal);
+
+ PyDateTime_IMPORT;
+
+ mod_pandas = PyImport_ImportModule("pandas");
+ if (mod_pandas) {
+ cls_dataframe =
+ (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "DataFrame");
+ cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index");
+ cls_series =
+ (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series");
+ Py_DECREF(mod_pandas);
+ }
+
+ mod_nattype = PyImport_ImportModule("pandas._libs.tslibs.nattype");
+ if (mod_nattype) {
+ cls_nat = (PyTypeObject *)PyObject_GetAttrString(mod_nattype,
+ "NaTType");
+ Py_DECREF(mod_nattype);
+ }
+
+ /* Initialise numpy API and use 2/3 compatible return */
+ import_array();
+ return NUMPY_IMPORT_ARRAY_RETVAL;
+}
+
+static TypeContext *createTypeContext(void) {
+ TypeContext *pc;
+
+ pc = PyObject_Malloc(sizeof(TypeContext));
+ if (!pc) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ pc->newObj = NULL;
+ pc->dictObj = NULL;
+ pc->itemValue = NULL;
+ pc->itemName = NULL;
+ pc->attrList = NULL;
+ pc->index = 0;
+ pc->size = 0;
+ pc->longValue = 0;
+ pc->doubleValue = 0.0;
+ pc->cStr = NULL;
+ pc->npyarr = NULL;
+ pc->pdblock = NULL;
+ pc->rowLabels = NULL;
+ pc->columnLabels = NULL;
+ pc->transpose = 0;
+ pc->rowLabelsLen = 0;
+ pc->columnLabelsLen = 0;
+
+ return pc;
+}
+
+static PyObject *get_values(PyObject *obj) {
+ PyObject *values = PyObject_GetAttrString(obj, "values");
+ PRINTMARK();
+
+ if (values && !PyArray_CheckExact(values)) {
+
+ if (PyObject_HasAttrString(values, "to_numpy")) {
+ values = PyObject_CallMethod(values, "to_numpy", NULL);
+ }
+
+ if (PyObject_HasAttrString(values, "values")) {
+ PyObject *subvals = get_values(values);
+ PyErr_Clear();
+ PRINTMARK();
+ // subvals are sometimes missing a dimension
+ if (subvals) {
+ PyArrayObject *reshape = (PyArrayObject *)subvals;
+ PyObject *shape = PyObject_GetAttrString(obj, "shape");
+ PyArray_Dims dims;
+ PRINTMARK();
+
+ if (!shape || !PyArray_IntpConverter(shape, &dims)) {
+ subvals = NULL;
+ } else {
+ subvals = PyArray_Newshape(reshape, &dims, NPY_ANYORDER);
+ PyDimMem_FREE(dims.ptr);
+ }
+ Py_DECREF(reshape);
+ Py_XDECREF(shape);
+ }
+ Py_DECREF(values);
+ values = subvals;
+ } else {
+ PRINTMARK();
+ Py_DECREF(values);
+ values = NULL;
+ }
+ }
+
+ if (!values && PyObject_HasAttrString(obj, "get_values")) {
+ PRINTMARK();
+ values = PyObject_CallMethod(obj, "get_values", NULL);
+ if (values && !PyArray_CheckExact(values)) {
+ PRINTMARK();
+ Py_DECREF(values);
+ values = NULL;
+ }
+ }
+
+ if (!values) {
+ PyObject *typeRepr = PyObject_Repr((PyObject *)Py_TYPE(obj));
+ PyObject *repr;
+ PRINTMARK();
+ if (PyObject_HasAttrString(obj, "dtype")) {
+ PyObject *dtype = PyObject_GetAttrString(obj, "dtype");
+ repr = PyObject_Repr(dtype);
+ Py_DECREF(dtype);
+ } else {
+ repr = PyString_FromString("<unknown dtype>");
+ }
+
+ PyErr_Format(PyExc_ValueError, "%R or %R are not JSON serializable yet",
+ repr, typeRepr);
+ Py_DECREF(repr);
+ Py_DECREF(typeRepr);
+
+ return NULL;
+ }
+
+ return values;
+}
+
+static PyObject *get_sub_attr(PyObject *obj, char *attr, char *subAttr) {
+ PyObject *tmp = PyObject_GetAttrString(obj, attr);
+ PyObject *ret;
+
+ if (tmp == 0) {
+ return 0;
+ }
+ ret = PyObject_GetAttrString(tmp, subAttr);
+ Py_DECREF(tmp);
+
+ return ret;
+}
+
+static int is_simple_frame(PyObject *obj) {
+ PyObject *check = get_sub_attr(obj, "_data", "is_mixed_type");
+ int ret = (check == Py_False);
+
+ if (!check) {
+ return 0;
+ }
+
+ Py_DECREF(check);
+ return ret;
+}
+
+static Py_ssize_t get_attr_length(PyObject *obj, char *attr) {
+ PyObject *tmp = PyObject_GetAttrString(obj, attr);
+ Py_ssize_t ret;
+
+ if (tmp == 0) {
+ return 0;
+ }
+ ret = PyObject_Length(tmp);
+ Py_DECREF(tmp);
+
+ if (ret == -1) {
+ return 0;
+ }
+
+ return ret;
+}
+
+static npy_int64 get_long_attr(PyObject *o, const char *attr) {
+ npy_int64 long_val;
+ PyObject *value = PyObject_GetAttrString(o, attr);
+ long_val = (PyLong_Check(value) ?
+ PyLong_AsLongLong(value) : PyInt_AS_LONG(value));
+ Py_DECREF(value);
+ return long_val;
+}
+
+static npy_float64 total_seconds(PyObject *td) {
+ npy_float64 double_val;
+ PyObject *value = PyObject_CallMethod(td, "total_seconds", NULL);
+ double_val = PyFloat_AS_DOUBLE(value);
+ Py_DECREF(value);
+ return double_val;
+}
+
+static PyObject *get_item(PyObject *obj, Py_ssize_t i) {
+ PyObject *tmp = PyInt_FromSsize_t(i);
+ PyObject *ret;
+
+ if (tmp == 0) {
+ return 0;
+ }
+ ret = PyObject_GetItem(obj, tmp);
+ Py_DECREF(tmp);
+
+ return ret;
+}
+
+static void *CDouble(JSOBJ obj, JSONTypeContext *tc, void *outValue,
+ size_t *_outLen) {
+ PRINTMARK();
+ *((double *)outValue) = GET_TC(tc)->doubleValue;
+ return NULL;
+}
+
+static void *CLong(JSOBJ obj, JSONTypeContext *tc, void *outValue,
+ size_t *_outLen) {
+ PRINTMARK();
+ *((JSINT64 *)outValue) = GET_TC(tc)->longValue;
+ return NULL;
+}
+
+#ifdef _LP64
+static void *PyIntToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue,
+ size_t *_outLen) {
+ PyObject *obj = (PyObject *)_obj;
+ *((JSINT64 *)outValue) = PyInt_AS_LONG(obj);
+ return NULL;
+}
+#else
+static void *PyIntToINT32(JSOBJ _obj, JSONTypeContext *tc, void *outValue,
+ size_t *_outLen) {
+ PyObject *obj = (PyObject *)_obj;
+ *((JSINT32 *)outValue) = PyInt_AS_LONG(obj);
+ return NULL;
+}
+#endif
+
+static void *PyLongToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue,
+ size_t *_outLen) {
+ *((JSINT64 *)outValue) = GET_TC(tc)->longValue;
+ return NULL;
+}
+
+static void *NpyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue,
+ size_t *_outLen) {
+ PyObject *obj = (PyObject *)_obj;
+ PyArray_CastScalarToCtype(obj, outValue, PyArray_DescrFromType(NPY_DOUBLE));
+ return NULL;
+}
+
+static void *PyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue,
+ size_t *_outLen) {
+ PyObject *obj = (PyObject *)_obj;
+ *((double *)outValue) = PyFloat_AsDouble(obj);
+ return NULL;
+}
+
+static void *PyStringToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue,
+ size_t *_outLen) {
+ PyObject *obj = (PyObject *)_obj;
+ *_outLen = PyString_GET_SIZE(obj);
+ return PyString_AS_STRING(obj);
+}
+
+static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue,
+ size_t *_outLen) {
+ PyObject *obj, *newObj;
+ obj = (PyObject *)_obj;
+
+#if (PY_VERSION_HEX >= 0x03030000)
+ if (PyUnicode_IS_COMPACT_ASCII(obj)) {
+ Py_ssize_t len;
+ char *data = (char*)PyUnicode_AsUTF8AndSize(obj, &len);
+ *_outLen = len;
+ return data;
+ }
+#endif
+
+ newObj = PyUnicode_AsUTF8String(obj);
+
+ GET_TC(tc)->newObj = newObj;
+
+ *_outLen = PyString_GET_SIZE(newObj);
+ return PyString_AS_STRING(newObj);
+}
+
+static void *PandasDateTimeStructToJSON(npy_datetimestruct *dts,
+ JSONTypeContext *tc, void *outValue,
+ size_t *_outLen) {
+ NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
+
+ if (((PyObjectEncoder *)tc->encoder)->datetimeIso) {
+ PRINTMARK();
+ *_outLen = (size_t)get_datetime_iso_8601_strlen(0, base);
+ GET_TC(tc)->cStr = PyObject_Malloc(sizeof(char) * (*_outLen));
+ if (!GET_TC(tc)->cStr) {
+ PyErr_NoMemory();
+ ((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
+ return NULL;
+ }
+
+ if (!make_iso_8601_datetime(dts, GET_TC(tc)->cStr, *_outLen, base)) {
+ PRINTMARK();
+ *_outLen = strlen(GET_TC(tc)->cStr);
+ return GET_TC(tc)->cStr;
+ } else {
+ PRINTMARK();
+ PyErr_SetString(PyExc_ValueError,
+ "Could not convert datetime value to string");
+ ((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
+ PyObject_Free(GET_TC(tc)->cStr);
+ return NULL;
+ }
+ } else {
+ PRINTMARK();
+ *((JSINT64 *)outValue) = npy_datetimestruct_to_datetime(base, dts);
+ return NULL;
+ }
+}
+
+static void *NpyDateTimeScalarToJSON(JSOBJ _obj, JSONTypeContext *tc,
+ void *outValue, size_t *_outLen) {
+ npy_datetimestruct dts;
+ PyDatetimeScalarObject *obj = (PyDatetimeScalarObject *)_obj;
+ PRINTMARK();
+ // TODO(anyone): Does not appear to be reached in tests.
+
+ pandas_datetime_to_datetimestruct(obj->obval,
+ (NPY_DATETIMEUNIT)obj->obmeta.base, &dts);
+ return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen);
+}
+
+static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue,
+ size_t *_outLen) {
+ npy_datetimestruct dts;
+ PyDateTime_Date *obj = (PyDateTime_Date *)_obj;
+
+ PRINTMARK();
+
+ if (!convert_pydatetime_to_datetimestruct(obj, &dts)) {
+ PRINTMARK();
+ return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen);
+ } else {
+ if (!PyErr_Occurred()) {
+ PyErr_SetString(PyExc_ValueError,
+ "Could not convert datetime value to string");
+ }
+ ((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
+ return NULL;
+ }
+}
+
+static void *NpyDatetime64ToJSON(JSOBJ _obj, JSONTypeContext *tc,
+ void *outValue, size_t *_outLen) {
+ npy_datetimestruct dts;
+ PRINTMARK();
+
+ pandas_datetime_to_datetimestruct((npy_datetime)GET_TC(tc)->longValue,
+ NPY_FR_ns, &dts);
+ return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen);
+}
+
+static void *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue,
+ size_t *outLen) {
+ PyObject *obj = (PyObject *)_obj;
+ PyObject *str;
+ PyObject *tmp;
+
+ str = PyObject_CallMethod(obj, "isoformat", NULL);
+ if (str == NULL) {
+ PRINTMARK();
+ *outLen = 0;
+ if (!PyErr_Occurred()) {
+ PyErr_SetString(PyExc_ValueError, "Failed to convert time");
+ }
+ ((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
+ return NULL;
+ }
+ if (PyUnicode_Check(str)) {
+ tmp = str;
+ str = PyUnicode_AsUTF8String(str);
+ Py_DECREF(tmp);
+ }
+
+ GET_TC(tc)->newObj = str;
+
+ *outLen = PyString_GET_SIZE(str);
+ outValue = (void *)PyString_AS_STRING(str);
+ return outValue;
+}
+
+static int NpyTypeToJSONType(PyObject *obj, JSONTypeContext *tc, int npyType,
+ void *value) {
+ PyArray_VectorUnaryFunc *castfunc;
+ npy_double doubleVal;
+ npy_int64 longVal;
+
+ if (PyTypeNum_ISFLOAT(npyType)) {
+ PRINTMARK();
+ castfunc =
+ PyArray_GetCastFunc(PyArray_DescrFromType(npyType), NPY_DOUBLE);
+ if (!castfunc) {
+ PyErr_Format(PyExc_ValueError,
+ "Cannot cast numpy dtype %d to double", npyType);
+ }
+ castfunc(value, &doubleVal, 1, NULL, NULL);
+ if (npy_isnan(doubleVal) || npy_isinf(doubleVal)) {
+ PRINTMARK();
+ return JT_NULL;
+ }
+ GET_TC(tc)->doubleValue = (double)doubleVal;
+ GET_TC(tc)->PyTypeToJSON = CDouble;
+ return JT_DOUBLE;
+ }
+
+ if (PyTypeNum_ISDATETIME(npyType)) {
+ PRINTMARK();
+ castfunc =
+ PyArray_GetCastFunc(PyArray_DescrFromType(npyType), NPY_INT64);
+ if (!castfunc) {
+ PyErr_Format(PyExc_ValueError, "Cannot cast numpy dtype %d to long",
+ npyType);
+ }
+ castfunc(value, &longVal, 1, NULL, NULL);
+ if (longVal == get_nat()) {
+ PRINTMARK();
+ return JT_NULL;
+ }
+ GET_TC(tc)->longValue = (JSINT64)longVal;
+ GET_TC(tc)->PyTypeToJSON = NpyDatetime64ToJSON;
+ return ((PyObjectEncoder *)tc->encoder)->datetimeIso ? JT_UTF8
+ : JT_LONG;
+ }
+
+ if (PyTypeNum_ISINTEGER(npyType)) {
+ PRINTMARK();
+ castfunc =
+ PyArray_GetCastFunc(PyArray_DescrFromType(npyType), NPY_INT64);
+ if (!castfunc) {
+ PyErr_Format(PyExc_ValueError, "Cannot cast numpy dtype %d to long",
+ npyType);
+ }
+ castfunc(value, &longVal, 1, NULL, NULL);
+ GET_TC(tc)->longValue = (JSINT64)longVal;
+ GET_TC(tc)->PyTypeToJSON = CLong;
+ return JT_LONG;
+ }
+
+ if (PyTypeNum_ISBOOL(npyType)) {
+ PRINTMARK();
+ return *((npy_bool *)value) == NPY_TRUE ? JT_TRUE : JT_FALSE;
+ }
+
+ PRINTMARK();
+ return JT_INVALID;
+}
+
+//=============================================================================
+// Numpy array iteration functions
+//=============================================================================
+
+static void NpyArr_freeItemValue(JSOBJ _obj, JSONTypeContext *tc) {
+ if (GET_TC(tc)->npyarr &&
+ GET_TC(tc)->itemValue != GET_TC(tc)->npyarr->array) {
+ PRINTMARK();
+ Py_XDECREF(GET_TC(tc)->itemValue);
+ GET_TC(tc)->itemValue = NULL;
+ }
+}
+
+int NpyArr_iterNextNone(JSOBJ _obj, JSONTypeContext *tc) { return 0; }
+
+void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
+ PyArrayObject *obj;
+ NpyArrContext *npyarr;
+
+ if (GET_TC(tc)->newObj) {
+ obj = (PyArrayObject *)GET_TC(tc)->newObj;
+ } else {
+ obj = (PyArrayObject *)_obj;
+ }
+
+ if (PyArray_SIZE(obj) < 0) {
+ PRINTMARK();
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ } else {
+ PRINTMARK();
+ npyarr = PyObject_Malloc(sizeof(NpyArrContext));
+ GET_TC(tc)->npyarr = npyarr;
+
+ if (!npyarr) {
+ PyErr_NoMemory();
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ return;
+ }
+
+ npyarr->array = (PyObject *)obj;
+ npyarr->getitem = (PyArray_GetItemFunc *)PyArray_DESCR(obj)->f->getitem;
+ npyarr->dataptr = PyArray_DATA(obj);
+ npyarr->ndim = PyArray_NDIM(obj) - 1;
+ npyarr->curdim = 0;
+ npyarr->type_num = PyArray_DESCR(obj)->type_num;
+
+ if (GET_TC(tc)->transpose) {
+ npyarr->dim = PyArray_DIM(obj, npyarr->ndim);
+ npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim);
+ npyarr->stridedim = npyarr->ndim;
+ npyarr->index[npyarr->ndim] = 0;
+ npyarr->inc = -1;
+ } else {
+ npyarr->dim = PyArray_DIM(obj, 0);
+ npyarr->stride = PyArray_STRIDE(obj, 0);
+ npyarr->stridedim = 0;
+ npyarr->index[0] = 0;
+ npyarr->inc = 1;
+ }
+
+ npyarr->columnLabels = GET_TC(tc)->columnLabels;
+ npyarr->rowLabels = GET_TC(tc)->rowLabels;
+ }
+}
+
+void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
+ NpyArrContext *npyarr = GET_TC(tc)->npyarr;
+ PRINTMARK();
+
+ if (npyarr) {
+ NpyArr_freeItemValue(obj, tc);
+ PyObject_Free(npyarr);
+ }
+}
+
+void NpyArrPassThru_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PRINTMARK(); }
+
+void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
+ NpyArrContext *npyarr = GET_TC(tc)->npyarr;
+ PRINTMARK();
+ // finished this dimension, reset the data pointer
+ npyarr->curdim--;
+ npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim];
+ npyarr->stridedim -= npyarr->inc;
+ npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim);
+ npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim);
+ npyarr->dataptr += npyarr->stride;
+
+ NpyArr_freeItemValue(obj, tc);
+}
+
+int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) {
+ NpyArrContext *npyarr = GET_TC(tc)->npyarr;
+ PRINTMARK();
+
+ if (PyErr_Occurred()) {
+ return 0;
+ }
+
+ if (npyarr->index[npyarr->stridedim] >= npyarr->dim) {
+ PRINTMARK();
+ return 0;
+ }
+
+ NpyArr_freeItemValue(obj, tc);
+
+#if NPY_API_VERSION < 0x00000007
+ if (PyArray_ISDATETIME(npyarr->array)) {
+ PRINTMARK();
+ GET_TC(tc)
+ ->itemValue = PyArray_ToScalar(npyarr->dataptr, npyarr->array);
+ } else if (PyArray_ISNUMBER(npyarr->array)) // NOLINT
+#else
+ if (PyArray_ISNUMBER(npyarr->array) || PyArray_ISDATETIME(npyarr->array)) // NOLINT
+#endif
+ {
+ PRINTMARK();
+ GET_TC(tc)->itemValue = obj;
+ Py_INCREF(obj);
+ ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array);
+ ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr;
+ ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr;
+ } else {
+ PRINTMARK();
+ GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array);
+ }
+
+ npyarr->dataptr += npyarr->stride;
+ npyarr->index[npyarr->stridedim]++;
+ return 1;
+}
+
+int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) {
+ NpyArrContext *npyarr = GET_TC(tc)->npyarr;
+ PRINTMARK();
+
+ if (PyErr_Occurred()) {
+ PRINTMARK();
+ return 0;
+ }
+
+ if (npyarr->curdim >= npyarr->ndim ||
+ npyarr->index[npyarr->stridedim] >= npyarr->dim) {
+ PRINTMARK();
+ // innermost dimension, start retrieving item values
+ GET_TC(tc)->iterNext = NpyArr_iterNextItem;
+ return NpyArr_iterNextItem(_obj, tc);
+ }
+
+ // dig a dimension deeper
+ npyarr->index[npyarr->stridedim]++;
+
+ npyarr->curdim++;
+ npyarr->stridedim += npyarr->inc;
+ npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim);
+ npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim);
+ npyarr->index[npyarr->stridedim] = 0;
+
+ ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr;
+ GET_TC(tc)->itemValue = npyarr->array;
+ return 1;
+}
+
+JSOBJ NpyArr_iterGetValue(JSOBJ obj, JSONTypeContext *tc) {
+ PRINTMARK();
+ return GET_TC(tc)->itemValue;
+}
+
+static void NpyArr_getLabel(JSOBJ obj, JSONTypeContext *tc, size_t *outLen,
+ npy_intp idx, char **labels) {
+ JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder;
+ PRINTMARK();
+ *outLen = strlen(labels[idx]);
+ Buffer_Reserve(enc, *outLen);
+ memcpy(enc->offset, labels[idx], sizeof(char) * (*outLen));
+ enc->offset += *outLen;
+ *outLen = 0;
+}
+
+char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) {
+ NpyArrContext *npyarr = GET_TC(tc)->npyarr;
+ npy_intp idx;
+ PRINTMARK();
+
+ if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) {
+ idx = npyarr->index[npyarr->stridedim] - 1;
+ NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels);
+ } else {
+ idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1;
+ NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels);
+ }
+ return NULL;
+}
+
+//=============================================================================
+// Pandas block iteration functions
+//
+// Serialises a DataFrame column by column to avoid unnecessary data copies and
+// more representative serialisation when dealing with mixed dtypes.
+//
+// Uses a dedicated NpyArrContext for each column.
+//=============================================================================
+
+void PdBlockPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
+ PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
+ PRINTMARK();
+
+ if (blkCtxt->transpose) {
+ blkCtxt->colIdx++;
+ } else {
+ blkCtxt->colIdx = 0;
+ }
+
+ NpyArr_freeItemValue(obj, tc);
+}
+
+int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) {
+ PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
+ PRINTMARK();
+
+ if (blkCtxt->colIdx >= blkCtxt->ncols) {
+ return 0;
+ }
+
+ GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx];
+ blkCtxt->colIdx++;
+ return NpyArr_iterNextItem(obj, tc);
+}
+
+char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) {
+ PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
+ NpyArrContext *npyarr = blkCtxt->npyCtxts[0];
+ npy_intp idx;
+ PRINTMARK();
+
+ if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) {
+ idx = blkCtxt->colIdx - 1;
+ NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels);
+ } else {
+ idx = GET_TC(tc)->iterNext != PdBlock_iterNext
+ ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1
+ : npyarr->index[npyarr->stridedim];
+
+ NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels);
+ }
+ return NULL;
+}
+
+char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc,
+ size_t *outLen) {
+ PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
+ NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx];
+ npy_intp idx;
+ PRINTMARK();
+
+ if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) {
+ idx = npyarr->index[npyarr->stridedim] - 1;
+ NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels);
+ } else {
+ idx = blkCtxt->colIdx;
+ NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels);
+ }
+ return NULL;
+}
+
+int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) {
+ PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
+ NpyArrContext *npyarr;
+ PRINTMARK();
+
+ if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) {
+ return 0;
+ }
+
+ if (blkCtxt->transpose) {
+ if (blkCtxt->colIdx >= blkCtxt->ncols) {
+ return 0;
+ }
+ } else {
+ npyarr = blkCtxt->npyCtxts[0];
+ if (npyarr->index[npyarr->stridedim] >= npyarr->dim) {
+ return 0;
+ }
+ }
+
+ ((PyObjectEncoder *)tc->encoder)->blkCtxtPassthru = blkCtxt;
+ GET_TC(tc)->itemValue = obj;
+
+ return 1;
+}
+
+void PdBlockPassThru_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
+ PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
+ PRINTMARK();
+
+ if (blkCtxt->transpose) {
+ // if transposed we exhaust each column before moving to the next
+ GET_TC(tc)->iterNext = NpyArr_iterNextItem;
+ GET_TC(tc)->iterGetName = PdBlock_iterGetName_Transpose;
+ GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx];
+ }
+}
+
+void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
+ PyObject *obj, *blocks, *block, *values, *tmp;
+ PyArrayObject *locs;
+ PdBlockContext *blkCtxt;
+ NpyArrContext *npyarr;
+ Py_ssize_t i;
+ PyArray_Descr *dtype;
+ NpyIter *iter;
+ NpyIter_IterNextFunc *iternext;
+ npy_int64 **dataptr;
+ npy_int64 colIdx;
+ npy_intp idx;
+
+ PRINTMARK();
+
+ i = 0;
+ blocks = NULL;
+ dtype = PyArray_DescrFromType(NPY_INT64);
+ obj = (PyObject *)_obj;
+
+ GET_TC(tc)
+ ->iterGetName = GET_TC(tc)->transpose ? PdBlock_iterGetName_Transpose
+ : PdBlock_iterGetName;
+
+ blkCtxt = PyObject_Malloc(sizeof(PdBlockContext));
+ if (!blkCtxt) {
+ PyErr_NoMemory();
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ goto BLKRET;
+ }
+ GET_TC(tc)->pdblock = blkCtxt;
+
+ blkCtxt->colIdx = 0;
+ blkCtxt->transpose = GET_TC(tc)->transpose;
+ blkCtxt->ncols = get_attr_length(obj, "columns");
+
+ if (blkCtxt->ncols == 0) {
+ blkCtxt->npyCtxts = NULL;
+ blkCtxt->cindices = NULL;
+
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ goto BLKRET;
+ }
+
+ blkCtxt->npyCtxts =
+ PyObject_Malloc(sizeof(NpyArrContext *) * blkCtxt->ncols);
+ if (!blkCtxt->npyCtxts) {
+ PyErr_NoMemory();
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ goto BLKRET;
+ }
+ for (i = 0; i < blkCtxt->ncols; i++) {
+ blkCtxt->npyCtxts[i] = NULL;
+ }
+
+ blkCtxt->cindices = PyObject_Malloc(sizeof(int) * blkCtxt->ncols);
+ if (!blkCtxt->cindices) {
+ PyErr_NoMemory();
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ goto BLKRET;
+ }
+
+ blocks = get_sub_attr(obj, "_data", "blocks");
+ if (!blocks) {
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ goto BLKRET;
+ }
+
+ // force transpose so each NpyArrContext strides down its column
+ GET_TC(tc)->transpose = 1;
+
+ for (i = 0; i < PyObject_Length(blocks); i++) {
+ block = get_item(blocks, i);
+ if (!block) {
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ goto BLKRET;
+ }
+
+ tmp = get_values(block);
+ if (!tmp) {
+ ((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
+ Py_DECREF(block);
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ goto BLKRET;
+ }
+
+ values = PyArray_Transpose((PyArrayObject *)tmp, NULL);
+ Py_DECREF(tmp);
+ if (!values) {
+ Py_DECREF(block);
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ goto BLKRET;
+ }
+
+ locs = (PyArrayObject *)get_sub_attr(block, "mgr_locs", "as_array");
+ if (!locs) {
+ Py_DECREF(block);
+ Py_DECREF(values);
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ goto BLKRET;
+ }
+
+ iter = NpyIter_New(locs, NPY_ITER_READONLY, NPY_KEEPORDER,
+ NPY_NO_CASTING, dtype);
+ if (!iter) {
+ Py_DECREF(block);
+ Py_DECREF(values);
+ Py_DECREF(locs);
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ goto BLKRET;
+ }
+ iternext = NpyIter_GetIterNext(iter, NULL);
+ if (!iternext) {
+ NpyIter_Deallocate(iter);
+ Py_DECREF(block);
+ Py_DECREF(values);
+ Py_DECREF(locs);
+ GET_TC(tc)->iterNext = NpyArr_iterNextNone;
+ goto BLKRET;
+ }
+ dataptr = (npy_int64 **)NpyIter_GetDataPtrArray(iter);
+ do {
+ colIdx = **dataptr;
+ idx = NpyIter_GetIterIndex(iter);
+
+ blkCtxt->cindices[colIdx] = idx;
+
+ // Reference freed in Pdblock_iterend
+ Py_INCREF(values);
+ GET_TC(tc)->newObj = values;
+
+ // init a dedicated context for this column
+ NpyArr_iterBegin(obj, tc);
+ npyarr = GET_TC(tc)->npyarr;
+
+ // set the dataptr to our desired column and initialise
+ if (npyarr != NULL) {
+ npyarr->dataptr += npyarr->stride * idx;
+ NpyArr_iterNext(obj, tc);
+ }
+ GET_TC(tc)->itemValue = NULL;
+ ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL;
+
+ blkCtxt->npyCtxts[colIdx] = npyarr;
+ GET_TC(tc)->newObj = NULL;
+ } while (iternext(iter));
+
+ NpyIter_Deallocate(iter);
+ Py_DECREF(block);
+ Py_DECREF(values);
+ Py_DECREF(locs);
+ }
+ GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0];
+
+BLKRET:
+ Py_XDECREF(dtype);
+ Py_XDECREF(blocks);
+}
+
+void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
+ PdBlockContext *blkCtxt;
+ NpyArrContext *npyarr;
+ int i;
+ PRINTMARK();
+
+ GET_TC(tc)->itemValue = NULL;
+ npyarr = GET_TC(tc)->npyarr;
+
+ blkCtxt = GET_TC(tc)->pdblock;
+
+ if (blkCtxt) {
+ for (i = 0; i < blkCtxt->ncols; i++) {
+ npyarr = blkCtxt->npyCtxts[i];
+ if (npyarr) {
+ if (npyarr->array) {
+ Py_DECREF(npyarr->array);
+ npyarr->array = NULL;
+ }
+
+ GET_TC(tc)->npyarr = npyarr;
+ NpyArr_iterEnd(obj, tc);
+
+ blkCtxt->npyCtxts[i] = NULL;
+ }
+ }
+
+ if (blkCtxt->npyCtxts) {
+ PyObject_Free(blkCtxt->npyCtxts);
+ }
+ if (blkCtxt->cindices) {
+ PyObject_Free(blkCtxt->cindices);
+ }
+ PyObject_Free(blkCtxt);
+ }
+}
+
+//=============================================================================
+// Tuple iteration functions
+// itemValue is borrowed reference, no ref counting
+//=============================================================================
+void Tuple_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
+ GET_TC(tc)->index = 0;
+ GET_TC(tc)->size = PyTuple_GET_SIZE((PyObject *)obj);
+ GET_TC(tc)->itemValue = NULL;
+}
+
+int Tuple_iterNext(JSOBJ obj, JSONTypeContext *tc) {
+ PyObject *item;
+
+ if (GET_TC(tc)->index >= GET_TC(tc)->size) {
+ return 0;
+ }
+
+ item = PyTuple_GET_ITEM(obj, GET_TC(tc)->index);
+
+ GET_TC(tc)->itemValue = item;
+ GET_TC(tc)->index++;
+ return 1;
+}
+
+void Tuple_iterEnd(JSOBJ obj, JSONTypeContext *tc) {}
+
+JSOBJ Tuple_iterGetValue(JSOBJ obj, JSONTypeContext *tc) {
+ return GET_TC(tc)->itemValue;
+}
+
+char *Tuple_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) {
+ return NULL;
+}
+
+//=============================================================================
+// Iterator iteration functions
+// itemValue is borrowed reference, no ref counting
+//=============================================================================
+void Iter_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
+ GET_TC(tc)->itemValue = NULL;
+ GET_TC(tc)->iterator = PyObject_GetIter(obj);
+}
+
+int Iter_iterNext(JSOBJ obj, JSONTypeContext *tc) {
+ PyObject *item;
+
+ if (GET_TC(tc)->itemValue) {
+ Py_DECREF(GET_TC(tc)->itemValue);
+ GET_TC(tc)->itemValue = NULL;
+ }
+
+ item = PyIter_Next(GET_TC(tc)->iterator);
+
+ if (item == NULL) {
+ return 0;
+ }
+
+ GET_TC(tc)->itemValue = item;
+ return 1;
+}
+
+void Iter_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
+ if (GET_TC(tc)->itemValue) {
+ Py_DECREF(GET_TC(tc)->itemValue);
+ GET_TC(tc)->itemValue = NULL;
+ }
+
+ if (GET_TC(tc)->iterator) {
+ Py_DECREF(GET_TC(tc)->iterator);
+ GET_TC(tc)->iterator = NULL;
+ }
+}
+
+JSOBJ Iter_iterGetValue(JSOBJ obj, JSONTypeContext *tc) {
+ return GET_TC(tc)->itemValue;
+}
+
+char *Iter_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) {
+ return NULL;
+}
+
+//=============================================================================
+// Dir iteration functions
+// itemName ref is borrowed from PyObject_Dir (attrList). No refcount
+// itemValue ref is from PyObject_GetAttr. Ref counted
+//=============================================================================
+void Dir_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
+ GET_TC(tc)->attrList = PyObject_Dir(obj);
+ GET_TC(tc)->index = 0;
+ GET_TC(tc)->size = PyList_GET_SIZE(GET_TC(tc)->attrList);
+ PRINTMARK();
+}
+
+void Dir_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
+ if (GET_TC(tc)->itemValue) {
+ Py_DECREF(GET_TC(tc)->itemValue);
+ GET_TC(tc)->itemValue = NULL;
+ }
+
+ if (GET_TC(tc)->itemName) {
+ Py_DECREF(GET_TC(tc)->itemName);
+ GET_TC(tc)->itemName = NULL;
+ }
+
+ Py_DECREF((PyObject *)GET_TC(tc)->attrList);
+ PRINTMARK();
+}
+
+int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) {
+ PyObject *obj = (PyObject *)_obj;
+ PyObject *itemValue = GET_TC(tc)->itemValue;
+ PyObject *itemName = GET_TC(tc)->itemName;
+ PyObject *attr;
+ PyObject *attrName;
+ char *attrStr;
+
+ if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) {
+ return 0;
+ }
+
+ if (itemValue) {
+ Py_DECREF(GET_TC(tc)->itemValue);
+ GET_TC(tc)->itemValue = itemValue = NULL;
+ }
+
+ if (itemName) {
+ Py_DECREF(GET_TC(tc)->itemName);
+ GET_TC(tc)->itemName = itemName = NULL;
+ }
+
+ for (; GET_TC(tc)->index < GET_TC(tc)->size; GET_TC(tc)->index++) {
+ attrName = PyList_GET_ITEM(GET_TC(tc)->attrList, GET_TC(tc)->index);
+#if PY_MAJOR_VERSION >= 3
+ attr = PyUnicode_AsUTF8String(attrName);
+#else
+ attr = attrName;
+ Py_INCREF(attr);
+#endif
+ attrStr = PyString_AS_STRING(attr);
+
+ if (attrStr[0] == '_') {
+ PRINTMARK();
+ Py_DECREF(attr);
+ continue;
+ }
+
+ itemValue = PyObject_GetAttr(obj, attrName);
+ if (itemValue == NULL) {
+ PyErr_Clear();
+ Py_DECREF(attr);
+ PRINTMARK();
+ continue;
+ }
+
+ if (PyCallable_Check(itemValue)) {
+ Py_DECREF(itemValue);
+ Py_DECREF(attr);
+ PRINTMARK();
+ continue;
+ }
+
+ GET_TC(tc)->itemName = itemName;
+ GET_TC(tc)->itemValue = itemValue;
+ GET_TC(tc)->index++;
+
+ PRINTMARK();
+ itemName = attr;
+ break;
+ }
+
+ if (itemName == NULL) {
+ GET_TC(tc)->index = GET_TC(tc)->size;
+ GET_TC(tc)->itemValue = NULL;
+ return 0;
+ }
+
+ GET_TC(tc)->itemName = itemName;
+ GET_TC(tc)->itemValue = itemValue;
+ GET_TC(tc)->index++;
+
+ PRINTMARK();
+ return 1;
+}
+
+JSOBJ Dir_iterGetValue(JSOBJ obj, JSONTypeContext *tc) {
+ PRINTMARK();
+ return GET_TC(tc)->itemValue;
+}
+
+char *Dir_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) {
+ PRINTMARK();
+ *outLen = PyString_GET_SIZE(GET_TC(tc)->itemName);
+ return PyString_AS_STRING(GET_TC(tc)->itemName);
+}
+
+//=============================================================================
+// List iteration functions
+// itemValue is borrowed from object (which is list). No refcounting
+//=============================================================================
+void List_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
+ GET_TC(tc)->index = 0;
+ GET_TC(tc)->size = PyList_GET_SIZE((PyObject *)obj);
+}
+
+int List_iterNext(JSOBJ obj, JSONTypeContext *tc) {
+ if (GET_TC(tc)->index >= GET_TC(tc)->size) {
+ PRINTMARK();
+ return 0;
+ }
+
+ GET_TC(tc)->itemValue = PyList_GET_ITEM(obj, GET_TC(tc)->index);
+ GET_TC(tc)->index++;
+ return 1;
+}
+
+void List_iterEnd(JSOBJ obj, JSONTypeContext *tc) {}
+
+JSOBJ List_iterGetValue(JSOBJ obj, JSONTypeContext *tc) {
+ return GET_TC(tc)->itemValue;
+}
+
+char *List_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) {
+ return NULL;
+}
+
+//=============================================================================
+// pandas Index iteration functions
+//=============================================================================
+void Index_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
+ GET_TC(tc)->index = 0;
+ GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char));
+ if (!GET_TC(tc)->cStr) {
+ PyErr_NoMemory();
+ }
+ PRINTMARK();
+}
+
+int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) {
+ Py_ssize_t index;
+ if (!GET_TC(tc)->cStr) {
+ return 0;
+ }
+
+ index = GET_TC(tc)->index;
+ Py_XDECREF(GET_TC(tc)->itemValue);
+ if (index == 0) {
+ memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5);
+ GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name");
+ } else if (index == 1) {
+ memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5);
+ GET_TC(tc)->itemValue = get_values(obj);
+ if (!GET_TC(tc)->itemValue) {
+ return 0;
+ }
+ } else {
+ PRINTMARK();
+ return 0;
+ }
+
+ GET_TC(tc)->index++;
+ PRINTMARK();
+ return 1;
+}
+
+void Index_iterEnd(JSOBJ obj, JSONTypeContext *tc) { PRINTMARK(); }
+
+JSOBJ Index_iterGetValue(JSOBJ obj, JSONTypeContext *tc) {
+ return GET_TC(tc)->itemValue;
+}
+
+char *Index_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) {
+ *outLen = strlen(GET_TC(tc)->cStr);
+ return GET_TC(tc)->cStr;
+}
+
+//=============================================================================
+// pandas Series iteration functions
+//=============================================================================
+void Series_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
+ PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
+ GET_TC(tc)->index = 0;
+ GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char));
+ enc->outputFormat = VALUES; // for contained series
+ if (!GET_TC(tc)->cStr) {
+ PyErr_NoMemory();
+ }
+ PRINTMARK();
+}
+
+int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) {
+ Py_ssize_t index;
+ if (!GET_TC(tc)->cStr) {
+ return 0;
+ }
+
+ index = GET_TC(tc)->index;
+ Py_XDECREF(GET_TC(tc)->itemValue);
+ if (index == 0) {
+ memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5);
+ GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name");
+ } else if (index == 1) {
+ memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6);
+ GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index");
+ } else if (index == 2) {
+ memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5);
+ GET_TC(tc)->itemValue = get_values(obj);
+ if (!GET_TC(tc)->itemValue) {
+ return 0;
+ }
+ } else {
+ PRINTMARK();
+ return 0;
+ }
+
+ GET_TC(tc)->index++;
+ PRINTMARK();
+ return 1;
+}
+
+void Series_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
+ PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
+ enc->outputFormat = enc->originalOutputFormat;
+ PRINTMARK();
+}
+
+JSOBJ Series_iterGetValue(JSOBJ obj, JSONTypeContext *tc) {
+ return GET_TC(tc)->itemValue;
+}
+
+char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) {
+ *outLen = strlen(GET_TC(tc)->cStr);
+ return GET_TC(tc)->cStr;
+}
+
+//=============================================================================
+// pandas DataFrame iteration functions
+//=============================================================================
+void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
+ PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
+ GET_TC(tc)->index = 0;
+ GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char));
+ enc->outputFormat = VALUES; // for contained series & index
+ if (!GET_TC(tc)->cStr) {
+ PyErr_NoMemory();
+ }
+ PRINTMARK();
+}
+
+int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) {
+ Py_ssize_t index;
+ if (!GET_TC(tc)->cStr) {
+ return 0;
+ }
+
+ index = GET_TC(tc)->index;
+ Py_XDECREF(GET_TC(tc)->itemValue);
+ if (index == 0) {
+ memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8);
+ GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns");
+ } else if (index == 1) {
+ memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6);
+ GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index");
+ } else if (index == 2) {
+ memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5);
+ if (is_simple_frame(obj)) {
+ GET_TC(tc)->itemValue = get_values(obj);
+ if (!GET_TC(tc)->itemValue) {
+ return 0;
+ }
+ } else {
+ Py_INCREF(obj);
+ GET_TC(tc)->itemValue = obj;
+ }
+ } else {
+ PRINTMARK();
+ return 0;
+ }
+
+ GET_TC(tc)->index++;
+ PRINTMARK();
+ return 1;
+}
+
+void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
+ PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
+ enc->outputFormat = enc->originalOutputFormat;
+ PRINTMARK();
+}
+
+JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) {
+ return GET_TC(tc)->itemValue;
+}
+
+char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) {
+ *outLen = strlen(GET_TC(tc)->cStr);
+ return GET_TC(tc)->cStr;
+}
+
+//=============================================================================
+// Dict iteration functions
+// itemName might converted to string (Python_Str). Do refCounting
+// itemValue is borrowed from object (which is dict). No refCounting
+//=============================================================================
+void Dict_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
+ GET_TC(tc)->index = 0;
+ PRINTMARK();
+}
+
+int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc) {
+#if PY_MAJOR_VERSION >= 3
+ PyObject *itemNameTmp;
+#endif
+
+ if (GET_TC(tc)->itemName) {
+ Py_DECREF(GET_TC(tc)->itemName);
+ GET_TC(tc)->itemName = NULL;
+ }
+
+ if (!PyDict_Next((PyObject *)GET_TC(tc)->dictObj, &GET_TC(tc)->index,
+ &GET_TC(tc)->itemName, &GET_TC(tc)->itemValue)) {
+ PRINTMARK();
+ return 0;
+ }
+
+ if (PyUnicode_Check(GET_TC(tc)->itemName)) {
+ GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName);
+ } else if (!PyString_Check(GET_TC(tc)->itemName)) {
+ GET_TC(tc)->itemName = PyObject_Str(GET_TC(tc)->itemName);
+#if PY_MAJOR_VERSION >= 3
+ itemNameTmp = GET_TC(tc)->itemName;
+ GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName);
+ Py_DECREF(itemNameTmp);
+#endif
+ } else {
+ Py_INCREF(GET_TC(tc)->itemName);
+ }
+ PRINTMARK();
+ return 1;
+}
+
+void Dict_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
+ if (GET_TC(tc)->itemName) {
+ Py_DECREF(GET_TC(tc)->itemName);
+ GET_TC(tc)->itemName = NULL;
+ }
+ Py_DECREF(GET_TC(tc)->dictObj);
+ PRINTMARK();
+}
+
+JSOBJ Dict_iterGetValue(JSOBJ obj, JSONTypeContext *tc) {
+ return GET_TC(tc)->itemValue;
+}
+
+char *Dict_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) {
+ *outLen = PyString_GET_SIZE(GET_TC(tc)->itemName);
+ return PyString_AS_STRING(GET_TC(tc)->itemName);
+}
+
+void NpyArr_freeLabels(char **labels, npy_intp len) {
+ npy_intp i;
+
+ if (labels) {
+ for (i = 0; i < len; i++) {
+ PyObject_Free(labels[i]);
+ }
+ PyObject_Free(labels);
+ }
+}
+
+char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc,
+ npy_intp num) {
+ // NOTE this function steals a reference to labels.
+ PyObjectEncoder *pyenc = (PyObjectEncoder *)enc;
+ PyObject *item = NULL;
+ npy_intp i, stride, len, need_quotes;
+ char **ret;
+ char *dataptr, *cLabel, *origend, *origst, *origoffset;
+ char labelBuffer[NPY_JSON_BUFSIZE];
+ PyArray_GetItemFunc *getitem;
+ int type_num;
+ PRINTMARK();
+
+ if (!labels) {
+ return 0;
+ }
+
+ if (PyArray_SIZE(labels) < num) {
+ PyErr_SetString(
+ PyExc_ValueError,
+ "Label array sizes do not match corresponding data shape");
+ Py_DECREF(labels);
+ return 0;
+ }
+
+ ret = PyObject_Malloc(sizeof(char *) * num);
+ if (!ret) {
+ PyErr_NoMemory();
+ Py_DECREF(labels);
+ return 0;
+ }
+
+ for (i = 0; i < num; i++) {
+ ret[i] = NULL;
+ }
+
+ origst = enc->start;
+ origend = enc->end;
+ origoffset = enc->offset;
+
+ stride = PyArray_STRIDE(labels, 0);
+ dataptr = PyArray_DATA(labels);
+ getitem = (PyArray_GetItemFunc *)PyArray_DESCR(labels)->f->getitem;
+ type_num = PyArray_TYPE(labels);
+
+ for (i = 0; i < num; i++) {
+#if NPY_API_VERSION < 0x00000007
+ if (PyTypeNum_ISDATETIME(type_num)) {
+ item = PyArray_ToScalar(dataptr, labels);
+ } else if (PyTypeNum_ISNUMBER(type_num)) // NOLINT
+#else
+ if (PyTypeNum_ISDATETIME(type_num) || PyTypeNum_ISNUMBER(type_num)) // NOLINT
+#endif
+ {
+ item = (PyObject *)labels;
+ pyenc->npyType = type_num;
+ pyenc->npyValue = dataptr;
+ } else {
+ item = getitem(dataptr, labels);
+ if (!item) {
+ NpyArr_freeLabels(ret, num);
+ ret = 0;
+ break;
+ }
+ }
+
+ cLabel = JSON_EncodeObject(item, enc, labelBuffer, NPY_JSON_BUFSIZE);
+
+ if (item != (PyObject *)labels) {
+ Py_DECREF(item);
+ }
+
+ if (PyErr_Occurred() || enc->errorMsg) {
+ NpyArr_freeLabels(ret, num);
+ ret = 0;
+ break;
+ }
+
+ need_quotes = ((*cLabel) != '"');
+ len = enc->offset - cLabel + 1 + 2 * need_quotes;
+ ret[i] = PyObject_Malloc(sizeof(char) * len);
+
+ if (!ret[i]) {
+ PyErr_NoMemory();
+ ret = 0;
+ break;
+ }
+
+ if (need_quotes) {
+ ret[i][0] = '"';
+ memcpy(ret[i] + 1, cLabel, sizeof(char) * (len - 4));
+ ret[i][len - 3] = '"';
+ } else {
+ memcpy(ret[i], cLabel, sizeof(char) * (len - 2));
+ }
+ ret[i][len - 2] = ':';
+ ret[i][len - 1] = '\0';
+ dataptr += stride;
+ }
+
+ enc->start = origst;
+ enc->end = origend;
+ enc->offset = origoffset;
+
+ Py_DECREF(labels);
+ return ret;
+}
+
+void Object_invokeDefaultHandler(PyObject *obj, PyObjectEncoder *enc) {
+ PyObject *tmpObj = NULL;
+ PRINTMARK();
+ tmpObj = PyObject_CallFunctionObjArgs(enc->defaultHandler, obj, NULL);
+ if (!PyErr_Occurred()) {
+ if (tmpObj == NULL) {
+ PyErr_SetString(PyExc_TypeError,
+ "Failed to execute default handler");
+ } else {
+ encode(tmpObj, (JSONObjectEncoder *)enc, NULL, 0);
+ }
+ }
+ Py_XDECREF(tmpObj);
+ return;
+}
+
+void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
+ PyObject *obj, *exc, *toDictFunc, *tmpObj, *values;
+ TypeContext *pc;
+ PyObjectEncoder *enc;
+ double val;
+ npy_int64 value;
+ int base;
+ PRINTMARK();
+
+ tc->prv = NULL;
+
+ if (!_obj) {
+ tc->type = JT_INVALID;
+ return;
+ }
+
+ obj = (PyObject *)_obj;
+ enc = (PyObjectEncoder *)tc->encoder;
+
+ if (enc->npyType >= 0) {
+ PRINTMARK();
+ tc->prv = &(enc->basicTypeContext);
+ tc->type = NpyTypeToJSONType(obj, tc, enc->npyType, enc->npyValue);
+
+ if (tc->type == JT_INVALID) {
+ if (enc->defaultHandler) {
+ enc->npyType = -1;
+ PRINTMARK();
+ Object_invokeDefaultHandler(
+ enc->npyCtxtPassthru->getitem(enc->npyValue,
+ enc->npyCtxtPassthru->array),
+ enc);
+ } else {
+ PyErr_Format(PyExc_RuntimeError, "Unhandled numpy dtype %d",
+ enc->npyType);
+ }
+ }
+ enc->npyCtxtPassthru = NULL;
+ enc->npyType = -1;
+ return;
+ }
+
+ if (PyBool_Check(obj)) {
+ PRINTMARK();
+ tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE;
+ return;
+ } else if (obj == Py_None) {
+ PRINTMARK();
+ tc->type = JT_NULL;
+ return;
+ }
+
+ pc = createTypeContext();
+ if (!pc) {
+ tc->type = JT_INVALID;
+ return;
+ }
+ tc->prv = pc;
+
+ if (PyIter_Check(obj) ||
+ (PyArray_Check(obj) && !PyArray_CheckScalar(obj))) {
+ PRINTMARK();
+ goto ISITERABLE;
+ }
+
+ if (PyLong_Check(obj)) {
+ PRINTMARK();
+ pc->PyTypeToJSON = PyLongToINT64;
+ tc->type = JT_LONG;
+ GET_TC(tc)->longValue = PyLong_AsLongLong(obj);
+
+ exc = PyErr_Occurred();
+
+ if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) {
+ PRINTMARK();
+ goto INVALID;
+ }
+
+ return;
+ } else if (PyInt_Check(obj)) {
+ PRINTMARK();
+
+#ifdef _LP64
+ pc->PyTypeToJSON = PyIntToINT64;
+ tc->type = JT_LONG;
+#else
+ pc->PyTypeToJSON = PyIntToINT32;
+ tc->type = JT_INT;
+#endif
+ return;
+ } else if (PyFloat_Check(obj)) {
+ PRINTMARK();
+ val = PyFloat_AS_DOUBLE(obj);
+ if (npy_isnan(val) || npy_isinf(val)) {
+ tc->type = JT_NULL;
+ } else {
+ pc->PyTypeToJSON = PyFloatToDOUBLE;
+ tc->type = JT_DOUBLE;
+ }
+ return;
+ } else if (PyString_Check(obj)) {
+ PRINTMARK();
+ pc->PyTypeToJSON = PyStringToUTF8;
+ tc->type = JT_UTF8;
+ return;
+ } else if (PyUnicode_Check(obj)) {
+ PRINTMARK();
+ pc->PyTypeToJSON = PyUnicodeToUTF8;
+ tc->type = JT_UTF8;
+ return;
+ } else if (PyObject_IsInstance(obj, type_decimal)) {
+ PRINTMARK();
+ pc->PyTypeToJSON = PyFloatToDOUBLE;
+ tc->type = JT_DOUBLE;
+ return;
+ } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) {
+ if (PyObject_TypeCheck(obj, cls_nat)) {
+ PRINTMARK();
+ tc->type = JT_NULL;
+ return;
+ }
+
+ PRINTMARK();
+ pc->PyTypeToJSON = PyDateTimeToJSON;
+ if (enc->datetimeIso) {
+ PRINTMARK();
+ tc->type = JT_UTF8;
+ } else {
+ PRINTMARK();
+ tc->type = JT_LONG;
+ }
+ return;
+ } else if (PyTime_Check(obj)) {
+ PRINTMARK();
+ pc->PyTypeToJSON = PyTimeToJSON;
+ tc->type = JT_UTF8;
+ return;
+ } else if (PyArray_IsScalar(obj, Datetime)) {
+ PRINTMARK();
+ if (((PyDatetimeScalarObject *)obj)->obval == get_nat()) {
+ PRINTMARK();
+ tc->type = JT_NULL;
+ return;
+ }
+
+ PRINTMARK();
+ pc->PyTypeToJSON = NpyDateTimeScalarToJSON;
+ tc->type = enc->datetimeIso ? JT_UTF8 : JT_LONG;
+ return;
+ } else if (PyDelta_Check(obj)) {
+ if (PyObject_HasAttrString(obj, "value")) {
+ PRINTMARK();
+ value = get_long_attr(obj, "value");
+ } else {
+ PRINTMARK();
+ value =
+ total_seconds(obj) * 1000000000LL; // nanoseconds per second
+ }
+
+ base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
+ switch (base) {
+ case NPY_FR_ns:
+ break;
+ case NPY_FR_us:
+ value /= 1000LL;
+ break;
+ case NPY_FR_ms:
+ value /= 1000000LL;
+ break;
+ case NPY_FR_s:
+ value /= 1000000000LL;
+ break;
+ }
+
+ exc = PyErr_Occurred();
+
+ if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) {
+ PRINTMARK();
+ goto INVALID;
+ }
+
+ if (value == get_nat()) {
+ PRINTMARK();
+ tc->type = JT_NULL;
+ return;
+ }
+
+ GET_TC(tc)->longValue = value;
+
+ PRINTMARK();
+ pc->PyTypeToJSON = PyLongToINT64;
+ tc->type = JT_LONG;
+ return;
+ } else if (PyArray_IsScalar(obj, Integer)) {
+ PRINTMARK();
+ pc->PyTypeToJSON = PyLongToINT64;
+ tc->type = JT_LONG;
+ PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue),
+ PyArray_DescrFromType(NPY_INT64));
+
+ exc = PyErr_Occurred();
+
+ if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) {
+ PRINTMARK();
+ goto INVALID;
+ }
+
+ return;
+ } else if (PyArray_IsScalar(obj, Bool)) {
+ PRINTMARK();
+ PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue),
+ PyArray_DescrFromType(NPY_BOOL));
+ tc->type = (GET_TC(tc)->longValue) ? JT_TRUE : JT_FALSE;
+ return;
+ } else if (PyArray_IsScalar(obj, Float) || PyArray_IsScalar(obj, Double)) {
+ PRINTMARK();
+ pc->PyTypeToJSON = NpyFloatToDOUBLE;
+ tc->type = JT_DOUBLE;
+ return;
+ } else if (PyArray_Check(obj) && PyArray_CheckScalar(obj)) {
+ tmpObj = PyObject_Repr(obj);
+ PyErr_Format(PyExc_TypeError,
+ "%s (0d array) is not JSON serializable at the moment",
+ PyString_AS_STRING(tmpObj));
+ Py_DECREF(tmpObj);
+ goto INVALID;
+ }
+
+ISITERABLE:
+
+ if (PyObject_TypeCheck(obj, cls_index)) {
+ if (enc->outputFormat == SPLIT) {
+ PRINTMARK();
+ tc->type = JT_OBJECT;
+ pc->iterBegin = Index_iterBegin;
+ pc->iterEnd = Index_iterEnd;
+ pc->iterNext = Index_iterNext;
+ pc->iterGetValue = Index_iterGetValue;
+ pc->iterGetName = Index_iterGetName;
+ return;
+ }
+
+ pc->newObj = get_values(obj);
+ if (pc->newObj) {
+ PRINTMARK();
+ tc->type = JT_ARRAY;
+ pc->iterBegin = NpyArr_iterBegin;
+ pc->iterEnd = NpyArr_iterEnd;
+ pc->iterNext = NpyArr_iterNext;
+ pc->iterGetValue = NpyArr_iterGetValue;
+ pc->iterGetName = NpyArr_iterGetName;
+ } else {
+ goto INVALID;
+ }
+
+ return;
+ } else if (PyObject_TypeCheck(obj, cls_series)) {
+ if (enc->outputFormat == SPLIT) {
+ PRINTMARK();
+ tc->type = JT_OBJECT;
+ pc->iterBegin = Series_iterBegin;
+ pc->iterEnd = Series_iterEnd;
+ pc->iterNext = Series_iterNext;
+ pc->iterGetValue = Series_iterGetValue;
+ pc->iterGetName = Series_iterGetName;
+ return;
+ }
+
+ pc->newObj = get_values(obj);
+ if (!pc->newObj) {
+ goto INVALID;
+ }
+
+ if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) {
+ PRINTMARK();
+ tc->type = JT_OBJECT;
+ tmpObj = PyObject_GetAttrString(obj, "index");
+ if (!tmpObj) {
+ goto INVALID;
+ }
+ values = get_values(tmpObj);
+ Py_DECREF(tmpObj);
+ if (!values) {
+ goto INVALID;
+ }
+ pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0);
+ pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values,
+ (JSONObjectEncoder *)enc,
+ pc->columnLabelsLen);
+ if (!pc->columnLabels) {
+ goto INVALID;
+ }
+ } else {
+ PRINTMARK();
+ tc->type = JT_ARRAY;
+ }
+ pc->iterBegin = NpyArr_iterBegin;
+ pc->iterEnd = NpyArr_iterEnd;
+ pc->iterNext = NpyArr_iterNext;
+ pc->iterGetValue = NpyArr_iterGetValue;
+ pc->iterGetName = NpyArr_iterGetName;
+ return;
+ } else if (PyArray_Check(obj)) {
+ if (enc->npyCtxtPassthru) {
+ PRINTMARK();
+ pc->npyarr = enc->npyCtxtPassthru;
+ tc->type = (pc->npyarr->columnLabels ? JT_OBJECT : JT_ARRAY);
+
+ pc->iterBegin = NpyArrPassThru_iterBegin;
+ pc->iterNext = NpyArr_iterNext;
+ pc->iterEnd = NpyArrPassThru_iterEnd;
+ pc->iterGetValue = NpyArr_iterGetValue;
+ pc->iterGetName = NpyArr_iterGetName;
+
+ enc->npyCtxtPassthru = NULL;
+ return;
+ }
+
+ PRINTMARK();
+ tc->type = JT_ARRAY;
+ pc->iterBegin = NpyArr_iterBegin;
+ pc->iterEnd = NpyArr_iterEnd;
+ pc->iterNext = NpyArr_iterNext;
+ pc->iterGetValue = NpyArr_iterGetValue;
+ pc->iterGetName = NpyArr_iterGetName;
+ return;
+ } else if (PyObject_TypeCheck(obj, cls_dataframe)) {
+ if (enc->blkCtxtPassthru) {
+ PRINTMARK();
+ pc->pdblock = enc->blkCtxtPassthru;
+ tc->type =
+ (pc->pdblock->npyCtxts[0]->columnLabels ? JT_OBJECT : JT_ARRAY);
+
+ pc->iterBegin = PdBlockPassThru_iterBegin;
+ pc->iterEnd = PdBlockPassThru_iterEnd;
+ pc->iterNext = PdBlock_iterNextItem;
+ pc->iterGetName = PdBlock_iterGetName;
+ pc->iterGetValue = NpyArr_iterGetValue;
+
+ enc->blkCtxtPassthru = NULL;
+ return;
+ }
+
+ if (enc->outputFormat == SPLIT) {
+ PRINTMARK();
+ tc->type = JT_OBJECT;
+ pc->iterBegin = DataFrame_iterBegin;
+ pc->iterEnd = DataFrame_iterEnd;
+ pc->iterNext = DataFrame_iterNext;
+ pc->iterGetValue = DataFrame_iterGetValue;
+ pc->iterGetName = DataFrame_iterGetName;
+ return;
+ }
+
+ PRINTMARK();
+ if (is_simple_frame(obj)) {
+ pc->iterBegin = NpyArr_iterBegin;
+ pc->iterEnd = NpyArr_iterEnd;
+ pc->iterNext = NpyArr_iterNext;
+ pc->iterGetName = NpyArr_iterGetName;
+
+ pc->newObj = get_values(obj);
+ if (!pc->newObj) {
+ goto INVALID;
+ }
+ } else {
+ pc->iterBegin = PdBlock_iterBegin;
+ pc->iterEnd = PdBlock_iterEnd;
+ pc->iterNext = PdBlock_iterNext;
+ pc->iterGetName = PdBlock_iterGetName;
+ }
+ pc->iterGetValue = NpyArr_iterGetValue;
+
+ if (enc->outputFormat == VALUES) {
+ PRINTMARK();
+ tc->type = JT_ARRAY;
+ } else if (enc->outputFormat == RECORDS) {
+ PRINTMARK();
+ tc->type = JT_ARRAY;
+ tmpObj = PyObject_GetAttrString(obj, "columns");
+ if (!tmpObj) {
+ goto INVALID;
+ }
+ values = get_values(tmpObj);
+ if (!values) {
+ Py_DECREF(tmpObj);
+ goto INVALID;
+ }
+ pc->columnLabelsLen = PyObject_Size(tmpObj);
+ pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values,
+ (JSONObjectEncoder *)enc,
+ pc->columnLabelsLen);
+ Py_DECREF(tmpObj);
+ if (!pc->columnLabels) {
+ goto INVALID;
+ }
+ } else if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) {
+ PRINTMARK();
+ tc->type = JT_OBJECT;
+ tmpObj = (enc->outputFormat == INDEX
+ ? PyObject_GetAttrString(obj, "index")
+ : PyObject_GetAttrString(obj, "columns"));
+ if (!tmpObj) {
+ goto INVALID;
+ }
+ values = get_values(tmpObj);
+ if (!values) {
+ Py_DECREF(tmpObj);
+ goto INVALID;
+ }
+ pc->rowLabelsLen = PyObject_Size(tmpObj);
+ pc->rowLabels =
+ NpyArr_encodeLabels((PyArrayObject *)values,
+ (JSONObjectEncoder *)enc, pc->rowLabelsLen);
+ Py_DECREF(tmpObj);
+ tmpObj = (enc->outputFormat == INDEX
+ ? PyObject_GetAttrString(obj, "columns")
+ : PyObject_GetAttrString(obj, "index"));
+ if (!tmpObj) {
+ NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen);
+ pc->rowLabels = NULL;
+ goto INVALID;
+ }
+ values = get_values(tmpObj);
+ if (!values) {
+ Py_DECREF(tmpObj);
+ NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen);
+ pc->rowLabels = NULL;
+ goto INVALID;
+ }
+ pc->columnLabelsLen = PyObject_Size(tmpObj);
+ pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values,
+ (JSONObjectEncoder *)enc,
+ pc->columnLabelsLen);
+ Py_DECREF(tmpObj);
+ if (!pc->columnLabels) {
+ NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen);
+ pc->rowLabels = NULL;
+ goto INVALID;
+ }
+
+ if (enc->outputFormat == COLUMNS) {
+ PRINTMARK();
+ pc->transpose = 1;
+ }
+ } else {
+ goto INVALID;
+ }
+ return;
+ } else if (PyDict_Check(obj)) {
+ PRINTMARK();
+ tc->type = JT_OBJECT;
+ pc->iterBegin = Dict_iterBegin;
+ pc->iterEnd = Dict_iterEnd;
+ pc->iterNext = Dict_iterNext;
+ pc->iterGetValue = Dict_iterGetValue;
+ pc->iterGetName = Dict_iterGetName;
+ pc->dictObj = obj;
+ Py_INCREF(obj);
+
+ return;
+ } else if (PyList_Check(obj)) {
+ PRINTMARK();
+ tc->type = JT_ARRAY;
+ pc->iterBegin = List_iterBegin;
+ pc->iterEnd = List_iterEnd;
+ pc->iterNext = List_iterNext;
+ pc->iterGetValue = List_iterGetValue;
+ pc->iterGetName = List_iterGetName;
+ return;
+ } else if (PyTuple_Check(obj)) {
+ PRINTMARK();
+ tc->type = JT_ARRAY;
+ pc->iterBegin = Tuple_iterBegin;
+ pc->iterEnd = Tuple_iterEnd;
+ pc->iterNext = Tuple_iterNext;
+ pc->iterGetValue = Tuple_iterGetValue;
+ pc->iterGetName = Tuple_iterGetName;
+ return;
+ } else if (PyAnySet_Check(obj)) {
+ PRINTMARK();
+ tc->type = JT_ARRAY;
+ pc->iterBegin = Iter_iterBegin;
+ pc->iterEnd = Iter_iterEnd;
+ pc->iterNext = Iter_iterNext;
+ pc->iterGetValue = Iter_iterGetValue;
+ pc->iterGetName = Iter_iterGetName;
+ return;
+ }
+
+ toDictFunc = PyObject_GetAttrString(obj, "toDict");
+
+ if (toDictFunc) {
+ PyObject *tuple = PyTuple_New(0);
+ PyObject *toDictResult = PyObject_Call(toDictFunc, tuple, NULL);
+ Py_DECREF(tuple);
+ Py_DECREF(toDictFunc);
+
+ if (toDictResult == NULL) {
+ PyErr_Clear();
+ tc->type = JT_NULL;
+ return;
+ }
+
+ if (!PyDict_Check(toDictResult)) {
+ Py_DECREF(toDictResult);
+ tc->type = JT_NULL;
+ return;
+ }
+
+ PRINTMARK();
+ tc->type = JT_OBJECT;
+ pc->iterBegin = Dict_iterBegin;
+ pc->iterEnd = Dict_iterEnd;
+ pc->iterNext = Dict_iterNext;
+ pc->iterGetValue = Dict_iterGetValue;
+ pc->iterGetName = Dict_iterGetName;
+ pc->dictObj = toDictResult;
+ return;
+ }
+
+ PyErr_Clear();
+
+ if (enc->defaultHandler) {
+ Object_invokeDefaultHandler(obj, enc);
+ goto INVALID;
+ }
+
+ PRINTMARK();
+ tc->type = JT_OBJECT;
+ pc->iterBegin = Dir_iterBegin;
+ pc->iterEnd = Dir_iterEnd;
+ pc->iterNext = Dir_iterNext;
+ pc->iterGetValue = Dir_iterGetValue;
+ pc->iterGetName = Dir_iterGetName;
+ return;
+
+INVALID:
+ tc->type = JT_INVALID;
+ PyObject_Free(tc->prv);
+ tc->prv = NULL;
+ return;
+}
+
+void Object_endTypeContext(JSOBJ obj, JSONTypeContext *tc) {
+ PRINTMARK();
+ if (tc->prv) {
+ Py_XDECREF(GET_TC(tc)->newObj);
+ GET_TC(tc)->newObj = NULL;
+ NpyArr_freeLabels(GET_TC(tc)->rowLabels, GET_TC(tc)->rowLabelsLen);
+ GET_TC(tc)->rowLabels = NULL;
+ NpyArr_freeLabels(GET_TC(tc)->columnLabels,
+ GET_TC(tc)->columnLabelsLen);
+ GET_TC(tc)->columnLabels = NULL;
+
+ PyObject_Free(GET_TC(tc)->cStr);
+ GET_TC(tc)->cStr = NULL;
+ if (tc->prv != &(((PyObjectEncoder *)tc->encoder)->basicTypeContext)) { // NOLINT
+ PyObject_Free(tc->prv);
+ }
+ tc->prv = NULL;
+ }
+}
+
+const char *Object_getStringValue(JSOBJ obj, JSONTypeContext *tc,
+ size_t *_outLen) {
+ return GET_TC(tc)->PyTypeToJSON(obj, tc, NULL, _outLen);
+}
+
+JSINT64 Object_getLongValue(JSOBJ obj, JSONTypeContext *tc) {
+ JSINT64 ret;
+ GET_TC(tc)->PyTypeToJSON(obj, tc, &ret, NULL);
+ return ret;
+}
+
+JSINT32 Object_getIntValue(JSOBJ obj, JSONTypeContext *tc) {
+ JSINT32 ret;
+ GET_TC(tc)->PyTypeToJSON(obj, tc, &ret, NULL);
+ return ret;
+}
+
+double Object_getDoubleValue(JSOBJ obj, JSONTypeContext *tc) {
+ double ret;
+ GET_TC(tc)->PyTypeToJSON(obj, tc, &ret, NULL);
+ return ret;
+}
+
+static void Object_releaseObject(JSOBJ _obj) { Py_DECREF((PyObject *)_obj); }
+
+void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
+ GET_TC(tc)->iterBegin(obj, tc);
+}
+
+int Object_iterNext(JSOBJ obj, JSONTypeContext *tc) {
+ return GET_TC(tc)->iterNext(obj, tc);
+}
+
+void Object_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
+ GET_TC(tc)->iterEnd(obj, tc);
+}
+
+JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) {
+ return GET_TC(tc)->iterGetValue(obj, tc);
+}
+
+char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) {
+ return GET_TC(tc)->iterGetName(obj, tc, outLen);
+}
+
+PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) {
+ static char *kwlist[] = {
+ "obj", "ensure_ascii", "double_precision", "encode_html_chars",
+ "orient", "date_unit", "iso_dates", "default_handler",
+ NULL};
+
+ char buffer[65536];
+ char *ret;
+ PyObject *newobj;
+ PyObject *oinput = NULL;
+ PyObject *oensureAscii = NULL;
+ int idoublePrecision = 10; // default double precision setting
+ PyObject *oencodeHTMLChars = NULL;
+ char *sOrient = NULL;
+ char *sdateFormat = NULL;
+ PyObject *oisoDates = 0;
+ PyObject *odefHandler = 0;
+
+ PyObjectEncoder pyEncoder = {{
+ Object_beginTypeContext,
+ Object_endTypeContext,
+ Object_getStringValue,
+ Object_getLongValue,
+ Object_getIntValue,
+ Object_getDoubleValue,
+ Object_iterBegin,
+ Object_iterNext,
+ Object_iterEnd,
+ Object_iterGetValue,
+ Object_iterGetName,
+ Object_releaseObject,
+ PyObject_Malloc,
+ PyObject_Realloc,
+ PyObject_Free,
+ -1, // recursionMax
+ idoublePrecision,
+ 1, // forceAscii
+ 0, // encodeHTMLChars
+ }};
+ JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder;
+
+ pyEncoder.npyCtxtPassthru = NULL;
+ pyEncoder.blkCtxtPassthru = NULL;
+ pyEncoder.npyType = -1;
+ pyEncoder.npyValue = NULL;
+ pyEncoder.datetimeIso = 0;
+ pyEncoder.datetimeUnit = NPY_FR_ms;
+ pyEncoder.outputFormat = COLUMNS;
+ pyEncoder.defaultHandler = 0;
+ pyEncoder.basicTypeContext.newObj = NULL;
+ pyEncoder.basicTypeContext.dictObj = NULL;
+ pyEncoder.basicTypeContext.itemValue = NULL;
+ pyEncoder.basicTypeContext.itemName = NULL;
+ pyEncoder.basicTypeContext.attrList = NULL;
+ pyEncoder.basicTypeContext.iterator = NULL;
+ pyEncoder.basicTypeContext.cStr = NULL;
+ pyEncoder.basicTypeContext.npyarr = NULL;
+ pyEncoder.basicTypeContext.rowLabels = NULL;
+ pyEncoder.basicTypeContext.columnLabels = NULL;
+
+ PRINTMARK();
+
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssOO", kwlist, &oinput,
+ &oensureAscii, &idoublePrecision,
+ &oencodeHTMLChars, &sOrient, &sdateFormat,
+ &oisoDates, &odefHandler)) {
+ return NULL;
+ }
+
+ if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii)) {
+ encoder->forceASCII = 0;
+ }
+
+ if (oencodeHTMLChars != NULL && PyObject_IsTrue(oencodeHTMLChars)) {
+ encoder->encodeHTMLChars = 1;
+ }
+
+ if (idoublePrecision > JSON_DOUBLE_MAX_DECIMALS || idoublePrecision < 0) {
+ PyErr_Format(
+ PyExc_ValueError,
+ "Invalid value '%d' for option 'double_precision', max is '%u'",
+ idoublePrecision, JSON_DOUBLE_MAX_DECIMALS);
+ return NULL;
+ }
+ encoder->doublePrecision = idoublePrecision;
+
+ if (sOrient != NULL) {
+ if (strcmp(sOrient, "records") == 0) {
+ pyEncoder.outputFormat = RECORDS;
+ } else if (strcmp(sOrient, "index") == 0) {
+ pyEncoder.outputFormat = INDEX;
+ } else if (strcmp(sOrient, "split") == 0) {
+ pyEncoder.outputFormat = SPLIT;
+ } else if (strcmp(sOrient, "values") == 0) {
+ pyEncoder.outputFormat = VALUES;
+ } else if (strcmp(sOrient, "columns") != 0) {
+ PyErr_Format(PyExc_ValueError,
+ "Invalid value '%s' for option 'orient'", sOrient);
+ return NULL;
+ }
+ }
+
+ if (sdateFormat != NULL) {
+ if (strcmp(sdateFormat, "s") == 0) {
+ pyEncoder.datetimeUnit = NPY_FR_s;
+ } else if (strcmp(sdateFormat, "ms") == 0) {
+ pyEncoder.datetimeUnit = NPY_FR_ms;
+ } else if (strcmp(sdateFormat, "us") == 0) {
+ pyEncoder.datetimeUnit = NPY_FR_us;
+ } else if (strcmp(sdateFormat, "ns") == 0) {
+ pyEncoder.datetimeUnit = NPY_FR_ns;
+ } else {
+ PyErr_Format(PyExc_ValueError,
+ "Invalid value '%s' for option 'date_unit'",
+ sdateFormat);
+ return NULL;
+ }
+ }
+
+ if (oisoDates != NULL && PyObject_IsTrue(oisoDates)) {
+ pyEncoder.datetimeIso = 1;
+ }
+
+ if (odefHandler != NULL && odefHandler != Py_None) {
+ if (!PyCallable_Check(odefHandler)) {
+ PyErr_SetString(PyExc_TypeError, "Default handler is not callable");
+ return NULL;
+ }
+ pyEncoder.defaultHandler = odefHandler;
+ }
+
+ pyEncoder.originalOutputFormat = pyEncoder.outputFormat;
+ PRINTMARK();
+ ret = JSON_EncodeObject(oinput, encoder, buffer, sizeof(buffer));
+ PRINTMARK();
+
+ if (PyErr_Occurred()) {
+ PRINTMARK();
+ return NULL;
+ }
+
+ if (encoder->errorMsg) {
+ PRINTMARK();
+ if (ret != buffer) {
+ encoder->free(ret);
+ }
+
+ PyErr_Format(PyExc_OverflowError, "%s", encoder->errorMsg);
+ return NULL;
+ }
+
+ newobj = PyString_FromString(ret);
+
+ if (ret != buffer) {
+ encoder->free(ret);
+ }
+
+ PRINTMARK();
+
+ return newobj;
+}
+
+PyObject *objToJSONFile(PyObject *self, PyObject *args, PyObject *kwargs) {
+ PyObject *data;
+ PyObject *file;
+ PyObject *string;
+ PyObject *write;
+ PyObject *argtuple;
+
+ PRINTMARK();
+
+ if (!PyArg_ParseTuple(args, "OO", &data, &file)) {
+ return NULL;
+ }
+
+ if (!PyObject_HasAttrString(file, "write")) {
+ PyErr_Format(PyExc_TypeError, "expected file");
+ return NULL;
+ }
+
+ write = PyObject_GetAttrString(file, "write");
+
+ if (!PyCallable_Check(write)) {
+ Py_XDECREF(write);
+ PyErr_Format(PyExc_TypeError, "expected file");
+ return NULL;
+ }
+
+ argtuple = PyTuple_Pack(1, data);
+
+ string = objToJSON(self, argtuple, kwargs);
+
+ if (string == NULL) {
+ Py_XDECREF(write);
+ Py_XDECREF(argtuple);
+ return NULL;
+ }
+
+ Py_XDECREF(argtuple);
+
+ argtuple = PyTuple_Pack(1, string);
+ if (argtuple == NULL) {
+ Py_XDECREF(write);
+ return NULL;
+ }
+ if (PyObject_CallObject(write, argtuple) == NULL) {
+ Py_XDECREF(write);
+ Py_XDECREF(argtuple);
+ return NULL;
+ }
+
+ Py_XDECREF(write);
+ Py_DECREF(argtuple);
+ Py_XDECREF(string);
+
+ PRINTMARK();
+
+ Py_RETURN_NONE;
+}
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/ujson/python/py_defines.h b/contrib/python/pandas/py2/pandas/_libs/src/ujson/python/py_defines.h
new file mode 100644
index 00000000000..82385fdd48a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/ujson/python/py_defines.h
@@ -0,0 +1,58 @@
+/*
+Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the ESN Social Software AB nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
+https://github.com/client9/stringencoders
+Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
+
+Numeric decoder derived from from TCL library
+http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
+ * Copyright (c) 1988-1993 The Regents of the University of California.
+ * Copyright (c) 1994 Sun Microsystems, Inc.
+*/
+
+#ifndef PANDAS__LIBS_SRC_UJSON_PYTHON_PY_DEFINES_H_
+#define PANDAS__LIBS_SRC_UJSON_PYTHON_PY_DEFINES_H_
+
+#include <Python.h>
+
+#if PY_MAJOR_VERSION >= 3
+
+#define PyInt_Check PyLong_Check
+#define PyInt_AS_LONG PyLong_AsLong
+#define PyInt_FromLong PyLong_FromLong
+#define PyInt_FromSsize_t PyLong_FromSsize_t
+
+#define PyString_Check PyBytes_Check
+#define PyString_GET_SIZE PyBytes_GET_SIZE
+#define PyString_AS_STRING PyBytes_AS_STRING
+
+#define PyString_FromString PyUnicode_FromString
+
+#endif
+
+#endif // PANDAS__LIBS_SRC_UJSON_PYTHON_PY_DEFINES_H_
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/ujson/python/ujson.c b/contrib/python/pandas/py2/pandas/_libs/src/ujson/python/ujson.c
new file mode 100644
index 00000000000..da19afab030
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/ujson/python/ujson.c
@@ -0,0 +1,122 @@
+/*
+Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the ESN Social Software AB nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
+https://github.com/client9/stringencoders
+Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
+
+Numeric decoder derived from from TCL library
+http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
+* Copyright (c) 1988-1993 The Regents of the University of California.
+* Copyright (c) 1994 Sun Microsystems, Inc.
+*/
+
+#include "py_defines.h"
+#include "version.h"
+
+/* objToJSON */
+PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs);
+void initObjToJSON(void);
+
+/* JSONToObj */
+PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs);
+
+/* objToJSONFile */
+PyObject *objToJSONFile(PyObject *self, PyObject *args, PyObject *kwargs);
+
+/* JSONFileToObj */
+PyObject *JSONFileToObj(PyObject *self, PyObject *args, PyObject *kwargs);
+
+#define ENCODER_HELP_TEXT \
+ "Use ensure_ascii=false to output UTF-8. Pass in double_precision to " \
+ "alter the maximum digit precision of doubles. Set " \
+ "encode_html_chars=True to encode < > & as unicode escape sequences."
+
+static PyMethodDef ujsonMethods[] = {
+ {"encode", (PyCFunction)objToJSON, METH_VARARGS | METH_KEYWORDS,
+ "Converts arbitrary object recursively into JSON. " ENCODER_HELP_TEXT},
+ {"decode", (PyCFunction)JSONToObj, METH_VARARGS | METH_KEYWORDS,
+ "Converts JSON as string to dict object structure. Use precise_float=True "
+ "to use high precision float decoder."},
+ {"dumps", (PyCFunction)objToJSON, METH_VARARGS | METH_KEYWORDS,
+ "Converts arbitrary object recursively into JSON. " ENCODER_HELP_TEXT},
+ {"loads", (PyCFunction)JSONToObj, METH_VARARGS | METH_KEYWORDS,
+ "Converts JSON as string to dict object structure. Use precise_float=True "
+ "to use high precision float decoder."},
+ {"dump", (PyCFunction)objToJSONFile, METH_VARARGS | METH_KEYWORDS,
+ "Converts arbitrary object recursively into JSON "
+ "file. " ENCODER_HELP_TEXT},
+ {"load", (PyCFunction)JSONFileToObj, METH_VARARGS | METH_KEYWORDS,
+ "Converts JSON as file to dict object structure. Use precise_float=True "
+ "to use high precision float decoder."},
+ {NULL, NULL, 0, NULL} /* Sentinel */
+};
+
+#if PY_MAJOR_VERSION >= 3
+
+static struct PyModuleDef moduledef = {
+ PyModuleDef_HEAD_INIT,
+ "_libjson",
+ 0, /* m_doc */
+ -1, /* m_size */
+ ujsonMethods, /* m_methods */
+ NULL, /* m_reload */
+ NULL, /* m_traverse */
+ NULL, /* m_clear */
+ NULL /* m_free */
+};
+
+#define PYMODINITFUNC PyMODINIT_FUNC PyInit_json(void)
+#define PYMODULE_CREATE() PyModule_Create(&moduledef)
+#define MODINITERROR return NULL
+
+#else
+
+#define PYMODINITFUNC PyMODINIT_FUNC initjson(void)
+#define PYMODULE_CREATE() Py_InitModule("json", ujsonMethods)
+#define MODINITERROR return
+
+#endif
+
+PYMODINITFUNC {
+ PyObject *module;
+ PyObject *version_string;
+
+ initObjToJSON();
+ module = PYMODULE_CREATE();
+
+ if (module == NULL) {
+ MODINITERROR;
+ }
+
+ version_string = PyString_FromString(UJSON_VERSION);
+ PyModule_AddObject(module, "__version__", version_string);
+
+#if PY_MAJOR_VERSION >= 3
+ return module;
+#endif
+}
diff --git a/contrib/python/pandas/py2/pandas/_libs/src/ujson/python/version.h b/contrib/python/pandas/py2/pandas/_libs/src/ujson/python/version.h
new file mode 100644
index 00000000000..ef6d28bf3a1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/src/ujson/python/version.h
@@ -0,0 +1,43 @@
+/*
+Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the ESN Social Software AB nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
+https://github.com/client9/stringencoders
+Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
+
+Numeric decoder derived from from TCL library
+http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
+ * Copyright (c) 1988-1993 The Regents of the University of California.
+ * Copyright (c) 1994 Sun Microsystems, Inc.
+*/
+
+#ifndef PANDAS__LIBS_SRC_UJSON_PYTHON_VERSION_H_
+#define PANDAS__LIBS_SRC_UJSON_PYTHON_VERSION_H_
+
+#define UJSON_VERSION "1.33"
+
+#endif // PANDAS__LIBS_SRC_UJSON_PYTHON_VERSION_H_
diff --git a/contrib/python/pandas/py2/pandas/_libs/testing.pyx b/contrib/python/pandas/py2/pandas/_libs/testing.pyx
new file mode 100644
index 00000000000..10f68187938
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/testing.pyx
@@ -0,0 +1,220 @@
+import numpy as np
+
+from pandas import compat
+from pandas.core.dtypes.missing import isna, array_equivalent
+from pandas.core.dtypes.common import is_dtype_equal
+
+cdef NUMERIC_TYPES = (
+ bool,
+ int,
+ float,
+ np.bool,
+ np.int8,
+ np.int16,
+ np.int32,
+ np.int64,
+ np.uint8,
+ np.uint16,
+ np.uint32,
+ np.uint64,
+ np.float16,
+ np.float32,
+ np.float64,
+)
+
+
+cdef bint is_comparable_as_number(obj):
+ return isinstance(obj, NUMERIC_TYPES)
+
+
+cdef bint isiterable(obj):
+ return hasattr(obj, '__iter__')
+
+
+cdef bint has_length(obj):
+ return hasattr(obj, '__len__')
+
+
+cdef bint is_dictlike(obj):
+ return hasattr(obj, 'keys') and hasattr(obj, '__getitem__')
+
+
+cdef bint decimal_almost_equal(double desired, double actual, int decimal):
+ # Code from
+ # http://docs.scipy.org/doc/numpy/reference/generated
+ # /numpy.testing.assert_almost_equal.html
+ return abs(desired - actual) < (0.5 * 10.0 ** -decimal)
+
+
+cpdef assert_dict_equal(a, b, bint compare_keys=True):
+ assert is_dictlike(a) and is_dictlike(b), (
+ "Cannot compare dict objects, one or both is not dict-like"
+ )
+
+ a_keys = frozenset(a.keys())
+ b_keys = frozenset(b.keys())
+
+ if compare_keys:
+ assert a_keys == b_keys
+
+ for k in a_keys:
+ assert_almost_equal(a[k], b[k])
+
+ return True
+
+
+cpdef assert_almost_equal(a, b,
+ check_less_precise=False,
+ bint check_dtype=True,
+ obj=None, lobj=None, robj=None):
+ """Check that left and right objects are almost equal.
+
+ Parameters
+ ----------
+ a : object
+ b : object
+ check_less_precise : bool or int, default False
+ Specify comparison precision.
+ 5 digits (False) or 3 digits (True) after decimal points are
+ compared. If an integer, then this will be the number of decimal
+ points to compare
+ check_dtype: bool, default True
+ check dtype if both a and b are np.ndarray
+ obj : str, default None
+ Specify object name being compared, internally used to show
+ appropriate assertion message
+ lobj : str, default None
+ Specify left object name being compared, internally used to show
+ appropriate assertion message
+ robj : str, default None
+ Specify right object name being compared, internally used to show
+ appropriate assertion message
+ """
+
+ cdef:
+ int decimal
+ double diff = 0.0
+ Py_ssize_t i, na, nb
+ double fa, fb
+ bint is_unequal = False, a_is_ndarray, b_is_ndarray
+
+ if lobj is None:
+ lobj = a
+ if robj is None:
+ robj = b
+
+ assert isinstance(check_less_precise, (int, bool))
+
+ if isinstance(a, dict) or isinstance(b, dict):
+ return assert_dict_equal(a, b)
+
+ if (isinstance(a, compat.string_types) or
+ isinstance(b, compat.string_types)):
+ assert a == b, "%r != %r" % (a, b)
+ return True
+
+ a_is_ndarray = isinstance(a, np.ndarray)
+ b_is_ndarray = isinstance(b, np.ndarray)
+
+ if obj is None:
+ if a_is_ndarray or b_is_ndarray:
+ obj = 'numpy array'
+ else:
+ obj = 'Iterable'
+
+ if isiterable(a):
+
+ if not isiterable(b):
+ from pandas.util.testing import assert_class_equal
+ # classes can't be the same, to raise error
+ assert_class_equal(a, b, obj=obj)
+
+ assert has_length(a) and has_length(b), (
+ "Can't compare objects without length, one or both is invalid: "
+ "(%r, %r)" % (a, b))
+
+ if a_is_ndarray and b_is_ndarray:
+ na, nb = a.size, b.size
+ if a.shape != b.shape:
+ from pandas.util.testing import raise_assert_detail
+ raise_assert_detail(
+ obj, '{0} shapes are different'.format(obj),
+ a.shape, b.shape)
+
+ if check_dtype and not is_dtype_equal(a, b):
+ from pandas.util.testing import assert_attr_equal
+ assert_attr_equal('dtype', a, b, obj=obj)
+
+ try:
+ if array_equivalent(a, b, strict_nan=True):
+ return True
+ except:
+ pass
+ else:
+ na, nb = len(a), len(b)
+
+ if na != nb:
+ from pandas.util.testing import raise_assert_detail
+
+ # if we have a small diff set, print it
+ if abs(na - nb) < 10:
+ r = list(set(a) ^ set(b))
+ else:
+ r = None
+
+ raise_assert_detail(obj, '{0} length are different'.format(obj),
+ na, nb, r)
+
+ for i in xrange(len(a)):
+ try:
+ assert_almost_equal(a[i], b[i],
+ check_less_precise=check_less_precise)
+ except AssertionError:
+ is_unequal = True
+ diff += 1
+
+ if is_unequal:
+ from pandas.util.testing import raise_assert_detail
+ msg = '{0} values are different ({1} %)'.format(
+ obj, np.round(diff * 100.0 / na, 5))
+ raise_assert_detail(obj, msg, lobj, robj)
+
+ return True
+
+ elif isiterable(b):
+ from pandas.util.testing import assert_class_equal
+ # classes can't be the same, to raise error
+ assert_class_equal(a, b, obj=obj)
+
+ if a == b:
+ # object comparison
+ return True
+ if isna(a) and isna(b):
+ # nan / None comparison
+ return True
+ if is_comparable_as_number(a) and is_comparable_as_number(b):
+ if array_equivalent(a, b, strict_nan=True):
+ # inf comparison
+ return True
+
+ if check_less_precise is True:
+ decimal = 3
+ elif check_less_precise is False:
+ decimal = 5
+ else:
+ decimal = check_less_precise
+
+ fa, fb = a, b
+
+ # case for zero
+ if abs(fa) < 1e-5:
+ if not decimal_almost_equal(fa, fb, decimal):
+ assert False, ('(very low values) expected %.5f but '
+ 'got %.5f, with decimal %d' % (fb, fa, decimal))
+ else:
+ if not decimal_almost_equal(1, fb / fa, decimal):
+ assert False, ('expected %.5f but got %.5f, '
+ 'with decimal %d' % (fb, fa, decimal))
+ return True
+
+ raise AssertionError("{0} != {1}".format(a, b))
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslib.pyx b/contrib/python/pandas/py2/pandas/_libs/tslib.pyx
new file mode 100644
index 00000000000..798e338d558
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslib.pyx
@@ -0,0 +1,828 @@
+# -*- coding: utf-8 -*-
+import cython
+
+from cpython.datetime cimport (PyDateTime_Check, PyDate_Check,
+ PyDateTime_CheckExact,
+ PyDateTime_IMPORT,
+ timedelta, datetime, date, time)
+# import datetime C API
+PyDateTime_IMPORT
+
+
+cimport numpy as cnp
+from numpy cimport int64_t, ndarray, float64_t
+import numpy as np
+cnp.import_array()
+
+import pytz
+
+from pandas._libs.util cimport (
+ is_integer_object, is_float_object, is_string_object, is_datetime64_object)
+
+
+from pandas._libs.tslibs.np_datetime cimport (
+ check_dts_bounds, npy_datetimestruct, _string_to_dts, dt64_to_dtstruct,
+ dtstruct_to_dt64, pydatetime_to_dt64, pydate_to_dt64, get_datetime64_value)
+from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
+
+from pandas._libs.tslibs.parsing import parse_datetime_string
+
+from pandas._libs.tslibs.timedeltas cimport cast_from_unit
+from pandas._libs.tslibs.timezones cimport is_utc, is_tzlocal, get_dst_info
+from pandas._libs.tslibs.timezones import UTC
+from pandas._libs.tslibs.conversion cimport (
+ tz_convert_single, _TSObject, convert_datetime_to_tsobject,
+ get_datetime64_nanos, tz_convert_utc_to_tzlocal)
+
+# many modules still look for NaT and iNaT here despite them not being needed
+from pandas._libs.tslibs.nattype import nat_strings, iNaT # noqa:F821
+from pandas._libs.tslibs.nattype cimport (
+ checknull_with_nat, NPY_NAT, c_NaT as NaT)
+
+from pandas._libs.tslibs.offsets cimport to_offset
+
+from pandas._libs.tslibs.timestamps cimport create_timestamp_from_ts
+from pandas._libs.tslibs.timestamps import Timestamp
+
+
+cdef bint PY2 = str == bytes
+
+
+cdef inline object create_datetime_from_ts(
+ int64_t value, npy_datetimestruct dts,
+ object tz, object freq):
+ """ convenience routine to construct a datetime.datetime from its parts """
+ return datetime(dts.year, dts.month, dts.day, dts.hour,
+ dts.min, dts.sec, dts.us, tz)
+
+
+cdef inline object create_date_from_ts(
+ int64_t value, npy_datetimestruct dts,
+ object tz, object freq):
+ """ convenience routine to construct a datetime.date from its parts """
+ return date(dts.year, dts.month, dts.day)
+
+
+cdef inline object create_time_from_ts(
+ int64_t value, npy_datetimestruct dts,
+ object tz, object freq):
+ """ convenience routine to construct a datetime.time from its parts """
+ return time(dts.hour, dts.min, dts.sec, dts.us, tz)
+
+
+def ints_to_pydatetime(int64_t[:] arr, object tz=None, object freq=None,
+ str box="datetime"):
+ """
+ Convert an i8 repr to an ndarray of datetimes, date, time or Timestamp
+
+ Parameters
+ ----------
+ arr : array of i8
+ tz : str, default None
+ convert to this timezone
+ freq : str/Offset, default None
+ freq to convert
+ box : {'datetime', 'timestamp', 'date', 'time'}, default 'datetime'
+ If datetime, convert to datetime.datetime
+ If date, convert to datetime.date
+ If time, convert to datetime.time
+ If Timestamp, convert to pandas.Timestamp
+
+ Returns
+ -------
+ result : array of dtype specified by box
+ """
+
+ cdef:
+ Py_ssize_t i, n = len(arr)
+ ndarray[int64_t] trans
+ int64_t[:] deltas
+ Py_ssize_t pos
+ npy_datetimestruct dts
+ object dt, new_tz
+ str typ
+ int64_t value, delta, local_value
+ ndarray[object] result = np.empty(n, dtype=object)
+ object (*func_create)(int64_t, npy_datetimestruct, object, object)
+
+ if box == "date":
+ assert (tz is None), "tz should be None when converting to date"
+
+ func_create = create_date_from_ts
+ elif box == "timestamp":
+ func_create = create_timestamp_from_ts
+
+ if is_string_object(freq):
+ freq = to_offset(freq)
+ elif box == "time":
+ func_create = create_time_from_ts
+ elif box == "datetime":
+ func_create = create_datetime_from_ts
+ else:
+ raise ValueError("box must be one of 'datetime', 'date', 'time' or"
+ " 'timestamp'")
+
+ if is_utc(tz) or tz is None:
+ for i in range(n):
+ value = arr[i]
+ if value == NPY_NAT:
+ result[i] = NaT
+ else:
+ dt64_to_dtstruct(value, &dts)
+ result[i] = func_create(value, dts, tz, freq)
+ elif is_tzlocal(tz):
+ for i in range(n):
+ value = arr[i]
+ if value == NPY_NAT:
+ result[i] = NaT
+ else:
+ # Python datetime objects do not support nanosecond
+ # resolution (yet, PEP 564). Need to compute new value
+ # using the i8 representation.
+ local_value = tz_convert_utc_to_tzlocal(value, tz)
+ dt64_to_dtstruct(local_value, &dts)
+ result[i] = func_create(value, dts, tz, freq)
+ else:
+ trans, deltas, typ = get_dst_info(tz)
+
+ if typ not in ['pytz', 'dateutil']:
+ # static/fixed; in this case we know that len(delta) == 1
+ delta = deltas[0]
+ for i in range(n):
+ value = arr[i]
+ if value == NPY_NAT:
+ result[i] = NaT
+ else:
+ # Adjust datetime64 timestamp, recompute datetimestruct
+ dt64_to_dtstruct(value + delta, &dts)
+ result[i] = func_create(value, dts, tz, freq)
+
+ elif typ == 'dateutil':
+ # no zone-name change for dateutil tzs - dst etc
+ # represented in single object.
+ for i in range(n):
+ value = arr[i]
+ if value == NPY_NAT:
+ result[i] = NaT
+ else:
+ # Adjust datetime64 timestamp, recompute datetimestruct
+ pos = trans.searchsorted(value, side='right') - 1
+ dt64_to_dtstruct(value + deltas[pos], &dts)
+ result[i] = func_create(value, dts, tz, freq)
+ else:
+ # pytz
+ for i in range(n):
+ value = arr[i]
+ if value == NPY_NAT:
+ result[i] = NaT
+ else:
+ # Adjust datetime64 timestamp, recompute datetimestruct
+ pos = trans.searchsorted(value, side='right') - 1
+ # find right representation of dst etc in pytz timezone
+ new_tz = tz._tzinfos[tz._transition_info[pos]]
+
+ dt64_to_dtstruct(value + deltas[pos], &dts)
+ result[i] = func_create(value, dts, new_tz, freq)
+
+ return result
+
+
+def _test_parse_iso8601(object ts):
+ """
+ TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used
+ only for testing, actual construction uses `convert_str_to_tsobject`
+ """
+ cdef:
+ _TSObject obj
+ int out_local = 0, out_tzoffset = 0
+
+ obj = _TSObject()
+
+ if ts == 'now':
+ return Timestamp.utcnow()
+ elif ts == 'today':
+ return Timestamp.now().normalize()
+
+ _string_to_dts(ts, &obj.dts, &out_local, &out_tzoffset)
+ obj.value = dtstruct_to_dt64(&obj.dts)
+ check_dts_bounds(&obj.dts)
+ if out_local == 1:
+ obj.tzinfo = pytz.FixedOffset(out_tzoffset)
+ obj.value = tz_convert_single(obj.value, obj.tzinfo, UTC)
+ return Timestamp(obj.value, tz=obj.tzinfo)
+ else:
+ return Timestamp(obj.value)
+
+
+def format_array_from_datetime(ndarray[int64_t] values, object tz=None,
+ object format=None, object na_rep=None):
+ """
+ return a np object array of the string formatted values
+
+ Parameters
+ ----------
+ values : a 1-d i8 array
+ tz : the timezone (or None)
+ format : optional, default is None
+ a strftime capable string
+ na_rep : optional, default is None
+ a nat format
+
+ """
+ cdef:
+ int64_t val, ns, N = len(values)
+ ndarray[int64_t] consider_values
+ bint show_ms = 0, show_us = 0, show_ns = 0, basic_format = 0
+ ndarray[object] result = np.empty(N, dtype=object)
+ object ts, res
+ npy_datetimestruct dts
+
+ if na_rep is None:
+ na_rep = 'NaT'
+
+ # if we don't have a format nor tz, then choose
+ # a format based on precision
+ basic_format = format is None and tz is None
+ if basic_format:
+ consider_values = values[values != NPY_NAT]
+ show_ns = (consider_values % 1000).any()
+
+ if not show_ns:
+ consider_values //= 1000
+ show_us = (consider_values % 1000).any()
+
+ if not show_ms:
+ consider_values //= 1000
+ show_ms = (consider_values % 1000).any()
+
+ for i in range(N):
+ val = values[i]
+
+ if val == NPY_NAT:
+ result[i] = na_rep
+ elif basic_format:
+
+ dt64_to_dtstruct(val, &dts)
+ res = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year,
+ dts.month,
+ dts.day,
+ dts.hour,
+ dts.min,
+ dts.sec)
+
+ if show_ns:
+ ns = dts.ps / 1000
+ res += '.%.9d' % (ns + 1000 * dts.us)
+ elif show_us:
+ res += '.%.6d' % dts.us
+ elif show_ms:
+ res += '.%.3d' % (dts.us /1000)
+
+ result[i] = res
+
+ else:
+
+ ts = Timestamp(val, tz=tz)
+ if format is None:
+ result[i] = str(ts)
+ else:
+
+ # invalid format string
+ # requires dates > 1900
+ try:
+ result[i] = ts.strftime(format)
+ except ValueError:
+ result[i] = str(ts)
+
+ return result
+
+
+def array_with_unit_to_datetime(ndarray values, object unit,
+ str errors='coerce'):
+ """
+ convert the ndarray according to the unit
+ if errors:
+ - raise: return converted values or raise OutOfBoundsDatetime
+ if out of range on the conversion or
+ ValueError for other conversions (e.g. a string)
+ - ignore: return non-convertible values as the same unit
+ - coerce: NaT for non-convertibles
+
+ """
+ cdef:
+ Py_ssize_t i, j, n=len(values)
+ int64_t m
+ ndarray[float64_t] fvalues
+ ndarray mask
+ bint is_ignore = errors=='ignore'
+ bint is_coerce = errors=='coerce'
+ bint is_raise = errors=='raise'
+ bint need_to_iterate = True
+ ndarray[int64_t] iresult
+ ndarray[object] oresult
+
+ assert is_ignore or is_coerce or is_raise
+
+ if unit == 'ns':
+ if issubclass(values.dtype.type, np.integer):
+ return values.astype('M8[ns]')
+ return array_to_datetime(values.astype(object), errors=errors)[0]
+
+ m = cast_from_unit(None, unit)
+
+ if is_raise:
+
+ # try a quick conversion to i8
+ # if we have nulls that are not type-compat
+ # then need to iterate
+ try:
+ iresult = values.astype('i8', casting='same_kind', copy=False)
+ mask = iresult == NPY_NAT
+ iresult[mask] = 0
+ fvalues = iresult.astype('f8') * m
+ need_to_iterate = False
+ except:
+ pass
+
+ # check the bounds
+ if not need_to_iterate:
+
+ if ((fvalues < Timestamp.min.value).any()
+ or (fvalues > Timestamp.max.value).any()):
+ raise OutOfBoundsDatetime("cannot convert input with unit "
+ "'{unit}'".format(unit=unit))
+ result = (iresult * m).astype('M8[ns]')
+ iresult = result.view('i8')
+ iresult[mask] = NPY_NAT
+ return result
+
+ result = np.empty(n, dtype='M8[ns]')
+ iresult = result.view('i8')
+
+ try:
+ for i in range(n):
+ val = values[i]
+
+ if checknull_with_nat(val):
+ iresult[i] = NPY_NAT
+
+ elif is_integer_object(val) or is_float_object(val):
+
+ if val != val or val == NPY_NAT:
+ iresult[i] = NPY_NAT
+ else:
+ try:
+ iresult[i] = cast_from_unit(val, unit)
+ except OverflowError:
+ if is_raise:
+ raise OutOfBoundsDatetime(
+ "cannot convert input {val} with the unit "
+ "'{unit}'".format(val=val, unit=unit))
+ elif is_ignore:
+ raise AssertionError
+ iresult[i] = NPY_NAT
+
+ elif is_string_object(val):
+ if len(val) == 0 or val in nat_strings:
+ iresult[i] = NPY_NAT
+
+ else:
+ try:
+ iresult[i] = cast_from_unit(float(val), unit)
+ except ValueError:
+ if is_raise:
+ raise ValueError(
+ "non convertible value {val} with the unit "
+ "'{unit}'".format(val=val, unit=unit))
+ elif is_ignore:
+ raise AssertionError
+ iresult[i] = NPY_NAT
+ except:
+ if is_raise:
+ raise OutOfBoundsDatetime(
+ "cannot convert input {val} with the unit "
+ "'{unit}'".format(val=val, unit=unit))
+ elif is_ignore:
+ raise AssertionError
+ iresult[i] = NPY_NAT
+
+ else:
+
+ if is_raise:
+ raise ValueError("unit='{0}' not valid with non-numerical "
+ "val='{1}'".format(unit, val))
+ if is_ignore:
+ raise AssertionError
+
+ iresult[i] = NPY_NAT
+
+ return result
+
+ except AssertionError:
+ pass
+
+ # we have hit an exception
+ # and are in ignore mode
+ # redo as object
+
+ oresult = np.empty(n, dtype=object)
+ for i in range(n):
+ val = values[i]
+
+ if checknull_with_nat(val):
+ oresult[i] = NaT
+ elif is_integer_object(val) or is_float_object(val):
+
+ if val != val or val == NPY_NAT:
+ oresult[i] = NaT
+ else:
+ try:
+ oresult[i] = Timestamp(cast_from_unit(val, unit))
+ except:
+ oresult[i] = val
+
+ elif is_string_object(val):
+ if len(val) == 0 or val in nat_strings:
+ oresult[i] = NaT
+
+ else:
+ oresult[i] = val
+
+ return oresult
+
+
+cpdef array_to_datetime(ndarray[object] values, str errors='raise',
+ bint dayfirst=False, bint yearfirst=False,
+ object utc=None, bint require_iso8601=False):
+ """
+ Converts a 1D array of date-like values to a numpy array of either:
+ 1) datetime64[ns] data
+ 2) datetime.datetime objects, if OutOfBoundsDatetime or TypeError
+ is encountered
+
+ Also returns a pytz.FixedOffset if an array of strings with the same
+ timezone offset is passed and utc=True is not passed. Otherwise, None
+ is returned
+
+ Handles datetime.date, datetime.datetime, np.datetime64 objects, numeric,
+ strings
+
+ Parameters
+ ----------
+ values : ndarray of object
+ date-like objects to convert
+ errors : str, default 'raise'
+ error behavior when parsing
+ dayfirst : bool, default False
+ dayfirst parsing behavior when encountering datetime strings
+ yearfirst : bool, default False
+ yearfirst parsing behavior when encountering datetime strings
+ utc : bool, default None
+ indicator whether the dates should be UTC
+ require_iso8601 : bool, default False
+ indicator whether the datetime string should be iso8601
+
+ Returns
+ -------
+ tuple (ndarray, tzoffset)
+ """
+ cdef:
+ Py_ssize_t i, n = len(values)
+ object val, py_dt, tz, tz_out = None
+ ndarray[int64_t] iresult
+ ndarray[object] oresult
+ npy_datetimestruct dts
+ bint utc_convert = bool(utc)
+ bint seen_integer = 0
+ bint seen_string = 0
+ bint seen_datetime = 0
+ bint seen_datetime_offset = 0
+ bint is_raise = errors=='raise'
+ bint is_ignore = errors=='ignore'
+ bint is_coerce = errors=='coerce'
+ bint is_same_offsets
+ _TSObject _ts
+ int64_t value
+ int out_local=0, out_tzoffset=0
+ float offset_seconds, tz_offset
+ set out_tzoffset_vals = set()
+
+ # specify error conditions
+ assert is_raise or is_ignore or is_coerce
+
+ result = np.empty(n, dtype='M8[ns]')
+ iresult = result.view('i8')
+
+ try:
+ for i in range(n):
+ val = values[i]
+
+ try:
+ if checknull_with_nat(val):
+ iresult[i] = NPY_NAT
+
+ elif PyDateTime_Check(val):
+ seen_datetime = 1
+ if val.tzinfo is not None:
+ if utc_convert:
+ _ts = convert_datetime_to_tsobject(val, None)
+ iresult[i] = _ts.value
+ else:
+ raise ValueError('Tz-aware datetime.datetime '
+ 'cannot be converted to '
+ 'datetime64 unless utc=True')
+ else:
+ iresult[i] = pydatetime_to_dt64(val, &dts)
+ if not PyDateTime_CheckExact(val):
+ # i.e. a Timestamp object
+ iresult[i] += val.nanosecond
+ check_dts_bounds(&dts)
+
+ elif PyDate_Check(val):
+ seen_datetime = 1
+ iresult[i] = pydate_to_dt64(val, &dts)
+ check_dts_bounds(&dts)
+
+ elif is_datetime64_object(val):
+ seen_datetime = 1
+ iresult[i] = get_datetime64_nanos(val)
+
+ elif is_integer_object(val) or is_float_object(val):
+ # these must be ns unit by-definition
+ seen_integer = 1
+
+ if val != val or val == NPY_NAT:
+ iresult[i] = NPY_NAT
+ elif is_raise or is_ignore:
+ iresult[i] = val
+ else:
+ # coerce
+ # we now need to parse this as if unit='ns'
+ # we can ONLY accept integers at this point
+ # if we have previously (or in future accept
+ # datetimes/strings, then we must coerce)
+ try:
+ iresult[i] = cast_from_unit(val, 'ns')
+ except:
+ iresult[i] = NPY_NAT
+
+ elif is_string_object(val):
+ # string
+ seen_string = 1
+
+ if len(val) == 0 or val in nat_strings:
+ iresult[i] = NPY_NAT
+ continue
+ if isinstance(val, unicode) and PY2:
+ val = val.encode('utf-8')
+
+ try:
+ _string_to_dts(val, &dts, &out_local, &out_tzoffset)
+ except ValueError:
+ # A ValueError at this point is a _parsing_ error
+ # specifically _not_ OutOfBoundsDatetime
+ if _parse_today_now(val, &iresult[i]):
+ continue
+ elif require_iso8601:
+ # if requiring iso8601 strings, skip trying
+ # other formats
+ if is_coerce:
+ iresult[i] = NPY_NAT
+ continue
+ elif is_raise:
+ raise ValueError("time data {val} doesn't "
+ "match format specified"
+ .format(val=val))
+ return values, tz_out
+
+ try:
+ py_dt = parse_datetime_string(val,
+ dayfirst=dayfirst,
+ yearfirst=yearfirst)
+ except Exception:
+ if is_coerce:
+ iresult[i] = NPY_NAT
+ continue
+ raise TypeError("invalid string coercion to "
+ "datetime")
+
+ # If the dateutil parser returned tzinfo, capture it
+ # to check if all arguments have the same tzinfo
+ tz = py_dt.utcoffset()
+ if tz is not None:
+ seen_datetime_offset = 1
+ # dateutil timezone objects cannot be hashed, so
+ # store the UTC offsets in seconds instead
+ out_tzoffset_vals.add(tz.total_seconds())
+ else:
+ # Add a marker for naive string, to track if we are
+ # parsing mixed naive and aware strings
+ out_tzoffset_vals.add('naive')
+
+ _ts = convert_datetime_to_tsobject(py_dt, None)
+ iresult[i] = _ts.value
+ except:
+ # TODO: What exception are we concerned with here?
+ if is_coerce:
+ iresult[i] = NPY_NAT
+ continue
+ raise
+ else:
+ # No error raised by string_to_dts, pick back up
+ # where we left off
+ value = dtstruct_to_dt64(&dts)
+ if out_local == 1:
+ seen_datetime_offset = 1
+ # Store the out_tzoffset in seconds
+ # since we store the total_seconds of
+ # dateutil.tz.tzoffset objects
+ out_tzoffset_vals.add(out_tzoffset * 60.)
+ tz = pytz.FixedOffset(out_tzoffset)
+ value = tz_convert_single(value, tz, UTC)
+ else:
+ # Add a marker for naive string, to track if we are
+ # parsing mixed naive and aware strings
+ out_tzoffset_vals.add('naive')
+ iresult[i] = value
+ check_dts_bounds(&dts)
+
+ else:
+ if is_coerce:
+ iresult[i] = NPY_NAT
+ else:
+ raise TypeError("{typ} is not convertible to datetime"
+ .format(typ=type(val)))
+
+ except OutOfBoundsDatetime:
+ if is_coerce:
+ iresult[i] = NPY_NAT
+ continue
+ elif require_iso8601 and is_string_object(val):
+ # GH#19382 for just-barely-OutOfBounds falling back to
+ # dateutil parser will return incorrect result because
+ # it will ignore nanoseconds
+ if is_raise:
+ raise ValueError("time data {val} doesn't "
+ "match format specified"
+ .format(val=val))
+ assert is_ignore
+ return values, tz_out
+ raise
+
+ except OutOfBoundsDatetime:
+ if is_raise:
+ raise
+
+ return ignore_errors_out_of_bounds_fallback(values), tz_out
+
+ except TypeError:
+ return array_to_datetime_object(values, is_raise, dayfirst, yearfirst)
+
+ if seen_datetime and seen_integer:
+ # we have mixed datetimes & integers
+
+ if is_coerce:
+ # coerce all of the integers/floats to NaT, preserve
+ # the datetimes and other convertibles
+ for i in range(n):
+ val = values[i]
+ if is_integer_object(val) or is_float_object(val):
+ result[i] = NPY_NAT
+ elif is_raise:
+ raise ValueError("mixed datetimes and integers in passed array")
+ else:
+ return array_to_datetime_object(values, is_raise,
+ dayfirst, yearfirst)
+
+ if seen_datetime_offset and not utc_convert:
+ # GH#17697
+ # 1) If all the offsets are equal, return one offset for
+ # the parsed dates to (maybe) pass to DatetimeIndex
+ # 2) If the offsets are different, then force the parsing down the
+ # object path where an array of datetimes
+ # (with individual dateutil.tzoffsets) are returned
+ is_same_offsets = len(out_tzoffset_vals) == 1
+ if not is_same_offsets:
+ return array_to_datetime_object(values, is_raise,
+ dayfirst, yearfirst)
+ else:
+ tz_offset = out_tzoffset_vals.pop()
+ tz_out = pytz.FixedOffset(tz_offset / 60.)
+ return result, tz_out
+
+
+cdef inline ignore_errors_out_of_bounds_fallback(ndarray[object] values):
+ """
+ Fallback for array_to_datetime if an OutOfBoundsDatetime is raised
+ and errors == "ignore"
+
+ Parameters
+ ----------
+ values : ndarray[object]
+
+ Returns
+ -------
+ ndarray[object]
+ """
+ cdef:
+ Py_ssize_t i, n = len(values)
+ object val
+
+ oresult = np.empty(n, dtype=object)
+
+ for i in range(n):
+ val = values[i]
+
+ # set as nan except if its a NaT
+ if checknull_with_nat(val):
+ if isinstance(val, float):
+ oresult[i] = np.nan
+ else:
+ oresult[i] = NaT
+ elif is_datetime64_object(val):
+ if get_datetime64_value(val) == NPY_NAT:
+ oresult[i] = NaT
+ else:
+ oresult[i] = val.item()
+ else:
+ oresult[i] = val
+ return oresult
+
+
+cdef array_to_datetime_object(ndarray[object] values, bint is_raise,
+ bint dayfirst=False, bint yearfirst=False):
+ """
+ Fall back function for array_to_datetime
+
+ Attempts to parse datetime strings with dateutil to return an array
+ of datetime objects
+
+ Parameters
+ ----------
+ values : ndarray of object
+ date-like objects to convert
+ is_raise : bool
+ error behavior when parsing
+ dayfirst : bool, default False
+ dayfirst parsing behavior when encountering datetime strings
+ yearfirst : bool, default False
+ yearfirst parsing behavior when encountering datetime strings
+
+ Returns
+ -------
+ tuple (ndarray, None)
+ """
+ cdef:
+ Py_ssize_t i, n = len(values)
+ object val,
+ ndarray[object] oresult
+ npy_datetimestruct dts
+
+ oresult = np.empty(n, dtype=object)
+
+ # We return an object array and only attempt to parse:
+ # 1) NaT or NaT-like values
+ # 2) datetime strings, which we return as datetime.datetime
+ for i in range(n):
+ val = values[i]
+ if checknull_with_nat(val):
+ oresult[i] = val
+ elif is_string_object(val):
+ if len(val) == 0 or val in nat_strings:
+ oresult[i] = 'NaT'
+ continue
+ try:
+ oresult[i] = parse_datetime_string(val, dayfirst=dayfirst,
+ yearfirst=yearfirst)
+ pydatetime_to_dt64(oresult[i], &dts)
+ check_dts_bounds(&dts)
+ except (ValueError, OverflowError):
+ if is_raise:
+ raise
+ return values, None
+ else:
+ if is_raise:
+ raise
+ return values, None
+ return oresult, None
+
+
+cdef inline bint _parse_today_now(str val, int64_t* iresult):
+ # We delay this check for as long as possible
+ # because it catches relatively rare cases
+ if val == 'now':
+ # Note: this is *not* the same as Timestamp('now')
+ iresult[0] = Timestamp.utcnow().value
+ return True
+ elif val == 'today':
+ iresult[0] = Timestamp.today().value
+ return True
+ return False
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/__init__.py b/contrib/python/pandas/py2/pandas/_libs/tslibs/__init__.py
new file mode 100644
index 00000000000..38401cab57f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/__init__.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+# flake8: noqa
+
+from .conversion import normalize_date, localize_pydatetime, tz_convert_single
+from .nattype import NaT, iNaT, is_null_datetimelike
+from .np_datetime import OutOfBoundsDatetime
+from .period import Period, IncompatibleFrequency
+from .timestamps import Timestamp
+from .timedeltas import delta_to_nanoseconds, ints_to_pytimedelta, Timedelta
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/ccalendar.pxd b/contrib/python/pandas/py2/pandas/_libs/tslibs/ccalendar.pxd
new file mode 100644
index 00000000000..08f539a70a7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/ccalendar.pxd
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+
+from cython cimport Py_ssize_t
+
+from numpy cimport int64_t, int32_t
+
+
+cdef int dayofweek(int y, int m, int d) nogil
+cdef bint is_leapyear(int64_t year) nogil
+cpdef int32_t get_days_in_month(int year, Py_ssize_t month) nogil
+cpdef int32_t get_week_of_year(int year, int month, int day) nogil
+cpdef int32_t get_day_of_year(int year, int month, int day) nogil
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/ccalendar.pyx b/contrib/python/pandas/py2/pandas/_libs/tslibs/ccalendar.pyx
new file mode 100644
index 00000000000..c48812acd3d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/ccalendar.pyx
@@ -0,0 +1,226 @@
+# -*- coding: utf-8 -*-
+# cython: boundscheck=False
+"""
+Cython implementations of functions resembling the stdlib calendar module
+"""
+
+import cython
+
+from numpy cimport int64_t, int32_t
+
+from locale import LC_TIME
+from pandas._libs.tslibs.strptime import LocaleTime
+
+# ----------------------------------------------------------------------
+# Constants
+
+# Slightly more performant cython lookups than a 2D table
+# The first 12 entries correspond to month lengths for non-leap years.
+# The remaining 12 entries give month lengths for leap years
+cdef int32_t* days_per_month_array = [
+ 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31,
+ 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
+
+cdef int* sakamoto_arr = [0, 3, 2, 5, 0, 3, 5, 1, 4, 6, 2, 4]
+
+# The first 13 entries give the month days elapsed as of the first of month N
+# (or the total number of days in the year for N=13) in non-leap years.
+# The remaining 13 entries give the days elapsed in leap years.
+cdef int32_t* _month_offset = [
+ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365,
+ 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366]
+
+# Canonical location for other modules to find name constants
+MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL',
+ 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']
+# The first blank line is consistent with calendar.month_name in the calendar
+# standard library
+MONTHS_FULL = ['', 'January', 'February', 'March', 'April', 'May', 'June',
+ 'July', 'August', 'September', 'October', 'November',
+ 'December']
+MONTH_NUMBERS = {name: num for num, name in enumerate(MONTHS)}
+MONTH_ALIASES = {(num + 1): name for num, name in enumerate(MONTHS)}
+MONTH_TO_CAL_NUM = {name: num + 1 for num, name in enumerate(MONTHS)}
+
+DAYS = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN']
+DAYS_FULL = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
+ 'Saturday', 'Sunday']
+int_to_weekday = {num: name for num, name in enumerate(DAYS)}
+weekday_to_int = {int_to_weekday[key]: key for key in int_to_weekday}
+
+DAY_SECONDS = 86400
+HOUR_SECONDS = 3600
+
+# ----------------------------------------------------------------------
+
+
+cpdef int32_t get_days_in_month(int year, Py_ssize_t month) nogil:
+ """Return the number of days in the given month of the given year.
+
+ Parameters
+ ----------
+ year : int
+ month : int
+
+ Returns
+ -------
+ days_in_month : int
+
+ Notes
+ -----
+ Assumes that the arguments are valid. Passing a month not between 1 and 12
+ risks a segfault.
+ """
+ return days_per_month_array[12 * is_leapyear(year) + month - 1]
+
+
+cdef int dayofweek(int y, int m, int d) nogil:
+ """Find the day of week for the date described by the Y/M/D triple y, m, d
+ using Sakamoto's method, from wikipedia.
+
+ 0 represents Monday. See [1]_.
+
+ Parameters
+ ----------
+ y : int
+ m : int
+ d : int
+
+ Returns
+ -------
+ weekday : int
+
+ Notes
+ -----
+ Assumes that y, m, d, represents a valid date.
+
+ See Also
+ --------
+ [1] https://docs.python.org/3/library/calendar.html#calendar.weekday
+
+ [2] https://en.wikipedia.org/wiki/\
+ Determination_of_the_day_of_the_week#Sakamoto.27s_methods
+ """
+ cdef:
+ int day
+
+ y -= m < 3
+ day = (y + y / 4 - y / 100 + y / 400 + sakamoto_arr[m - 1] + d) % 7
+ # convert to python day
+ return (day + 6) % 7
+
+
+cdef bint is_leapyear(int64_t year) nogil:
+ """Returns 1 if the given year is a leap year, 0 otherwise.
+
+ Parameters
+ ----------
+ year : int
+
+ Returns
+ -------
+ is_leap : bool
+ """
+ return ((year & 0x3) == 0 and # year % 4 == 0
+ ((year % 100) != 0 or (year % 400) == 0))
+
+
+cpdef int32_t get_week_of_year(int year, int month, int day) nogil:
+ """Return the ordinal week-of-year for the given day.
+
+ Parameters
+ ----------
+ year : int
+ month : int
+ day : int
+
+ Returns
+ -------
+ week_of_year : int32_t
+
+ Notes
+ -----
+ Assumes the inputs describe a valid date.
+ """
+ cdef:
+ int32_t doy, dow
+ int woy
+
+ doy = get_day_of_year(year, month, day)
+ dow = dayofweek(year, month, day)
+
+ # estimate
+ woy = (doy - 1) - dow + 3
+ if woy >= 0:
+ woy = woy / 7 + 1
+
+ # verify
+ if woy < 0:
+ if (woy > -2) or (woy == -2 and is_leapyear(year - 1)):
+ woy = 53
+ else:
+ woy = 52
+ elif woy == 53:
+ if 31 - day + dow < 3:
+ woy = 1
+
+ return woy
+
+
+cpdef int32_t get_day_of_year(int year, int month, int day) nogil:
+ """Return the ordinal day-of-year for the given day.
+
+ Parameters
+ ----------
+ year : int
+ month : int
+ day : int
+
+ Returns
+ -------
+ day_of_year : int32_t
+
+ Notes
+ -----
+ Assumes the inputs describe a valid date.
+ """
+ cdef:
+ bint isleap
+ int32_t mo_off
+ int day_of_year
+
+ isleap = is_leapyear(year)
+
+ mo_off = _month_offset[isleap * 13 + month - 1]
+
+ day_of_year = mo_off + day
+ return day_of_year
+
+
+cpdef get_locale_names(object name_type, object locale=None):
+ """Returns an array of localized day or month names
+
+ Parameters
+ ----------
+ name_type : string, attribute of LocaleTime() in which to return localized
+ names
+ locale : string
+
+ Returns
+ -------
+ list of locale names
+
+ """
+ from pandas.util.testing import set_locale
+
+ with set_locale(locale, LC_TIME):
+ return getattr(LocaleTime(), name_type)
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/conversion.pxd b/contrib/python/pandas/py2/pandas/_libs/tslibs/conversion.pxd
new file mode 100644
index 00000000000..8aca9ca1852
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/conversion.pxd
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+
+from cpython.datetime cimport datetime, tzinfo
+
+from numpy cimport int64_t, int32_t
+
+from pandas._libs.tslibs.np_datetime cimport npy_datetimestruct
+
+
+cdef class _TSObject:
+ cdef:
+ npy_datetimestruct dts # npy_datetimestruct
+ int64_t value # numpy dt64
+ object tzinfo
+
+
+cdef convert_to_tsobject(object ts, object tz, object unit,
+ bint dayfirst, bint yearfirst,
+ int32_t nanos=*)
+
+cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz,
+ int32_t nanos=*)
+
+cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2)
+
+cdef int64_t get_datetime64_nanos(object val) except? -1
+
+cpdef int64_t pydt_to_i8(object pydt) except? -1
+
+cdef maybe_datetimelike_to_i8(object val)
+
+cdef int64_t tz_convert_utc_to_tzlocal(int64_t utc_val, tzinfo tz)
+
+cpdef datetime localize_pydatetime(datetime dt, object tz)
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/conversion.pyx b/contrib/python/pandas/py2/pandas/_libs/tslibs/conversion.pyx
new file mode 100644
index 00000000000..6c8b732928b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/conversion.pyx
@@ -0,0 +1,1335 @@
+# -*- coding: utf-8 -*-
+import cython
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport uint8_t, int64_t, int32_t, intp_t, ndarray
+cnp.import_array()
+
+import pytz
+from dateutil.tz import tzutc
+
+# stdlib datetime imports
+from datetime import time as datetime_time
+from cpython.datetime cimport (datetime, tzinfo,
+ PyDateTime_Check, PyDate_Check,
+ PyDateTime_CheckExact, PyDateTime_IMPORT,
+ PyDelta_Check)
+PyDateTime_IMPORT
+
+from pandas._libs.tslibs.ccalendar import DAY_SECONDS, HOUR_SECONDS
+
+from pandas._libs.tslibs.np_datetime cimport (
+ check_dts_bounds, npy_datetimestruct, pandas_datetime_to_datetimestruct,
+ _string_to_dts, npy_datetime, dt64_to_dtstruct, dtstruct_to_dt64,
+ get_datetime64_unit, get_datetime64_value, pydatetime_to_dt64,
+ NPY_DATETIMEUNIT, NPY_FR_ns)
+from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
+
+from pandas._libs.tslibs.util cimport (
+ is_string_object, is_datetime64_object, is_integer_object, is_float_object)
+
+from pandas._libs.tslibs.timedeltas cimport (cast_from_unit,
+ delta_to_nanoseconds)
+from pandas._libs.tslibs.timezones cimport (
+ is_utc, is_tzlocal, is_fixed_offset, get_utcoffset, get_dst_info,
+ get_timezone, maybe_get_tz, tz_compare)
+from pandas._libs.tslibs.timezones import UTC
+from pandas._libs.tslibs.parsing import parse_datetime_string
+
+from pandas._libs.tslibs.nattype import nat_strings
+from pandas._libs.tslibs.nattype cimport (
+ NPY_NAT, checknull_with_nat, c_NaT as NaT)
+
+# ----------------------------------------------------------------------
+# Constants
+
+NS_DTYPE = np.dtype('M8[ns]')
+TD_DTYPE = np.dtype('m8[ns]')
+
+
+# ----------------------------------------------------------------------
+# Misc Helpers
+
+cdef inline int64_t get_datetime64_nanos(object val) except? -1:
+ """
+ Extract the value and unit from a np.datetime64 object, then convert the
+ value to nanoseconds if necessary.
+ """
+ cdef:
+ npy_datetimestruct dts
+ NPY_DATETIMEUNIT unit
+ npy_datetime ival
+
+ ival = get_datetime64_value(val)
+ if ival == NPY_NAT:
+ return NPY_NAT
+
+ unit = get_datetime64_unit(val)
+
+ if unit != NPY_FR_ns:
+ pandas_datetime_to_datetimestruct(ival, unit, &dts)
+ check_dts_bounds(&dts)
+ ival = dtstruct_to_dt64(&dts)
+
+ return ival
+
+
+def ensure_datetime64ns(arr: ndarray, copy: bool=True):
+ """
+ Ensure a np.datetime64 array has dtype specifically 'datetime64[ns]'
+
+ Parameters
+ ----------
+ arr : ndarray
+ copy : boolean, default True
+
+ Returns
+ -------
+ result : ndarray with dtype datetime64[ns]
+
+ """
+ cdef:
+ Py_ssize_t i, n = arr.size
+ int64_t[:] ivalues, iresult
+ NPY_DATETIMEUNIT unit
+ npy_datetimestruct dts
+
+ shape = (<object>arr).shape
+
+ ivalues = arr.view(np.int64).ravel()
+
+ result = np.empty(shape, dtype=NS_DTYPE)
+ iresult = result.ravel().view(np.int64)
+
+ if len(iresult) == 0:
+ result = arr.view(NS_DTYPE)
+ if copy:
+ result = result.copy()
+ return result
+
+ unit = get_datetime64_unit(arr.flat[0])
+ if unit == NPY_FR_ns:
+ if copy:
+ arr = arr.copy()
+ result = arr
+ else:
+ for i in range(n):
+ if ivalues[i] != NPY_NAT:
+ pandas_datetime_to_datetimestruct(ivalues[i], unit, &dts)
+ iresult[i] = dtstruct_to_dt64(&dts)
+ check_dts_bounds(&dts)
+ else:
+ iresult[i] = NPY_NAT
+
+ return result
+
+
+def ensure_timedelta64ns(arr: ndarray, copy: bool=True):
+ """
+ Ensure a np.timedelta64 array has dtype specifically 'timedelta64[ns]'
+
+ Parameters
+ ----------
+ arr : ndarray
+ copy : boolean, default True
+
+ Returns
+ -------
+ result : ndarray with dtype timedelta64[ns]
+
+ """
+ return arr.astype(TD_DTYPE, copy=copy)
+ # TODO: check for overflows when going from a lower-resolution to nanos
+
+
+def datetime_to_datetime64(values: object[:]):
+ """
+ Convert ndarray of datetime-like objects to int64 array representing
+ nanosecond timestamps.
+
+ Parameters
+ ----------
+ values : ndarray[object]
+
+ Returns
+ -------
+ result : ndarray[int64_t]
+ inferred_tz : tzinfo or None
+ """
+ cdef:
+ Py_ssize_t i, n = len(values)
+ object val, inferred_tz = None
+ int64_t[:] iresult
+ npy_datetimestruct dts
+ _TSObject _ts
+ bint found_naive = False
+
+ result = np.empty(n, dtype='M8[ns]')
+ iresult = result.view('i8')
+ for i in range(n):
+ val = values[i]
+ if checknull_with_nat(val):
+ iresult[i] = NPY_NAT
+ elif PyDateTime_Check(val):
+ if val.tzinfo is not None:
+ if found_naive:
+ raise ValueError('Cannot mix tz-aware with '
+ 'tz-naive values')
+ if inferred_tz is not None:
+ if not tz_compare(val.tzinfo, inferred_tz):
+ raise ValueError('Array must be all same time zone')
+ else:
+ inferred_tz = get_timezone(val.tzinfo)
+
+ _ts = convert_datetime_to_tsobject(val, None)
+ iresult[i] = _ts.value
+ check_dts_bounds(&_ts.dts)
+ else:
+ found_naive = True
+ if inferred_tz is not None:
+ raise ValueError('Cannot mix tz-aware with '
+ 'tz-naive values')
+ iresult[i] = pydatetime_to_dt64(val, &dts)
+ check_dts_bounds(&dts)
+ else:
+ raise TypeError('Unrecognized value type: %s' % type(val))
+
+ return result, inferred_tz
+
+
+cdef inline maybe_datetimelike_to_i8(object val):
+ """
+ Try to convert to a nanosecond timestamp. Fall back to returning the
+ input value.
+
+ Parameters
+ ----------
+ val : object
+
+ Returns
+ -------
+ val : int64 timestamp or original input
+ """
+ cdef:
+ npy_datetimestruct dts
+ try:
+ return val.value
+ except AttributeError:
+ if is_datetime64_object(val):
+ return get_datetime64_value(val)
+ elif PyDateTime_Check(val):
+ return convert_datetime_to_tsobject(val, None).value
+ return val
+
+
+# ----------------------------------------------------------------------
+# _TSObject Conversion
+
+# lightweight C object to hold datetime & int64 pair
+cdef class _TSObject:
+ # cdef:
+ # npy_datetimestruct dts # npy_datetimestruct
+ # int64_t value # numpy dt64
+ # object tzinfo
+
+ @property
+ def value(self):
+ # This is needed in order for `value` to be accessible in lib.pyx
+ return self.value
+
+
+cpdef int64_t pydt_to_i8(object pydt) except? -1:
+ """
+ Convert to int64 representation compatible with numpy datetime64; converts
+ to UTC
+
+ Parameters
+ ----------
+ pydt : object
+
+ Returns
+ -------
+ i8value : np.int64
+ """
+ cdef:
+ _TSObject ts
+
+ ts = convert_to_tsobject(pydt, None, None, 0, 0)
+
+ return ts.value
+
+
+cdef convert_to_tsobject(object ts, object tz, object unit,
+ bint dayfirst, bint yearfirst, int32_t nanos=0):
+ """
+ Extract datetime and int64 from any of:
+ - np.int64 (with unit providing a possible modifier)
+ - np.datetime64
+ - a float (with unit providing a possible modifier)
+ - python int or long object (with unit providing a possible modifier)
+ - iso8601 string object
+ - python datetime object
+ - another timestamp object
+ """
+ cdef:
+ _TSObject obj
+
+ if tz is not None:
+ tz = maybe_get_tz(tz)
+
+ obj = _TSObject()
+
+ if is_string_object(ts):
+ return convert_str_to_tsobject(ts, tz, unit, dayfirst, yearfirst)
+
+ if ts is None or ts is NaT:
+ obj.value = NPY_NAT
+ elif is_datetime64_object(ts):
+ obj.value = get_datetime64_nanos(ts)
+ if obj.value != NPY_NAT:
+ dt64_to_dtstruct(obj.value, &obj.dts)
+ elif is_integer_object(ts):
+ if ts == NPY_NAT:
+ obj.value = NPY_NAT
+ else:
+ ts = ts * cast_from_unit(None, unit)
+ obj.value = ts
+ dt64_to_dtstruct(ts, &obj.dts)
+ elif is_float_object(ts):
+ if ts != ts or ts == NPY_NAT:
+ obj.value = NPY_NAT
+ else:
+ ts = cast_from_unit(ts, unit)
+ obj.value = ts
+ dt64_to_dtstruct(ts, &obj.dts)
+ elif PyDateTime_Check(ts):
+ return convert_datetime_to_tsobject(ts, tz, nanos)
+ elif PyDate_Check(ts):
+ # Keep the converter same as PyDateTime's
+ ts = datetime.combine(ts, datetime_time())
+ return convert_datetime_to_tsobject(ts, tz)
+ elif getattr(ts, '_typ', None) == 'period':
+ raise ValueError("Cannot convert Period to Timestamp "
+ "unambiguously. Use to_timestamp")
+ else:
+ raise TypeError('Cannot convert input [{}] of type {} to '
+ 'Timestamp'.format(ts, type(ts)))
+
+ if tz is not None:
+ localize_tso(obj, tz)
+
+ if obj.value != NPY_NAT:
+ # check_overflows needs to run after localize_tso
+ check_dts_bounds(&obj.dts)
+ check_overflows(obj)
+ return obj
+
+
+cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz,
+ int32_t nanos=0):
+ """
+ Convert a datetime (or Timestamp) input `ts`, along with optional timezone
+ object `tz` to a _TSObject.
+
+ The optional argument `nanos` allows for cases where datetime input
+ needs to be supplemented with higher-precision information.
+
+ Parameters
+ ----------
+ ts : datetime or Timestamp
+ Value to be converted to _TSObject
+ tz : tzinfo or None
+ timezone for the timezone-aware output
+ nanos : int32_t, default is 0
+ nanoseconds supplement the precision of the datetime input ts
+
+ Returns
+ -------
+ obj : _TSObject
+ """
+ cdef:
+ _TSObject obj = _TSObject()
+
+ if tz is not None:
+ tz = maybe_get_tz(tz)
+
+ if ts.tzinfo is not None:
+ # Convert the current timezone to the passed timezone
+ ts = ts.astimezone(tz)
+ obj.value = pydatetime_to_dt64(ts, &obj.dts)
+ obj.tzinfo = ts.tzinfo
+ elif not is_utc(tz):
+ ts = _localize_pydatetime(ts, tz)
+ obj.value = pydatetime_to_dt64(ts, &obj.dts)
+ obj.tzinfo = ts.tzinfo
+ else:
+ # UTC
+ obj.value = pydatetime_to_dt64(ts, &obj.dts)
+ obj.tzinfo = tz
+ else:
+ obj.value = pydatetime_to_dt64(ts, &obj.dts)
+ obj.tzinfo = ts.tzinfo
+
+ if obj.tzinfo is not None and not is_utc(obj.tzinfo):
+ offset = get_utcoffset(obj.tzinfo, ts)
+ obj.value -= int(offset.total_seconds() * 1e9)
+
+ if not PyDateTime_CheckExact(ts):
+ # datetime instance but not datetime type --> Timestamp
+ obj.value += ts.nanosecond
+ obj.dts.ps = ts.nanosecond * 1000
+
+ if nanos:
+ obj.value += nanos
+ obj.dts.ps = nanos * 1000
+
+ check_dts_bounds(&obj.dts)
+ check_overflows(obj)
+ return obj
+
+
+cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit,
+ bint dayfirst=False,
+ bint yearfirst=False):
+ """
+ Convert a string-like (bytes or unicode) input `ts`, along with optional
+ timezone object `tz` to a _TSObject.
+
+ The optional arguments `dayfirst` and `yearfirst` are passed to the
+ dateutil parser.
+
+ Parameters
+ ----------
+ ts : bytes or unicode
+ Value to be converted to _TSObject
+ tz : tzinfo or None
+ timezone for the timezone-aware output
+ dayfirst : bool, default False
+ When parsing an ambiguous date string, interpret e.g. "3/4/1975" as
+ April 3, as opposed to the standard US interpretation March 4.
+ yearfirst : bool, default False
+ When parsing an ambiguous date string, interpret e.g. "01/05/09"
+ as "May 9, 2001", as opposed to the default "Jan 5, 2009"
+
+ Returns
+ -------
+ obj : _TSObject
+ """
+ cdef:
+ _TSObject obj
+ int out_local = 0, out_tzoffset = 0
+ datetime dt
+
+ if tz is not None:
+ tz = maybe_get_tz(tz)
+
+ obj = _TSObject()
+
+ assert is_string_object(ts)
+
+ if len(ts) == 0 or ts in nat_strings:
+ ts = NaT
+ elif ts == 'now':
+ # Issue 9000, we short-circuit rather than going
+ # into np_datetime_strings which returns utc
+ ts = datetime.now(tz)
+ elif ts == 'today':
+ # Issue 9000, we short-circuit rather than going
+ # into np_datetime_strings which returns a normalized datetime
+ ts = datetime.now(tz)
+ # equiv: datetime.today().replace(tzinfo=tz)
+ else:
+ try:
+ _string_to_dts(ts, &obj.dts, &out_local, &out_tzoffset)
+ obj.value = dtstruct_to_dt64(&obj.dts)
+ check_dts_bounds(&obj.dts)
+ if out_local == 1:
+ obj.tzinfo = pytz.FixedOffset(out_tzoffset)
+ obj.value = tz_convert_single(obj.value, obj.tzinfo, UTC)
+ if tz is None:
+ check_dts_bounds(&obj.dts)
+ check_overflows(obj)
+ return obj
+ else:
+ # Keep the converter same as PyDateTime's
+ obj = convert_to_tsobject(obj.value, obj.tzinfo,
+ None, 0, 0)
+ dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day,
+ obj.dts.hour, obj.dts.min, obj.dts.sec,
+ obj.dts.us, obj.tzinfo)
+ obj = convert_datetime_to_tsobject(dt, tz,
+ nanos=obj.dts.ps / 1000)
+ return obj
+
+ else:
+ ts = obj.value
+ if tz is not None:
+ # shift for localize_tso
+ ts = tz_localize_to_utc(np.array([ts], dtype='i8'), tz,
+ ambiguous='raise')[0]
+
+ except OutOfBoundsDatetime:
+ # GH#19382 for just-barely-OutOfBounds falling back to dateutil
+ # parser will return incorrect result because it will ignore
+ # nanoseconds
+ raise
+
+ except ValueError:
+ try:
+ ts = parse_datetime_string(ts, dayfirst=dayfirst,
+ yearfirst=yearfirst)
+ except Exception:
+ raise ValueError("could not convert string to Timestamp")
+
+ return convert_to_tsobject(ts, tz, unit, dayfirst, yearfirst)
+
+
+cdef inline check_overflows(_TSObject obj):
+ """
+ Check that we haven't silently overflowed in timezone conversion
+
+ Parameters
+ ----------
+ obj : _TSObject
+
+ Returns
+ -------
+ None
+
+ Raises
+ ------
+ OutOfBoundsDatetime
+ """
+ # GH#12677
+ if obj.dts.year == 1677:
+ if not (obj.value < 0):
+ raise OutOfBoundsDatetime
+ elif obj.dts.year == 2262:
+ if not (obj.value > 0):
+ raise OutOfBoundsDatetime
+
+
+# ----------------------------------------------------------------------
+# Localization
+
+cdef inline void localize_tso(_TSObject obj, tzinfo tz):
+ """
+ Given the UTC nanosecond timestamp in obj.value, find the wall-clock
+ representation of that timestamp in the given timezone.
+
+ Parameters
+ ----------
+ obj : _TSObject
+ tz : tzinfo
+
+ Returns
+ -------
+ None
+
+ Notes
+ -----
+ Sets obj.tzinfo inplace, alters obj.dts inplace.
+ """
+ cdef:
+ ndarray[int64_t] trans
+ int64_t[:] deltas
+ int64_t local_val
+ Py_ssize_t pos
+ str typ
+
+ assert obj.tzinfo is None
+
+ if is_utc(tz):
+ pass
+ elif obj.value == NPY_NAT:
+ pass
+ elif is_tzlocal(tz):
+ local_val = _tz_convert_tzlocal_utc(obj.value, tz, to_utc=False)
+ dt64_to_dtstruct(local_val, &obj.dts)
+ else:
+ # Adjust datetime64 timestamp, recompute datetimestruct
+ trans, deltas, typ = get_dst_info(tz)
+
+ if is_fixed_offset(tz):
+ # static/fixed tzinfo; in this case we know len(deltas) == 1
+ # This can come back with `typ` of either "fixed" or None
+ dt64_to_dtstruct(obj.value + deltas[0], &obj.dts)
+ elif typ == 'pytz':
+ # i.e. treat_tz_as_pytz(tz)
+ pos = trans.searchsorted(obj.value, side='right') - 1
+ tz = tz._tzinfos[tz._transition_info[pos]]
+ dt64_to_dtstruct(obj.value + deltas[pos], &obj.dts)
+ elif typ == 'dateutil':
+ # i.e. treat_tz_as_dateutil(tz)
+ pos = trans.searchsorted(obj.value, side='right') - 1
+ dt64_to_dtstruct(obj.value + deltas[pos], &obj.dts)
+ else:
+ # Note: as of 2018-07-17 all tzinfo objects that are _not_
+ # either pytz or dateutil have is_fixed_offset(tz) == True,
+ # so this branch will never be reached.
+ pass
+
+ obj.tzinfo = tz
+
+
+cdef inline datetime _localize_pydatetime(datetime dt, tzinfo tz):
+ """
+ Take a datetime/Timestamp in UTC and localizes to timezone tz.
+
+ NB: Unlike the public version, this treats datetime and Timestamp objects
+ identically, i.e. discards nanos from Timestamps.
+ It also assumes that the `tz` input is not None.
+ """
+ try:
+ # datetime.replace with pytz may be incorrect result
+ return tz.localize(dt)
+ except AttributeError:
+ return dt.replace(tzinfo=tz)
+
+
+cpdef inline datetime localize_pydatetime(datetime dt, object tz):
+ """
+ Take a datetime/Timestamp in UTC and localizes to timezone tz.
+
+ Parameters
+ ----------
+ dt : datetime or Timestamp
+ tz : tzinfo, "UTC", or None
+
+ Returns
+ -------
+ localized : datetime or Timestamp
+ """
+ if tz is None:
+ return dt
+ elif not PyDateTime_CheckExact(dt):
+ # i.e. is a Timestamp
+ return dt.tz_localize(tz)
+ elif is_utc(tz):
+ return _localize_pydatetime(dt, tz)
+ try:
+ # datetime.replace with pytz may be incorrect result
+ return tz.localize(dt)
+ except AttributeError:
+ return dt.replace(tzinfo=tz)
+
+
+# ----------------------------------------------------------------------
+# Timezone Conversion
+
+cdef inline int64_t[:] _tz_convert_dst(int64_t[:] values, tzinfo tz,
+ bint to_utc=True):
+ """
+ tz_convert for non-UTC non-tzlocal cases where we have to check
+ DST transitions pointwise.
+
+ Parameters
+ ----------
+ values : ndarray[int64_t]
+ tz : tzinfo
+ to_utc : bool
+ True if converting _to_ UTC, False if converting _from_ utc
+
+ Returns
+ -------
+ result : ndarray[int64_t]
+ """
+ cdef:
+ Py_ssize_t n = len(values)
+ Py_ssize_t i
+ intp_t[:] pos
+ int64_t[:] result = np.empty(n, dtype=np.int64)
+ ndarray[int64_t] trans
+ int64_t[:] deltas
+ int64_t v
+ bint tz_is_local
+
+ tz_is_local = is_tzlocal(tz)
+
+ if not tz_is_local:
+ # get_dst_info cannot extract offsets from tzlocal because its
+ # dependent on a datetime
+ trans, deltas, _ = get_dst_info(tz)
+ if not to_utc:
+ # We add `offset` below instead of subtracting it
+ deltas = -1 * np.array(deltas, dtype='i8')
+
+ # Previously, this search was done pointwise to try and benefit
+ # from getting to skip searches for iNaTs. However, it seems call
+ # overhead dominates the search time so doing it once in bulk
+ # is substantially faster (GH#24603)
+ pos = trans.searchsorted(values, side='right') - 1
+
+ for i in range(n):
+ v = values[i]
+ if v == NPY_NAT:
+ result[i] = v
+ elif tz_is_local:
+ result[i] = _tz_convert_tzlocal_utc(v, tz, to_utc=to_utc)
+ else:
+ if pos[i] < 0:
+ raise ValueError('First time before start of DST info')
+ result[i] = v - deltas[pos[i]]
+
+ return result
+
+
+cdef inline int64_t _tz_convert_tzlocal_utc(int64_t val, tzinfo tz,
+ bint to_utc=True):
+ """
+ Convert the i8 representation of a datetime from a tzlocal timezone to
+ UTC, or vice-versa.
+
+ Private, not intended for use outside of tslibs.conversion
+
+ Parameters
+ ----------
+ val : int64_t
+ tz : tzinfo
+ to_utc : bint
+ True if converting tzlocal _to_ UTC, False if going the other direction
+
+ Returns
+ -------
+ result : int64_t
+ """
+ cdef:
+ npy_datetimestruct dts
+ int64_t delta
+ datetime dt
+
+ dt64_to_dtstruct(val, &dts)
+ dt = datetime(dts.year, dts.month, dts.day, dts.hour,
+ dts.min, dts.sec, dts.us)
+ # get_utcoffset (tz.utcoffset under the hood) only makes sense if datetime
+ # is _wall time_, so if val is a UTC timestamp convert to wall time
+ if not to_utc:
+ dt = dt.replace(tzinfo=tzutc())
+ dt = dt.astimezone(tz)
+ delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000
+
+ if not to_utc:
+ return val + delta
+ return val - delta
+
+
+cdef inline int64_t tz_convert_utc_to_tzlocal(int64_t utc_val, tzinfo tz):
+ """
+ Parameters
+ ----------
+ utc_val : int64_t
+ tz : tzinfo
+
+ Returns
+ -------
+ local_val : int64_t
+ """
+ return _tz_convert_tzlocal_utc(utc_val, tz, to_utc=False)
+
+
+cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2):
+ """
+ Convert the val (in i8) from timezone1 to timezone2
+
+ This is a single timezone version of tz_convert
+
+ Parameters
+ ----------
+ val : int64
+ tz1 : string / timezone object
+ tz2 : string / timezone object
+
+ Returns
+ -------
+ converted: int64
+ """
+ cdef:
+ int64_t[:] deltas
+ Py_ssize_t pos
+ int64_t v, offset, utc_date
+ npy_datetimestruct dts
+ int64_t arr[1]
+
+ # See GH#17734 We should always be converting either from UTC or to UTC
+ assert is_utc(tz1) or is_utc(tz2)
+
+ if val == NPY_NAT:
+ return val
+
+ # Convert to UTC
+ if is_tzlocal(tz1):
+ utc_date = _tz_convert_tzlocal_utc(val, tz1, to_utc=True)
+ elif not is_utc(get_timezone(tz1)):
+ arr[0] = val
+ utc_date = _tz_convert_dst(arr, tz1, to_utc=True)[0]
+ else:
+ utc_date = val
+
+ if is_utc(get_timezone(tz2)):
+ return utc_date
+ elif is_tzlocal(tz2):
+ return _tz_convert_tzlocal_utc(utc_date, tz2, to_utc=False)
+ else:
+ # Convert UTC to other timezone
+ arr[0] = utc_date
+ # Note: at least with cython 0.28.3, doing a lookup `[0]` in the next
+ # line is sensitive to the declared return type of _tz_convert_dst;
+ # if it is declared as returning ndarray[int64_t], a compile-time error
+ # is raised.
+ return _tz_convert_dst(arr, tz2, to_utc=False)[0]
+
+
+cdef inline int64_t[:] _tz_convert_one_way(int64_t[:] vals, object tz,
+ bint to_utc):
+ """
+ Convert the given values (in i8) either to UTC or from UTC.
+
+ Parameters
+ ----------
+ vals : int64 ndarray
+ tz1 : string / timezone object
+ to_utc : bint
+
+ Returns
+ -------
+ converted : ndarray[int64_t]
+ """
+ cdef:
+ int64_t[:] converted, result
+ Py_ssize_t i, n = len(vals)
+ int64_t val
+
+ if not is_utc(get_timezone(tz)):
+ converted = np.empty(n, dtype=np.int64)
+ if is_tzlocal(tz):
+ for i in range(n):
+ val = vals[i]
+ if val == NPY_NAT:
+ converted[i] = NPY_NAT
+ else:
+ converted[i] = _tz_convert_tzlocal_utc(val, tz, to_utc)
+ else:
+ converted = _tz_convert_dst(vals, tz, to_utc)
+ else:
+ converted = vals
+
+ return converted
+
+
+def tz_convert(int64_t[:] vals, object tz1, object tz2):
+ """
+ Convert the values (in i8) from timezone1 to timezone2
+
+ Parameters
+ ----------
+ vals : int64 ndarray
+ tz1 : string / timezone object
+ tz2 : string / timezone object
+
+ Returns
+ -------
+ int64 ndarray of converted
+ """
+ cdef:
+ int64_t[:] utc_dates, converted
+
+ if len(vals) == 0:
+ return np.array([], dtype=np.int64)
+
+ # Convert to UTC
+ utc_dates = _tz_convert_one_way(vals, tz1, to_utc=True)
+ converted = _tz_convert_one_way(utc_dates, tz2, to_utc=False)
+ return np.array(converted, dtype=np.int64)
+
+
+# TODO: cdef scalar version to call from convert_str_to_tsobject
+def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None,
+ object nonexistent=None):
+ """
+ Localize tzinfo-naive i8 to given time zone (using pytz). If
+ there are ambiguities in the values, raise AmbiguousTimeError.
+
+ Parameters
+ ----------
+ vals : ndarray[int64_t]
+ tz : tzinfo or None
+ ambiguous : str, bool, or arraylike
+ When clocks moved backward due to DST, ambiguous times may arise.
+ For example in Central European Time (UTC+01), when going from 03:00
+ DST to 02:00 non-DST, 02:30:00 local time occurs both at 00:30:00 UTC
+ and at 01:30:00 UTC. In such a situation, the `ambiguous` parameter
+ dictates how ambiguous times should be handled.
+
+ - 'infer' will attempt to infer fall dst-transition hours based on
+ order
+ - bool-ndarray where True signifies a DST time, False signifies a
+ non-DST time (note that this flag is only applicable for ambiguous
+ times, but the array must have the same length as vals)
+ - bool if True, treat all vals as DST. If False, treat them as non-DST
+ - 'NaT' will return NaT where there are ambiguous times
+
+ nonexistent : {None, "NaT", "shift_forward", "shift_backward", "raise",
+ timedelta-like}
+ How to handle non-existent times when converting wall times to UTC
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ localized : ndarray[int64_t]
+ """
+ cdef:
+ int64_t[:] deltas, idx_shifted, idx_shifted_left, idx_shifted_right
+ ndarray[uint8_t, cast=True] ambiguous_array, both_nat, both_eq
+ Py_ssize_t i, idx, pos, ntrans, n = len(vals)
+ Py_ssize_t delta_idx_offset, delta_idx, pos_left, pos_right
+ int64_t *tdata
+ int64_t v, left, right, val, v_left, v_right, new_local, remaining_mins
+ int64_t first_delta
+ int64_t HOURS_NS = HOUR_SECONDS * 1000000000, shift_delta = 0
+ ndarray[int64_t] trans, result, result_a, result_b, dst_hours, delta
+ ndarray trans_idx, grp, a_idx, b_idx, one_diff
+ npy_datetimestruct dts
+ bint infer_dst = False, is_dst = False, fill = False
+ bint shift_forward = False, shift_backward = False
+ bint fill_nonexist = False
+ list trans_grp
+ str stamp
+
+ # Vectorized version of DstTzInfo.localize
+ if is_utc(tz) or tz is None:
+ return vals
+
+ result = np.empty(n, dtype=np.int64)
+
+ if is_tzlocal(tz):
+ for i in range(n):
+ v = vals[i]
+ if v == NPY_NAT:
+ result[i] = NPY_NAT
+ else:
+ result[i] = _tz_convert_tzlocal_utc(v, tz, to_utc=True)
+ return result
+
+ if is_string_object(ambiguous):
+ if ambiguous == 'infer':
+ infer_dst = True
+ elif ambiguous == 'NaT':
+ fill = True
+ elif isinstance(ambiguous, bool):
+ is_dst = True
+ if ambiguous:
+ ambiguous_array = np.ones(len(vals), dtype=bool)
+ else:
+ ambiguous_array = np.zeros(len(vals), dtype=bool)
+ elif hasattr(ambiguous, '__iter__'):
+ is_dst = True
+ if len(ambiguous) != len(vals):
+ raise ValueError("Length of ambiguous bool-array must be "
+ "the same size as vals")
+ ambiguous_array = np.asarray(ambiguous, dtype=bool)
+
+ if nonexistent == 'NaT':
+ fill_nonexist = True
+ elif nonexistent == 'shift_forward':
+ shift_forward = True
+ elif nonexistent == 'shift_backward':
+ shift_backward = True
+ elif PyDelta_Check(nonexistent):
+ shift_delta = delta_to_nanoseconds(nonexistent)
+ elif nonexistent not in ('raise', None):
+ msg = ("nonexistent must be one of {'NaT', 'raise', 'shift_forward', "
+ "shift_backwards} or a timedelta object")
+ raise ValueError(msg)
+
+ trans, deltas, _ = get_dst_info(tz)
+
+ tdata = <int64_t*>cnp.PyArray_DATA(trans)
+ ntrans = len(trans)
+
+ # Determine whether each date lies left of the DST transition (store in
+ # result_a) or right of the DST transition (store in result_b)
+ result_a = np.empty(n, dtype=np.int64)
+ result_b = np.empty(n, dtype=np.int64)
+ result_a[:] = NPY_NAT
+ result_b[:] = NPY_NAT
+
+ idx_shifted_left = (np.maximum(0, trans.searchsorted(
+ vals - DAY_SECONDS * 1000000000, side='right') - 1)).astype(np.int64)
+
+ idx_shifted_right = (np.maximum(0, trans.searchsorted(
+ vals + DAY_SECONDS * 1000000000, side='right') - 1)).astype(np.int64)
+
+ for i in range(n):
+ val = vals[i]
+ v_left = val - deltas[idx_shifted_left[i]]
+ pos_left = bisect_right_i8(tdata, v_left, ntrans) - 1
+ # timestamp falls to the left side of the DST transition
+ if v_left + deltas[pos_left] == val:
+ result_a[i] = v_left
+
+ v_right = val - deltas[idx_shifted_right[i]]
+ pos_right = bisect_right_i8(tdata, v_right, ntrans) - 1
+ # timestamp falls to the right side of the DST transition
+ if v_right + deltas[pos_right] == val:
+ result_b[i] = v_right
+
+ if infer_dst:
+ dst_hours = np.empty(n, dtype=np.int64)
+ dst_hours[:] = NPY_NAT
+
+ # Get the ambiguous hours (given the above, these are the hours
+ # where result_a != result_b and neither of them are NAT)
+ both_nat = np.logical_and(result_a != NPY_NAT, result_b != NPY_NAT)
+ both_eq = result_a == result_b
+ trans_idx = np.squeeze(np.nonzero(np.logical_and(both_nat, ~both_eq)))
+ if trans_idx.size == 1:
+ stamp = _render_tstamp(vals[trans_idx])
+ raise pytz.AmbiguousTimeError(
+ "Cannot infer dst time from %s as there "
+ "are no repeated times".format(stamp))
+ # Split the array into contiguous chunks (where the difference between
+ # indices is 1). These are effectively dst transitions in different
+ # years which is useful for checking that there is not an ambiguous
+ # transition in an individual year.
+ if trans_idx.size > 0:
+ one_diff = np.where(np.diff(trans_idx) != 1)[0] + 1
+ trans_grp = np.array_split(trans_idx, one_diff)
+
+ # Iterate through each day, if there are no hours where the
+ # delta is negative (indicates a repeat of hour) the switch
+ # cannot be inferred
+ for grp in trans_grp:
+
+ delta = np.diff(result_a[grp])
+ if grp.size == 1 or np.all(delta > 0):
+ stamp = _render_tstamp(vals[grp[0]])
+ raise pytz.AmbiguousTimeError(stamp)
+
+ # Find the index for the switch and pull from a for dst and b
+ # for standard
+ switch_idx = (delta <= 0).nonzero()[0]
+ if switch_idx.size > 1:
+ raise pytz.AmbiguousTimeError(
+ "There are %i dst switches when "
+ "there should only be 1.".format(switch_idx.size))
+ switch_idx = switch_idx[0] + 1
+ # Pull the only index and adjust
+ a_idx = grp[:switch_idx]
+ b_idx = grp[switch_idx:]
+ dst_hours[grp] = np.hstack((result_a[a_idx], result_b[b_idx]))
+
+ for i in range(n):
+ val = vals[i]
+ left = result_a[i]
+ right = result_b[i]
+ if val == NPY_NAT:
+ result[i] = val
+ elif left != NPY_NAT and right != NPY_NAT:
+ if left == right:
+ result[i] = left
+ else:
+ if infer_dst and dst_hours[i] != NPY_NAT:
+ result[i] = dst_hours[i]
+ elif is_dst:
+ if ambiguous_array[i]:
+ result[i] = left
+ else:
+ result[i] = right
+ elif fill:
+ result[i] = NPY_NAT
+ else:
+ stamp = _render_tstamp(val)
+ raise pytz.AmbiguousTimeError(
+ "Cannot infer dst time from %r, try using the "
+ "'ambiguous' argument".format(stamp))
+ elif left != NPY_NAT:
+ result[i] = left
+ elif right != NPY_NAT:
+ result[i] = right
+ else:
+ # Handle nonexistent times
+ if shift_forward or shift_backward or shift_delta != 0:
+ # Shift the nonexistent time to the closest existing time
+ remaining_mins = val % HOURS_NS
+ if shift_delta != 0:
+ # Validate that we don't relocalize on another nonexistent
+ # time
+ if -1 < shift_delta + remaining_mins < HOURS_NS:
+ raise ValueError(
+ "The provided timedelta will relocalize on a "
+ "nonexistent time: {}".format(nonexistent)
+ )
+ new_local = val + shift_delta
+ elif shift_forward:
+ new_local = val + (HOURS_NS - remaining_mins)
+ else:
+ # Subtract 1 since the beginning hour is _inclusive_ of
+ # nonexistent times
+ new_local = val - remaining_mins - 1
+ delta_idx = trans.searchsorted(new_local, side='right')
+ # Shift the delta_idx by if the UTC offset of
+ # the target tz is greater than 0 and we're moving forward
+ # or vice versa
+ first_delta = deltas[0]
+ if (shift_forward or shift_delta > 0) and first_delta > 0:
+ delta_idx_offset = 1
+ elif (shift_backward or shift_delta < 0) and first_delta < 0:
+ delta_idx_offset = 1
+ else:
+ delta_idx_offset = 0
+ delta_idx = delta_idx - delta_idx_offset
+ result[i] = new_local - deltas[delta_idx]
+ elif fill_nonexist:
+ result[i] = NPY_NAT
+ else:
+ stamp = _render_tstamp(val)
+ raise pytz.NonExistentTimeError(stamp)
+
+ return result
+
+
+cdef inline Py_ssize_t bisect_right_i8(int64_t *data,
+ int64_t val, Py_ssize_t n):
+ cdef:
+ Py_ssize_t pivot, left = 0, right = n
+
+ assert n >= 1
+
+ # edge cases
+ if val > data[n - 1]:
+ return n
+
+ if val < data[0]:
+ return 0
+
+ while left < right:
+ pivot = left + (right - left) // 2
+
+ if data[pivot] <= val:
+ left = pivot + 1
+ else:
+ right = pivot
+
+ return left
+
+
+cdef inline str _render_tstamp(int64_t val):
+ """ Helper function to render exception messages"""
+ from pandas._libs.tslibs.timestamps import Timestamp
+ return str(Timestamp(val))
+
+
+# ----------------------------------------------------------------------
+# Normalization
+
+
+def normalize_date(dt: object) -> datetime:
+ """
+ Normalize datetime.datetime value to midnight. Returns datetime.date as a
+ datetime.datetime at midnight
+
+ Parameters
+ ----------
+ dt : date, datetime, or Timestamp
+
+ Returns
+ -------
+ normalized : datetime.datetime or Timestamp
+
+ Raises
+ ------
+ TypeError : if input is not datetime.date, datetime.datetime, or Timestamp
+ """
+ if PyDateTime_Check(dt):
+ if not PyDateTime_CheckExact(dt):
+ # i.e. a Timestamp object
+ return dt.replace(hour=0, minute=0, second=0, microsecond=0,
+ nanosecond=0)
+ else:
+ # regular datetime object
+ return dt.replace(hour=0, minute=0, second=0, microsecond=0)
+ # TODO: Make sure DST crossing is handled correctly here
+ elif PyDate_Check(dt):
+ return datetime(dt.year, dt.month, dt.day)
+ else:
+ raise TypeError('Unrecognized type: %s' % type(dt))
+
+
+def normalize_i8_timestamps(int64_t[:] stamps, object tz):
+ """
+ Normalize each of the (nanosecond) timezone aware timestamps in the given
+ array by rounding down to the beginning of the day (i.e. midnight).
+ This is midnight for timezone, `tz`.
+
+ Parameters
+ ----------
+ stamps : int64 ndarray
+ tz : tzinfo or None
+
+ Returns
+ -------
+ result : int64 ndarray of converted of normalized nanosecond timestamps
+ """
+ cdef:
+ Py_ssize_t n = len(stamps)
+ int64_t[:] result = np.empty(n, dtype=np.int64)
+
+ result = _normalize_local(stamps, tz)
+
+ return result.base # .base to access underlying np.ndarray
+
+
+cdef int64_t[:] _normalize_local(int64_t[:] stamps, tzinfo tz):
+ """
+ Normalize each of the (nanosecond) timestamps in the given array by
+ rounding down to the beginning of the day (i.e. midnight) for the
+ given timezone `tz`.
+
+ Parameters
+ ----------
+ stamps : int64 ndarray
+ tz : tzinfo
+
+ Returns
+ -------
+ result : int64 ndarray of converted of normalized nanosecond timestamps
+ """
+ cdef:
+ Py_ssize_t i, n = len(stamps)
+ int64_t[:] result = np.empty(n, dtype=np.int64)
+ ndarray[int64_t] trans
+ int64_t[:] deltas
+ str typ
+ Py_ssize_t[:] pos
+ npy_datetimestruct dts
+ int64_t delta, local_val
+
+ if is_tzlocal(tz):
+ for i in range(n):
+ if stamps[i] == NPY_NAT:
+ result[i] = NPY_NAT
+ continue
+ local_val = _tz_convert_tzlocal_utc(stamps[i], tz, to_utc=False)
+ dt64_to_dtstruct(local_val, &dts)
+ result[i] = _normalized_stamp(&dts)
+ else:
+ # Adjust datetime64 timestamp, recompute datetimestruct
+ trans, deltas, typ = get_dst_info(tz)
+
+ if typ not in ['pytz', 'dateutil']:
+ # static/fixed; in this case we know that len(delta) == 1
+ delta = deltas[0]
+ for i in range(n):
+ if stamps[i] == NPY_NAT:
+ result[i] = NPY_NAT
+ continue
+ dt64_to_dtstruct(stamps[i] + delta, &dts)
+ result[i] = _normalized_stamp(&dts)
+ else:
+ pos = trans.searchsorted(stamps, side='right') - 1
+ for i in range(n):
+ if stamps[i] == NPY_NAT:
+ result[i] = NPY_NAT
+ continue
+ dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts)
+ result[i] = _normalized_stamp(&dts)
+
+ return result
+
+
+cdef inline int64_t _normalized_stamp(npy_datetimestruct *dts) nogil:
+ """
+ Normalize the given datetimestruct to midnight, then convert to int64_t.
+
+ Parameters
+ ----------
+ *dts : pointer to npy_datetimestruct
+
+ Returns
+ -------
+ stamp : int64
+ """
+ dts.hour = 0
+ dts.min = 0
+ dts.sec = 0
+ dts.us = 0
+ dts.ps = 0
+ return dtstruct_to_dt64(dts)
+
+
+def is_date_array_normalized(int64_t[:] stamps, object tz=None):
+ """
+ Check if all of the given (nanosecond) timestamps are normalized to
+ midnight, i.e. hour == minute == second == 0. If the optional timezone
+ `tz` is not None, then this is midnight for this timezone.
+
+ Parameters
+ ----------
+ stamps : int64 ndarray
+ tz : tzinfo or None
+
+ Returns
+ -------
+ is_normalized : bool True if all stamps are normalized
+ """
+ cdef:
+ Py_ssize_t i, n = len(stamps)
+ ndarray[int64_t] trans
+ int64_t[:] deltas
+ intp_t[:] pos
+ npy_datetimestruct dts
+ int64_t local_val, delta
+ str typ
+
+ if tz is None or is_utc(tz):
+ for i in range(n):
+ dt64_to_dtstruct(stamps[i], &dts)
+ if (dts.hour + dts.min + dts.sec + dts.us) > 0:
+ return False
+ elif is_tzlocal(tz):
+ for i in range(n):
+ local_val = _tz_convert_tzlocal_utc(stamps[i], tz, to_utc=False)
+ dt64_to_dtstruct(local_val, &dts)
+ if (dts.hour + dts.min + dts.sec + dts.us) > 0:
+ return False
+ else:
+ trans, deltas, typ = get_dst_info(tz)
+
+ if typ not in ['pytz', 'dateutil']:
+ # static/fixed; in this case we know that len(delta) == 1
+ delta = deltas[0]
+ for i in range(n):
+ # Adjust datetime64 timestamp, recompute datetimestruct
+ dt64_to_dtstruct(stamps[i] + delta, &dts)
+ if (dts.hour + dts.min + dts.sec + dts.us) > 0:
+ return False
+
+ else:
+ pos = trans.searchsorted(stamps) - 1
+ for i in range(n):
+ # Adjust datetime64 timestamp, recompute datetimestruct
+ dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts)
+ if (dts.hour + dts.min + dts.sec + dts.us) > 0:
+ return False
+
+ return True
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/fields.pyx b/contrib/python/pandas/py2/pandas/_libs/tslibs/fields.pyx
new file mode 100644
index 00000000000..5cda7992369
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/fields.pyx
@@ -0,0 +1,669 @@
+# -*- coding: utf-8 -*-
+"""
+Functions for accessing attributes of Timestamp/datetime64/datetime-like
+objects and arrays
+"""
+
+import cython
+from cython import Py_ssize_t
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport ndarray, int64_t, int32_t, int8_t
+cnp.import_array()
+
+from pandas._libs.tslibs.ccalendar import (
+ get_locale_names, MONTHS_FULL, DAYS_FULL, DAY_SECONDS)
+from pandas._libs.tslibs.ccalendar cimport (
+ get_days_in_month, is_leapyear, dayofweek, get_week_of_year,
+ get_day_of_year)
+from pandas._libs.tslibs.np_datetime cimport (
+ npy_datetimestruct, pandas_timedeltastruct, dt64_to_dtstruct,
+ td64_to_tdstruct)
+from pandas._libs.tslibs.nattype cimport NPY_NAT
+
+
+def get_time_micros(ndarray[int64_t] dtindex):
+ """
+ Return the number of microseconds in the time component of a
+ nanosecond timestamp.
+
+ Parameters
+ ----------
+ dtindex : ndarray[int64_t]
+
+ Returns
+ -------
+ micros : ndarray[int64_t]
+ """
+ cdef:
+ ndarray[int64_t] micros
+
+ micros = np.mod(dtindex, DAY_SECONDS * 1000000000, dtype=np.int64)
+ micros //= 1000
+ return micros
+
+
+def build_field_sarray(int64_t[:] dtindex):
+ """
+ Datetime as int64 representation to a structured array of fields
+ """
+ cdef:
+ Py_ssize_t i, count = len(dtindex)
+ npy_datetimestruct dts
+ ndarray[int32_t] years, months, days, hours, minutes, seconds, mus
+
+ sa_dtype = [('Y', 'i4'), # year
+ ('M', 'i4'), # month
+ ('D', 'i4'), # day
+ ('h', 'i4'), # hour
+ ('m', 'i4'), # min
+ ('s', 'i4'), # second
+ ('u', 'i4')] # microsecond
+
+ out = np.empty(count, dtype=sa_dtype)
+
+ years = out['Y']
+ months = out['M']
+ days = out['D']
+ hours = out['h']
+ minutes = out['m']
+ seconds = out['s']
+ mus = out['u']
+
+ for i in range(count):
+ dt64_to_dtstruct(dtindex[i], &dts)
+ years[i] = dts.year
+ months[i] = dts.month
+ days[i] = dts.day
+ hours[i] = dts.hour
+ minutes[i] = dts.min
+ seconds[i] = dts.sec
+ mus[i] = dts.us
+
+ return out
+
+
+def get_date_name_field(int64_t[:] dtindex, object field, object locale=None):
+ """
+ Given a int64-based datetime index, return array of strings of date
+ name based on requested field (e.g. weekday_name)
+ """
+ cdef:
+ Py_ssize_t i, count = len(dtindex)
+ ndarray[object] out, names
+ npy_datetimestruct dts
+ int dow
+
+ out = np.empty(count, dtype=object)
+
+ if field == 'day_name' or field == 'weekday_name':
+ if locale is None:
+ names = np.array(DAYS_FULL, dtype=np.object_)
+ else:
+ names = np.array(get_locale_names('f_weekday', locale),
+ dtype=np.object_)
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = np.nan
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ dow = dayofweek(dts.year, dts.month, dts.day)
+ out[i] = names[dow].capitalize()
+
+ elif field == 'month_name':
+ if locale is None:
+ names = np.array(MONTHS_FULL, dtype=np.object_)
+ else:
+ names = np.array(get_locale_names('f_month', locale),
+ dtype=np.object_)
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = np.nan
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ out[i] = names[dts.month].capitalize()
+
+ else:
+ raise ValueError("Field {field} not supported".format(field=field))
+
+ return out
+
+
+def get_start_end_field(int64_t[:] dtindex, object field,
+ object freqstr=None, int month_kw=12):
+ """
+ Given an int64-based datetime index return array of indicators
+ of whether timestamps are at the start/end of the month/quarter/year
+ (defined by frequency).
+ """
+ cdef:
+ Py_ssize_t i
+ int count = len(dtindex)
+ bint is_business = 0
+ int end_month = 12
+ int start_month = 1
+ ndarray[int8_t] out
+ ndarray[int32_t, ndim=2] _month_offset
+ bint isleap
+ npy_datetimestruct dts
+ int mo_off, dom, doy, dow, ldom
+
+ _month_offset = np.array(
+ [[0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365],
+ [0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366]],
+ dtype=np.int32)
+
+ out = np.zeros(count, dtype='int8')
+
+ if freqstr:
+ if freqstr == 'C':
+ raise ValueError("Custom business days is not supported by {field}"
+ .format(field=field))
+ is_business = freqstr[0] == 'B'
+
+ # YearBegin(), BYearBegin() use month = starting month of year.
+ # QuarterBegin(), BQuarterBegin() use startingMonth = starting
+ # month of year. Other offests use month, startingMonth as ending
+ # month of year.
+
+ if (freqstr[0:2] in ['MS', 'QS', 'AS']) or (
+ freqstr[1:3] in ['MS', 'QS', 'AS']):
+ end_month = 12 if month_kw == 1 else month_kw - 1
+ start_month = month_kw
+ else:
+ end_month = month_kw
+ start_month = (end_month % 12) + 1
+ else:
+ end_month = 12
+ start_month = 1
+
+ if field == 'is_month_start':
+ if is_business:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = 0
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ dom = dts.day
+ dow = dayofweek(dts.year, dts.month, dts.day)
+
+ if (dom == 1 and dow < 5) or (dom <= 3 and dow == 0):
+ out[i] = 1
+
+ else:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = 0
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ dom = dts.day
+
+ if dom == 1:
+ out[i] = 1
+
+ elif field == 'is_month_end':
+ if is_business:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = 0
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ isleap = is_leapyear(dts.year)
+ mo_off = _month_offset[isleap, dts.month - 1]
+ dom = dts.day
+ doy = mo_off + dom
+ ldom = _month_offset[isleap, dts.month]
+ dow = dayofweek(dts.year, dts.month, dts.day)
+
+ if (ldom == doy and dow < 5) or (
+ dow == 4 and (ldom - doy <= 2)):
+ out[i] = 1
+
+ else:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = 0
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ isleap = is_leapyear(dts.year)
+ mo_off = _month_offset[isleap, dts.month - 1]
+ dom = dts.day
+ doy = mo_off + dom
+ ldom = _month_offset[isleap, dts.month]
+
+ if ldom == doy:
+ out[i] = 1
+
+ elif field == 'is_quarter_start':
+ if is_business:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = 0
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ dom = dts.day
+ dow = dayofweek(dts.year, dts.month, dts.day)
+
+ if ((dts.month - start_month) % 3 == 0) and (
+ (dom == 1 and dow < 5) or (dom <= 3 and dow == 0)):
+ out[i] = 1
+
+ else:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = 0
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ dom = dts.day
+
+ if ((dts.month - start_month) % 3 == 0) and dom == 1:
+ out[i] = 1
+
+ elif field == 'is_quarter_end':
+ if is_business:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = 0
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ isleap = is_leapyear(dts.year)
+ mo_off = _month_offset[isleap, dts.month - 1]
+ dom = dts.day
+ doy = mo_off + dom
+ ldom = _month_offset[isleap, dts.month]
+ dow = dayofweek(dts.year, dts.month, dts.day)
+
+ if ((dts.month - end_month) % 3 == 0) and (
+ (ldom == doy and dow < 5) or (
+ dow == 4 and (ldom - doy <= 2))):
+ out[i] = 1
+
+ else:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = 0
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ isleap = is_leapyear(dts.year)
+ mo_off = _month_offset[isleap, dts.month - 1]
+ dom = dts.day
+ doy = mo_off + dom
+ ldom = _month_offset[isleap, dts.month]
+
+ if ((dts.month - end_month) % 3 == 0) and (ldom == doy):
+ out[i] = 1
+
+ elif field == 'is_year_start':
+ if is_business:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = 0
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ dom = dts.day
+ dow = dayofweek(dts.year, dts.month, dts.day)
+
+ if (dts.month == start_month) and (
+ (dom == 1 and dow < 5) or (dom <= 3 and dow == 0)):
+ out[i] = 1
+
+ else:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = 0
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ dom = dts.day
+
+ if (dts.month == start_month) and dom == 1:
+ out[i] = 1
+
+ elif field == 'is_year_end':
+ if is_business:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = 0
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ isleap = is_leapyear(dts.year)
+ dom = dts.day
+ mo_off = _month_offset[isleap, dts.month - 1]
+ doy = mo_off + dom
+ dow = dayofweek(dts.year, dts.month, dts.day)
+ ldom = _month_offset[isleap, dts.month]
+
+ if (dts.month == end_month) and (
+ (ldom == doy and dow < 5) or (
+ dow == 4 and (ldom - doy <= 2))):
+ out[i] = 1
+
+ else:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = 0
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ isleap = is_leapyear(dts.year)
+ mo_off = _month_offset[isleap, dts.month - 1]
+ dom = dts.day
+ doy = mo_off + dom
+ ldom = _month_offset[isleap, dts.month]
+
+ if (dts.month == end_month) and (ldom == doy):
+ out[i] = 1
+
+ else:
+ raise ValueError("Field {field} not supported".format(field=field))
+
+ return out.view(bool)
+
+
+def get_date_field(ndarray[int64_t] dtindex, object field):
+ """
+ Given a int64-based datetime index, extract the year, month, etc.,
+ field and return an array of these values.
+ """
+ cdef:
+ Py_ssize_t i, count = len(dtindex)
+ ndarray[int32_t] out
+ npy_datetimestruct dts
+
+ out = np.empty(count, dtype='i4')
+
+ if field == 'Y':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ out[i] = dts.year
+ return out
+
+ elif field == 'M':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ out[i] = dts.month
+ return out
+
+ elif field == 'D':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ out[i] = dts.day
+ return out
+
+ elif field == 'h':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ out[i] = dts.hour
+ return out
+
+ elif field == 'm':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ out[i] = dts.min
+ return out
+
+ elif field == 's':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ out[i] = dts.sec
+ return out
+
+ elif field == 'us':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ out[i] = dts.us
+ return out
+
+ elif field == 'ns':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ out[i] = dts.ps / 1000
+ return out
+ elif field == 'doy':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ out[i] = get_day_of_year(dts.year, dts.month, dts.day)
+ return out
+
+ elif field == 'dow':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ out[i] = dayofweek(dts.year, dts.month, dts.day)
+ return out
+
+ elif field == 'woy':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ out[i] = get_week_of_year(dts.year, dts.month, dts.day)
+ return out
+
+ elif field == 'q':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ out[i] = dts.month
+ out[i] = ((out[i] - 1) / 3) + 1
+ return out
+
+ elif field == 'dim':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ out[i] = get_days_in_month(dts.year, dts.month)
+ return out
+ elif field == 'is_leap_year':
+ return isleapyear_arr(get_date_field(dtindex, 'Y'))
+
+ raise ValueError("Field %s not supported" % field)
+
+
+def get_timedelta_field(int64_t[:] tdindex, object field):
+ """
+ Given a int64-based timedelta index, extract the days, hrs, sec.,
+ field and return an array of these values.
+ """
+ cdef:
+ Py_ssize_t i, count = len(tdindex)
+ ndarray[int32_t] out
+ pandas_timedeltastruct tds
+
+ out = np.empty(count, dtype='i4')
+
+ if field == 'days':
+ with nogil:
+ for i in range(count):
+ if tdindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ td64_to_tdstruct(tdindex[i], &tds)
+ out[i] = tds.days
+ return out
+
+ elif field == 'h':
+ with nogil:
+ for i in range(count):
+ if tdindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ td64_to_tdstruct(tdindex[i], &tds)
+ out[i] = tds.hrs
+ return out
+
+ elif field == 's':
+ with nogil:
+ for i in range(count):
+ if tdindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ td64_to_tdstruct(tdindex[i], &tds)
+ out[i] = tds.sec
+ return out
+
+ elif field == 'seconds':
+ with nogil:
+ for i in range(count):
+ if tdindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ td64_to_tdstruct(tdindex[i], &tds)
+ out[i] = tds.seconds
+ return out
+
+ elif field == 'ms':
+ with nogil:
+ for i in range(count):
+ if tdindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ td64_to_tdstruct(tdindex[i], &tds)
+ out[i] = tds.ms
+ return out
+
+ elif field == 'microseconds':
+ with nogil:
+ for i in range(count):
+ if tdindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ td64_to_tdstruct(tdindex[i], &tds)
+ out[i] = tds.microseconds
+ return out
+
+ elif field == 'us':
+ with nogil:
+ for i in range(count):
+ if tdindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ td64_to_tdstruct(tdindex[i], &tds)
+ out[i] = tds.us
+ return out
+
+ elif field == 'ns':
+ with nogil:
+ for i in range(count):
+ if tdindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ td64_to_tdstruct(tdindex[i], &tds)
+ out[i] = tds.ns
+ return out
+
+ elif field == 'nanoseconds':
+ with nogil:
+ for i in range(count):
+ if tdindex[i] == NPY_NAT:
+ out[i] = -1
+ continue
+
+ td64_to_tdstruct(tdindex[i], &tds)
+ out[i] = tds.nanoseconds
+ return out
+
+ raise ValueError("Field %s not supported" % field)
+
+
+cpdef isleapyear_arr(ndarray years):
+ """vectorized version of isleapyear; NaT evaluates as False"""
+ cdef:
+ ndarray[int8_t] out
+
+ out = np.zeros(len(years), dtype='int8')
+ out[np.logical_or(years % 400 == 0,
+ np.logical_and(years % 4 == 0,
+ years % 100 > 0))] = 1
+ return out.view(bool)
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/frequencies.pxd b/contrib/python/pandas/py2/pandas/_libs/tslibs/frequencies.pxd
new file mode 100644
index 00000000000..4e7949e55c8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/frequencies.pxd
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+
+cpdef object get_rule_month(object source, object default=*)
+
+cpdef get_freq_code(freqstr)
+cpdef object get_freq(object freq)
+cpdef str get_base_alias(freqstr)
+cpdef int get_to_timestamp_base(int base)
+cpdef str get_freq_str(base, mult=*)
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/frequencies.pyx b/contrib/python/pandas/py2/pandas/_libs/tslibs/frequencies.pyx
new file mode 100644
index 00000000000..bd9e68e1344
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/frequencies.pyx
@@ -0,0 +1,512 @@
+# -*- coding: utf-8 -*-
+import re
+
+cimport numpy as cnp
+cnp.import_array()
+
+from pandas._libs.tslibs.util cimport is_integer_object, is_string_object
+
+from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS
+
+# ----------------------------------------------------------------------
+# Constants
+
+# hack to handle WOM-1MON
+opattern = re.compile(
+ r'([+\-]?\d*|[+\-]?\d*\.\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)'
+)
+
+INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}"
+
+# ---------------------------------------------------------------------
+# Period codes
+
+
+class FreqGroup(object):
+ FR_ANN = 1000
+ FR_QTR = 2000
+ FR_MTH = 3000
+ FR_WK = 4000
+ FR_BUS = 5000
+ FR_DAY = 6000
+ FR_HR = 7000
+ FR_MIN = 8000
+ FR_SEC = 9000
+ FR_MS = 10000
+ FR_US = 11000
+ FR_NS = 12000
+
+
+# period frequency constants corresponding to scikits timeseries
+# originals
+_period_code_map = {
+ # Annual freqs with various fiscal year ends.
+ # eg, 2005 for A-FEB runs Mar 1, 2004 to Feb 28, 2005
+ "A-DEC": 1000, # Annual - December year end
+ "A-JAN": 1001, # Annual - January year end
+ "A-FEB": 1002, # Annual - February year end
+ "A-MAR": 1003, # Annual - March year end
+ "A-APR": 1004, # Annual - April year end
+ "A-MAY": 1005, # Annual - May year end
+ "A-JUN": 1006, # Annual - June year end
+ "A-JUL": 1007, # Annual - July year end
+ "A-AUG": 1008, # Annual - August year end
+ "A-SEP": 1009, # Annual - September year end
+ "A-OCT": 1010, # Annual - October year end
+ "A-NOV": 1011, # Annual - November year end
+
+ # Quarterly frequencies with various fiscal year ends.
+ # eg, Q42005 for Q-OCT runs Aug 1, 2005 to Oct 31, 2005
+ "Q-DEC": 2000, # Quarterly - December year end
+ "Q-JAN": 2001, # Quarterly - January year end
+ "Q-FEB": 2002, # Quarterly - February year end
+ "Q-MAR": 2003, # Quarterly - March year end
+ "Q-APR": 2004, # Quarterly - April year end
+ "Q-MAY": 2005, # Quarterly - May year end
+ "Q-JUN": 2006, # Quarterly - June year end
+ "Q-JUL": 2007, # Quarterly - July year end
+ "Q-AUG": 2008, # Quarterly - August year end
+ "Q-SEP": 2009, # Quarterly - September year end
+ "Q-OCT": 2010, # Quarterly - October year end
+ "Q-NOV": 2011, # Quarterly - November year end
+
+ "M": 3000, # Monthly
+
+ "W-SUN": 4000, # Weekly - Sunday end of week
+ "W-MON": 4001, # Weekly - Monday end of week
+ "W-TUE": 4002, # Weekly - Tuesday end of week
+ "W-WED": 4003, # Weekly - Wednesday end of week
+ "W-THU": 4004, # Weekly - Thursday end of week
+ "W-FRI": 4005, # Weekly - Friday end of week
+ "W-SAT": 4006, # Weekly - Saturday end of week
+
+ "B": 5000, # Business days
+ "D": 6000, # Daily
+ "H": 7000, # Hourly
+ "T": 8000, # Minutely
+ "S": 9000, # Secondly
+ "L": 10000, # Millisecondly
+ "U": 11000, # Microsecondly
+ "N": 12000} # Nanosecondly
+
+
+_reverse_period_code_map = {
+ _period_code_map[key]: key for key in _period_code_map}
+
+# Yearly aliases; careful not to put these in _reverse_period_code_map
+_period_code_map.update({'Y' + key[1:]: _period_code_map[key]
+ for key in _period_code_map
+ if key.startswith('A-')})
+
+_period_code_map.update({
+ "Q": 2000, # Quarterly - December year end (default quarterly)
+ "A": 1000, # Annual
+ "W": 4000, # Weekly
+ "C": 5000}) # Custom Business Day
+
+_lite_rule_alias = {
+ 'W': 'W-SUN',
+ 'Q': 'Q-DEC',
+
+ 'A': 'A-DEC', # YearEnd(month=12),
+ 'Y': 'A-DEC',
+ 'AS': 'AS-JAN', # YearBegin(month=1),
+ 'YS': 'AS-JAN',
+ 'BA': 'BA-DEC', # BYearEnd(month=12),
+ 'BY': 'BA-DEC',
+ 'BAS': 'BAS-JAN', # BYearBegin(month=1),
+ 'BYS': 'BAS-JAN',
+
+ 'Min': 'T',
+ 'min': 'T',
+ 'ms': 'L',
+ 'us': 'U',
+ 'ns': 'N'}
+
+_dont_uppercase = {'MS', 'ms'}
+
+# ----------------------------------------------------------------------
+
+cpdef get_freq_code(freqstr):
+ """
+ Return freq str or tuple to freq code and stride (mult)
+
+ Parameters
+ ----------
+ freqstr : str or tuple
+
+ Returns
+ -------
+ return : tuple of base frequency code and stride (mult)
+
+ Examples
+ --------
+ >>> get_freq_code('3D')
+ (6000, 3)
+
+ >>> get_freq_code('D')
+ (6000, 1)
+
+ >>> get_freq_code(('D', 3))
+ (6000, 3)
+ """
+ if getattr(freqstr, '_typ', None) == 'dateoffset':
+ freqstr = (freqstr.rule_code, freqstr.n)
+
+ if isinstance(freqstr, tuple):
+ if is_integer_object(freqstr[0]) and is_integer_object(freqstr[1]):
+ # e.g., freqstr = (2000, 1)
+ return freqstr
+ else:
+ # e.g., freqstr = ('T', 5)
+ try:
+ code = _period_str_to_code(freqstr[0])
+ stride = freqstr[1]
+ except:
+ if is_integer_object(freqstr[1]):
+ raise
+ code = _period_str_to_code(freqstr[1])
+ stride = freqstr[0]
+ return code, stride
+
+ if is_integer_object(freqstr):
+ return freqstr, 1
+
+ base, stride = _base_and_stride(freqstr)
+ code = _period_str_to_code(base)
+
+ return code, stride
+
+
+cpdef _base_and_stride(freqstr):
+ """
+ Return base freq and stride info from string representation
+
+ Returns
+ -------
+ base : str
+ stride : int
+
+ Examples
+ --------
+ _freq_and_stride('5Min') -> 'Min', 5
+ """
+ groups = opattern.match(freqstr)
+
+ if not groups:
+ raise ValueError("Could not evaluate {freq}".format(freq=freqstr))
+
+ stride = groups.group(1)
+
+ if len(stride):
+ stride = int(stride)
+ else:
+ stride = 1
+
+ base = groups.group(2)
+
+ return base, stride
+
+
+cpdef _period_str_to_code(freqstr):
+ freqstr = _lite_rule_alias.get(freqstr, freqstr)
+
+ if freqstr not in _dont_uppercase:
+ lower = freqstr.lower()
+ freqstr = _lite_rule_alias.get(lower, freqstr)
+
+ if freqstr not in _dont_uppercase:
+ freqstr = freqstr.upper()
+ try:
+ return _period_code_map[freqstr]
+ except KeyError:
+ raise ValueError(INVALID_FREQ_ERR_MSG.format(freqstr))
+
+
+cpdef str get_freq_str(base, mult=1):
+ """
+ Return the summary string associated with this offset code, possibly
+ adjusted by a multiplier.
+
+ Parameters
+ ----------
+ base : int (member of FreqGroup)
+
+ Returns
+ -------
+ freq_str : str
+
+ Examples
+ --------
+ >>> get_freq_str(1000)
+ 'A-DEC'
+
+ >>> get_freq_str(2000, 2)
+ '2Q-DEC'
+
+ >>> get_freq_str("foo")
+ """
+ code = _reverse_period_code_map.get(base)
+ if mult == 1:
+ return code
+ return str(mult) + code
+
+
+cpdef str get_base_alias(freqstr):
+ """
+ Returns the base frequency alias, e.g., '5D' -> 'D'
+
+ Parameters
+ ----------
+ freqstr : str
+
+ Returns
+ -------
+ base_alias : str
+ """
+ return _base_and_stride(freqstr)[0]
+
+
+cpdef int get_to_timestamp_base(int base):
+ """
+ Return frequency code group used for base of to_timestamp against
+ frequency code.
+
+ Parameters
+ ----------
+ base : int (member of FreqGroup)
+
+ Returns
+ -------
+ base : int
+
+ Examples
+ --------
+ # Return day freq code against longer freq than day
+ >>> get_to_timestamp_base(get_freq_code('D')[0])
+ 6000
+ >>> get_to_timestamp_base(get_freq_code('W')[0])
+ 6000
+ >>> get_to_timestamp_base(get_freq_code('M')[0])
+ 6000
+
+ # Return second freq code against hour between second
+ >>> get_to_timestamp_base(get_freq_code('H')[0])
+ 9000
+ >>> get_to_timestamp_base(get_freq_code('S')[0])
+ 9000
+ """
+ if base < FreqGroup.FR_BUS:
+ return FreqGroup.FR_DAY
+ elif FreqGroup.FR_HR <= base <= FreqGroup.FR_SEC:
+ return FreqGroup.FR_SEC
+ return base
+
+
+cpdef object get_freq(object freq):
+ """
+ Return frequency code of given frequency str.
+ If input is not string, return input as it is.
+
+ Examples
+ --------
+ >>> get_freq('A')
+ 1000
+
+ >>> get_freq('3A')
+ 1000
+ """
+ if is_string_object(freq):
+ base, mult = get_freq_code(freq)
+ freq = base
+ return freq
+
+
+# ----------------------------------------------------------------------
+# Frequency comparison
+
+def is_subperiod(source, target) -> bint:
+ """
+ Returns True if downsampling is possible between source and target
+ frequencies
+
+ Parameters
+ ----------
+ source : string or DateOffset
+ Frequency converting from
+ target : string or DateOffset
+ Frequency converting to
+
+ Returns
+ -------
+ is_subperiod : boolean
+ """
+
+ if target is None or source is None:
+ return False
+ source = _maybe_coerce_freq(source)
+ target = _maybe_coerce_freq(target)
+
+ if _is_annual(target):
+ if _is_quarterly(source):
+ return _quarter_months_conform(get_rule_month(source),
+ get_rule_month(target))
+ return source in {'D', 'C', 'B', 'M', 'H', 'T', 'S', 'L', 'U', 'N'}
+ elif _is_quarterly(target):
+ return source in {'D', 'C', 'B', 'M', 'H', 'T', 'S', 'L', 'U', 'N'}
+ elif _is_monthly(target):
+ return source in {'D', 'C', 'B', 'H', 'T', 'S', 'L', 'U', 'N'}
+ elif _is_weekly(target):
+ return source in {target, 'D', 'C', 'B', 'H', 'T', 'S', 'L', 'U', 'N'}
+ elif target == 'B':
+ return source in {'B', 'H', 'T', 'S', 'L', 'U', 'N'}
+ elif target == 'C':
+ return source in {'C', 'H', 'T', 'S', 'L', 'U', 'N'}
+ elif target == 'D':
+ return source in {'D', 'H', 'T', 'S', 'L', 'U', 'N'}
+ elif target == 'H':
+ return source in {'H', 'T', 'S', 'L', 'U', 'N'}
+ elif target == 'T':
+ return source in {'T', 'S', 'L', 'U', 'N'}
+ elif target == 'S':
+ return source in {'S', 'L', 'U', 'N'}
+ elif target == 'L':
+ return source in {'L', 'U', 'N'}
+ elif target == 'U':
+ return source in {'U', 'N'}
+ elif target == 'N':
+ return source in {'N'}
+
+
+def is_superperiod(source, target) -> bint:
+ """
+ Returns True if upsampling is possible between source and target
+ frequencies
+
+ Parameters
+ ----------
+ source : string
+ Frequency converting from
+ target : string
+ Frequency converting to
+
+ Returns
+ -------
+ is_superperiod : boolean
+ """
+ if target is None or source is None:
+ return False
+ source = _maybe_coerce_freq(source)
+ target = _maybe_coerce_freq(target)
+
+ if _is_annual(source):
+ if _is_annual(target):
+ return get_rule_month(source) == get_rule_month(target)
+
+ if _is_quarterly(target):
+ smonth = get_rule_month(source)
+ tmonth = get_rule_month(target)
+ return _quarter_months_conform(smonth, tmonth)
+ return target in {'D', 'C', 'B', 'M', 'H', 'T', 'S', 'L', 'U', 'N'}
+ elif _is_quarterly(source):
+ return target in {'D', 'C', 'B', 'M', 'H', 'T', 'S', 'L', 'U', 'N'}
+ elif _is_monthly(source):
+ return target in {'D', 'C', 'B', 'H', 'T', 'S', 'L', 'U', 'N'}
+ elif _is_weekly(source):
+ return target in {source, 'D', 'C', 'B', 'H', 'T', 'S', 'L', 'U', 'N'}
+ elif source == 'B':
+ return target in {'D', 'C', 'B', 'H', 'T', 'S', 'L', 'U', 'N'}
+ elif source == 'C':
+ return target in {'D', 'C', 'B', 'H', 'T', 'S', 'L', 'U', 'N'}
+ elif source == 'D':
+ return target in {'D', 'C', 'B', 'H', 'T', 'S', 'L', 'U', 'N'}
+ elif source == 'H':
+ return target in {'H', 'T', 'S', 'L', 'U', 'N'}
+ elif source == 'T':
+ return target in {'T', 'S', 'L', 'U', 'N'}
+ elif source == 'S':
+ return target in {'S', 'L', 'U', 'N'}
+ elif source == 'L':
+ return target in {'L', 'U', 'N'}
+ elif source == 'U':
+ return target in {'U', 'N'}
+ elif source == 'N':
+ return target in {'N'}
+
+
+cdef str _maybe_coerce_freq(code):
+ """ we might need to coerce a code to a rule_code
+ and uppercase it
+
+ Parameters
+ ----------
+ source : string or DateOffset
+ Frequency converting from
+
+ Returns
+ -------
+ code : string
+ """
+ assert code is not None
+ if getattr(code, '_typ', None) == 'dateoffset':
+ # i.e. isinstance(code, ABCDateOffset):
+ code = code.rule_code
+ return code.upper()
+
+
+cdef bint _quarter_months_conform(str source, str target):
+ snum = MONTH_NUMBERS[source]
+ tnum = MONTH_NUMBERS[target]
+ return snum % 3 == tnum % 3
+
+
+cdef bint _is_annual(str rule):
+ rule = rule.upper()
+ return rule == 'A' or rule.startswith('A-')
+
+
+cdef bint _is_quarterly(str rule):
+ rule = rule.upper()
+ return rule == 'Q' or rule.startswith('Q-') or rule.startswith('BQ')
+
+
+cdef bint _is_monthly(str rule):
+ rule = rule.upper()
+ return rule == 'M' or rule == 'BM'
+
+
+cdef bint _is_weekly(str rule):
+ rule = rule.upper()
+ return rule == 'W' or rule.startswith('W-')
+
+
+# ----------------------------------------------------------------------
+
+cpdef object get_rule_month(object source, object default='DEC'):
+ """
+ Return starting month of given freq, default is December.
+
+ Parameters
+ ----------
+ source : object
+ default : object (default "DEC")
+
+ Returns
+ -------
+ rule_month: object (usually string)
+
+ Examples
+ --------
+ >>> get_rule_month('D')
+ 'DEC'
+
+ >>> get_rule_month('A-JAN')
+ 'JAN'
+ """
+ if hasattr(source, 'freqstr'):
+ source = source.freqstr
+ source = source.upper()
+ if '-' not in source:
+ return default
+ else:
+ return source.split('-')[1]
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/nattype.pxd b/contrib/python/pandas/py2/pandas/_libs/tslibs/nattype.pxd
new file mode 100644
index 00000000000..dae5bdc3f93
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/nattype.pxd
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+
+from cpython.datetime cimport datetime
+
+from numpy cimport int64_t
+cdef int64_t NPY_NAT
+
+cdef bint _nat_scalar_rules[6]
+
+
+cdef class _NaT(datetime):
+ cdef readonly:
+ int64_t value
+ object freq
+
+cdef _NaT c_NaT
+
+
+cdef bint checknull_with_nat(object val)
+cpdef bint is_null_datetimelike(object val, bint inat_is_null=*)
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/nattype.pyx b/contrib/python/pandas/py2/pandas/_libs/tslibs/nattype.pyx
new file mode 100644
index 00000000000..a55d15a7c4e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/nattype.pyx
@@ -0,0 +1,717 @@
+# -*- coding: utf-8 -*-
+
+from cpython cimport (
+ PyObject_RichCompare,
+ Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE)
+
+from cpython.datetime cimport (datetime,
+ PyDateTime_Check, PyDelta_Check,
+ PyDateTime_IMPORT)
+PyDateTime_IMPORT
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport int64_t
+cnp.import_array()
+
+from pandas._libs.tslibs.np_datetime cimport (
+ get_datetime64_value, get_timedelta64_value)
+cimport pandas._libs.tslibs.util as util
+from pandas._libs.tslibs.util cimport (
+ get_nat, is_integer_object, is_float_object, is_datetime64_object,
+ is_timedelta64_object)
+
+# ----------------------------------------------------------------------
+# Constants
+nat_strings = {'NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN'}
+
+cdef int64_t NPY_NAT = get_nat()
+iNaT = NPY_NAT # python-visible constant
+
+cdef bint _nat_scalar_rules[6]
+_nat_scalar_rules[Py_EQ] = False
+_nat_scalar_rules[Py_NE] = True
+_nat_scalar_rules[Py_LT] = False
+_nat_scalar_rules[Py_LE] = False
+_nat_scalar_rules[Py_GT] = False
+_nat_scalar_rules[Py_GE] = False
+
+# ----------------------------------------------------------------------
+
+
+def _make_nan_func(func_name, doc):
+ def f(*args, **kwargs):
+ return np.nan
+ f.__name__ = func_name
+ f.__doc__ = doc
+ return f
+
+
+def _make_nat_func(func_name, doc):
+ def f(*args, **kwargs):
+ return c_NaT
+ f.__name__ = func_name
+ f.__doc__ = doc
+ return f
+
+
+def _make_error_func(func_name, cls):
+ def f(*args, **kwargs):
+ raise ValueError("NaTType does not support " + func_name)
+
+ f.__name__ = func_name
+ if isinstance(cls, str):
+ # passed the literal docstring directly
+ f.__doc__ = cls
+ elif cls is not None:
+ f.__doc__ = getattr(cls, func_name).__doc__
+ return f
+
+
+cdef _nat_divide_op(self, other):
+ if PyDelta_Check(other) or is_timedelta64_object(other) or other is c_NaT:
+ return np.nan
+ if is_integer_object(other) or is_float_object(other):
+ return c_NaT
+ return NotImplemented
+
+
+cdef _nat_rdivide_op(self, other):
+ if PyDelta_Check(other):
+ return np.nan
+ return NotImplemented
+
+
+def __nat_unpickle(*args):
+ # return constant defined in the module
+ return c_NaT
+
+# ----------------------------------------------------------------------
+
+
+cdef class _NaT(datetime):
+ # cdef readonly:
+ # int64_t value
+ # object freq
+
+ def __hash__(_NaT self):
+ # py3k needs this defined here
+ return hash(self.value)
+
+ def __richcmp__(_NaT self, object other, int op):
+ cdef:
+ int ndim = getattr(other, 'ndim', -1)
+
+ if ndim == -1:
+ return _nat_scalar_rules[op]
+
+ if ndim == 0:
+ if is_datetime64_object(other):
+ return _nat_scalar_rules[op]
+ else:
+ raise TypeError('Cannot compare type %r with type %r' %
+ (type(self).__name__, type(other).__name__))
+ # Note: instead of passing "other, self, _reverse_ops[op]", we observe
+ # that `_nat_scalar_rules` is invariant under `_reverse_ops`,
+ # rendering it unnecessary.
+ return PyObject_RichCompare(other, self, op)
+
+ def __add__(self, other):
+ if PyDateTime_Check(other):
+ return c_NaT
+
+ elif hasattr(other, 'delta'):
+ # Timedelta, offsets.Tick, offsets.Week
+ return c_NaT
+ elif getattr(other, '_typ', None) in ['dateoffset', 'series',
+ 'period', 'datetimeindex',
+ 'timedeltaindex']:
+ # Duplicate logic in _Timestamp.__add__ to avoid needing
+ # to subclass; allows us to @final(_Timestamp.__add__)
+ return NotImplemented
+ return c_NaT
+
+ def __sub__(self, other):
+ # Duplicate some logic from _Timestamp.__sub__ to avoid needing
+ # to subclass; allows us to @final(_Timestamp.__sub__)
+ if PyDateTime_Check(other):
+ return NaT
+ elif PyDelta_Check(other):
+ return NaT
+
+ elif getattr(other, '_typ', None) == 'datetimeindex':
+ # a Timestamp-DatetimeIndex -> yields a negative TimedeltaIndex
+ return -other.__sub__(self)
+
+ elif getattr(other, '_typ', None) == 'timedeltaindex':
+ # a Timestamp-TimedeltaIndex -> yields a negative TimedeltaIndex
+ return (-other).__add__(self)
+
+ elif hasattr(other, 'delta'):
+ # offsets.Tick, offsets.Week
+ neg_other = -other
+ return self + neg_other
+
+ elif getattr(other, '_typ', None) in ['period', 'series',
+ 'periodindex', 'dateoffset']:
+ return NotImplemented
+
+ return NaT
+
+ def __pos__(self):
+ return NaT
+
+ def __neg__(self):
+ return NaT
+
+ def __div__(self, other):
+ return _nat_divide_op(self, other)
+
+ def __truediv__(self, other):
+ return _nat_divide_op(self, other)
+
+ def __floordiv__(self, other):
+ return _nat_divide_op(self, other)
+
+ def __mul__(self, other):
+ if is_integer_object(other) or is_float_object(other):
+ return NaT
+ return NotImplemented
+
+ @property
+ def asm8(self):
+ return np.datetime64(NPY_NAT, 'ns')
+
+ def to_datetime64(self):
+ """ Returns a numpy.datetime64 object with 'ns' precision """
+ return np.datetime64('NaT', 'ns')
+
+ def __repr__(self):
+ return 'NaT'
+
+ def __str__(self):
+ return 'NaT'
+
+ def isoformat(self, sep='T'):
+ # This allows Timestamp(ts.isoformat()) to always correctly roundtrip.
+ return 'NaT'
+
+ def __hash__(self):
+ return NPY_NAT
+
+ def __int__(self):
+ return NPY_NAT
+
+ def __long__(self):
+ return NPY_NAT
+
+ def total_seconds(self):
+ """
+ Total duration of timedelta in seconds (to ns precision)
+ """
+ # GH#10939
+ return np.nan
+
+ @property
+ def is_leap_year(self):
+ return False
+
+ @property
+ def is_month_start(self):
+ return False
+
+ @property
+ def is_quarter_start(self):
+ return False
+
+ @property
+ def is_year_start(self):
+ return False
+
+ @property
+ def is_month_end(self):
+ return False
+
+ @property
+ def is_quarter_end(self):
+ return False
+
+ @property
+ def is_year_end(self):
+ return False
+
+
+class NaTType(_NaT):
+ """(N)ot-(A)-(T)ime, the time equivalent of NaN"""
+
+ def __new__(cls):
+ cdef _NaT base
+
+ base = _NaT.__new__(cls, 1, 1, 1)
+ base.value = NPY_NAT
+ base.freq = None
+
+ return base
+
+ def __reduce_ex__(self, protocol):
+ # python 3.6 compat
+ # http://bugs.python.org/issue28730
+ # now __reduce_ex__ is defined and higher priority than __reduce__
+ return self.__reduce__()
+
+ def __reduce__(self):
+ return (__nat_unpickle, (None, ))
+
+ def __rdiv__(self, other):
+ return _nat_rdivide_op(self, other)
+
+ def __rtruediv__(self, other):
+ return _nat_rdivide_op(self, other)
+
+ def __rfloordiv__(self, other):
+ return _nat_rdivide_op(self, other)
+
+ def __rmul__(self, other):
+ if is_integer_object(other) or is_float_object(other):
+ return c_NaT
+ return NotImplemented
+
+ # ----------------------------------------------------------------------
+ # inject the Timestamp field properties
+ # these by definition return np.nan
+
+ year = property(fget=lambda self: np.nan)
+ quarter = property(fget=lambda self: np.nan)
+ month = property(fget=lambda self: np.nan)
+ day = property(fget=lambda self: np.nan)
+ hour = property(fget=lambda self: np.nan)
+ minute = property(fget=lambda self: np.nan)
+ second = property(fget=lambda self: np.nan)
+ millisecond = property(fget=lambda self: np.nan)
+ microsecond = property(fget=lambda self: np.nan)
+ nanosecond = property(fget=lambda self: np.nan)
+
+ week = property(fget=lambda self: np.nan)
+ dayofyear = property(fget=lambda self: np.nan)
+ weekofyear = property(fget=lambda self: np.nan)
+ days_in_month = property(fget=lambda self: np.nan)
+ daysinmonth = property(fget=lambda self: np.nan)
+ dayofweek = property(fget=lambda self: np.nan)
+ weekday_name = property(fget=lambda self: np.nan)
+
+ # inject Timedelta properties
+ days = property(fget=lambda self: np.nan)
+ seconds = property(fget=lambda self: np.nan)
+ microseconds = property(fget=lambda self: np.nan)
+ nanoseconds = property(fget=lambda self: np.nan)
+
+ # inject pd.Period properties
+ qyear = property(fget=lambda self: np.nan)
+
+ # ----------------------------------------------------------------------
+ # GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or
+ # return NaT create functions that raise, for binding to NaTType
+ # These are the ones that can get their docstrings from datetime.
+
+ # nan methods
+ weekday = _make_nan_func('weekday', datetime.weekday.__doc__)
+ isoweekday = _make_nan_func('isoweekday', datetime.isoweekday.__doc__)
+ month_name = _make_nan_func('month_name', # noqa:E128
+ """
+ Return the month name of the Timestamp with specified locale.
+
+ Parameters
+ ----------
+ locale : string, default None (English locale)
+ locale determining the language in which to return the month name
+
+ Returns
+ -------
+ month_name : string
+
+ .. versionadded:: 0.23.0
+ """)
+ day_name = _make_nan_func('day_name', # noqa:E128
+ """
+ Return the day name of the Timestamp with specified locale.
+
+ Parameters
+ ----------
+ locale : string, default None (English locale)
+ locale determining the language in which to return the day name
+
+ Returns
+ -------
+ day_name : string
+
+ .. versionadded:: 0.23.0
+ """)
+ # _nat_methods
+ date = _make_nat_func('date', datetime.date.__doc__)
+
+ utctimetuple = _make_error_func('utctimetuple', datetime)
+ timetz = _make_error_func('timetz', datetime)
+ timetuple = _make_error_func('timetuple', datetime)
+ strptime = _make_error_func('strptime', datetime)
+ strftime = _make_error_func('strftime', datetime)
+ isocalendar = _make_error_func('isocalendar', datetime)
+ dst = _make_error_func('dst', datetime)
+ ctime = _make_error_func('ctime', datetime)
+ time = _make_error_func('time', datetime)
+ toordinal = _make_error_func('toordinal', datetime)
+ tzname = _make_error_func('tzname', datetime)
+ utcoffset = _make_error_func('utcoffset', datetime)
+
+ # ----------------------------------------------------------------------
+ # The remaining methods have docstrings copy/pasted from the analogous
+ # Timestamp methods.
+
+ utcfromtimestamp = _make_error_func('utcfromtimestamp', # noqa:E128
+ """
+ Timestamp.utcfromtimestamp(ts)
+
+ Construct a naive UTC datetime from a POSIX timestamp.
+ """
+ )
+ fromtimestamp = _make_error_func('fromtimestamp', # noqa:E128
+ """
+ Timestamp.fromtimestamp(ts)
+
+ timestamp[, tz] -> tz's local time from POSIX timestamp.
+ """
+ )
+ combine = _make_error_func('combine', # noqa:E128
+ """
+ Timsetamp.combine(date, time)
+
+ date, time -> datetime with same date and time fields
+ """
+ )
+ utcnow = _make_error_func('utcnow', # noqa:E128
+ """
+ Timestamp.utcnow()
+
+ Return a new Timestamp representing UTC day and time.
+ """
+ )
+
+ timestamp = _make_error_func('timestamp', # noqa:E128
+ """Return POSIX timestamp as float.""")
+
+ # GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or
+ # return NaT create functions that raise, for binding to NaTType
+ astimezone = _make_error_func('astimezone', # noqa:E128
+ """
+ Convert tz-aware Timestamp to another time zone.
+
+ Parameters
+ ----------
+ tz : str, pytz.timezone, dateutil.tz.tzfile or None
+ Time zone for time which Timestamp will be converted to.
+ None will remove timezone holding UTC time.
+
+ Returns
+ -------
+ converted : Timestamp
+
+ Raises
+ ------
+ TypeError
+ If Timestamp is tz-naive.
+ """)
+ fromordinal = _make_error_func('fromordinal', # noqa:E128
+ """
+ Timestamp.fromordinal(ordinal, freq=None, tz=None)
+
+ passed an ordinal, translate and convert to a ts
+ note: by definition there cannot be any tz info on the ordinal itself
+
+ Parameters
+ ----------
+ ordinal : int
+ date corresponding to a proleptic Gregorian ordinal
+ freq : str, DateOffset
+ Offset which Timestamp will have
+ tz : str, pytz.timezone, dateutil.tz.tzfile or None
+ Time zone for time which Timestamp will have.
+ """)
+
+ # _nat_methods
+ to_pydatetime = _make_nat_func('to_pydatetime', # noqa:E128
+ """
+ Convert a Timestamp object to a native Python datetime object.
+
+ If warn=True, issue a warning if nanoseconds is nonzero.
+ """)
+
+ now = _make_nat_func('now', # noqa:E128
+ """
+ Timestamp.now(tz=None)
+
+ Returns new Timestamp object representing current time local to
+ tz.
+
+ Parameters
+ ----------
+ tz : str or timezone object, default None
+ Timezone to localize to
+ """)
+ today = _make_nat_func('today', # noqa:E128
+ """
+ Timestamp.today(cls, tz=None)
+
+ Return the current time in the local timezone. This differs
+ from datetime.today() in that it can be localized to a
+ passed timezone.
+
+ Parameters
+ ----------
+ tz : str or timezone object, default None
+ Timezone to localize to
+ """)
+ round = _make_nat_func('round', # noqa:E128
+ """
+ Round the Timestamp to the specified resolution
+
+ Parameters
+ ----------
+ freq : a freq string indicating the rounding resolution
+ ambiguous : bool, 'NaT', default 'raise'
+ - bool contains flags to determine if time is dst or not (note
+ that this flag is only applicable for ambiguous fall dst dates)
+ - 'NaT' will return NaT for an ambiguous time
+ - 'raise' will raise an AmbiguousTimeError for an ambiguous time
+
+ .. versionadded:: 0.24.0
+ nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta,
+ default 'raise'
+ A nonexistent time does not exist in a particular timezone
+ where clocks moved forward due to DST.
+
+ - 'shift_forward' will shift the nonexistent time forward to the
+ closest existing time
+ - 'shift_backward' will shift the nonexistent time backward to the
+ closest existing time
+ - 'NaT' will return NaT where there are nonexistent times
+ - timedelta objects will shift nonexistent times by the timedelta
+ - 'raise' will raise an NonExistentTimeError if there are
+ nonexistent times
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ a new Timestamp rounded to the given resolution of `freq`
+
+ Raises
+ ------
+ ValueError if the freq cannot be converted
+ """)
+ floor = _make_nat_func('floor', # noqa:E128
+ """
+ return a new Timestamp floored to this resolution
+
+ Parameters
+ ----------
+ freq : a freq string indicating the flooring resolution
+ ambiguous : bool, 'NaT', default 'raise'
+ - bool contains flags to determine if time is dst or not (note
+ that this flag is only applicable for ambiguous fall dst dates)
+ - 'NaT' will return NaT for an ambiguous time
+ - 'raise' will raise an AmbiguousTimeError for an ambiguous time
+
+ .. versionadded:: 0.24.0
+ nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta,
+ default 'raise'
+ A nonexistent time does not exist in a particular timezone
+ where clocks moved forward due to DST.
+
+ - 'shift_forward' will shift the nonexistent time forward to the
+ closest existing time
+ - 'shift_backward' will shift the nonexistent time backward to the
+ closest existing time
+ - 'NaT' will return NaT where there are nonexistent times
+ - timedelta objects will shift nonexistent times by the timedelta
+ - 'raise' will raise an NonExistentTimeError if there are
+ nonexistent times
+
+ .. versionadded:: 0.24.0
+
+ Raises
+ ------
+ ValueError if the freq cannot be converted
+ """)
+ ceil = _make_nat_func('ceil', # noqa:E128
+ """
+ return a new Timestamp ceiled to this resolution
+
+ Parameters
+ ----------
+ freq : a freq string indicating the ceiling resolution
+ ambiguous : bool, 'NaT', default 'raise'
+ - bool contains flags to determine if time is dst or not (note
+ that this flag is only applicable for ambiguous fall dst dates)
+ - 'NaT' will return NaT for an ambiguous time
+ - 'raise' will raise an AmbiguousTimeError for an ambiguous time
+
+ .. versionadded:: 0.24.0
+ nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta,
+ default 'raise'
+ A nonexistent time does not exist in a particular timezone
+ where clocks moved forward due to DST.
+
+ - 'shift_forward' will shift the nonexistent time forward to the
+ closest existing time
+ - 'shift_backward' will shift the nonexistent time backward to the
+ closest existing time
+ - 'NaT' will return NaT where there are nonexistent times
+ - timedelta objects will shift nonexistent times by the timedelta
+ - 'raise' will raise an NonExistentTimeError if there are
+ nonexistent times
+
+ .. versionadded:: 0.24.0
+
+ Raises
+ ------
+ ValueError if the freq cannot be converted
+ """)
+
+ tz_convert = _make_nat_func('tz_convert', # noqa:E128
+ """
+ Convert tz-aware Timestamp to another time zone.
+
+ Parameters
+ ----------
+ tz : str, pytz.timezone, dateutil.tz.tzfile or None
+ Time zone for time which Timestamp will be converted to.
+ None will remove timezone holding UTC time.
+
+ Returns
+ -------
+ converted : Timestamp
+
+ Raises
+ ------
+ TypeError
+ If Timestamp is tz-naive.
+ """)
+ tz_localize = _make_nat_func('tz_localize', # noqa:E128
+ """
+ Convert naive Timestamp to local time zone, or remove
+ timezone from tz-aware Timestamp.
+
+ Parameters
+ ----------
+ tz : str, pytz.timezone, dateutil.tz.tzfile or None
+ Time zone for time which Timestamp will be converted to.
+ None will remove timezone holding local time.
+
+ ambiguous : bool, 'NaT', default 'raise'
+ When clocks moved backward due to DST, ambiguous times may arise.
+ For example in Central European Time (UTC+01), when going from
+ 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at
+ 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the
+ `ambiguous` parameter dictates how ambiguous times should be
+ handled.
+
+ - bool contains flags to determine if time is dst or not (note
+ that this flag is only applicable for ambiguous fall dst dates)
+ - 'NaT' will return NaT for an ambiguous time
+ - 'raise' will raise an AmbiguousTimeError for an ambiguous time
+
+ nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta,
+ default 'raise'
+ A nonexistent time does not exist in a particular timezone
+ where clocks moved forward due to DST.
+
+ - 'shift_forward' will shift the nonexistent time forward to the
+ closest existing time
+ - 'shift_backward' will shift the nonexistent time backward to the
+ closest existing time
+ - 'NaT' will return NaT where there are nonexistent times
+ - timedelta objects will shift nonexistent times by the timedelta
+ - 'raise' will raise an NonExistentTimeError if there are
+ nonexistent times
+
+ .. versionadded:: 0.24.0
+
+ errors : 'raise', 'coerce', default None
+ - 'raise' will raise a NonExistentTimeError if a timestamp is not
+ valid in the specified timezone (e.g. due to a transition from
+ or to DST time). Use ``nonexistent='raise'`` instead.
+ - 'coerce' will return NaT if the timestamp can not be converted
+ into the specified timezone. Use ``nonexistent='NaT'`` instead.
+
+ .. deprecated:: 0.24.0
+
+ Returns
+ -------
+ localized : Timestamp
+
+ Raises
+ ------
+ TypeError
+ If the Timestamp is tz-aware and tz is not None.
+ """)
+ replace = _make_nat_func('replace', # noqa:E128
+ """
+ implements datetime.replace, handles nanoseconds
+
+ Parameters
+ ----------
+ year : int, optional
+ month : int, optional
+ day : int, optional
+ hour : int, optional
+ minute : int, optional
+ second : int, optional
+ microsecond : int, optional
+ nanosecond : int, optional
+ tzinfo : tz-convertible, optional
+ fold : int, optional, default is 0
+ added in 3.6, NotImplemented
+
+ Returns
+ -------
+ Timestamp with fields replaced
+ """)
+
+
+c_NaT = NaTType() # C-visible
+NaT = c_NaT # Python-visible
+
+
+# ----------------------------------------------------------------------
+
+cdef inline bint checknull_with_nat(object val):
+ """ utility to check if a value is a nat or not """
+ return val is None or util.is_nan(val) or val is c_NaT
+
+
+cpdef bint is_null_datetimelike(object val, bint inat_is_null=True):
+ """
+ Determine if we have a null for a timedelta/datetime (or integer versions)
+
+ Parameters
+ ----------
+ val : object
+ inat_is_null : bool, default True
+ Whether to treat integer iNaT value as null
+
+ Returns
+ -------
+ null_datetimelike : bool
+ """
+ if val is None:
+ return True
+ elif val is c_NaT:
+ return True
+ elif util.is_float_object(val) or util.is_complex_object(val):
+ return val != val
+ elif util.is_timedelta64_object(val):
+ return get_timedelta64_value(val) == NPY_NAT
+ elif util.is_datetime64_object(val):
+ return get_datetime64_value(val) == NPY_NAT
+ elif inat_is_null and util.is_integer_object(val):
+ return val == NPY_NAT
+ return False
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/np_datetime.pxd b/contrib/python/pandas/py2/pandas/_libs/tslibs/np_datetime.pxd
new file mode 100644
index 00000000000..803c8cb18e3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/np_datetime.pxd
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+
+from cpython.datetime cimport date, datetime
+
+from numpy cimport int64_t, int32_t
+
+cdef extern from "numpy/ndarrayobject.h":
+ ctypedef int64_t npy_timedelta
+ ctypedef int64_t npy_datetime
+
+cdef extern from "numpy/ndarraytypes.h":
+ ctypedef struct PyArray_DatetimeMetaData:
+ NPY_DATETIMEUNIT base
+ int64_t num
+
+cdef extern from "numpy/arrayscalars.h":
+ ctypedef struct PyDatetimeScalarObject:
+ # PyObject_HEAD
+ npy_datetime obval
+ PyArray_DatetimeMetaData obmeta
+
+ ctypedef struct PyTimedeltaScalarObject:
+ # PyObject_HEAD
+ npy_timedelta obval
+ PyArray_DatetimeMetaData obmeta
+
+cdef extern from "numpy/ndarraytypes.h":
+ ctypedef struct npy_datetimestruct:
+ int64_t year
+ int32_t month, day, hour, min, sec, us, ps, as
+
+ ctypedef enum NPY_DATETIMEUNIT:
+ NPY_FR_Y
+ NPY_FR_M
+ NPY_FR_W
+ NPY_FR_D
+ NPY_FR_B
+ NPY_FR_h
+ NPY_FR_m
+ NPY_FR_s
+ NPY_FR_ms
+ NPY_FR_us
+ NPY_FR_ns
+ NPY_FR_ps
+ NPY_FR_fs
+ NPY_FR_as
+
+cdef extern from "src/datetime/np_datetime.h":
+ ctypedef struct pandas_timedeltastruct:
+ int64_t days
+ int32_t hrs, min, sec, ms, us, ns, seconds, microseconds, nanoseconds
+
+ void pandas_datetime_to_datetimestruct(npy_datetime val,
+ NPY_DATETIMEUNIT fr,
+ npy_datetimestruct *result) nogil
+
+
+cdef int reverse_ops[6]
+
+cdef bint cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1
+
+cdef check_dts_bounds(npy_datetimestruct *dts)
+
+cdef int64_t dtstruct_to_dt64(npy_datetimestruct* dts) nogil
+cdef void dt64_to_dtstruct(int64_t dt64, npy_datetimestruct* out) nogil
+cdef void td64_to_tdstruct(int64_t td64, pandas_timedeltastruct* out) nogil
+
+cdef int64_t pydatetime_to_dt64(datetime val, npy_datetimestruct *dts)
+cdef int64_t pydate_to_dt64(date val, npy_datetimestruct *dts)
+
+cdef npy_datetime get_datetime64_value(object obj) nogil
+cdef npy_timedelta get_timedelta64_value(object obj) nogil
+cdef NPY_DATETIMEUNIT get_datetime64_unit(object obj) nogil
+
+cdef int _string_to_dts(object val, npy_datetimestruct* dts,
+ int* out_local, int* out_tzoffset) except? -1
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/np_datetime.pyx b/contrib/python/pandas/py2/pandas/_libs/tslibs/np_datetime.pyx
new file mode 100644
index 00000000000..dbbe9da381f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/np_datetime.pyx
@@ -0,0 +1,203 @@
+# -*- coding: utf-8 -*-
+
+from cpython cimport (Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE,
+ PyUnicode_AsASCIIString)
+
+from cpython.datetime cimport (datetime, date,
+ PyDateTime_IMPORT,
+ PyDateTime_GET_YEAR, PyDateTime_GET_MONTH,
+ PyDateTime_GET_DAY, PyDateTime_DATE_GET_HOUR,
+ PyDateTime_DATE_GET_MINUTE,
+ PyDateTime_DATE_GET_SECOND,
+ PyDateTime_DATE_GET_MICROSECOND)
+PyDateTime_IMPORT
+
+from numpy cimport int64_t
+
+cdef extern from "src/datetime/np_datetime.h":
+ int cmp_npy_datetimestruct(npy_datetimestruct *a,
+ npy_datetimestruct *b)
+
+ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT fr,
+ npy_datetimestruct *d) nogil
+
+ void pandas_datetime_to_datetimestruct(npy_datetime val,
+ NPY_DATETIMEUNIT fr,
+ npy_datetimestruct *result) nogil
+
+ void pandas_timedelta_to_timedeltastruct(npy_timedelta val,
+ NPY_DATETIMEUNIT fr,
+ pandas_timedeltastruct *result
+ ) nogil
+
+ npy_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS
+
+cdef extern from "src/datetime/np_datetime_strings.h":
+ int parse_iso_8601_datetime(char *str, int len,
+ npy_datetimestruct *out,
+ int *out_local, int *out_tzoffset)
+
+
+# ----------------------------------------------------------------------
+# numpy object inspection
+
+cdef inline npy_datetime get_datetime64_value(object obj) nogil:
+ """
+ returns the int64 value underlying scalar numpy datetime64 object
+
+ Note that to interpret this as a datetime, the corresponding unit is
+ also needed. That can be found using `get_datetime64_unit`.
+ """
+ return (<PyDatetimeScalarObject*>obj).obval
+
+
+cdef inline npy_timedelta get_timedelta64_value(object obj) nogil:
+ """
+ returns the int64 value underlying scalar numpy timedelta64 object
+ """
+ return (<PyTimedeltaScalarObject*>obj).obval
+
+
+cdef inline NPY_DATETIMEUNIT get_datetime64_unit(object obj) nogil:
+ """
+ returns the unit part of the dtype for a numpy datetime64 object.
+ """
+ return <NPY_DATETIMEUNIT>(<PyDatetimeScalarObject*>obj).obmeta.base
+
+# ----------------------------------------------------------------------
+# Comparison
+
+cdef int reverse_ops[6]
+
+reverse_ops[Py_LT] = Py_GT
+reverse_ops[Py_LE] = Py_GE
+reverse_ops[Py_EQ] = Py_EQ
+reverse_ops[Py_NE] = Py_NE
+reverse_ops[Py_GT] = Py_LT
+reverse_ops[Py_GE] = Py_LE
+
+
+cdef inline bint cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1:
+ """
+ cmp_scalar is a more performant version of PyObject_RichCompare
+ typed for int64_t arguments.
+ """
+ if op == Py_EQ:
+ return lhs == rhs
+ elif op == Py_NE:
+ return lhs != rhs
+ elif op == Py_LT:
+ return lhs < rhs
+ elif op == Py_LE:
+ return lhs <= rhs
+ elif op == Py_GT:
+ return lhs > rhs
+ elif op == Py_GE:
+ return lhs >= rhs
+
+
+class OutOfBoundsDatetime(ValueError):
+ pass
+
+
+cdef inline check_dts_bounds(npy_datetimestruct *dts):
+ """Raises OutOfBoundsDatetime if the given date is outside the range that
+ can be represented by nanosecond-resolution 64-bit integers."""
+ cdef:
+ bint error = False
+
+ if (dts.year <= 1677 and
+ cmp_npy_datetimestruct(dts, &_NS_MIN_DTS) == -1):
+ error = True
+ elif (dts.year >= 2262 and
+ cmp_npy_datetimestruct(dts, &_NS_MAX_DTS) == 1):
+ error = True
+
+ if error:
+ fmt = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, dts.month,
+ dts.day, dts.hour,
+ dts.min, dts.sec)
+ raise OutOfBoundsDatetime(
+ 'Out of bounds nanosecond timestamp: {fmt}'.format(fmt=fmt))
+
+
+# ----------------------------------------------------------------------
+# Conversion
+
+cdef inline int64_t dtstruct_to_dt64(npy_datetimestruct* dts) nogil:
+ """Convenience function to call npy_datetimestruct_to_datetime
+ with the by-far-most-common frequency NPY_FR_ns"""
+ return npy_datetimestruct_to_datetime(NPY_FR_ns, dts)
+
+
+cdef inline void dt64_to_dtstruct(int64_t dt64,
+ npy_datetimestruct* out) nogil:
+ """Convenience function to call pandas_datetime_to_datetimestruct
+ with the by-far-most-common frequency NPY_FR_ns"""
+ pandas_datetime_to_datetimestruct(dt64, NPY_FR_ns, out)
+ return
+
+
+cdef inline void td64_to_tdstruct(int64_t td64,
+ pandas_timedeltastruct* out) nogil:
+ """Convenience function to call pandas_timedelta_to_timedeltastruct
+ with the by-far-most-common frequency NPY_FR_ns"""
+ pandas_timedelta_to_timedeltastruct(td64, NPY_FR_ns, out)
+ return
+
+
+cdef inline int64_t pydatetime_to_dt64(datetime val,
+ npy_datetimestruct *dts):
+ """
+ Note we are assuming that the datetime object is timezone-naive.
+ """
+ dts.year = PyDateTime_GET_YEAR(val)
+ dts.month = PyDateTime_GET_MONTH(val)
+ dts.day = PyDateTime_GET_DAY(val)
+ dts.hour = PyDateTime_DATE_GET_HOUR(val)
+ dts.min = PyDateTime_DATE_GET_MINUTE(val)
+ dts.sec = PyDateTime_DATE_GET_SECOND(val)
+ dts.us = PyDateTime_DATE_GET_MICROSECOND(val)
+ dts.ps = dts.as = 0
+ return dtstruct_to_dt64(dts)
+
+
+cdef inline int64_t pydate_to_dt64(date val, npy_datetimestruct *dts):
+ dts.year = PyDateTime_GET_YEAR(val)
+ dts.month = PyDateTime_GET_MONTH(val)
+ dts.day = PyDateTime_GET_DAY(val)
+ dts.hour = dts.min = dts.sec = dts.us = 0
+ dts.ps = dts.as = 0
+ return dtstruct_to_dt64(dts)
+
+
+cdef inline int _string_to_dts(object val, npy_datetimestruct* dts,
+ int* out_local, int* out_tzoffset) except? -1:
+ cdef:
+ int result
+ char *tmp
+
+ if isinstance(val, unicode):
+ val = PyUnicode_AsASCIIString(val)
+
+ tmp = val
+ result = _cstring_to_dts(tmp, len(val), dts, out_local, out_tzoffset)
+
+ if result == -1:
+ raise ValueError('Unable to parse %s' % str(val))
+ return result
+
+
+cdef inline int _cstring_to_dts(char *val, int length,
+ npy_datetimestruct* dts,
+ int* out_local, int* out_tzoffset) except? -1:
+ # Note: without this "extra layer" between _string_to_dts
+ # and parse_iso_8601_datetime, calling _string_to_dts raises
+ # `SystemError: <class 'str'> returned a result with an error set`
+ # in Python3
+ cdef:
+ int result
+
+ result = parse_iso_8601_datetime(val, length,
+ dts, out_local, out_tzoffset)
+ return result
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/offsets.pxd b/contrib/python/pandas/py2/pandas/_libs/tslibs/offsets.pxd
new file mode 100644
index 00000000000..2829a27b990
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/offsets.pxd
@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+
+cdef to_offset(object obj)
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/offsets.pyx b/contrib/python/pandas/py2/pandas/_libs/tslibs/offsets.pyx
new file mode 100644
index 00000000000..856aa52f82c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/offsets.pyx
@@ -0,0 +1,1127 @@
+# -*- coding: utf-8 -*-
+
+import cython
+
+import time
+from cpython.datetime cimport (PyDateTime_IMPORT,
+ PyDateTime_Check,
+ PyDelta_Check,
+ datetime, timedelta,
+ time as dt_time)
+PyDateTime_IMPORT
+
+from dateutil.relativedelta import relativedelta
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport int64_t
+cnp.import_array()
+
+
+from pandas._libs.tslibs.util cimport is_string_object, is_integer_object
+
+from pandas._libs.tslibs.ccalendar import MONTHS, DAYS
+from pandas._libs.tslibs.ccalendar cimport get_days_in_month, dayofweek
+from pandas._libs.tslibs.conversion cimport (
+ tz_convert_single, pydt_to_i8, localize_pydatetime)
+from pandas._libs.tslibs.nattype cimport NPY_NAT
+from pandas._libs.tslibs.np_datetime cimport (
+ npy_datetimestruct, dtstruct_to_dt64, dt64_to_dtstruct)
+from pandas._libs.tslibs.timezones import UTC
+
+
+PY2 = bytes == str
+
+# ---------------------------------------------------------------------
+# Constants
+
+
+_offset_to_period_map = {
+ 'WEEKDAY': 'D',
+ 'EOM': 'M',
+ 'BM': 'M',
+ 'BQS': 'Q',
+ 'QS': 'Q',
+ 'BQ': 'Q',
+ 'BA': 'A',
+ 'AS': 'A',
+ 'BAS': 'A',
+ 'MS': 'M',
+ 'D': 'D',
+ 'C': 'C',
+ 'B': 'B',
+ 'T': 'T',
+ 'S': 'S',
+ 'L': 'L',
+ 'U': 'U',
+ 'N': 'N',
+ 'H': 'H',
+ 'Q': 'Q',
+ 'A': 'A',
+ 'W': 'W',
+ 'M': 'M',
+ 'Y': 'A',
+ 'BY': 'A',
+ 'YS': 'A',
+ 'BYS': 'A'}
+
+need_suffix = ['QS', 'BQ', 'BQS', 'YS', 'AS', 'BY', 'BA', 'BYS', 'BAS']
+
+for __prefix in need_suffix:
+ for _m in MONTHS:
+ key = '%s-%s' % (__prefix, _m)
+ _offset_to_period_map[key] = _offset_to_period_map[__prefix]
+
+for __prefix in ['A', 'Q']:
+ for _m in MONTHS:
+ _alias = '%s-%s' % (__prefix, _m)
+ _offset_to_period_map[_alias] = _alias
+
+for _d in DAYS:
+ _offset_to_period_map['W-%s' % _d] = 'W-%s' % _d
+
+
+# ---------------------------------------------------------------------
+# Misc Helpers
+
+cdef to_offset(object obj):
+ """
+ Wrap pandas.tseries.frequencies.to_offset to keep centralize runtime
+ imports
+ """
+ if isinstance(obj, _BaseOffset):
+ return obj
+ from pandas.tseries.frequencies import to_offset
+ return to_offset(obj)
+
+
+def as_datetime(obj):
+ f = getattr(obj, 'to_pydatetime', None)
+ if f is not None:
+ obj = f()
+ return obj
+
+
+cpdef bint _is_normalized(dt):
+ if (dt.hour != 0 or dt.minute != 0 or dt.second != 0 or
+ dt.microsecond != 0 or getattr(dt, 'nanosecond', 0) != 0):
+ return False
+ return True
+
+
+def apply_index_wraps(func):
+ # Note: normally we would use `@functools.wraps(func)`, but this does
+ # not play nicely with cython class methods
+ def wrapper(self, other):
+ result = func(self, other)
+ if self.normalize:
+ result = result.to_period('D').to_timestamp()
+ return result
+
+ # do @functools.wraps(func) manually since it doesn't work on cdef funcs
+ wrapper.__name__ = func.__name__
+ wrapper.__doc__ = func.__doc__
+ try:
+ wrapper.__module__ = func.__module__
+ except AttributeError:
+ # AttributeError: 'method_descriptor' object has no
+ # attribute '__module__'
+ pass
+ return wrapper
+
+
+cdef _wrap_timedelta_result(result):
+ """
+ Tick operations dispatch to their Timedelta counterparts. Wrap the result
+ of these operations in a Tick if possible.
+
+ Parameters
+ ----------
+ result : object
+
+ Returns
+ -------
+ object
+ """
+ if PyDelta_Check(result):
+ # convert Timedelta back to a Tick
+ from pandas.tseries.offsets import _delta_to_tick
+ return _delta_to_tick(result)
+
+ return result
+
+# ---------------------------------------------------------------------
+# Business Helpers
+
+cpdef int get_lastbday(int year, int month) nogil:
+ """
+ Find the last day of the month that is a business day.
+
+ Parameters
+ ----------
+ year : int
+ month : int
+
+ Returns
+ -------
+ last_bday : int
+ """
+ cdef:
+ int wkday, days_in_month
+
+ wkday = dayofweek(year, month, 1)
+ days_in_month = get_days_in_month(year, month)
+ return days_in_month - max(((wkday + days_in_month - 1) % 7) - 4, 0)
+
+
+cpdef int get_firstbday(int year, int month) nogil:
+ """
+ Find the first day of the month that is a business day.
+
+ Parameters
+ ----------
+ year : int
+ month : int
+
+ Returns
+ -------
+ first_bday : int
+ """
+ cdef:
+ int first, wkday
+
+ wkday = dayofweek(year, month, 1)
+ first = 1
+ if wkday == 5: # on Saturday
+ first = 3
+ elif wkday == 6: # on Sunday
+ first = 2
+ return first
+
+
+def _get_calendar(weekmask, holidays, calendar):
+ """Generate busdaycalendar"""
+ if isinstance(calendar, np.busdaycalendar):
+ if not holidays:
+ holidays = tuple(calendar.holidays)
+ elif not isinstance(holidays, tuple):
+ holidays = tuple(holidays)
+ else:
+ # trust that calendar.holidays and holidays are
+ # consistent
+ pass
+ return calendar, holidays
+
+ if holidays is None:
+ holidays = []
+ try:
+ holidays = holidays + calendar.holidays().tolist()
+ except AttributeError:
+ pass
+ holidays = [_to_dt64(dt, dtype='datetime64[D]') for dt in holidays]
+ holidays = tuple(sorted(holidays))
+
+ kwargs = {'weekmask': weekmask}
+ if holidays:
+ kwargs['holidays'] = holidays
+
+ busdaycalendar = np.busdaycalendar(**kwargs)
+ return busdaycalendar, holidays
+
+
+def _to_dt64(dt, dtype='datetime64'):
+ # Currently
+ # > np.datetime64(dt.datetime(2013,5,1),dtype='datetime64[D]')
+ # numpy.datetime64('2013-05-01T02:00:00.000000+0200')
+ # Thus astype is needed to cast datetime to datetime64[D]
+ if getattr(dt, 'tzinfo', None) is not None:
+ i8 = pydt_to_i8(dt)
+ dt = tz_convert_single(i8, UTC, dt.tzinfo)
+ dt = np.int64(dt).astype('datetime64[ns]')
+ else:
+ dt = np.datetime64(dt)
+ if dt.dtype.name != dtype:
+ dt = dt.astype(dtype)
+ return dt
+
+
+# ---------------------------------------------------------------------
+# Validation
+
+
+def _validate_business_time(t_input):
+ if is_string_object(t_input):
+ try:
+ t = time.strptime(t_input, '%H:%M')
+ return dt_time(hour=t.tm_hour, minute=t.tm_min)
+ except ValueError:
+ raise ValueError("time data must match '%H:%M' format")
+ elif isinstance(t_input, dt_time):
+ if t_input.second != 0 or t_input.microsecond != 0:
+ raise ValueError(
+ "time data must be specified only with hour and minute")
+ return t_input
+ else:
+ raise ValueError("time data must be string or datetime.time")
+
+
+# ---------------------------------------------------------------------
+# Constructor Helpers
+
+relativedelta_kwds = {'years', 'months', 'weeks', 'days', 'year', 'month',
+ 'day', 'weekday', 'hour', 'minute', 'second',
+ 'microsecond', 'nanosecond', 'nanoseconds', 'hours',
+ 'minutes', 'seconds', 'microseconds'}
+
+
+def _determine_offset(kwds):
+ # timedelta is used for sub-daily plural offsets and all singular
+ # offsets relativedelta is used for plural offsets of daily length or
+ # more nanosecond(s) are handled by apply_wraps
+ kwds_no_nanos = dict(
+ (k, v) for k, v in kwds.items()
+ if k not in ('nanosecond', 'nanoseconds')
+ )
+ # TODO: Are nanosecond and nanoseconds allowed somewhere?
+
+ _kwds_use_relativedelta = ('years', 'months', 'weeks', 'days',
+ 'year', 'month', 'week', 'day', 'weekday',
+ 'hour', 'minute', 'second', 'microsecond')
+
+ use_relativedelta = False
+ if len(kwds_no_nanos) > 0:
+ if any(k in _kwds_use_relativedelta for k in kwds_no_nanos):
+ offset = relativedelta(**kwds_no_nanos)
+ use_relativedelta = True
+ else:
+ # sub-daily offset - use timedelta (tz-aware)
+ offset = timedelta(**kwds_no_nanos)
+ else:
+ offset = timedelta(1)
+ return offset, use_relativedelta
+
+
+# ---------------------------------------------------------------------
+# Mixins & Singletons
+
+
+class ApplyTypeError(TypeError):
+ # sentinel class for catching the apply error to return NotImplemented
+ pass
+
+
+# ---------------------------------------------------------------------
+# Base Classes
+
+class _BaseOffset(object):
+ """
+ Base class for DateOffset methods that are not overridden by subclasses
+ and will (after pickle errors are resolved) go into a cdef class.
+ """
+ _typ = "dateoffset"
+ _day_opt = None
+ _attributes = frozenset(['n', 'normalize'])
+
+ def __init__(self, n=1, normalize=False):
+ n = self._validate_n(n)
+ object.__setattr__(self, "n", n)
+ object.__setattr__(self, "normalize", normalize)
+ object.__setattr__(self, "_cache", {})
+
+ def __setattr__(self, name, value):
+ raise AttributeError("DateOffset objects are immutable.")
+
+ def __eq__(self, other):
+ if is_string_object(other):
+ try:
+ # GH#23524 if to_offset fails, we are dealing with an
+ # incomparable type so == is False and != is True
+ other = to_offset(other)
+ except ValueError:
+ # e.g. "infer"
+ return False
+ try:
+ return self._params == other._params
+ except AttributeError:
+ # other is not a DateOffset object
+ return False
+
+ def __ne__(self, other):
+ return not self == other
+
+ def __hash__(self):
+ return hash(self._params)
+
+ @property
+ def _params(self):
+ """
+ Returns a tuple containing all of the attributes needed to evaluate
+ equality between two DateOffset objects.
+ """
+ # NB: non-cython subclasses override property with cache_readonly
+ all_paras = self.__dict__.copy()
+ if 'holidays' in all_paras and not all_paras['holidays']:
+ all_paras.pop('holidays')
+ exclude = ['kwds', 'name', 'calendar']
+ attrs = [(k, v) for k, v in all_paras.items()
+ if (k not in exclude) and (k[0] != '_')]
+ attrs = sorted(set(attrs))
+ params = tuple([str(self.__class__)] + attrs)
+ return params
+
+ @property
+ def kwds(self):
+ # for backwards-compatibility
+ kwds = {name: getattr(self, name, None) for name in self._attributes
+ if name not in ['n', 'normalize']}
+ return {name: kwds[name] for name in kwds if kwds[name] is not None}
+
+ @property
+ def base(self):
+ """
+ Returns a copy of the calling offset object with n=1 and all other
+ attributes equal.
+ """
+ return type(self)(n=1, normalize=self.normalize, **self.kwds)
+
+ def __add__(self, other):
+ if getattr(other, "_typ", None) in ["datetimeindex", "periodindex",
+ "datetimearray", "periodarray",
+ "series", "period", "dataframe"]:
+ # defer to the other class's implementation
+ return other + self
+ try:
+ return self.apply(other)
+ except ApplyTypeError:
+ return NotImplemented
+
+ def __sub__(self, other):
+ if PyDateTime_Check(other):
+ raise TypeError('Cannot subtract datetime from offset.')
+ elif type(other) == type(self):
+ return type(self)(self.n - other.n, normalize=self.normalize,
+ **self.kwds)
+ else: # pragma: no cover
+ return NotImplemented
+
+ def __call__(self, other):
+ return self.apply(other)
+
+ def __mul__(self, other):
+ return type(self)(n=other * self.n, normalize=self.normalize,
+ **self.kwds)
+
+ def __neg__(self):
+ # Note: we are deferring directly to __mul__ instead of __rmul__, as
+ # that allows us to use methods that can go in a `cdef class`
+ return self * -1
+
+ def copy(self):
+ # Note: we are deferring directly to __mul__ instead of __rmul__, as
+ # that allows us to use methods that can go in a `cdef class`
+ return self * 1
+
+ def __repr__(self):
+ className = getattr(self, '_outputName', type(self).__name__)
+
+ if abs(self.n) != 1:
+ plural = 's'
+ else:
+ plural = ''
+
+ n_str = ""
+ if self.n != 1:
+ n_str = "%s * " % self.n
+
+ out = '<%s' % n_str + className + plural + self._repr_attrs() + '>'
+ return out
+
+ def _get_offset_day(self, datetime other):
+ # subclass must implement `_day_opt`; calling from the base class
+ # will raise NotImplementedError.
+ return get_day_of_month(other, self._day_opt)
+
+ def _validate_n(self, n):
+ """
+ Require that `n` be a nonzero integer.
+
+ Parameters
+ ----------
+ n : int
+
+ Returns
+ -------
+ nint : int
+
+ Raises
+ ------
+ TypeError if `int(n)` raises
+ ValueError if n != int(n)
+ """
+ try:
+ nint = int(n)
+ except (ValueError, TypeError):
+ raise TypeError('`n` argument must be an integer, '
+ 'got {ntype}'.format(ntype=type(n)))
+ if n != nint:
+ raise ValueError('`n` argument must be an integer, '
+ 'got {n}'.format(n=n))
+ return nint
+
+ def __setstate__(self, state):
+ """Reconstruct an instance from a pickled state"""
+ if 'offset' in state:
+ # Older (<0.22.0) versions have offset attribute instead of _offset
+ if '_offset' in state: # pragma: no cover
+ raise AssertionError('Unexpected key `_offset`')
+ state['_offset'] = state.pop('offset')
+ state['kwds']['offset'] = state['_offset']
+
+ if '_offset' in state and not isinstance(state['_offset'], timedelta):
+ # relativedelta, we need to populate using its kwds
+ offset = state['_offset']
+ odict = offset.__dict__
+ kwds = {key: odict[key] for key in odict if odict[key]}
+ state.update(kwds)
+
+ if '_cache' not in state:
+ state['_cache'] = {}
+
+ self.__dict__.update(state)
+
+ if 'weekmask' in state and 'holidays' in state:
+ calendar, holidays = _get_calendar(weekmask=self.weekmask,
+ holidays=self.holidays,
+ calendar=None)
+ object.__setattr__(self, "calendar", calendar)
+ object.__setattr__(self, "holidays", holidays)
+
+ def __getstate__(self):
+ """Return a pickleable state"""
+ state = self.__dict__.copy()
+
+ # we don't want to actually pickle the calendar object
+ # as its a np.busyday; we recreate on deserilization
+ if 'calendar' in state:
+ del state['calendar']
+ try:
+ state['kwds'].pop('calendar')
+ except KeyError:
+ pass
+
+ return state
+
+
+class BaseOffset(_BaseOffset):
+ # Here we add __rfoo__ methods that don't play well with cdef classes
+ def __rmul__(self, other):
+ return self.__mul__(other)
+
+ def __radd__(self, other):
+ return self.__add__(other)
+
+ def __rsub__(self, other):
+ if getattr(other, '_typ', None) in ['datetimeindex', 'series']:
+ # i.e. isinstance(other, (ABCDatetimeIndex, ABCSeries))
+ return other - self
+ return -self + other
+
+
+class _Tick(object):
+ """
+ dummy class to mix into tseries.offsets.Tick so that in tslibs.period we
+ can do isinstance checks on _Tick and avoid importing tseries.offsets
+ """
+
+ def __truediv__(self, other):
+ result = self.delta.__truediv__(other)
+ return _wrap_timedelta_result(result)
+
+ if PY2:
+ __div__ = __truediv__
+
+
+# ----------------------------------------------------------------------
+# RelativeDelta Arithmetic
+
+def shift_day(other: datetime, days: int) -> datetime:
+ """
+ Increment the datetime `other` by the given number of days, retaining
+ the time-portion of the datetime. For tz-naive datetimes this is
+ equivalent to adding a timedelta. For tz-aware datetimes it is similar to
+ dateutil's relativedelta.__add__, but handles pytz tzinfo objects.
+
+ Parameters
+ ----------
+ other : datetime or Timestamp
+ days : int
+
+ Returns
+ -------
+ shifted: datetime or Timestamp
+ """
+ if other.tzinfo is None:
+ return other + timedelta(days=days)
+
+ tz = other.tzinfo
+ naive = other.replace(tzinfo=None)
+ shifted = naive + timedelta(days=days)
+ return localize_pydatetime(shifted, tz)
+
+
+cdef inline int year_add_months(npy_datetimestruct dts, int months) nogil:
+ """new year number after shifting npy_datetimestruct number of months"""
+ return dts.year + (dts.month + months - 1) / 12
+
+
+cdef inline int month_add_months(npy_datetimestruct dts, int months) nogil:
+ """
+ New month number after shifting npy_datetimestruct
+ number of months.
+ """
+ cdef:
+ int new_month = (dts.month + months) % 12
+ return 12 if new_month == 0 else new_month
+
+
+def shift_quarters(int64_t[:] dtindex, int quarters,
+ int q1start_month, object day, int modby=3):
+ """
+ Given an int64 array representing nanosecond timestamps, shift all elements
+ by the specified number of quarters using DateOffset semantics.
+
+ Parameters
+ ----------
+ dtindex : int64_t[:] timestamps for input dates
+ quarters : int number of quarters to shift
+ q1start_month : int month in which Q1 begins by convention
+ day : {'start', 'end', 'business_start', 'business_end'}
+ modby : int (3 for quarters, 12 for years)
+
+ Returns
+ -------
+ out : ndarray[int64_t]
+ """
+ cdef:
+ Py_ssize_t i
+ npy_datetimestruct dts
+ int count = len(dtindex)
+ int months_to_roll, months_since, n, compare_day
+ bint roll_check
+ int64_t[:] out = np.empty(count, dtype='int64')
+
+ if day == 'start':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = NPY_NAT
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ n = quarters
+
+ months_since = (dts.month - q1start_month) % modby
+
+ # offset semantics - if on the anchor point and going backwards
+ # shift to next
+ if n <= 0 and (months_since != 0 or
+ (months_since == 0 and dts.day > 1)):
+ n += 1
+
+ dts.year = year_add_months(dts, modby * n - months_since)
+ dts.month = month_add_months(dts, modby * n - months_since)
+ dts.day = 1
+
+ out[i] = dtstruct_to_dt64(&dts)
+
+ elif day == 'end':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = NPY_NAT
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ n = quarters
+
+ months_since = (dts.month - q1start_month) % modby
+
+ if n <= 0 and months_since != 0:
+ # The general case of this condition would be
+ # `months_since != 0 or (months_since == 0 and
+ # dts.day > get_days_in_month(dts.year, dts.month))`
+ # but the get_days_in_month inequality would never hold.
+ n += 1
+ elif n > 0 and (months_since == 0 and
+ dts.day < get_days_in_month(dts.year,
+ dts.month)):
+ n -= 1
+
+ dts.year = year_add_months(dts, modby * n - months_since)
+ dts.month = month_add_months(dts, modby * n - months_since)
+ dts.day = get_days_in_month(dts.year, dts.month)
+
+ out[i] = dtstruct_to_dt64(&dts)
+
+ elif day == 'business_start':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = NPY_NAT
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ n = quarters
+
+ months_since = (dts.month - q1start_month) % modby
+ # compare_day is only relevant for comparison in the case
+ # where months_since == 0.
+ compare_day = get_firstbday(dts.year, dts.month)
+
+ if n <= 0 and (months_since != 0 or
+ (months_since == 0 and dts.day > compare_day)):
+ # make sure to roll forward, so negate
+ n += 1
+ elif n > 0 and (months_since == 0 and dts.day < compare_day):
+ # pretend to roll back if on same month but
+ # before compare_day
+ n -= 1
+
+ dts.year = year_add_months(dts, modby * n - months_since)
+ dts.month = month_add_months(dts, modby * n - months_since)
+
+ dts.day = get_firstbday(dts.year, dts.month)
+
+ out[i] = dtstruct_to_dt64(&dts)
+
+ elif day == 'business_end':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = NPY_NAT
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ n = quarters
+
+ months_since = (dts.month - q1start_month) % modby
+ # compare_day is only relevant for comparison in the case
+ # where months_since == 0.
+ compare_day = get_lastbday(dts.year, dts.month)
+
+ if n <= 0 and (months_since != 0 or
+ (months_since == 0 and dts.day > compare_day)):
+ # make sure to roll forward, so negate
+ n += 1
+ elif n > 0 and (months_since == 0 and dts.day < compare_day):
+ # pretend to roll back if on same month but
+ # before compare_day
+ n -= 1
+
+ dts.year = year_add_months(dts, modby * n - months_since)
+ dts.month = month_add_months(dts, modby * n - months_since)
+
+ dts.day = get_lastbday(dts.year, dts.month)
+
+ out[i] = dtstruct_to_dt64(&dts)
+
+ else:
+ raise ValueError("day must be None, 'start', 'end', "
+ "'business_start', or 'business_end'")
+
+ return np.asarray(out)
+
+
+def shift_months(int64_t[:] dtindex, int months, object day=None):
+ """
+ Given an int64-based datetime index, shift all elements
+ specified number of months using DateOffset semantics
+
+ day: {None, 'start', 'end'}
+ * None: day of month
+ * 'start' 1st day of month
+ * 'end' last day of month
+ """
+ cdef:
+ Py_ssize_t i
+ npy_datetimestruct dts
+ int count = len(dtindex)
+ int months_to_roll
+ bint roll_check
+ int64_t[:] out = np.empty(count, dtype='int64')
+
+ if day is None:
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = NPY_NAT
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ dts.year = year_add_months(dts, months)
+ dts.month = month_add_months(dts, months)
+
+ dts.day = min(dts.day, get_days_in_month(dts.year, dts.month))
+ out[i] = dtstruct_to_dt64(&dts)
+ elif day == 'start':
+ roll_check = False
+ if months <= 0:
+ months += 1
+ roll_check = True
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = NPY_NAT
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ months_to_roll = months
+
+ # offset semantics - if on the anchor point and going backwards
+ # shift to next
+ if roll_check and dts.day == 1:
+ months_to_roll -= 1
+
+ dts.year = year_add_months(dts, months_to_roll)
+ dts.month = month_add_months(dts, months_to_roll)
+ dts.day = 1
+
+ out[i] = dtstruct_to_dt64(&dts)
+ elif day == 'end':
+ roll_check = False
+ if months > 0:
+ months -= 1
+ roll_check = True
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = NPY_NAT
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ months_to_roll = months
+
+ # similar semantics - when adding shift forward by one
+ # month if already at an end of month
+ if roll_check and dts.day == get_days_in_month(dts.year,
+ dts.month):
+ months_to_roll += 1
+
+ dts.year = year_add_months(dts, months_to_roll)
+ dts.month = month_add_months(dts, months_to_roll)
+
+ dts.day = get_days_in_month(dts.year, dts.month)
+ out[i] = dtstruct_to_dt64(&dts)
+
+ elif day == 'business_start':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = NPY_NAT
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ months_to_roll = months
+ compare_day = get_firstbday(dts.year, dts.month)
+
+ months_to_roll = roll_convention(dts.day, months_to_roll,
+ compare_day)
+
+ dts.year = year_add_months(dts, months_to_roll)
+ dts.month = month_add_months(dts, months_to_roll)
+
+ dts.day = get_firstbday(dts.year, dts.month)
+ out[i] = dtstruct_to_dt64(&dts)
+
+ elif day == 'business_end':
+ with nogil:
+ for i in range(count):
+ if dtindex[i] == NPY_NAT:
+ out[i] = NPY_NAT
+ continue
+
+ dt64_to_dtstruct(dtindex[i], &dts)
+ months_to_roll = months
+ compare_day = get_lastbday(dts.year, dts.month)
+
+ months_to_roll = roll_convention(dts.day, months_to_roll,
+ compare_day)
+
+ dts.year = year_add_months(dts, months_to_roll)
+ dts.month = month_add_months(dts, months_to_roll)
+
+ dts.day = get_lastbday(dts.year, dts.month)
+ out[i] = dtstruct_to_dt64(&dts)
+
+ else:
+ raise ValueError("day must be None, 'start', 'end', "
+ "'business_start', or 'business_end'")
+
+ return np.asarray(out)
+
+
+def shift_month(stamp: datetime, months: int,
+ day_opt: object=None) -> datetime:
+ """
+ Given a datetime (or Timestamp) `stamp`, an integer `months` and an
+ option `day_opt`, return a new datetimelike that many months later,
+ with day determined by `day_opt` using relativedelta semantics.
+
+ Scalar analogue of shift_months
+
+ Parameters
+ ----------
+ stamp : datetime or Timestamp
+ months : int
+ day_opt : None, 'start', 'end', 'business_start', 'business_end', or int
+ None: returned datetimelike has the same day as the input, or the
+ last day of the month if the new month is too short
+ 'start': returned datetimelike has day=1
+ 'end': returned datetimelike has day on the last day of the month
+ 'business_start': returned datetimelike has day on the first
+ business day of the month
+ 'business_end': returned datetimelike has day on the last
+ business day of the month
+ int: returned datetimelike has day equal to day_opt
+
+ Returns
+ -------
+ shifted : datetime or Timestamp (same as input `stamp`)
+ """
+ cdef:
+ int year, month, day
+ int days_in_month, dy
+
+ dy = (stamp.month + months) // 12
+ month = (stamp.month + months) % 12
+
+ if month == 0:
+ month = 12
+ dy -= 1
+ year = stamp.year + dy
+
+ if day_opt is None:
+ days_in_month = get_days_in_month(year, month)
+ day = min(stamp.day, days_in_month)
+ elif day_opt == 'start':
+ day = 1
+ elif day_opt == 'end':
+ day = get_days_in_month(year, month)
+ elif day_opt == 'business_start':
+ # first business day of month
+ day = get_firstbday(year, month)
+ elif day_opt == 'business_end':
+ # last business day of month
+ day = get_lastbday(year, month)
+ elif is_integer_object(day_opt):
+ days_in_month = get_days_in_month(year, month)
+ day = min(day_opt, days_in_month)
+ else:
+ raise ValueError(day_opt)
+ return stamp.replace(year=year, month=month, day=day)
+
+
+cpdef int get_day_of_month(datetime other, day_opt) except? -1:
+ """
+ Find the day in `other`'s month that satisfies a DateOffset's onOffset
+ policy, as described by the `day_opt` argument.
+
+ Parameters
+ ----------
+ other : datetime or Timestamp
+ day_opt : 'start', 'end', 'business_start', 'business_end', or int
+ 'start': returns 1
+ 'end': returns last day of the month
+ 'business_start': returns the first business day of the month
+ 'business_end': returns the last business day of the month
+ int: returns the day in the month indicated by `other`, or the last of
+ day the month if the value exceeds in that month's number of days.
+
+ Returns
+ -------
+ day_of_month : int
+
+ Examples
+ -------
+ >>> other = datetime(2017, 11, 14)
+ >>> get_day_of_month(other, 'start')
+ 1
+ >>> get_day_of_month(other, 'end')
+ 30
+
+ """
+ cdef:
+ int days_in_month
+
+ if day_opt == 'start':
+ return 1
+ elif day_opt == 'end':
+ days_in_month = get_days_in_month(other.year, other.month)
+ return days_in_month
+ elif day_opt == 'business_start':
+ # first business day of month
+ return get_firstbday(other.year, other.month)
+ elif day_opt == 'business_end':
+ # last business day of month
+ return get_lastbday(other.year, other.month)
+ elif is_integer_object(day_opt):
+ days_in_month = get_days_in_month(other.year, other.month)
+ return min(day_opt, days_in_month)
+ elif day_opt is None:
+ # Note: unlike `shift_month`, get_day_of_month does not
+ # allow day_opt = None
+ raise NotImplementedError
+ else:
+ raise ValueError(day_opt)
+
+
+cpdef int roll_convention(int other, int n, int compare) nogil:
+ """
+ Possibly increment or decrement the number of periods to shift
+ based on rollforward/rollbackward conventions.
+
+ Parameters
+ ----------
+ other : int, generally the day component of a datetime
+ n : number of periods to increment, before adjusting for rolling
+ compare : int, generally the day component of a datetime, in the same
+ month as the datetime form which `other` was taken.
+
+ Returns
+ -------
+ n : int number of periods to increment
+ """
+ if n > 0 and other < compare:
+ n -= 1
+ elif n <= 0 and other > compare:
+ # as if rolled forward already
+ n += 1
+ return n
+
+
+def roll_qtrday(other: datetime, n: int, month: int,
+ day_opt: object, modby: int=3) -> int:
+ """
+ Possibly increment or decrement the number of periods to shift
+ based on rollforward/rollbackward conventions.
+
+ Parameters
+ ----------
+ other : datetime or Timestamp
+ n : number of periods to increment, before adjusting for rolling
+ month : int reference month giving the first month of the year
+ day_opt : 'start', 'end', 'business_start', 'business_end', or int
+ The convention to use in finding the day in a given month against
+ which to compare for rollforward/rollbackward decisions.
+ modby : int 3 for quarters, 12 for years
+
+ Returns
+ -------
+ n : int number of periods to increment
+
+ See Also
+ --------
+ get_day_of_month : Find the day in a month provided an offset.
+ """
+ cdef:
+ int months_since
+ # TODO: Merge this with roll_yearday by setting modby=12 there?
+ # code de-duplication versus perf hit?
+ # TODO: with small adjustments this could be used in shift_quarters
+ months_since = other.month % modby - month % modby
+
+ if n > 0:
+ if months_since < 0 or (months_since == 0 and
+ other.day < get_day_of_month(other,
+ day_opt)):
+ # pretend to roll back if on same month but
+ # before compare_day
+ n -= 1
+ else:
+ if months_since > 0 or (months_since == 0 and
+ other.day > get_day_of_month(other,
+ day_opt)):
+ # make sure to roll forward, so negate
+ n += 1
+ return n
+
+
+def roll_yearday(other: datetime, n: int, month: int, day_opt: object) -> int:
+ """
+ Possibly increment or decrement the number of periods to shift
+ based on rollforward/rollbackward conventions.
+
+ Parameters
+ ----------
+ other : datetime or Timestamp
+ n : number of periods to increment, before adjusting for rolling
+ month : reference month giving the first month of the year
+ day_opt : 'start', 'end', 'business_start', 'business_end', or int
+ The day of the month to compare against that of `other` when
+ incrementing or decrementing the number of periods:
+
+ 'start': 1
+ 'end': last day of the month
+ 'business_start': first business day of the month
+ 'business_end': last business day of the month
+ int: day in the month indicated by `other`, or the last of day
+ the month if the value exceeds in that month's number of days.
+
+ Returns
+ -------
+ n : int number of periods to increment
+
+ Notes
+ -----
+ * Mirrors `roll_check` in shift_months
+
+ Examples
+ -------
+ >>> month = 3
+ >>> day_opt = 'start' # `other` will be compared to March 1
+ >>> other = datetime(2017, 2, 10) # before March 1
+ >>> roll_yearday(other, 2, month, day_opt)
+ 1
+ >>> roll_yearday(other, -7, month, day_opt)
+ -7
+ >>>
+ >>> other = Timestamp('2014-03-15', tz='US/Eastern') # after March 1
+ >>> roll_yearday(other, 2, month, day_opt)
+ 2
+ >>> roll_yearday(other, -7, month, day_opt)
+ -6
+
+ >>> month = 6
+ >>> day_opt = 'end' # `other` will be compared to June 30
+ >>> other = datetime(1999, 6, 29) # before June 30
+ >>> roll_yearday(other, 5, month, day_opt)
+ 4
+ >>> roll_yearday(other, -7, month, day_opt)
+ -7
+ >>>
+ >>> other = Timestamp(2072, 8, 24, 6, 17, 18) # after June 30
+ >>> roll_yearday(other, 5, month, day_opt)
+ 5
+ >>> roll_yearday(other, -7, month, day_opt)
+ -6
+
+ """
+ # Note: The other.day < ... condition will never hold when day_opt=='start'
+ # and the other.day > ... condition will never hold when day_opt=='end'.
+ # At some point these extra checks may need to be optimized away.
+ # But that point isn't today.
+ if n > 0:
+ if other.month < month or (other.month == month and
+ other.day < get_day_of_month(other,
+ day_opt)):
+ n -= 1
+ else:
+ if other.month > month or (other.month == month and
+ other.day > get_day_of_month(other,
+ day_opt)):
+ n += 1
+ return n
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/parsing.pyx b/contrib/python/pandas/py2/pandas/_libs/tslibs/parsing.pyx
new file mode 100644
index 00000000000..82719de2dbd
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/parsing.pyx
@@ -0,0 +1,749 @@
+# -*- coding: utf-8 -*-
+"""
+Parsing functions for datetime and datetime-like strings.
+"""
+import sys
+import re
+import time
+
+from cpython.datetime cimport datetime
+
+
+import numpy as np
+
+import six
+from six import binary_type, text_type
+
+# Avoid import from outside _libs
+if sys.version_info.major == 2:
+ from StringIO import StringIO
+else:
+ from io import StringIO
+
+
+# dateutil compat
+from dateutil.tz import (tzoffset,
+ tzlocal as _dateutil_tzlocal,
+ tzutc as _dateutil_tzutc,
+ tzstr as _dateutil_tzstr)
+from dateutil.relativedelta import relativedelta
+from dateutil.parser import DEFAULTPARSER
+from dateutil.parser import parse as du_parse
+
+from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS
+from pandas._libs.tslibs.nattype import nat_strings, NaT
+
+# ----------------------------------------------------------------------
+# Constants
+
+
+class DateParseError(ValueError):
+ pass
+
+
+_DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0,
+ second=0, microsecond=0)
+
+cdef object _TIMEPAT = re.compile(r'^([01]?[0-9]|2[0-3]):([0-5][0-9])')
+
+cdef set _not_datelike_strings = {'a', 'A', 'm', 'M', 'p', 'P', 't', 'T'}
+
+# ----------------------------------------------------------------------
+
+_get_option = None
+
+
+def get_option(param):
+ """ Defer import of get_option to break an import cycle that caused
+ significant performance degradation in Period construction. See
+ GH#24118 for details
+ """
+ global _get_option
+ if _get_option is None:
+ from pandas.core.config import get_option
+ _get_option = get_option
+ return _get_option(param)
+
+
+def parse_datetime_string(date_string, freq=None, dayfirst=False,
+ yearfirst=False, **kwargs):
+ """parse datetime string, only returns datetime.
+ Also cares special handling matching time patterns.
+
+ Returns
+ -------
+ datetime
+ """
+
+ cdef:
+ object dt
+
+ if not _does_string_look_like_datetime(date_string):
+ raise ValueError('Given date string not likely a datetime.')
+
+ if _TIMEPAT.match(date_string):
+ # use current datetime as default, not pass _DEFAULT_DATETIME
+ dt = du_parse(date_string, dayfirst=dayfirst,
+ yearfirst=yearfirst, **kwargs)
+ return dt
+
+ try:
+ dt, _, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq)
+ return dt
+ except DateParseError:
+ raise
+ except ValueError:
+ pass
+
+ try:
+ dt = du_parse(date_string, default=_DEFAULT_DATETIME,
+ dayfirst=dayfirst, yearfirst=yearfirst, **kwargs)
+ except TypeError:
+ # following may be raised from dateutil
+ # TypeError: 'NoneType' object is not iterable
+ raise ValueError('Given date string not likely a datetime.')
+
+ return dt
+
+
+def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None):
+ """
+ Try hard to parse datetime string, leveraging dateutil plus some extra
+ goodies like quarter recognition.
+
+ Parameters
+ ----------
+ arg : compat.string_types
+ freq : str or DateOffset, default None
+ Helps with interpreting time string if supplied
+ dayfirst : bool, default None
+ If None uses default from print_config
+ yearfirst : bool, default None
+ If None uses default from print_config
+
+ Returns
+ -------
+ datetime, datetime/dateutil.parser._result, str
+ """
+ if not isinstance(arg, (str, unicode)):
+ # Note: cython recognizes `unicode` in both py2/py3, optimizes
+ # this check into a C call.
+ return arg
+
+ if getattr(freq, "_typ", None) == "dateoffset":
+ freq = freq.rule_code
+
+ if dayfirst is None or yearfirst is None:
+ if dayfirst is None:
+ dayfirst = get_option("display.date_dayfirst")
+ if yearfirst is None:
+ yearfirst = get_option("display.date_yearfirst")
+
+ res = parse_datetime_string_with_reso(arg, freq=freq,
+ dayfirst=dayfirst,
+ yearfirst=yearfirst)
+ return res
+
+
+cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False,
+ yearfirst=False):
+ """parse datetime string, only returns datetime
+
+ Returns
+ -------
+ parsed : datetime
+ parsed2 : datetime/dateutil.parser._result
+ reso : str
+ inferred resolution
+
+ Raises
+ ------
+ ValueError : preliminary check suggests string is not datetime
+ DateParseError : error within dateutil
+ """
+ cdef:
+ object parsed, reso
+
+ if not _does_string_look_like_datetime(date_string):
+ raise ValueError('Given date string not likely a datetime.')
+
+ try:
+ return _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq)
+ except DateParseError:
+ raise
+ except ValueError:
+ pass
+
+ try:
+ parsed, reso = dateutil_parse(date_string, _DEFAULT_DATETIME,
+ dayfirst=dayfirst, yearfirst=yearfirst,
+ ignoretz=False, tzinfos=None)
+ except Exception as e:
+ # TODO: allow raise of errors within instead
+ raise DateParseError(e)
+ if parsed is None:
+ raise DateParseError("Could not parse {dstr}".format(dstr=date_string))
+ return parsed, parsed, reso
+
+
+cpdef bint _does_string_look_like_datetime(object date_string):
+ if date_string.startswith('0'):
+ # Strings starting with 0 are more consistent with a
+ # date-like string than a number
+ return True
+
+ try:
+ if float(date_string) < 1000:
+ return False
+ except ValueError:
+ pass
+
+ if date_string in _not_datelike_strings:
+ return False
+
+ return True
+
+
+cdef inline object _parse_dateabbr_string(object date_string, object default,
+ object freq):
+ cdef:
+ object ret
+ int year, quarter = -1, month, mnum, date_len
+
+ # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1
+ assert isinstance(date_string, (str, unicode))
+
+ # len(date_string) == 0
+ # should be NaT???
+
+ if date_string in nat_strings:
+ return NaT, NaT, ''
+
+ date_string = date_string.upper()
+ date_len = len(date_string)
+
+ if date_len == 4:
+ # parse year only like 2000
+ try:
+ ret = default.replace(year=int(date_string))
+ return ret, ret, 'year'
+ except ValueError:
+ pass
+
+ try:
+ if 4 <= date_len <= 7:
+ i = date_string.index('Q', 1, 6)
+ if i == 1:
+ quarter = int(date_string[0])
+ if date_len == 4 or (date_len == 5
+ and date_string[i + 1] == '-'):
+ # r'(\d)Q-?(\d\d)')
+ year = 2000 + int(date_string[-2:])
+ elif date_len == 6 or (date_len == 7
+ and date_string[i + 1] == '-'):
+ # r'(\d)Q-?(\d\d\d\d)')
+ year = int(date_string[-4:])
+ else:
+ raise ValueError
+ elif i == 2 or i == 3:
+ # r'(\d\d)-?Q(\d)'
+ if date_len == 4 or (date_len == 5
+ and date_string[i - 1] == '-'):
+ quarter = int(date_string[-1])
+ year = 2000 + int(date_string[:2])
+ else:
+ raise ValueError
+ elif i == 4 or i == 5:
+ if date_len == 6 or (date_len == 7
+ and date_string[i - 1] == '-'):
+ # r'(\d\d\d\d)-?Q(\d)'
+ quarter = int(date_string[-1])
+ year = int(date_string[:4])
+ else:
+ raise ValueError
+
+ if not (1 <= quarter <= 4):
+ msg = ('Incorrect quarterly string is given, quarter must be '
+ 'between 1 and 4: {dstr}')
+ raise DateParseError(msg.format(dstr=date_string))
+
+ if freq is not None:
+ # hack attack, #1228
+ try:
+ mnum = MONTH_NUMBERS[_get_rule_month(freq)] + 1
+ except (KeyError, ValueError):
+ msg = ('Unable to retrieve month information from given '
+ 'freq: {freq}'.format(freq=freq))
+ raise DateParseError(msg)
+
+ month = (mnum + (quarter - 1) * 3) % 12 + 1
+ if month > mnum:
+ year -= 1
+ else:
+ month = (quarter - 1) * 3 + 1
+
+ ret = default.replace(year=year, month=month)
+ return ret, ret, 'quarter'
+
+ except DateParseError:
+ raise
+ except ValueError:
+ pass
+
+ if date_len == 6 and (freq == 'M' or
+ getattr(freq, 'rule_code', None) == 'M'):
+ year = int(date_string[:4])
+ month = int(date_string[4:6])
+ try:
+ ret = default.replace(year=year, month=month)
+ return ret, ret, 'month'
+ except ValueError:
+ pass
+
+ for pat in ['%Y-%m', '%m-%Y', '%b %Y', '%b-%Y']:
+ try:
+ ret = datetime.strptime(date_string, pat)
+ return ret, ret, 'month'
+ except ValueError:
+ pass
+
+ raise ValueError('Unable to parse {0}'.format(date_string))
+
+
+cdef dateutil_parse(object timestr, object default, ignoretz=False,
+ tzinfos=None, dayfirst=None, yearfirst=None):
+ """ lifted from dateutil to get resolution"""
+
+ cdef:
+ object fobj, res, attr, ret, tzdata
+ object reso = None
+ dict repl = {}
+
+ fobj = StringIO(str(timestr))
+ res = DEFAULTPARSER._parse(fobj, dayfirst=dayfirst, yearfirst=yearfirst)
+
+ # dateutil 2.2 compat
+ if isinstance(res, tuple): # PyTuple_Check
+ res, _ = res
+
+ if res is None:
+ msg = "Unknown datetime string format, unable to parse: {timestr}"
+ raise ValueError(msg.format(timestr=timestr))
+
+ for attr in ["year", "month", "day", "hour",
+ "minute", "second", "microsecond"]:
+ value = getattr(res, attr)
+ if value is not None:
+ repl[attr] = value
+ reso = attr
+
+ if reso is None:
+ msg = "Unable to parse datetime string: {timestr}"
+ raise ValueError(msg.format(timestr=timestr))
+
+ if reso == 'microsecond':
+ if repl['microsecond'] == 0:
+ reso = 'second'
+ elif repl['microsecond'] % 1000 == 0:
+ reso = 'millisecond'
+
+ ret = default.replace(**repl)
+ if res.weekday is not None and not res.day:
+ ret = ret + relativedelta.relativedelta(weekday=res.weekday)
+ if not ignoretz:
+ if callable(tzinfos) or tzinfos and res.tzname in tzinfos:
+ if callable(tzinfos):
+ tzdata = tzinfos(res.tzname, res.tzoffset)
+ else:
+ tzdata = tzinfos.get(res.tzname)
+ if isinstance(tzdata, datetime.tzinfo):
+ tzinfo = tzdata
+ elif isinstance(tzdata, (str, unicode)):
+ tzinfo = _dateutil_tzstr(tzdata)
+ elif isinstance(tzdata, int):
+ tzinfo = tzoffset(res.tzname, tzdata)
+ else:
+ raise ValueError("offset must be tzinfo subclass, "
+ "tz string, or int offset")
+ ret = ret.replace(tzinfo=tzinfo)
+ elif res.tzname and res.tzname in time.tzname:
+ ret = ret.replace(tzinfo=_dateutil_tzlocal())
+ elif res.tzoffset == 0:
+ ret = ret.replace(tzinfo=_dateutil_tzutc())
+ elif res.tzoffset:
+ ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset))
+ return ret, reso
+
+
+cdef object _get_rule_month(object source, object default='DEC'):
+ """
+ Return starting month of given freq, default is December.
+
+ Example
+ -------
+ >>> _get_rule_month('D')
+ 'DEC'
+
+ >>> _get_rule_month('A-JAN')
+ 'JAN'
+ """
+ if hasattr(source, 'freqstr'):
+ source = source.freqstr
+ source = source.upper()
+ if '-' not in source:
+ return default
+ else:
+ return source.split('-')[1]
+
+
+# ----------------------------------------------------------------------
+# Parsing for type-inference
+
+
+def try_parse_dates(object[:] values, parser=None,
+ dayfirst=False, default=None):
+ cdef:
+ Py_ssize_t i, n
+ object[:] result
+
+ n = len(values)
+ result = np.empty(n, dtype='O')
+
+ if parser is None:
+ if default is None: # GH2618
+ date = datetime.now()
+ default = datetime(date.year, date.month, 1)
+
+ parse_date = lambda x: du_parse(x, dayfirst=dayfirst, default=default)
+
+ # EAFP here
+ try:
+ for i in range(n):
+ if values[i] == '':
+ result[i] = np.nan
+ else:
+ result[i] = parse_date(values[i])
+ except Exception:
+ # failed
+ return values
+ else:
+ parse_date = parser
+
+ try:
+ for i in range(n):
+ if values[i] == '':
+ result[i] = np.nan
+ else:
+ result[i] = parse_date(values[i])
+ except Exception:
+ # raise if passed parser and it failed
+ raise
+
+ return result.base # .base to access underlying ndarray
+
+
+def try_parse_date_and_time(object[:] dates, object[:] times,
+ date_parser=None, time_parser=None,
+ dayfirst=False, default=None):
+ cdef:
+ Py_ssize_t i, n
+ object[:] result
+
+ n = len(dates)
+ if len(times) != n:
+ raise ValueError('Length of dates and times must be equal')
+ result = np.empty(n, dtype='O')
+
+ if date_parser is None:
+ if default is None: # GH2618
+ date = datetime.now()
+ default = datetime(date.year, date.month, 1)
+
+ parse_date = lambda x: du_parse(x, dayfirst=dayfirst, default=default)
+
+ else:
+ parse_date = date_parser
+
+ if time_parser is None:
+ parse_time = lambda x: du_parse(x)
+
+ else:
+ parse_time = time_parser
+
+ for i in range(n):
+ d = parse_date(str(dates[i]))
+ t = parse_time(str(times[i]))
+ result[i] = datetime(d.year, d.month, d.day,
+ t.hour, t.minute, t.second)
+
+ return result.base # .base to access underlying ndarray
+
+
+def try_parse_year_month_day(object[:] years, object[:] months,
+ object[:] days):
+ cdef:
+ Py_ssize_t i, n
+ object[:] result
+
+ n = len(years)
+ if len(months) != n or len(days) != n:
+ raise ValueError('Length of years/months/days must all be equal')
+ result = np.empty(n, dtype='O')
+
+ for i in range(n):
+ result[i] = datetime(int(years[i]), int(months[i]), int(days[i]))
+
+ return result.base # .base to access underlying ndarray
+
+
+def try_parse_datetime_components(object[:] years,
+ object[:] months,
+ object[:] days,
+ object[:] hours,
+ object[:] minutes,
+ object[:] seconds):
+
+ cdef:
+ Py_ssize_t i, n
+ object[:] result
+ int secs
+ double float_secs
+ double micros
+
+ n = len(years)
+ if (len(months) != n or len(days) != n or len(hours) != n or
+ len(minutes) != n or len(seconds) != n):
+ raise ValueError('Length of all datetime components must be equal')
+ result = np.empty(n, dtype='O')
+
+ for i in range(n):
+ float_secs = float(seconds[i])
+ secs = int(float_secs)
+
+ micros = float_secs - secs
+ if micros > 0:
+ micros = micros * 1000000
+
+ result[i] = datetime(int(years[i]), int(months[i]), int(days[i]),
+ int(hours[i]), int(minutes[i]), secs,
+ int(micros))
+
+ return result.base # .base to access underlying ndarray
+
+
+# ----------------------------------------------------------------------
+# Miscellaneous
+
+
+# Class copied verbatim from https://github.com/dateutil/dateutil/pull/732
+#
+# We use this class to parse and tokenize date strings. However, as it is
+# a private class in the dateutil library, relying on backwards compatibility
+# is not practical. In fact, using this class issues warnings (xref gh-21322).
+# Thus, we port the class over so that both issues are resolved.
+#
+# Copyright (c) 2017 - dateutil contributors
+class _timelex(object):
+ def __init__(self, instream):
+ if six.PY2:
+ # In Python 2, we can't duck type properly because unicode has
+ # a 'decode' function, and we'd be double-decoding
+ if isinstance(instream, (binary_type, bytearray)):
+ instream = instream.decode()
+ else:
+ if getattr(instream, 'decode', None) is not None:
+ instream = instream.decode()
+
+ if isinstance(instream, text_type):
+ self.stream = instream
+ elif getattr(instream, 'read', None) is None:
+ raise TypeError(
+ 'Parser must be a string or character stream, not '
+ '{itype}'.format(itype=instream.__class__.__name__))
+ else:
+ self.stream = instream.read()
+
+ def get_tokens(self):
+ """
+ This function breaks the time string into lexical units (tokens), which
+ can be parsed by the parser. Lexical units are demarcated by changes in
+ the character set, so any continuous string of letters is considered
+ one unit, any continuous string of numbers is considered one unit.
+ The main complication arises from the fact that dots ('.') can be used
+ both as separators (e.g. "Sep.20.2009") or decimal points (e.g.
+ "4:30:21.447"). As such, it is necessary to read the full context of
+ any dot-separated strings before breaking it into tokens; as such, this
+ function maintains a "token stack", for when the ambiguous context
+ demands that multiple tokens be parsed at once.
+ """
+ stream = self.stream.replace('\x00', '')
+
+ # TODO: Change \s --> \s+ (this doesn't match existing behavior)
+ # TODO: change the punctuation block to punc+ (doesnt match existing)
+ # TODO: can we merge the two digit patterns?
+ tokens = re.findall('\s|'
+ '(?<![\.\d])\d+\.\d+(?![\.\d])'
+ '|\d+'
+ '|[a-zA-Z]+'
+ '|[\./:]+'
+ '|[^\da-zA-Z\./:\s]+', stream)
+
+ # Re-combine token tuples of the form ["59", ",", "456"] because
+ # in this context the "," is treated as a decimal
+ # (e.g. in python's default logging format)
+ for n, token in enumerate(tokens[:-2]):
+ # Kludge to match ,-decimal behavior; it'd be better to do this
+ # later in the process and have a simpler tokenization
+ if (token is not None and token.isdigit() and
+ tokens[n + 1] == ',' and tokens[n + 2].isdigit()):
+ # Have to check None b/c it might be replaced during the loop
+ # TODO: I _really_ don't faking the value here
+ tokens[n] = token + '.' + tokens[n + 2]
+ tokens[n + 1] = None
+ tokens[n + 2] = None
+
+ tokens = [x for x in tokens if x is not None]
+ return tokens
+
+ @classmethod
+ def split(cls, s):
+ return cls(s).get_tokens()
+
+
+_DATEUTIL_LEXER_SPLIT = _timelex.split
+
+
+def _format_is_iso(f) -> bint:
+ """
+ Does format match the iso8601 set that can be handled by the C parser?
+ Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different
+ but must be consistent. Leading 0s in dates and times are optional.
+ """
+ iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S.%f'.format
+ excluded_formats = ['%Y%m%d', '%Y%m', '%Y']
+
+ for date_sep in [' ', '/', '\\', '-', '.', '']:
+ for time_sep in [' ', 'T']:
+ if (iso_template(date_sep=date_sep,
+ time_sep=time_sep
+ ).startswith(f) and f not in excluded_formats):
+ return True
+ return False
+
+
+def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse,
+ dt_str_split=_DATEUTIL_LEXER_SPLIT):
+ """
+ Guess the datetime format of a given datetime string.
+
+ Parameters
+ ----------
+ dt_str : string, datetime string to guess the format of
+ dayfirst : boolean, default False
+ If True parses dates with the day first, eg 20/01/2005
+ Warning: dayfirst=True is not strict, but will prefer to parse
+ with day first (this is a known bug).
+ dt_str_parse : function, defaults to `compat.parse_date` (dateutil)
+ This function should take in a datetime string and return
+ a `datetime.datetime` guess that the datetime string represents
+ dt_str_split : function, defaults to `_DATEUTIL_LEXER_SPLIT` (dateutil)
+ This function should take in a datetime string and return
+ a list of strings, the guess of the various specific parts
+ e.g. '2011/12/30' -> ['2011', '/', '12', '/', '30']
+
+ Returns
+ -------
+ ret : datetime format string (for `strftime` or `strptime`)
+ """
+ if dt_str_parse is None or dt_str_split is None:
+ return None
+
+ if not isinstance(dt_str, (str, unicode)):
+ return None
+
+ day_attribute_and_format = (('day',), '%d', 2)
+
+ # attr name, format, padding (if any)
+ datetime_attrs_to_format = [
+ (('year', 'month', 'day'), '%Y%m%d', 0),
+ (('year',), '%Y', 0),
+ (('month',), '%B', 0),
+ (('month',), '%b', 0),
+ (('month',), '%m', 2),
+ day_attribute_and_format,
+ (('hour',), '%H', 2),
+ (('minute',), '%M', 2),
+ (('second',), '%S', 2),
+ (('microsecond',), '%f', 6),
+ (('second', 'microsecond'), '%S.%f', 0),
+ ]
+
+ if dayfirst:
+ datetime_attrs_to_format.remove(day_attribute_and_format)
+ datetime_attrs_to_format.insert(0, day_attribute_and_format)
+
+ try:
+ parsed_datetime = dt_str_parse(dt_str, dayfirst=dayfirst)
+ except:
+ # In case the datetime can't be parsed, its format cannot be guessed
+ return None
+
+ if parsed_datetime is None:
+ return None
+
+ try:
+ tokens = dt_str_split(dt_str)
+ except:
+ # In case the datetime string can't be split, its format cannot
+ # be guessed
+ return None
+
+ format_guess = [None] * len(tokens)
+ found_attrs = set()
+
+ for attrs, attr_format, padding in datetime_attrs_to_format:
+ # If a given attribute has been placed in the format string, skip
+ # over other formats for that same underlying attribute (IE, month
+ # can be represented in multiple different ways)
+ if set(attrs) & found_attrs:
+ continue
+
+ if all(getattr(parsed_datetime, attr) is not None for attr in attrs):
+ for i, token_format in enumerate(format_guess):
+ token_filled = tokens[i].zfill(padding)
+ if (token_format is None and
+ token_filled == parsed_datetime.strftime(attr_format)):
+ format_guess[i] = attr_format
+ tokens[i] = token_filled
+ found_attrs.update(attrs)
+ break
+
+ # Only consider it a valid guess if we have a year, month and day
+ if len({'year', 'month', 'day'} & found_attrs) != 3:
+ return None
+
+ output_format = []
+ for i, guess in enumerate(format_guess):
+ if guess is not None:
+ # Either fill in the format placeholder (like %Y)
+ output_format.append(guess)
+ else:
+ # Or just the token separate (IE, the dashes in "01-01-2013")
+ try:
+ # If the token is numeric, then we likely didn't parse it
+ # properly, so our guess is wrong
+ float(tokens[i])
+ return None
+ except ValueError:
+ pass
+
+ output_format.append(tokens[i])
+
+ guessed_format = ''.join(output_format)
+
+ # rebuild string, capturing any inferred padding
+ dt_str = ''.join(tokens)
+ if parsed_datetime.strftime(guessed_format) == dt_str:
+ return guessed_format
+ else:
+ return None
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/period.pyx b/contrib/python/pandas/py2/pandas/_libs/tslibs/period.pyx
new file mode 100644
index 00000000000..02fae1d09fd
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/period.pyx
@@ -0,0 +1,2553 @@
+# -*- coding: utf-8 -*-
+from datetime import datetime
+
+from cpython cimport (
+ PyObject_RichCompareBool,
+ Py_EQ, Py_NE)
+
+from numpy cimport int64_t, import_array, ndarray
+import numpy as np
+import_array()
+
+from libc.stdlib cimport free, malloc
+from libc.time cimport strftime, tm
+from libc.string cimport strlen, memset
+
+import cython
+
+from cpython.datetime cimport (PyDateTime_Check, PyDelta_Check, PyDate_Check,
+ PyDateTime_IMPORT)
+# import datetime C API
+PyDateTime_IMPORT
+
+from pandas._libs.tslibs.np_datetime cimport (
+ npy_datetimestruct, dtstruct_to_dt64, dt64_to_dtstruct,
+ pandas_datetime_to_datetimestruct, NPY_DATETIMEUNIT, NPY_FR_D)
+
+cdef extern from "src/datetime/np_datetime.h":
+ int64_t npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT fr,
+ npy_datetimestruct *d) nogil
+
+cimport pandas._libs.tslibs.util as util
+from pandas._libs.tslibs.util cimport is_period_object, is_string_object
+
+from pandas._libs.tslibs.timestamps import Timestamp
+from pandas._libs.tslibs.timezones cimport is_utc, is_tzlocal, get_dst_info
+from pandas._libs.tslibs.timedeltas import Timedelta
+from pandas._libs.tslibs.timedeltas cimport delta_to_nanoseconds
+
+cimport pandas._libs.tslibs.ccalendar as ccalendar
+from pandas._libs.tslibs.ccalendar cimport (
+ dayofweek, get_day_of_year, is_leapyear)
+from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS
+from pandas._libs.tslibs.conversion cimport tz_convert_utc_to_tzlocal
+from pandas._libs.tslibs.frequencies cimport (
+ get_freq_code, get_base_alias, get_to_timestamp_base, get_freq_str,
+ get_rule_month)
+from pandas._libs.tslibs.parsing import parse_time_string
+from pandas._libs.tslibs.resolution import Resolution
+from pandas._libs.tslibs.nattype import nat_strings
+from pandas._libs.tslibs.nattype cimport (
+ _nat_scalar_rules, NPY_NAT, is_null_datetimelike, c_NaT as NaT)
+from pandas._libs.tslibs.offsets cimport to_offset
+from pandas._libs.tslibs.offsets import _Tick
+
+cdef bint PY2 = str == bytes
+cdef enum:
+ INT32_MIN = -2147483648
+
+
+ctypedef struct asfreq_info:
+ int64_t intraday_conversion_factor
+ int is_end
+ int to_end
+ int from_end
+
+ctypedef int64_t (*freq_conv_func)(int64_t, asfreq_info*) nogil
+
+
+cdef extern from *:
+ """
+ /*** FREQUENCY CONSTANTS ***/
+ // See frequencies.pyx for more detailed variants
+
+ #define FR_ANN 1000 /* Annual */
+ #define FR_QTR 2000 /* Quarterly - December year end (default Q) */
+ #define FR_MTH 3000 /* Monthly */
+ #define FR_WK 4000 /* Weekly */
+ #define FR_BUS 5000 /* Business days */
+ #define FR_DAY 6000 /* Daily */
+ #define FR_HR 7000 /* Hourly */
+ #define FR_MIN 8000 /* Minutely */
+ #define FR_SEC 9000 /* Secondly */
+ #define FR_MS 10000 /* Millisecondly */
+ #define FR_US 11000 /* Microsecondly */
+ #define FR_NS 12000 /* Nanosecondly */
+ #define FR_UND -10000 /* Undefined */
+
+ // must use npy typedef b/c int64_t is aliased in cython-generated c
+ static npy_int64 daytime_conversion_factor_matrix[7][7] = {
+ {1, 24, 1440, 86400, 86400000, 86400000000, 86400000000000},
+ {0, 1, 60, 3600, 3600000, 3600000000, 3600000000000},
+ {0, 0, 1, 60, 60000, 60000000, 60000000000},
+ {0, 0, 0, 1, 1000, 1000000, 1000000000},
+ {0, 0, 0, 0, 1, 1000, 1000000},
+ {0, 0, 0, 0, 0, 1, 1000},
+ {0, 0, 0, 0, 0, 0, 1}};
+ """
+ int64_t daytime_conversion_factor_matrix[7][7]
+ # TODO: Can we get these frequencies from frequencies.FreqGroup?
+ int FR_ANN
+ int FR_QTR
+ int FR_MTH
+ int FR_WK
+ int FR_DAY
+ int FR_HR
+ int FR_MIN
+ int FR_SEC
+ int FR_MS
+ int FR_US
+ int FR_NS
+ int FR_BUS
+ int FR_UND
+
+
+cdef int max_value(int left, int right) nogil:
+ if left > right:
+ return left
+ return right
+
+
+cdef int min_value(int left, int right) nogil:
+ if left < right:
+ return left
+ return right
+
+
+cdef int64_t get_daytime_conversion_factor(int from_index, int to_index) nogil:
+ cdef:
+ int row = min_value(from_index, to_index)
+ int col = max_value(from_index, to_index)
+ # row or col < 6 means frequency strictly lower than Daily, which
+ # do not use daytime_conversion_factors
+ if row < 6:
+ return 0
+ elif col < 6:
+ return 0
+ return daytime_conversion_factor_matrix[row - 6][col - 6]
+
+
+cdef int64_t nofunc(int64_t ordinal, asfreq_info *af_info):
+ return np.iinfo(np.int32).min
+
+
+cdef int64_t no_op(int64_t ordinal, asfreq_info *af_info):
+ return ordinal
+
+
+cdef freq_conv_func get_asfreq_func(int from_freq, int to_freq) nogil:
+ cdef:
+ int from_group = get_freq_group(from_freq)
+ int to_group = get_freq_group(to_freq)
+
+ if from_group == FR_UND:
+ from_group = FR_DAY
+
+ if from_group == FR_BUS:
+ if to_group == FR_ANN:
+ return <freq_conv_func>asfreq_BtoA
+ elif to_group == FR_QTR:
+ return <freq_conv_func>asfreq_BtoQ
+ elif to_group == FR_MTH:
+ return <freq_conv_func>asfreq_BtoM
+ elif to_group == FR_WK:
+ return <freq_conv_func>asfreq_BtoW
+ elif to_group == FR_BUS:
+ return <freq_conv_func>no_op
+ elif to_group in [FR_DAY, FR_HR, FR_MIN, FR_SEC, FR_MS, FR_US, FR_NS]:
+ return <freq_conv_func>asfreq_BtoDT
+ else:
+ return <freq_conv_func>nofunc
+
+ elif to_group == FR_BUS:
+ if from_group == FR_ANN:
+ return <freq_conv_func>asfreq_AtoB
+ elif from_group == FR_QTR:
+ return <freq_conv_func>asfreq_QtoB
+ elif from_group == FR_MTH:
+ return <freq_conv_func>asfreq_MtoB
+ elif from_group == FR_WK:
+ return <freq_conv_func>asfreq_WtoB
+ elif from_group in [FR_DAY, FR_HR, FR_MIN, FR_SEC,
+ FR_MS, FR_US, FR_NS]:
+ return <freq_conv_func>asfreq_DTtoB
+ else:
+ return <freq_conv_func>nofunc
+
+ elif from_group == FR_ANN:
+ if to_group == FR_ANN:
+ return <freq_conv_func>asfreq_AtoA
+ elif to_group == FR_QTR:
+ return <freq_conv_func>asfreq_AtoQ
+ elif to_group == FR_MTH:
+ return <freq_conv_func>asfreq_AtoM
+ elif to_group == FR_WK:
+ return <freq_conv_func>asfreq_AtoW
+ elif to_group in [FR_DAY, FR_HR, FR_MIN, FR_SEC, FR_MS, FR_US, FR_NS]:
+ return <freq_conv_func>asfreq_AtoDT
+ else:
+ return <freq_conv_func>nofunc
+
+ elif from_group == FR_QTR:
+ if to_group == FR_ANN:
+ return <freq_conv_func>asfreq_QtoA
+ elif to_group == FR_QTR:
+ return <freq_conv_func>asfreq_QtoQ
+ elif to_group == FR_MTH:
+ return <freq_conv_func>asfreq_QtoM
+ elif to_group == FR_WK:
+ return <freq_conv_func>asfreq_QtoW
+ elif to_group in [FR_DAY, FR_HR, FR_MIN, FR_SEC, FR_MS, FR_US, FR_NS]:
+ return <freq_conv_func>asfreq_QtoDT
+ else:
+ return <freq_conv_func>nofunc
+
+ elif from_group == FR_MTH:
+ if to_group == FR_ANN:
+ return <freq_conv_func>asfreq_MtoA
+ elif to_group == FR_QTR:
+ return <freq_conv_func>asfreq_MtoQ
+ elif to_group == FR_MTH:
+ return <freq_conv_func>no_op
+ elif to_group == FR_WK:
+ return <freq_conv_func>asfreq_MtoW
+ elif to_group in [FR_DAY, FR_HR, FR_MIN, FR_SEC, FR_MS, FR_US, FR_NS]:
+ return <freq_conv_func>asfreq_MtoDT
+ else:
+ return <freq_conv_func>nofunc
+
+ elif from_group == FR_WK:
+ if to_group == FR_ANN:
+ return <freq_conv_func>asfreq_WtoA
+ elif to_group == FR_QTR:
+ return <freq_conv_func>asfreq_WtoQ
+ elif to_group == FR_MTH:
+ return <freq_conv_func>asfreq_WtoM
+ elif to_group == FR_WK:
+ return <freq_conv_func>asfreq_WtoW
+ elif to_group in [FR_DAY, FR_HR, FR_MIN, FR_SEC, FR_MS, FR_US, FR_NS]:
+ return <freq_conv_func>asfreq_WtoDT
+ else:
+ return <freq_conv_func>nofunc
+
+ elif from_group in [FR_DAY, FR_HR, FR_MIN, FR_SEC, FR_MS, FR_US, FR_NS]:
+ if to_group == FR_ANN:
+ return <freq_conv_func>asfreq_DTtoA
+ elif to_group == FR_QTR:
+ return <freq_conv_func>asfreq_DTtoQ
+ elif to_group == FR_MTH:
+ return <freq_conv_func>asfreq_DTtoM
+ elif to_group == FR_WK:
+ return <freq_conv_func>asfreq_DTtoW
+ elif to_group in [FR_DAY, FR_HR, FR_MIN, FR_SEC, FR_MS, FR_US, FR_NS]:
+ if from_group > to_group:
+ return <freq_conv_func>downsample_daytime
+ else:
+ return <freq_conv_func>upsample_daytime
+
+ else:
+ return <freq_conv_func>nofunc
+
+ else:
+ return <freq_conv_func>nofunc
+
+
+# --------------------------------------------------------------------
+# Frequency Conversion Helpers
+
+cdef int64_t DtoB_weekday(int64_t unix_date) nogil:
+ return ((unix_date + 4) // 7) * 5 + ((unix_date + 4) % 7) - 4
+
+
+cdef int64_t DtoB(npy_datetimestruct *dts, int roll_back, int64_t unix_date):
+ cdef:
+ int day_of_week = dayofweek(dts.year, dts.month, dts.day)
+
+ if roll_back == 1:
+ if day_of_week > 4:
+ # change to friday before weekend
+ unix_date -= (day_of_week - 4)
+ else:
+ if day_of_week > 4:
+ # change to Monday after weekend
+ unix_date += (7 - day_of_week)
+
+ return DtoB_weekday(unix_date)
+
+
+cdef inline int64_t upsample_daytime(int64_t ordinal, asfreq_info *af_info):
+ if (af_info.is_end):
+ return (ordinal + 1) * af_info.intraday_conversion_factor - 1
+ else:
+ return ordinal * af_info.intraday_conversion_factor
+
+
+cdef inline int64_t downsample_daytime(int64_t ordinal, asfreq_info *af_info):
+ return ordinal // (af_info.intraday_conversion_factor)
+
+
+cdef inline int64_t transform_via_day(int64_t ordinal,
+ asfreq_info *af_info,
+ freq_conv_func first_func,
+ freq_conv_func second_func):
+ cdef:
+ int64_t result
+
+ result = first_func(ordinal, af_info)
+ result = second_func(result, af_info)
+ return result
+
+
+# --------------------------------------------------------------------
+# Conversion _to_ Daily Freq
+
+cdef void AtoD_ym(int64_t ordinal, int64_t *year,
+ int *month, asfreq_info *af_info):
+ year[0] = ordinal + 1970
+ month[0] = 1
+
+ if af_info.from_end != 12:
+ month[0] += af_info.from_end
+ if month[0] > 12:
+ # This case is never reached, but is kept for symmetry
+ # with QtoD_ym
+ month[0] -= 12
+ else:
+ year[0] -= 1
+
+
+cdef int64_t asfreq_AtoDT(int64_t ordinal, asfreq_info *af_info):
+ cdef:
+ int64_t unix_date, year
+ int month
+
+ ordinal += af_info.is_end
+ AtoD_ym(ordinal, &year, &month, af_info)
+
+ unix_date = unix_date_from_ymd(year, month, 1)
+ unix_date -= af_info.is_end
+ return upsample_daytime(unix_date, af_info)
+
+
+cdef void QtoD_ym(int64_t ordinal, int *year,
+ int *month, asfreq_info *af_info):
+ year[0] = ordinal // 4 + 1970
+ month[0] = (ordinal % 4) * 3 + 1
+
+ if af_info.from_end != 12:
+ month[0] += af_info.from_end
+ if month[0] > 12:
+ month[0] -= 12
+ else:
+ year[0] -= 1
+
+
+cdef int64_t asfreq_QtoDT(int64_t ordinal, asfreq_info *af_info):
+ cdef:
+ int64_t unix_date
+ int year, month
+
+ ordinal += af_info.is_end
+ QtoD_ym(ordinal, &year, &month, af_info)
+
+ unix_date = unix_date_from_ymd(year, month, 1)
+ unix_date -= af_info.is_end
+ return upsample_daytime(unix_date, af_info)
+
+
+cdef void MtoD_ym(int64_t ordinal, int *year, int *month):
+ year[0] = ordinal // 12 + 1970
+ month[0] = ordinal % 12 + 1
+
+
+cdef int64_t asfreq_MtoDT(int64_t ordinal, asfreq_info *af_info):
+ cdef:
+ int64_t unix_date
+ int year, month
+
+ ordinal += af_info.is_end
+ MtoD_ym(ordinal, &year, &month)
+
+ unix_date = unix_date_from_ymd(year, month, 1)
+ unix_date -= af_info.is_end
+ return upsample_daytime(unix_date, af_info)
+
+
+cdef int64_t asfreq_WtoDT(int64_t ordinal, asfreq_info *af_info):
+ ordinal = (ordinal * 7 + af_info.from_end - 4 +
+ (7 - 1) * (af_info.is_end - 1))
+ return upsample_daytime(ordinal, af_info)
+
+
+# --------------------------------------------------------------------
+# Conversion _to_ BusinessDay Freq
+
+cdef int64_t asfreq_AtoB(int64_t ordinal, asfreq_info *af_info):
+ cdef:
+ int roll_back
+ npy_datetimestruct dts
+ int64_t unix_date = asfreq_AtoDT(ordinal, af_info)
+
+ pandas_datetime_to_datetimestruct(unix_date, NPY_FR_D, &dts)
+ roll_back = af_info.is_end
+ return DtoB(&dts, roll_back, unix_date)
+
+
+cdef int64_t asfreq_QtoB(int64_t ordinal, asfreq_info *af_info):
+ cdef:
+ int roll_back
+ npy_datetimestruct dts
+ int64_t unix_date = asfreq_QtoDT(ordinal, af_info)
+
+ pandas_datetime_to_datetimestruct(unix_date, NPY_FR_D, &dts)
+ roll_back = af_info.is_end
+ return DtoB(&dts, roll_back, unix_date)
+
+
+cdef int64_t asfreq_MtoB(int64_t ordinal, asfreq_info *af_info):
+ cdef:
+ int roll_back
+ npy_datetimestruct dts
+ int64_t unix_date = asfreq_MtoDT(ordinal, af_info)
+
+ pandas_datetime_to_datetimestruct(unix_date, NPY_FR_D, &dts)
+ roll_back = af_info.is_end
+ return DtoB(&dts, roll_back, unix_date)
+
+
+cdef int64_t asfreq_WtoB(int64_t ordinal, asfreq_info *af_info):
+ cdef:
+ int roll_back
+ npy_datetimestruct dts
+ int64_t unix_date = asfreq_WtoDT(ordinal, af_info)
+
+ pandas_datetime_to_datetimestruct(unix_date, NPY_FR_D, &dts)
+ roll_back = af_info.is_end
+ return DtoB(&dts, roll_back, unix_date)
+
+
+cdef int64_t asfreq_DTtoB(int64_t ordinal, asfreq_info *af_info):
+ cdef:
+ int roll_back
+ npy_datetimestruct dts
+ int64_t unix_date = downsample_daytime(ordinal, af_info)
+
+ pandas_datetime_to_datetimestruct(unix_date, NPY_FR_D, &dts)
+ # This usage defines roll_back the opposite way from the others
+ roll_back = 1 - af_info.is_end
+ return DtoB(&dts, roll_back, unix_date)
+
+
+# ----------------------------------------------------------------------
+# Conversion _from_ Daily Freq
+
+cdef int64_t asfreq_DTtoA(int64_t ordinal, asfreq_info *af_info):
+ cdef:
+ npy_datetimestruct dts
+
+ ordinal = downsample_daytime(ordinal, af_info)
+ pandas_datetime_to_datetimestruct(ordinal, NPY_FR_D, &dts)
+ if dts.month > af_info.to_end:
+ return <int64_t>(dts.year + 1 - 1970)
+ else:
+ return <int64_t>(dts.year - 1970)
+
+
+cdef int DtoQ_yq(int64_t ordinal, asfreq_info *af_info, int *year):
+ cdef:
+ npy_datetimestruct dts
+ int quarter
+
+ pandas_datetime_to_datetimestruct(ordinal, NPY_FR_D, &dts)
+ # TODO: Another version of this function used
+ # date_info_from_days_and_time(&dts, unix_date, 0)
+ # instead of pandas_datetime_to_datetimestruct; is one more performant?
+ if af_info.to_end != 12:
+ dts.month -= af_info.to_end
+ if dts.month <= 0:
+ dts.month += 12
+ else:
+ dts.year += 1
+
+ year[0] = dts.year
+ quarter = month_to_quarter(dts.month)
+ return quarter
+
+
+cdef int64_t asfreq_DTtoQ(int64_t ordinal, asfreq_info *af_info):
+ cdef:
+ int year, quarter
+
+ ordinal = downsample_daytime(ordinal, af_info)
+
+ quarter = DtoQ_yq(ordinal, af_info, &year)
+ return <int64_t>((year - 1970) * 4 + quarter - 1)
+
+
+cdef int64_t asfreq_DTtoM(int64_t ordinal, asfreq_info *af_info):
+ cdef:
+ npy_datetimestruct dts
+
+ ordinal = downsample_daytime(ordinal, af_info)
+ pandas_datetime_to_datetimestruct(ordinal, NPY_FR_D, &dts)
+ return <int64_t>((dts.year - 1970) * 12 + dts.month - 1)
+
+
+cdef int64_t asfreq_DTtoW(int64_t ordinal, asfreq_info *af_info):
+ ordinal = downsample_daytime(ordinal, af_info)
+ return (ordinal + 3 - af_info.to_end) // 7 + 1
+
+
+# --------------------------------------------------------------------
+# Conversion _from_ BusinessDay Freq
+
+cdef int64_t asfreq_BtoDT(int64_t ordinal, asfreq_info *af_info):
+ ordinal = ((ordinal + 3) // 5) * 7 + (ordinal + 3) % 5 -3
+ return upsample_daytime(ordinal, af_info)
+
+
+cdef int64_t asfreq_BtoA(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_BtoDT,
+ <freq_conv_func>asfreq_DTtoA)
+
+
+cdef int64_t asfreq_BtoQ(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_BtoDT,
+ <freq_conv_func>asfreq_DTtoQ)
+
+
+cdef int64_t asfreq_BtoM(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_BtoDT,
+ <freq_conv_func>asfreq_DTtoM)
+
+
+cdef int64_t asfreq_BtoW(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_BtoDT,
+ <freq_conv_func>asfreq_DTtoW)
+
+
+# ----------------------------------------------------------------------
+# Conversion _from_ Annual Freq
+
+cdef int64_t asfreq_AtoA(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_AtoDT,
+ <freq_conv_func>asfreq_DTtoA)
+
+
+cdef int64_t asfreq_AtoQ(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_AtoDT,
+ <freq_conv_func>asfreq_DTtoQ)
+
+
+cdef int64_t asfreq_AtoM(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_AtoDT,
+ <freq_conv_func>asfreq_DTtoM)
+
+
+cdef int64_t asfreq_AtoW(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_AtoDT,
+ <freq_conv_func>asfreq_DTtoW)
+
+
+# ----------------------------------------------------------------------
+# Conversion _from_ Quarterly Freq
+
+cdef int64_t asfreq_QtoQ(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_QtoDT,
+ <freq_conv_func>asfreq_DTtoQ)
+
+
+cdef int64_t asfreq_QtoA(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_QtoDT,
+ <freq_conv_func>asfreq_DTtoA)
+
+
+cdef int64_t asfreq_QtoM(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_QtoDT,
+ <freq_conv_func>asfreq_DTtoM)
+
+
+cdef int64_t asfreq_QtoW(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_QtoDT,
+ <freq_conv_func>asfreq_DTtoW)
+
+
+# ----------------------------------------------------------------------
+# Conversion _from_ Monthly Freq
+
+cdef int64_t asfreq_MtoA(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_MtoDT,
+ <freq_conv_func>asfreq_DTtoA)
+
+
+cdef int64_t asfreq_MtoQ(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_MtoDT,
+ <freq_conv_func>asfreq_DTtoQ)
+
+
+cdef int64_t asfreq_MtoW(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_MtoDT,
+ <freq_conv_func>asfreq_DTtoW)
+
+
+# ----------------------------------------------------------------------
+# Conversion _from_ Weekly Freq
+
+cdef int64_t asfreq_WtoA(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_WtoDT,
+ <freq_conv_func>asfreq_DTtoA)
+
+
+cdef int64_t asfreq_WtoQ(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_WtoDT,
+ <freq_conv_func>asfreq_DTtoQ)
+
+
+cdef int64_t asfreq_WtoM(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_WtoDT,
+ <freq_conv_func>asfreq_DTtoM)
+
+
+cdef int64_t asfreq_WtoW(int64_t ordinal, asfreq_info *af_info):
+ return transform_via_day(ordinal, af_info,
+ <freq_conv_func>asfreq_WtoDT,
+ <freq_conv_func>asfreq_DTtoW)
+
+
+# ----------------------------------------------------------------------
+
+cdef char* c_strftime(npy_datetimestruct *dts, char *fmt):
+ """
+ Generate a nice string representation of the period
+ object, originally from DateObject_strftime
+
+ Parameters
+ ----------
+ dts : npy_datetimestruct*
+ fmt : char*
+
+ Returns
+ -------
+ result : char*
+ """
+ cdef:
+ tm c_date
+ char *result
+ int result_len = strlen(fmt) + 50
+
+ c_date.tm_sec = dts.sec
+ c_date.tm_min = dts.min
+ c_date.tm_hour = dts.hour
+ c_date.tm_mday = dts.day
+ c_date.tm_mon = dts.month - 1
+ c_date.tm_year = dts.year - 1900
+ c_date.tm_wday = (dayofweek(dts.year, dts.month, dts.day) + 1) % 7
+ c_date.tm_yday = get_day_of_year(dts.year, dts.month, dts.day) - 1
+ c_date.tm_isdst = -1
+
+ result = <char*>malloc(result_len * sizeof(char))
+
+ strftime(result, result_len, fmt, &c_date)
+
+ return result
+
+
+# ----------------------------------------------------------------------
+# Conversion between date_info and npy_datetimestruct
+
+cdef inline int get_freq_group(int freq) nogil:
+ return (freq // 1000) * 1000
+
+
+cdef inline int get_freq_group_index(int freq) nogil:
+ return freq // 1000
+
+
+# Find the unix_date (days elapsed since datetime(1970, 1, 1)
+# for the given year/month/day.
+# Assumes GREGORIAN_CALENDAR */
+cdef int64_t unix_date_from_ymd(int year, int month, int day) nogil:
+ # Calculate the absolute date
+ cdef:
+ npy_datetimestruct dts
+ int64_t unix_date
+
+ memset(&dts, 0, sizeof(npy_datetimestruct))
+ dts.year = year
+ dts.month = month
+ dts.day = day
+ unix_date = npy_datetimestruct_to_datetime(NPY_FR_D, &dts)
+ return unix_date
+
+
+# specifically _dont_ use cdvision or else ordinals near -1 are assigned to
+# incorrect dates GH#19643
+cdef int64_t get_period_ordinal(npy_datetimestruct *dts, int freq) nogil:
+ """
+ Generate an ordinal in period space
+
+ Parameters
+ ----------
+ dts: npy_datetimestruct*
+ freq : int
+
+ Returns
+ -------
+ period_ordinal : int64_t
+ """
+ cdef:
+ int64_t unix_date, seconds, delta
+ int64_t weeks
+ int64_t day_adj
+ int freq_group, fmonth, mdiff
+
+ freq_group = get_freq_group(freq)
+
+ if freq_group == FR_ANN:
+ fmonth = freq - FR_ANN
+ if fmonth == 0:
+ fmonth = 12
+
+ mdiff = dts.month - fmonth
+ if mdiff <= 0:
+ return dts.year - 1970
+ else:
+ return dts.year - 1970 + 1
+
+ elif freq_group == FR_QTR:
+ fmonth = freq - FR_QTR
+ if fmonth == 0:
+ fmonth = 12
+
+ mdiff = dts.month - fmonth
+ # TODO: Aren't the next two conditions equivalent to
+ # unconditional incrementing?
+ if mdiff < 0:
+ mdiff += 12
+ if dts.month >= fmonth:
+ mdiff += 12
+
+ return (dts.year - 1970) * 4 + (mdiff - 1) // 3
+
+ elif freq == FR_MTH:
+ return (dts.year - 1970) * 12 + dts.month - 1
+
+ unix_date = npy_datetimestruct_to_datetime(NPY_FR_D, dts)
+
+ if freq >= FR_SEC:
+ seconds = unix_date * 86400 + dts.hour * 3600 + dts.min * 60 + dts.sec
+
+ if freq == FR_MS:
+ return seconds * 1000 + dts.us // 1000
+
+ elif freq == FR_US:
+ return seconds * 1000000 + dts.us
+
+ elif freq == FR_NS:
+ return (seconds * 1000000000 +
+ dts.us * 1000 + dts.ps // 1000)
+
+ else:
+ return seconds
+
+ elif freq == FR_MIN:
+ return unix_date * 1440 + dts.hour * 60 + dts.min
+
+ elif freq == FR_HR:
+ return unix_date * 24 + dts.hour
+
+ elif freq == FR_DAY:
+ return unix_date
+
+ elif freq == FR_UND:
+ return unix_date
+
+ elif freq == FR_BUS:
+ # calculate the current week (counting from 1970-01-01) treating
+ # sunday as last day of a week
+ weeks = (unix_date + 3) // 7
+ # calculate the current weekday (in range 1 .. 7)
+ delta = (unix_date + 3) % 7 + 1
+ # return the number of business days in full weeks plus the business
+ # days in the last - possible partial - week
+ if delta <= 5:
+ return (5 * weeks) + delta - 4
+ else:
+ return (5 * weeks) + (5 + 1) - 4
+
+ elif freq_group == FR_WK:
+ day_adj = freq - FR_WK
+ return (unix_date + 3 - day_adj) // 7 + 1
+
+ # raise ValueError
+
+
+cdef void get_date_info(int64_t ordinal, int freq,
+ npy_datetimestruct *dts) nogil:
+ cdef:
+ int64_t unix_date
+ double abstime
+
+ unix_date = get_unix_date(ordinal, freq)
+ abstime = get_abs_time(freq, unix_date, ordinal)
+
+ while abstime < 0:
+ abstime += 86400
+ unix_date -= 1
+
+ while abstime >= 86400:
+ abstime -= 86400
+ unix_date += 1
+
+ date_info_from_days_and_time(dts, unix_date, abstime)
+
+
+cdef int64_t get_unix_date(int64_t period_ordinal, int freq) nogil:
+ """
+ Returns the proleptic Gregorian ordinal of the date, as an integer.
+ This corresponds to the number of days since Jan., 1st, 1970 AD.
+ When the instance has a frequency less than daily, the proleptic date
+ is calculated for the last day of the period.
+
+ Parameters
+ ----------
+ period_ordinal : int64_t
+ freq : int
+
+ Returns
+ -------
+ unix_date : int64_t number of days since datetime(1970, 1, 1)
+ """
+ cdef:
+ asfreq_info af_info
+ freq_conv_func toDaily = NULL
+
+ if freq == FR_DAY:
+ return period_ordinal
+
+ toDaily = get_asfreq_func(freq, FR_DAY)
+ get_asfreq_info(freq, FR_DAY, True, &af_info)
+ return toDaily(period_ordinal, &af_info)
+
+
+cdef void date_info_from_days_and_time(npy_datetimestruct *dts,
+ int64_t unix_date,
+ double abstime) nogil:
+ """
+ Set the instance's value using the given date and time.
+
+ Parameters
+ ----------
+ dts : npy_datetimestruct*
+ unix_date : int64_t
+ days elapsed since datetime(1970, 1, 1)
+ abstime : double
+ seconds elapsed since beginning of day described by unix_date
+
+ Notes
+ -----
+ Updates dts inplace
+ """
+ cdef:
+ int inttime
+ int hour, minute
+ double second, subsecond_fraction
+
+ # Bounds check
+ # The calling function is responsible for ensuring that
+ # abstime >= 0.0 and abstime <= 86400
+
+ # Calculate the date
+ pandas_datetime_to_datetimestruct(unix_date, NPY_FR_D, dts)
+
+ # Calculate the time
+ inttime = <int>abstime
+ hour = inttime / 3600
+ minute = (inttime % 3600) / 60
+ second = abstime - <double>(hour * 3600 + minute * 60)
+
+ dts.hour = hour
+ dts.min = minute
+ dts.sec = <int>second
+
+ subsecond_fraction = second - dts.sec
+ dts.us = int((subsecond_fraction) * 1e6)
+ dts.ps = int(((subsecond_fraction) * 1e6 - dts.us) * 1e6)
+
+
+cdef double get_abs_time(int freq, int64_t unix_date, int64_t ordinal) nogil:
+ cdef:
+ int freq_index, day_index, base_index
+ int64_t per_day, start_ord
+ double unit, result
+
+ if freq <= FR_DAY:
+ return 0
+
+ freq_index = freq // 1000
+ day_index = FR_DAY // 1000
+ base_index = FR_SEC // 1000
+
+ per_day = get_daytime_conversion_factor(day_index, freq_index)
+ unit = get_daytime_conversion_factor(freq_index, base_index)
+
+ if base_index < freq_index:
+ unit = 1 / unit
+
+ start_ord = unix_date * per_day
+ result = <double>(unit * (ordinal - start_ord))
+ return result
+
+
+cdef int get_yq(int64_t ordinal, int freq, int *quarter, int *year):
+ """
+ Find the year and quarter of a Period with the given ordinal and frequency
+
+ Parameters
+ ----------
+ ordinal : int64_t
+ freq : int
+ quarter : *int
+ year : *int
+
+ Returns
+ -------
+ qtr_freq : int
+ describes the implied quarterly frequency associated with `freq`
+
+ Notes
+ -----
+ Sets quarter and year inplace
+ """
+ cdef:
+ asfreq_info af_info
+ int qtr_freq
+ int64_t unix_date
+
+ unix_date = get_unix_date(ordinal, freq)
+
+ if get_freq_group(freq) == FR_QTR:
+ qtr_freq = freq
+ else:
+ qtr_freq = FR_QTR
+
+ assert (qtr_freq % 1000) <= 12
+ get_asfreq_info(FR_DAY, qtr_freq, True, &af_info)
+
+ quarter[0] = DtoQ_yq(unix_date, &af_info, year)
+ return qtr_freq
+
+
+cdef inline int month_to_quarter(int month):
+ return (month - 1) // 3 + 1
+
+
+# ----------------------------------------------------------------------
+# Period logic
+
+def dt64arr_to_periodarr(int64_t[:] dtarr, int freq, tz=None):
+ """
+ Convert array of datetime64 values (passed in as 'i8' dtype) to a set of
+ periods corresponding to desired frequency, per period convention.
+ """
+ cdef:
+ int64_t[:] out
+ Py_ssize_t i, l
+ npy_datetimestruct dts
+
+ l = len(dtarr)
+
+ out = np.empty(l, dtype='i8')
+
+ if tz is None:
+ with nogil:
+ for i in range(l):
+ if dtarr[i] == NPY_NAT:
+ out[i] = NPY_NAT
+ continue
+ dt64_to_dtstruct(dtarr[i], &dts)
+ out[i] = get_period_ordinal(&dts, freq)
+ else:
+ out = localize_dt64arr_to_period(dtarr, freq, tz)
+ return out.base # .base to access underlying np.ndarray
+
+
+def periodarr_to_dt64arr(int64_t[:] periodarr, int freq):
+ """
+ Convert array to datetime64 values from a set of ordinals corresponding to
+ periods per period convention.
+ """
+ cdef:
+ int64_t[:] out
+ Py_ssize_t i, l
+
+ l = len(periodarr)
+
+ out = np.empty(l, dtype='i8')
+
+ with nogil:
+ for i in range(l):
+ if periodarr[i] == NPY_NAT:
+ out[i] = NPY_NAT
+ continue
+ out[i] = period_ordinal_to_dt64(periodarr[i], freq)
+
+ return out.base # .base to access underlying np.ndarray
+
+
+cpdef int64_t period_asfreq(int64_t ordinal, int freq1, int freq2, bint end):
+ """
+ Convert period ordinal from one frequency to another, and if upsampling,
+ choose to use start ('S') or end ('E') of period.
+ """
+ cdef:
+ int64_t retval
+ freq_conv_func func
+ asfreq_info af_info
+
+ if ordinal == NPY_NAT:
+ return NPY_NAT
+
+ func = get_asfreq_func(freq1, freq2)
+ get_asfreq_info(freq1, freq2, end, &af_info)
+ retval = func(ordinal, &af_info)
+
+ if retval == INT32_MIN:
+ raise ValueError('Frequency conversion failed')
+
+ return retval
+
+
+cdef void get_asfreq_info(int from_freq, int to_freq,
+ bint is_end, asfreq_info *af_info) nogil:
+ """
+ Construct the `asfreq_info` object used to convert an ordinal from
+ `from_freq` to `to_freq`.
+
+ Parameters
+ ----------
+ from_freq : int
+ to_freq int
+ is_end : bool
+ af_info : *asfreq_info
+ """
+ cdef:
+ int from_group = get_freq_group(from_freq)
+ int to_group = get_freq_group(to_freq)
+
+ af_info.is_end = is_end
+
+ af_info.intraday_conversion_factor = get_daytime_conversion_factor(
+ get_freq_group_index(max_value(from_group, FR_DAY)),
+ get_freq_group_index(max_value(to_group, FR_DAY)))
+
+ if from_group == FR_WK:
+ af_info.from_end = calc_week_end(from_freq, from_group)
+ elif from_group == FR_ANN:
+ af_info.from_end = calc_a_year_end(from_freq, from_group)
+ elif from_group == FR_QTR:
+ af_info.from_end = calc_a_year_end(from_freq, from_group)
+
+ if to_group == FR_WK:
+ af_info.to_end = calc_week_end(to_freq, to_group)
+ elif to_group == FR_ANN:
+ af_info.to_end = calc_a_year_end(to_freq, to_group)
+ elif to_group == FR_QTR:
+ af_info.to_end = calc_a_year_end(to_freq, to_group)
+
+
+cdef int calc_a_year_end(int freq, int group) nogil:
+ cdef:
+ int result = (freq - group) % 12
+ if result == 0:
+ return 12
+ else:
+ return result
+
+
+cdef inline int calc_week_end(int freq, int group) nogil:
+ return freq - group
+
+
+def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end):
+ """
+ Convert int64-array of period ordinals from one frequency to another, and
+ if upsampling, choose to use start ('S') or end ('E') of period.
+ """
+ cdef:
+ int64_t[:] result
+ Py_ssize_t i, n
+ freq_conv_func func
+ asfreq_info af_info
+ int64_t val
+
+ n = len(arr)
+ result = np.empty(n, dtype=np.int64)
+
+ func = get_asfreq_func(freq1, freq2)
+ get_asfreq_info(freq1, freq2, end, &af_info)
+
+ mask = arr == NPY_NAT
+ if mask.any(): # NaT process
+ for i in range(n):
+ val = arr[i]
+ if val != NPY_NAT:
+ val = func(val, &af_info)
+ if val == INT32_MIN:
+ raise ValueError("Unable to convert to desired frequency.")
+ result[i] = val
+ else:
+ for i in range(n):
+ val = func(arr[i], &af_info)
+ if val == INT32_MIN:
+ raise ValueError("Unable to convert to desired frequency.")
+ result[i] = val
+
+ return result.base # .base to access underlying np.ndarray
+
+
+cpdef int64_t period_ordinal(int y, int m, int d, int h, int min,
+ int s, int us, int ps, int freq):
+ """
+ Find the ordinal representation of the given datetime components at the
+ frequency `freq`.
+
+ Parameters
+ ----------
+ y : int
+ m : int
+ d : int
+ h : int
+ min : int
+ s : int
+ us : int
+ ps : int
+
+ Returns
+ -------
+ ordinal : int64_t
+ """
+ cdef:
+ npy_datetimestruct dts
+ dts.year = y
+ dts.month = m
+ dts.day = d
+ dts.hour = h
+ dts.min = min
+ dts.sec = s
+ dts.us = us
+ dts.ps = ps
+ return get_period_ordinal(&dts, freq)
+
+
+cpdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) nogil:
+ cdef:
+ npy_datetimestruct dts
+
+ if ordinal == NPY_NAT:
+ return NPY_NAT
+
+ get_date_info(ordinal, freq, &dts)
+ return dtstruct_to_dt64(&dts)
+
+
+def period_format(int64_t value, int freq, object fmt=None):
+ cdef:
+ int freq_group
+
+ if value == NPY_NAT:
+ return repr(NaT)
+
+ if fmt is None:
+ freq_group = get_freq_group(freq)
+ if freq_group == 1000: # FR_ANN
+ fmt = b'%Y'
+ elif freq_group == 2000: # FR_QTR
+ fmt = b'%FQ%q'
+ elif freq_group == 3000: # FR_MTH
+ fmt = b'%Y-%m'
+ elif freq_group == 4000: # WK
+ left = period_asfreq(value, freq, 6000, 0)
+ right = period_asfreq(value, freq, 6000, 1)
+ return '%s/%s' % (period_format(left, 6000),
+ period_format(right, 6000))
+ elif (freq_group == 5000 # BUS
+ or freq_group == 6000): # DAY
+ fmt = b'%Y-%m-%d'
+ elif freq_group == 7000: # HR
+ fmt = b'%Y-%m-%d %H:00'
+ elif freq_group == 8000: # MIN
+ fmt = b'%Y-%m-%d %H:%M'
+ elif freq_group == 9000: # SEC
+ fmt = b'%Y-%m-%d %H:%M:%S'
+ elif freq_group == 10000: # MILLISEC
+ fmt = b'%Y-%m-%d %H:%M:%S.%l'
+ elif freq_group == 11000: # MICROSEC
+ fmt = b'%Y-%m-%d %H:%M:%S.%u'
+ elif freq_group == 12000: # NANOSEC
+ fmt = b'%Y-%m-%d %H:%M:%S.%n'
+ else:
+ raise ValueError('Unknown freq: {freq}'.format(freq=freq))
+
+ return _period_strftime(value, freq, fmt)
+
+
+cdef list extra_fmts = [(b"%q", b"^`AB`^"),
+ (b"%f", b"^`CD`^"),
+ (b"%F", b"^`EF`^"),
+ (b"%l", b"^`GH`^"),
+ (b"%u", b"^`IJ`^"),
+ (b"%n", b"^`KL`^")]
+
+cdef list str_extra_fmts = ["^`AB`^", "^`CD`^", "^`EF`^",
+ "^`GH`^", "^`IJ`^", "^`KL`^"]
+
+cdef object _period_strftime(int64_t value, int freq, object fmt):
+ cdef:
+ Py_ssize_t i
+ npy_datetimestruct dts
+ char *formatted
+ object pat, repl, result
+ list found_pat = [False] * len(extra_fmts)
+ int year, quarter
+
+ if isinstance(fmt, unicode):
+ fmt = fmt.encode('utf-8')
+
+ get_date_info(value, freq, &dts)
+ for i in range(len(extra_fmts)):
+ pat = extra_fmts[i][0]
+ repl = extra_fmts[i][1]
+ if pat in fmt:
+ fmt = fmt.replace(pat, repl)
+ found_pat[i] = True
+
+ formatted = c_strftime(&dts, <char*>fmt)
+
+ result = util.char_to_string(formatted)
+ free(formatted)
+
+ for i in range(len(extra_fmts)):
+ if found_pat[i]:
+ if get_yq(value, freq, &quarter, &year) < 0:
+ raise ValueError('Unable to get quarter and year')
+
+ if i == 0:
+ repl = '%d' % quarter
+ elif i == 1: # %f, 2-digit year
+ repl = '%.2d' % (year % 100)
+ elif i == 2:
+ repl = '%d' % year
+ elif i == 3:
+ repl = '%03d' % (value % 1000)
+ elif i == 4:
+ repl = '%06d' % (value % 1000000)
+ elif i == 5:
+ repl = '%09d' % (value % 1000000000)
+
+ result = result.replace(str_extra_fmts[i], repl)
+
+ if PY2:
+ result = result.decode('utf-8', 'ignore')
+
+ return result
+
+
+# ----------------------------------------------------------------------
+# period accessors
+
+ctypedef int (*accessor)(int64_t ordinal, int freq) except INT32_MIN
+
+
+cdef int pyear(int64_t ordinal, int freq):
+ cdef:
+ npy_datetimestruct dts
+ get_date_info(ordinal, freq, &dts)
+ return dts.year
+
+
+cdef int pqyear(int64_t ordinal, int freq):
+ cdef:
+ int year, quarter
+ get_yq(ordinal, freq, &quarter, &year)
+ return year
+
+
+cdef int pquarter(int64_t ordinal, int freq):
+ cdef:
+ int year, quarter
+ get_yq(ordinal, freq, &quarter, &year)
+ return quarter
+
+
+cdef int pmonth(int64_t ordinal, int freq):
+ cdef:
+ npy_datetimestruct dts
+ get_date_info(ordinal, freq, &dts)
+ return dts.month
+
+
+cdef int pday(int64_t ordinal, int freq):
+ cdef:
+ npy_datetimestruct dts
+ get_date_info(ordinal, freq, &dts)
+ return dts.day
+
+
+cdef int pweekday(int64_t ordinal, int freq):
+ cdef:
+ npy_datetimestruct dts
+ get_date_info(ordinal, freq, &dts)
+ return dayofweek(dts.year, dts.month, dts.day)
+
+
+cdef int pday_of_year(int64_t ordinal, int freq):
+ cdef:
+ npy_datetimestruct dts
+ get_date_info(ordinal, freq, &dts)
+ return get_day_of_year(dts.year, dts.month, dts.day)
+
+
+cdef int pweek(int64_t ordinal, int freq):
+ cdef:
+ npy_datetimestruct dts
+ get_date_info(ordinal, freq, &dts)
+ return ccalendar.get_week_of_year(dts.year, dts.month, dts.day)
+
+
+cdef int phour(int64_t ordinal, int freq):
+ cdef:
+ npy_datetimestruct dts
+ get_date_info(ordinal, freq, &dts)
+ return dts.hour
+
+
+cdef int pminute(int64_t ordinal, int freq):
+ cdef:
+ npy_datetimestruct dts
+ get_date_info(ordinal, freq, &dts)
+ return dts.min
+
+
+cdef int psecond(int64_t ordinal, int freq):
+ cdef:
+ npy_datetimestruct dts
+ get_date_info(ordinal, freq, &dts)
+ return <int>dts.sec
+
+
+cdef int pdays_in_month(int64_t ordinal, int freq):
+ cdef:
+ npy_datetimestruct dts
+ get_date_info(ordinal, freq, &dts)
+ return ccalendar.get_days_in_month(dts.year, dts.month)
+
+
+def get_period_field_arr(int code, int64_t[:] arr, int freq):
+ cdef:
+ Py_ssize_t i, sz
+ int64_t[:] out
+ accessor f
+
+ func = _get_accessor_func(code)
+ if func is NULL:
+ raise ValueError('Unrecognized period code: {code}'.format(code=code))
+
+ sz = len(arr)
+ out = np.empty(sz, dtype=np.int64)
+
+ for i in range(sz):
+ if arr[i] == NPY_NAT:
+ out[i] = -1
+ continue
+ out[i] = func(arr[i], freq)
+
+ return out.base # .base to access underlying np.ndarray
+
+
+cdef accessor _get_accessor_func(int code):
+ if code == 0:
+ return <accessor>pyear
+ elif code == 1:
+ return <accessor>pqyear
+ elif code == 2:
+ return <accessor>pquarter
+ elif code == 3:
+ return <accessor>pmonth
+ elif code == 4:
+ return <accessor>pday
+ elif code == 5:
+ return <accessor>phour
+ elif code == 6:
+ return <accessor>pminute
+ elif code == 7:
+ return <accessor>psecond
+ elif code == 8:
+ return <accessor>pweek
+ elif code == 9:
+ return <accessor>pday_of_year
+ elif code == 10:
+ return <accessor>pweekday
+ elif code == 11:
+ return <accessor>pdays_in_month
+ return NULL
+
+
+def extract_ordinals(ndarray[object] values, freq):
+ # TODO: Change type to const object[:] when Cython supports that.
+
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int64_t[:] ordinals = np.empty(n, dtype=np.int64)
+ object p
+
+ freqstr = Period._maybe_convert_freq(freq).freqstr
+
+ for i in range(n):
+ p = values[i]
+
+ if is_null_datetimelike(p):
+ ordinals[i] = NPY_NAT
+ else:
+ try:
+ ordinals[i] = p.ordinal
+
+ if p.freqstr != freqstr:
+ msg = DIFFERENT_FREQ.format(cls="PeriodIndex",
+ own_freq=freqstr,
+ other_freq=p.freqstr)
+ raise IncompatibleFrequency(msg)
+
+ except AttributeError:
+ p = Period(p, freq=freq)
+ if p is NaT:
+ # input may contain NaT-like string
+ ordinals[i] = NPY_NAT
+ else:
+ ordinals[i] = p.ordinal
+
+ return ordinals.base # .base to access underlying np.ndarray
+
+
+def extract_freq(ndarray[object] values):
+ # TODO: Change type to const object[:] when Cython supports that.
+
+ cdef:
+ Py_ssize_t i, n = len(values)
+ object p
+
+ for i in range(n):
+ p = values[i]
+
+ try:
+ # now Timestamp / NaT has freq attr
+ if is_period_object(p):
+ return p.freq
+ except AttributeError:
+ pass
+
+ raise ValueError('freq not specified and cannot be inferred')
+
+
+# -----------------------------------------------------------------------
+# period helpers
+
+cdef int64_t[:] localize_dt64arr_to_period(int64_t[:] stamps,
+ int freq, object tz):
+ cdef:
+ Py_ssize_t n = len(stamps)
+ int64_t[:] result = np.empty(n, dtype=np.int64)
+ ndarray[int64_t] trans
+ int64_t[:] deltas
+ Py_ssize_t[:] pos
+ npy_datetimestruct dts
+ int64_t local_val
+
+ if is_utc(tz) or tz is None:
+ with nogil:
+ for i in range(n):
+ if stamps[i] == NPY_NAT:
+ result[i] = NPY_NAT
+ continue
+ dt64_to_dtstruct(stamps[i], &dts)
+ result[i] = get_period_ordinal(&dts, freq)
+
+ elif is_tzlocal(tz):
+ for i in range(n):
+ if stamps[i] == NPY_NAT:
+ result[i] = NPY_NAT
+ continue
+ local_val = tz_convert_utc_to_tzlocal(stamps[i], tz)
+ dt64_to_dtstruct(local_val, &dts)
+ result[i] = get_period_ordinal(&dts, freq)
+ else:
+ # Adjust datetime64 timestamp, recompute datetimestruct
+ trans, deltas, typ = get_dst_info(tz)
+
+ if typ not in ['pytz', 'dateutil']:
+ # static/fixed; in this case we know that len(delta) == 1
+ for i in range(n):
+ if stamps[i] == NPY_NAT:
+ result[i] = NPY_NAT
+ continue
+ dt64_to_dtstruct(stamps[i] + deltas[0], &dts)
+ result[i] = get_period_ordinal(&dts, freq)
+ else:
+ pos = trans.searchsorted(stamps, side='right') - 1
+
+ for i in range(n):
+ if stamps[i] == NPY_NAT:
+ result[i] = NPY_NAT
+ continue
+ dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts)
+ result[i] = get_period_ordinal(&dts, freq)
+
+ return result
+
+
+DIFFERENT_FREQ = ("Input has different freq={other_freq} "
+ "from {cls}(freq={own_freq})")
+
+
+class IncompatibleFrequency(ValueError):
+ pass
+
+
+cdef class _Period(object):
+
+ cdef readonly:
+ int64_t ordinal
+ object freq
+
+ _typ = 'period'
+
+ def __cinit__(self, ordinal, freq):
+ self.ordinal = ordinal
+ self.freq = freq
+
+ @classmethod
+ def _maybe_convert_freq(cls, object freq):
+ if isinstance(freq, (int, tuple)):
+ code, stride = get_freq_code(freq)
+ freq = get_freq_str(code, stride)
+
+ freq = to_offset(freq)
+
+ if freq.n <= 0:
+ raise ValueError('Frequency must be positive, because it'
+ ' represents span: {freqstr}'
+ .format(freqstr=freq.freqstr))
+
+ return freq
+
+ @classmethod
+ def _from_ordinal(cls, ordinal, freq):
+ """
+ Fast creation from an ordinal and freq that are already validated!
+ """
+ if ordinal == NPY_NAT:
+ return NaT
+ else:
+ freq = cls._maybe_convert_freq(freq)
+ self = _Period.__new__(cls, ordinal, freq)
+ return self
+
+ def __richcmp__(self, other, op):
+ if is_period_object(other):
+ if other.freq != self.freq:
+ msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
+ own_freq=self.freqstr,
+ other_freq=other.freqstr)
+ raise IncompatibleFrequency(msg)
+ return PyObject_RichCompareBool(self.ordinal, other.ordinal, op)
+ elif other is NaT:
+ return _nat_scalar_rules[op]
+ # index/series like
+ elif hasattr(other, '_typ'):
+ return NotImplemented
+ else:
+ if op == Py_EQ:
+ return NotImplemented
+ elif op == Py_NE:
+ return NotImplemented
+ raise TypeError('Cannot compare type {cls} with type {typ}'
+ .format(cls=type(self).__name__,
+ typ=type(other).__name__))
+
+ def __hash__(self):
+ return hash((self.ordinal, self.freqstr))
+
+ def _add_delta(self, other):
+ cdef:
+ int64_t nanos, offset_nanos
+
+ if (PyDelta_Check(other) or util.is_timedelta64_object(other) or
+ isinstance(other, _Tick)):
+ offset = to_offset(self.freq.rule_code)
+ if isinstance(offset, _Tick):
+ nanos = delta_to_nanoseconds(other)
+ offset_nanos = delta_to_nanoseconds(offset)
+ if nanos % offset_nanos == 0:
+ ordinal = self.ordinal + (nanos // offset_nanos)
+ return Period(ordinal=ordinal, freq=self.freq)
+ msg = 'Input cannot be converted to Period(freq={0})'
+ raise IncompatibleFrequency(msg.format(self.freqstr))
+ elif util.is_offset_object(other):
+ freqstr = other.rule_code
+ base = get_base_alias(freqstr)
+ if base == self.freq.rule_code:
+ ordinal = self.ordinal + other.n
+ return Period(ordinal=ordinal, freq=self.freq)
+ msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
+ own_freq=self.freqstr,
+ other_freq=other.freqstr)
+ raise IncompatibleFrequency(msg)
+ else: # pragma no cover
+ return NotImplemented
+
+ def __add__(self, other):
+ if is_period_object(self):
+ if (PyDelta_Check(other) or util.is_timedelta64_object(other) or
+ util.is_offset_object(other)):
+ return self._add_delta(other)
+ elif other is NaT:
+ return NaT
+ elif util.is_integer_object(other):
+ ordinal = self.ordinal + other * self.freq.n
+ return Period(ordinal=ordinal, freq=self.freq)
+ elif (PyDateTime_Check(other) or
+ is_period_object(other) or util.is_datetime64_object(other)):
+ # can't add datetime-like
+ # GH#17983
+ sname = type(self).__name__
+ oname = type(other).__name__
+ raise TypeError("unsupported operand type(s) for +: '{self}' "
+ "and '{other}'".format(self=sname,
+ other=oname))
+ else: # pragma: no cover
+ return NotImplemented
+ elif is_period_object(other):
+ # this can be reached via __radd__ because of cython rules
+ return other + self
+ else:
+ return NotImplemented
+
+ def __sub__(self, other):
+ if is_period_object(self):
+ if (PyDelta_Check(other) or util.is_timedelta64_object(other) or
+ util.is_offset_object(other)):
+ neg_other = -other
+ return self + neg_other
+ elif util.is_integer_object(other):
+ ordinal = self.ordinal - other * self.freq.n
+ return Period(ordinal=ordinal, freq=self.freq)
+ elif is_period_object(other):
+ if other.freq != self.freq:
+ msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
+ own_freq=self.freqstr,
+ other_freq=other.freqstr)
+ raise IncompatibleFrequency(msg)
+ # GH 23915 - mul by base freq since __add__ is agnostic of n
+ return (self.ordinal - other.ordinal) * self.freq.base
+ elif getattr(other, '_typ', None) == 'periodindex':
+ # GH#21314 PeriodIndex - Period returns an object-index
+ # of DateOffset objects, for which we cannot use __neg__
+ # directly, so we have to apply it pointwise
+ return other.__sub__(self).map(lambda x: -x)
+ else: # pragma: no cover
+ return NotImplemented
+ elif is_period_object(other):
+ if self is NaT:
+ return NaT
+ return NotImplemented
+ else:
+ return NotImplemented
+
+ def asfreq(self, freq, how='E'):
+ """
+ Convert Period to desired frequency, either at the start or end of the
+ interval
+
+ Parameters
+ ----------
+ freq : string
+ how : {'E', 'S', 'end', 'start'}, default 'end'
+ Start or end of the timespan
+
+ Returns
+ -------
+ resampled : Period
+ """
+ freq = self._maybe_convert_freq(freq)
+ how = _validate_end_alias(how)
+ base1, mult1 = get_freq_code(self.freq)
+ base2, mult2 = get_freq_code(freq)
+
+ # mult1 can't be negative or 0
+ end = how == 'E'
+ if end:
+ ordinal = self.ordinal + mult1 - 1
+ else:
+ ordinal = self.ordinal
+ ordinal = period_asfreq(ordinal, base1, base2, end)
+
+ return Period(ordinal=ordinal, freq=freq)
+
+ @property
+ def start_time(self):
+ """
+ Get the Timestamp for the start of the period.
+
+ Returns
+ -------
+ Timestamp
+
+ See Also
+ --------
+ Period.end_time : Return the end Timestamp.
+ Period.dayofyear : Return the day of year.
+ Period.daysinmonth : Return the days in that month.
+ Period.dayofweek : Return the day of the week.
+
+ Examples
+ --------
+ >>> period = pd.Period('2012-1-1', freq='D')
+ >>> period
+ Period('2012-01-01', 'D')
+
+ >>> period.start_time
+ Timestamp('2012-01-01 00:00:00')
+
+ >>> period.end_time
+ Timestamp('2012-01-01 23:59:59.999999999')
+ """
+ return self.to_timestamp(how='S')
+
+ @property
+ def end_time(self):
+ # freq.n can't be negative or 0
+ # ordinal = (self + self.freq.n).start_time.value - 1
+ ordinal = (self + self.freq).start_time.value - 1
+ return Timestamp(ordinal)
+
+ def to_timestamp(self, freq=None, how='start', tz=None):
+ """
+ Return the Timestamp representation of the Period at the target
+ frequency at the specified end (how) of the Period
+
+ Parameters
+ ----------
+ freq : string or DateOffset
+ Target frequency. Default is 'D' if self.freq is week or
+ longer and 'S' otherwise
+ how : str, default 'S' (start)
+ 'S', 'E'. Can be aliased as case insensitive
+ 'Start', 'Finish', 'Begin', 'End'
+
+ Returns
+ -------
+ Timestamp
+ """
+ if freq is not None:
+ freq = self._maybe_convert_freq(freq)
+ how = _validate_end_alias(how)
+
+ end = how == 'E'
+ if end:
+ endpoint = (self + self.freq).to_timestamp(how='start')
+ return endpoint - Timedelta(1, 'ns')
+
+ if freq is None:
+ base, mult = get_freq_code(self.freq)
+ freq = get_to_timestamp_base(base)
+
+ base, mult = get_freq_code(freq)
+ val = self.asfreq(freq, how)
+
+ dt64 = period_ordinal_to_dt64(val.ordinal, base)
+ return Timestamp(dt64, tz=tz)
+
+ @property
+ def year(self):
+ base, mult = get_freq_code(self.freq)
+ return pyear(self.ordinal, base)
+
+ @property
+ def month(self):
+ base, mult = get_freq_code(self.freq)
+ return pmonth(self.ordinal, base)
+
+ @property
+ def day(self):
+ """
+ Get day of the month that a Period falls on.
+
+ Returns
+ -------
+ int
+
+ See Also
+ --------
+ Period.dayofweek : Get the day of the week.
+ Period.dayofyear : Get the day of the year.
+
+ Examples
+ --------
+ >>> p = pd.Period("2018-03-11", freq='H')
+ >>> p.day
+ 11
+ """
+ base, mult = get_freq_code(self.freq)
+ return pday(self.ordinal, base)
+
+ @property
+ def hour(self):
+ """
+ Get the hour of the day component of the Period.
+
+ Returns
+ -------
+ int
+ The hour as an integer, between 0 and 23.
+
+ See Also
+ --------
+ Period.second : Get the second component of the Period.
+ Period.minute : Get the minute component of the Period.
+
+ Examples
+ --------
+ >>> p = pd.Period("2018-03-11 13:03:12.050000")
+ >>> p.hour
+ 13
+
+ Period longer than a day
+
+ >>> p = pd.Period("2018-03-11", freq="M")
+ >>> p.hour
+ 0
+ """
+ base, mult = get_freq_code(self.freq)
+ return phour(self.ordinal, base)
+
+ @property
+ def minute(self):
+ """
+ Get minute of the hour component of the Period.
+
+ Returns
+ -------
+ int
+ The minute as an integer, between 0 and 59.
+
+ See Also
+ --------
+ Period.hour : Get the hour component of the Period.
+ Period.second : Get the second component of the Period.
+
+ Examples
+ --------
+ >>> p = pd.Period("2018-03-11 13:03:12.050000")
+ >>> p.minute
+ 3
+ """
+ base, mult = get_freq_code(self.freq)
+ return pminute(self.ordinal, base)
+
+ @property
+ def second(self):
+ """
+ Get the second component of the Period.
+
+ Returns
+ -------
+ int
+ The second of the Period (ranges from 0 to 59).
+
+ See Also
+ --------
+ Period.hour : Get the hour component of the Period.
+ Period.minute : Get the minute component of the Period.
+
+ Examples
+ --------
+ >>> p = pd.Period("2018-03-11 13:03:12.050000")
+ >>> p.second
+ 12
+ """
+ base, mult = get_freq_code(self.freq)
+ return psecond(self.ordinal, base)
+
+ @property
+ def weekofyear(self):
+ base, mult = get_freq_code(self.freq)
+ return pweek(self.ordinal, base)
+
+ @property
+ def week(self):
+ """
+ Get the week of the year on the given Period.
+
+ Returns
+ -------
+ int
+
+ See Also
+ --------
+ Period.dayofweek : Get the day component of the Period.
+ Period.weekday : Get the day component of the Period.
+
+ Examples
+ --------
+ >>> p = pd.Period("2018-03-11", "H")
+ >>> p.week
+ 10
+
+ >>> p = pd.Period("2018-02-01", "D")
+ >>> p.week
+ 5
+
+ >>> p = pd.Period("2018-01-06", "D")
+ >>> p.week
+ 1
+ """
+ return self.weekofyear
+
+ @property
+ def dayofweek(self):
+ """
+ Day of the week the period lies in, with Monday=0 and Sunday=6.
+
+ If the period frequency is lower than daily (e.g. hourly), and the
+ period spans over multiple days, the day at the start of the period is
+ used.
+
+ If the frequency is higher than daily (e.g. monthly), the last day
+ of the period is used.
+
+ Returns
+ -------
+ int
+ Day of the week.
+
+ See Also
+ --------
+ Period.dayofweek : Day of the week the period lies in.
+ Period.weekday : Alias of Period.dayofweek.
+ Period.day : Day of the month.
+ Period.dayofyear : Day of the year.
+
+ Examples
+ --------
+ >>> per = pd.Period('2017-12-31 22:00', 'H')
+ >>> per.dayofweek
+ 6
+
+ For periods that span over multiple days, the day at the beginning of
+ the period is returned.
+
+ >>> per = pd.Period('2017-12-31 22:00', '4H')
+ >>> per.dayofweek
+ 6
+ >>> per.start_time.dayofweek
+ 6
+
+ For periods with a frequency higher than days, the last day of the
+ period is returned.
+
+ >>> per = pd.Period('2018-01', 'M')
+ >>> per.dayofweek
+ 2
+ >>> per.end_time.dayofweek
+ 2
+ """
+ base, mult = get_freq_code(self.freq)
+ return pweekday(self.ordinal, base)
+
+ @property
+ def weekday(self):
+ """
+ Day of the week the period lies in, with Monday=0 and Sunday=6.
+
+ If the period frequency is lower than daily (e.g. hourly), and the
+ period spans over multiple days, the day at the start of the period is
+ used.
+
+ If the frequency is higher than daily (e.g. monthly), the last day
+ of the period is used.
+
+ Returns
+ -------
+ int
+ Day of the week.
+
+ See Also
+ --------
+ Period.dayofweek : Day of the week the period lies in.
+ Period.weekday : Alias of Period.dayofweek.
+ Period.day : Day of the month.
+ Period.dayofyear : Day of the year.
+
+ Examples
+ --------
+ >>> per = pd.Period('2017-12-31 22:00', 'H')
+ >>> per.dayofweek
+ 6
+
+ For periods that span over multiple days, the day at the beginning of
+ the period is returned.
+
+ >>> per = pd.Period('2017-12-31 22:00', '4H')
+ >>> per.dayofweek
+ 6
+ >>> per.start_time.dayofweek
+ 6
+
+ For periods with a frequency higher than days, the last day of the
+ period is returned.
+
+ >>> per = pd.Period('2018-01', 'M')
+ >>> per.dayofweek
+ 2
+ >>> per.end_time.dayofweek
+ 2
+ """
+ # Docstring is a duplicate from dayofweek. Reusing docstrings with
+ # Appender doesn't work for properties in Cython files, and setting
+ # the __doc__ attribute is also not possible.
+ return self.dayofweek
+
+ @property
+ def dayofyear(self):
+ """
+ Return the day of the year.
+
+ This attribute returns the day of the year on which the particular
+ date occurs. The return value ranges between 1 to 365 for regular
+ years and 1 to 366 for leap years.
+
+ Returns
+ -------
+ int
+ The day of year.
+
+ See Also
+ --------
+ Period.day : Return the day of the month.
+ Period.dayofweek : Return the day of week.
+ PeriodIndex.dayofyear : Return the day of year of all indexes.
+
+ Examples
+ --------
+ >>> period = pd.Period("2015-10-23", freq='H')
+ >>> period.dayofyear
+ 296
+ >>> period = pd.Period("2012-12-31", freq='D')
+ >>> period.dayofyear
+ 366
+ >>> period = pd.Period("2013-01-01", freq='D')
+ >>> period.dayofyear
+ 1
+ """
+ base, mult = get_freq_code(self.freq)
+ return pday_of_year(self.ordinal, base)
+
+ @property
+ def quarter(self):
+ base, mult = get_freq_code(self.freq)
+ return pquarter(self.ordinal, base)
+
+ @property
+ def qyear(self):
+ """
+ Fiscal year the Period lies in according to its starting-quarter.
+
+ The `year` and the `qyear` of the period will be the same if the fiscal
+ and calendar years are the same. When they are not, the fiscal year
+ can be different from the calendar year of the period.
+
+ Returns
+ -------
+ int
+ The fiscal year of the period.
+
+ See Also
+ --------
+ Period.year : Return the calendar year of the period.
+
+ Examples
+ --------
+ If the natural and fiscal year are the same, `qyear` and `year` will
+ be the same.
+
+ >>> per = pd.Period('2018Q1', freq='Q')
+ >>> per.qyear
+ 2018
+ >>> per.year
+ 2018
+
+ If the fiscal year starts in April (`Q-MAR`), the first quarter of
+ 2018 will start in April 2017. `year` will then be 2018, but `qyear`
+ will be the fiscal year, 2018.
+
+ >>> per = pd.Period('2018Q1', freq='Q-MAR')
+ >>> per.start_time
+ Timestamp('2017-04-01 00:00:00')
+ >>> per.qyear
+ 2018
+ >>> per.year
+ 2017
+ """
+ base, mult = get_freq_code(self.freq)
+ return pqyear(self.ordinal, base)
+
+ @property
+ def days_in_month(self):
+ """
+ Get the total number of days in the month that this period falls on.
+
+ Returns
+ -------
+ int
+
+ See Also
+ --------
+ Period.daysinmonth : Gets the number of days in the month.
+ DatetimeIndex.daysinmonth : Gets the number of days in the month.
+ calendar.monthrange : Returns a tuple containing weekday
+ (0-6 ~ Mon-Sun) and number of days (28-31).
+
+ Examples
+ --------
+ >>> p = pd.Period('2018-2-17')
+ >>> p.days_in_month
+ 28
+
+ >>> pd.Period('2018-03-01').days_in_month
+ 31
+
+ Handles the leap year case as well:
+
+ >>> p = pd.Period('2016-2-17')
+ >>> p.days_in_month
+ 29
+ """
+ base, mult = get_freq_code(self.freq)
+ return pdays_in_month(self.ordinal, base)
+
+ @property
+ def daysinmonth(self):
+ """
+ Get the total number of days of the month that the Period falls in.
+
+ Returns
+ -------
+ int
+
+ See Also
+ --------
+ Period.days_in_month : Return the days of the month.
+ Period.dayofyear : Return the day of the year.
+
+ Examples
+ --------
+ >>> p = pd.Period("2018-03-11", freq='H')
+ >>> p.daysinmonth
+ 31
+ """
+ return self.days_in_month
+
+ @property
+ def is_leap_year(self):
+ return bool(is_leapyear(self.year))
+
+ @classmethod
+ def now(cls, freq=None):
+ return Period(datetime.now(), freq=freq)
+
+ # HACK IT UP AND YOU BETTER FIX IT SOON
+ def __str__(self):
+ return self.__unicode__()
+
+ @property
+ def freqstr(self):
+ return self.freq.freqstr
+
+ def __repr__(self):
+ base, mult = get_freq_code(self.freq)
+ formatted = period_format(self.ordinal, base)
+ return "Period('%s', '%s')" % (formatted, self.freqstr)
+
+ def __unicode__(self):
+ """
+ Return a string representation for a particular DataFrame
+
+ Invoked by unicode(df) in py2 only. Yields a Unicode String in both
+ py2/py3.
+ """
+ base, mult = get_freq_code(self.freq)
+ formatted = period_format(self.ordinal, base)
+ value = ("%s" % formatted)
+ return value
+
+ def __setstate__(self, state):
+ self.freq = state[1]
+ self.ordinal = state[2]
+
+ def __reduce__(self):
+ object_state = None, self.freq, self.ordinal
+ return (Period, object_state)
+
+ def strftime(self, fmt):
+ """
+ Returns the string representation of the :class:`Period`, depending
+ on the selected ``fmt``. ``fmt`` must be a string
+ containing one or several directives. The method recognizes the same
+ directives as the :func:`time.strftime` function of the standard Python
+ distribution, as well as the specific additional directives ``%f``,
+ ``%F``, ``%q``. (formatting & docs originally from scikits.timeries)
+
+ +-----------+--------------------------------+-------+
+ | Directive | Meaning | Notes |
+ +===========+================================+=======+
+ | ``%a`` | Locale's abbreviated weekday | |
+ | | name. | |
+ +-----------+--------------------------------+-------+
+ | ``%A`` | Locale's full weekday name. | |
+ +-----------+--------------------------------+-------+
+ | ``%b`` | Locale's abbreviated month | |
+ | | name. | |
+ +-----------+--------------------------------+-------+
+ | ``%B`` | Locale's full month name. | |
+ +-----------+--------------------------------+-------+
+ | ``%c`` | Locale's appropriate date and | |
+ | | time representation. | |
+ +-----------+--------------------------------+-------+
+ | ``%d`` | Day of the month as a decimal | |
+ | | number [01,31]. | |
+ +-----------+--------------------------------+-------+
+ | ``%f`` | 'Fiscal' year without a | \(1) |
+ | | century as a decimal number | |
+ | | [00,99] | |
+ +-----------+--------------------------------+-------+
+ | ``%F`` | 'Fiscal' year with a century | \(2) |
+ | | as a decimal number | |
+ +-----------+--------------------------------+-------+
+ | ``%H`` | Hour (24-hour clock) as a | |
+ | | decimal number [00,23]. | |
+ +-----------+--------------------------------+-------+
+ | ``%I`` | Hour (12-hour clock) as a | |
+ | | decimal number [01,12]. | |
+ +-----------+--------------------------------+-------+
+ | ``%j`` | Day of the year as a decimal | |
+ | | number [001,366]. | |
+ +-----------+--------------------------------+-------+
+ | ``%m`` | Month as a decimal number | |
+ | | [01,12]. | |
+ +-----------+--------------------------------+-------+
+ | ``%M`` | Minute as a decimal number | |
+ | | [00,59]. | |
+ +-----------+--------------------------------+-------+
+ | ``%p`` | Locale's equivalent of either | \(3) |
+ | | AM or PM. | |
+ +-----------+--------------------------------+-------+
+ | ``%q`` | Quarter as a decimal number | |
+ | | [01,04] | |
+ +-----------+--------------------------------+-------+
+ | ``%S`` | Second as a decimal number | \(4) |
+ | | [00,61]. | |
+ +-----------+--------------------------------+-------+
+ | ``%U`` | Week number of the year | \(5) |
+ | | (Sunday as the first day of | |
+ | | the week) as a decimal number | |
+ | | [00,53]. All days in a new | |
+ | | year preceding the first | |
+ | | Sunday are considered to be in | |
+ | | week 0. | |
+ +-----------+--------------------------------+-------+
+ | ``%w`` | Weekday as a decimal number | |
+ | | [0(Sunday),6]. | |
+ +-----------+--------------------------------+-------+
+ | ``%W`` | Week number of the year | \(5) |
+ | | (Monday as the first day of | |
+ | | the week) as a decimal number | |
+ | | [00,53]. All days in a new | |
+ | | year preceding the first | |
+ | | Monday are considered to be in | |
+ | | week 0. | |
+ +-----------+--------------------------------+-------+
+ | ``%x`` | Locale's appropriate date | |
+ | | representation. | |
+ +-----------+--------------------------------+-------+
+ | ``%X`` | Locale's appropriate time | |
+ | | representation. | |
+ +-----------+--------------------------------+-------+
+ | ``%y`` | Year without century as a | |
+ | | decimal number [00,99]. | |
+ +-----------+--------------------------------+-------+
+ | ``%Y`` | Year with century as a decimal | |
+ | | number. | |
+ +-----------+--------------------------------+-------+
+ | ``%Z`` | Time zone name (no characters | |
+ | | if no time zone exists). | |
+ +-----------+--------------------------------+-------+
+ | ``%%`` | A literal ``'%'`` character. | |
+ +-----------+--------------------------------+-------+
+
+ Notes
+ -----
+
+ (1)
+ The ``%f`` directive is the same as ``%y`` if the frequency is
+ not quarterly.
+ Otherwise, it corresponds to the 'fiscal' year, as defined by
+ the :attr:`qyear` attribute.
+
+ (2)
+ The ``%F`` directive is the same as ``%Y`` if the frequency is
+ not quarterly.
+ Otherwise, it corresponds to the 'fiscal' year, as defined by
+ the :attr:`qyear` attribute.
+
+ (3)
+ The ``%p`` directive only affects the output hour field
+ if the ``%I`` directive is used to parse the hour.
+
+ (4)
+ The range really is ``0`` to ``61``; this accounts for leap
+ seconds and the (very rare) double leap seconds.
+
+ (5)
+ The ``%U`` and ``%W`` directives are only used in calculations
+ when the day of the week and the year are specified.
+
+ Examples
+ --------
+
+ >>> a = Period(freq='Q-JUL', year=2006, quarter=1)
+ >>> a.strftime('%F-Q%q')
+ '2006-Q1'
+ >>> # Output the last month in the quarter of this date
+ >>> a.strftime('%b-%Y')
+ 'Oct-2005'
+ >>>
+ >>> a = Period(freq='D', year=2001, month=1, day=1)
+ >>> a.strftime('%d-%b-%Y')
+ '01-Jan-2006'
+ >>> a.strftime('%b. %d, %Y was a %A')
+ 'Jan. 01, 2001 was a Monday'
+ """
+ base, mult = get_freq_code(self.freq)
+ return period_format(self.ordinal, base, fmt)
+
+
+class Period(_Period):
+ """
+ Represents a period of time
+
+ Parameters
+ ----------
+ value : Period or compat.string_types, default None
+ The time period represented (e.g., '4Q2005')
+ freq : str, default None
+ One of pandas period strings or corresponding objects
+ year : int, default None
+ month : int, default 1
+ quarter : int, default None
+ day : int, default 1
+ hour : int, default 0
+ minute : int, default 0
+ second : int, default 0
+ """
+
+ def __new__(cls, value=None, freq=None, ordinal=None,
+ year=None, month=None, quarter=None, day=None,
+ hour=None, minute=None, second=None):
+ # freq points to a tuple (base, mult); base is one of the defined
+ # periods such as A, Q, etc. Every five minutes would be, e.g.,
+ # ('T', 5) but may be passed in as a string like '5T'
+
+ # ordinal is the period offset from the gregorian proleptic epoch
+ cdef _Period self
+
+ if freq is not None:
+ freq = cls._maybe_convert_freq(freq)
+
+ if ordinal is not None and value is not None:
+ raise ValueError("Only value or ordinal but not both should be "
+ "given but not both")
+ elif ordinal is not None:
+ if not util.is_integer_object(ordinal):
+ raise ValueError("Ordinal must be an integer")
+ if freq is None:
+ raise ValueError('Must supply freq for ordinal value')
+
+ elif value is None:
+ if (year is None and month is None and
+ quarter is None and day is None and
+ hour is None and minute is None and second is None):
+ ordinal = NPY_NAT
+ else:
+ if freq is None:
+ raise ValueError("If value is None, freq cannot be None")
+
+ # set defaults
+ month = 1 if month is None else month
+ day = 1 if day is None else day
+ hour = 0 if hour is None else hour
+ minute = 0 if minute is None else minute
+ second = 0 if second is None else second
+
+ ordinal = _ordinal_from_fields(year, month, quarter, day,
+ hour, minute, second, freq)
+
+ elif is_period_object(value):
+ other = value
+ if freq is None or get_freq_code(
+ freq) == get_freq_code(other.freq):
+ ordinal = other.ordinal
+ freq = other.freq
+ else:
+ converted = other.asfreq(freq)
+ ordinal = converted.ordinal
+
+ elif is_null_datetimelike(value) or value in nat_strings:
+ ordinal = NPY_NAT
+
+ elif is_string_object(value) or util.is_integer_object(value):
+ if util.is_integer_object(value):
+ value = str(value)
+ value = value.upper()
+ dt, _, reso = parse_time_string(value, freq)
+ if dt is NaT:
+ ordinal = NPY_NAT
+
+ if freq is None:
+ try:
+ freq = Resolution.get_freq(reso)
+ except KeyError:
+ raise ValueError(
+ "Invalid frequency or could not infer: {reso}"
+ .format(reso=reso))
+
+ elif PyDateTime_Check(value):
+ dt = value
+ if freq is None:
+ raise ValueError('Must supply freq for datetime value')
+ elif util.is_datetime64_object(value):
+ dt = Timestamp(value)
+ if freq is None:
+ raise ValueError('Must supply freq for datetime value')
+ elif PyDate_Check(value):
+ dt = datetime(year=value.year, month=value.month, day=value.day)
+ if freq is None:
+ raise ValueError('Must supply freq for datetime value')
+ else:
+ msg = "Value must be Period, string, integer, or datetime"
+ raise ValueError(msg)
+
+ if ordinal is None:
+ base, mult = get_freq_code(freq)
+ ordinal = period_ordinal(dt.year, dt.month, dt.day,
+ dt.hour, dt.minute, dt.second,
+ dt.microsecond, 0, base)
+
+ return cls._from_ordinal(ordinal, freq)
+
+
+cdef int64_t _ordinal_from_fields(int year, int month, quarter, int day,
+ int hour, int minute, int second, freq):
+ base, mult = get_freq_code(freq)
+ if quarter is not None:
+ year, month = quarter_to_myear(year, quarter, freq)
+
+ return period_ordinal(year, month, day, hour,
+ minute, second, 0, 0, base)
+
+
+def quarter_to_myear(year: int, quarter: int, freq):
+ """
+ A quarterly frequency defines a "year" which may not coincide with
+ the calendar-year. Find the calendar-year and calendar-month associated
+ with the given year and quarter under the `freq`-derived calendar.
+
+ Parameters
+ ----------
+ year : int
+ quarter : int
+ freq : DateOffset
+
+ Returns
+ -------
+ year : int
+ month : int
+
+ See Also
+ --------
+ Period.qyear
+ """
+ if quarter <= 0 or quarter > 4:
+ raise ValueError('Quarter must be 1 <= q <= 4')
+
+ mnum = MONTH_NUMBERS[get_rule_month(freq)] + 1
+ month = (mnum + (quarter - 1) * 3) % 12 + 1
+ if month > mnum:
+ year -= 1
+
+ return year, month
+
+
+def _validate_end_alias(how):
+ how_dict = {'S': 'S', 'E': 'E',
+ 'START': 'S', 'FINISH': 'E',
+ 'BEGIN': 'S', 'END': 'E'}
+ how = how_dict.get(str(how).upper())
+ if how not in {'S', 'E'}:
+ raise ValueError('How must be one of S or E')
+ return how
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/resolution.pyx b/contrib/python/pandas/py2/pandas/_libs/tslibs/resolution.pyx
new file mode 100644
index 00000000000..f80c1e9841a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/resolution.pyx
@@ -0,0 +1,354 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+from numpy cimport ndarray, int64_t, int32_t
+
+from pandas._libs.tslibs.util cimport is_string_object, get_nat
+
+from pandas._libs.tslibs.np_datetime cimport (
+ npy_datetimestruct, dt64_to_dtstruct)
+from pandas._libs.tslibs.frequencies cimport get_freq_code
+from pandas._libs.tslibs.timezones cimport (
+ is_utc, is_tzlocal, maybe_get_tz, get_dst_info)
+from pandas._libs.tslibs.conversion cimport tz_convert_utc_to_tzlocal
+from pandas._libs.tslibs.ccalendar cimport get_days_in_month
+
+# ----------------------------------------------------------------------
+# Constants
+
+cdef int64_t NPY_NAT = get_nat()
+
+cdef int RESO_NS = 0
+cdef int RESO_US = 1
+cdef int RESO_MS = 2
+cdef int RESO_SEC = 3
+cdef int RESO_MIN = 4
+cdef int RESO_HR = 5
+cdef int RESO_DAY = 6
+
+# ----------------------------------------------------------------------
+
+cpdef resolution(int64_t[:] stamps, tz=None):
+ cdef:
+ Py_ssize_t i, n = len(stamps)
+ npy_datetimestruct dts
+ int reso = RESO_DAY, curr_reso
+
+ if tz is not None:
+ tz = maybe_get_tz(tz)
+ return _reso_local(stamps, tz)
+
+
+cdef _reso_local(int64_t[:] stamps, object tz):
+ cdef:
+ Py_ssize_t i, n = len(stamps)
+ int reso = RESO_DAY, curr_reso
+ ndarray[int64_t] trans
+ int64_t[:] deltas
+ Py_ssize_t[:] pos
+ npy_datetimestruct dts
+ int64_t local_val, delta
+
+ if is_utc(tz) or tz is None:
+ for i in range(n):
+ if stamps[i] == NPY_NAT:
+ continue
+ dt64_to_dtstruct(stamps[i], &dts)
+ curr_reso = _reso_stamp(&dts)
+ if curr_reso < reso:
+ reso = curr_reso
+ elif is_tzlocal(tz):
+ for i in range(n):
+ if stamps[i] == NPY_NAT:
+ continue
+ local_val = tz_convert_utc_to_tzlocal(stamps[i], tz)
+ dt64_to_dtstruct(local_val, &dts)
+ curr_reso = _reso_stamp(&dts)
+ if curr_reso < reso:
+ reso = curr_reso
+ else:
+ # Adjust datetime64 timestamp, recompute datetimestruct
+ trans, deltas, typ = get_dst_info(tz)
+
+ if typ not in ['pytz', 'dateutil']:
+ # static/fixed; in this case we know that len(delta) == 1
+ delta = deltas[0]
+ for i in range(n):
+ if stamps[i] == NPY_NAT:
+ continue
+ dt64_to_dtstruct(stamps[i] + delta, &dts)
+ curr_reso = _reso_stamp(&dts)
+ if curr_reso < reso:
+ reso = curr_reso
+ else:
+ pos = trans.searchsorted(stamps, side='right') - 1
+ for i in range(n):
+ if stamps[i] == NPY_NAT:
+ continue
+ dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts)
+ curr_reso = _reso_stamp(&dts)
+ if curr_reso < reso:
+ reso = curr_reso
+
+ return reso
+
+
+cdef inline int _reso_stamp(npy_datetimestruct *dts):
+ if dts.us != 0:
+ if dts.us % 1000 == 0:
+ return RESO_MS
+ return RESO_US
+ elif dts.sec != 0:
+ return RESO_SEC
+ elif dts.min != 0:
+ return RESO_MIN
+ elif dts.hour != 0:
+ return RESO_HR
+ return RESO_DAY
+
+
+def get_freq_group(freq):
+ """
+ Return frequency code group of given frequency str or offset.
+
+ Example
+ -------
+ >>> get_freq_group('W-MON')
+ 4000
+
+ >>> get_freq_group('W-FRI')
+ 4000
+ """
+ if getattr(freq, '_typ', None) == 'dateoffset':
+ freq = freq.rule_code
+
+ if is_string_object(freq):
+ base, mult = get_freq_code(freq)
+ freq = base
+ elif isinstance(freq, int):
+ pass
+ else:
+ raise ValueError('input must be str, offset or int')
+ return (freq // 1000) * 1000
+
+
+class Resolution(object):
+
+ # Note: cython won't allow us to reference the cdef versions at the
+ # module level
+ RESO_NS = 0
+ RESO_US = 1
+ RESO_MS = 2
+ RESO_SEC = 3
+ RESO_MIN = 4
+ RESO_HR = 5
+ RESO_DAY = 6
+
+ _reso_str_map = {
+ RESO_NS: 'nanosecond',
+ RESO_US: 'microsecond',
+ RESO_MS: 'millisecond',
+ RESO_SEC: 'second',
+ RESO_MIN: 'minute',
+ RESO_HR: 'hour',
+ RESO_DAY: 'day'}
+
+ # factor to multiply a value by to convert it to the next finer grained
+ # resolution
+ _reso_mult_map = {
+ RESO_NS: None,
+ RESO_US: 1000,
+ RESO_MS: 1000,
+ RESO_SEC: 1000,
+ RESO_MIN: 60,
+ RESO_HR: 60,
+ RESO_DAY: 24}
+
+ _reso_str_bump_map = {
+ 'D': 'H',
+ 'H': 'T',
+ 'T': 'S',
+ 'S': 'L',
+ 'L': 'U',
+ 'U': 'N',
+ 'N': None}
+
+ _str_reso_map = {v: k for k, v in _reso_str_map.items()}
+
+ _reso_freq_map = {
+ 'year': 'A',
+ 'quarter': 'Q',
+ 'month': 'M',
+ 'day': 'D',
+ 'hour': 'H',
+ 'minute': 'T',
+ 'second': 'S',
+ 'millisecond': 'L',
+ 'microsecond': 'U',
+ 'nanosecond': 'N'}
+
+ _freq_reso_map = {v: k for k, v in _reso_freq_map.items()}
+
+ @classmethod
+ def get_str(cls, reso):
+ """
+ Return resolution str against resolution code.
+
+ Example
+ -------
+ >>> Resolution.get_str(Resolution.RESO_SEC)
+ 'second'
+ """
+ return cls._reso_str_map.get(reso, 'day')
+
+ @classmethod
+ def get_reso(cls, resostr):
+ """
+ Return resolution str against resolution code.
+
+ Example
+ -------
+ >>> Resolution.get_reso('second')
+ 2
+
+ >>> Resolution.get_reso('second') == Resolution.RESO_SEC
+ True
+ """
+ return cls._str_reso_map.get(resostr, cls.RESO_DAY)
+
+ @classmethod
+ def get_freq_group(cls, resostr):
+ """
+ Return frequency str against resolution str.
+
+ Example
+ -------
+ >>> f.Resolution.get_freq_group('day')
+ 4000
+ """
+ return get_freq_group(cls.get_freq(resostr))
+
+ @classmethod
+ def get_freq(cls, resostr):
+ """
+ Return frequency str against resolution str.
+
+ Example
+ -------
+ >>> f.Resolution.get_freq('day')
+ 'D'
+ """
+ return cls._reso_freq_map[resostr]
+
+ @classmethod
+ def get_str_from_freq(cls, freq):
+ """
+ Return resolution str against frequency str.
+
+ Example
+ -------
+ >>> Resolution.get_str_from_freq('H')
+ 'hour'
+ """
+ return cls._freq_reso_map.get(freq, 'day')
+
+ @classmethod
+ def get_reso_from_freq(cls, freq):
+ """
+ Return resolution code against frequency str.
+
+ Example
+ -------
+ >>> Resolution.get_reso_from_freq('H')
+ 4
+
+ >>> Resolution.get_reso_from_freq('H') == Resolution.RESO_HR
+ True
+ """
+ return cls.get_reso(cls.get_str_from_freq(freq))
+
+ @classmethod
+ def get_stride_from_decimal(cls, value, freq):
+ """
+ Convert freq with decimal stride into a higher freq with integer stride
+
+ Parameters
+ ----------
+ value : integer or float
+ freq : string
+ Frequency string
+
+ Raises
+ ------
+ ValueError
+ If the float cannot be converted to an integer at any resolution.
+
+ Example
+ -------
+ >>> Resolution.get_stride_from_decimal(1.5, 'T')
+ (90, 'S')
+
+ >>> Resolution.get_stride_from_decimal(1.04, 'H')
+ (3744, 'S')
+
+ >>> Resolution.get_stride_from_decimal(1, 'D')
+ (1, 'D')
+ """
+ if np.isclose(value % 1, 0):
+ return int(value), freq
+ else:
+ start_reso = cls.get_reso_from_freq(freq)
+ if start_reso == 0:
+ raise ValueError("Could not convert to integer offset "
+ "at any resolution")
+
+ next_value = cls._reso_mult_map[start_reso] * value
+ next_name = cls._reso_str_bump_map[freq]
+ return cls.get_stride_from_decimal(next_value, next_name)
+
+
+# ----------------------------------------------------------------------
+# Frequency Inference
+
+def month_position_check(fields, weekdays):
+ cdef:
+ int32_t daysinmonth, y, m, d
+ bint calendar_end = True
+ bint business_end = True
+ bint calendar_start = True
+ bint business_start = True
+ bint cal
+ int32_t[:] years
+ int32_t[:] months
+ int32_t[:] days
+
+ years = fields['Y']
+ months = fields['M']
+ days = fields['D']
+
+ for y, m, d, wd in zip(years, months, days, weekdays):
+ if calendar_start:
+ calendar_start &= d == 1
+ if business_start:
+ business_start &= d == 1 or (d <= 3 and wd == 0)
+
+ if calendar_end or business_end:
+ daysinmonth = get_days_in_month(y, m)
+ cal = d == daysinmonth
+ if calendar_end:
+ calendar_end &= cal
+ if business_end:
+ business_end &= cal or (daysinmonth - d < 3 and wd == 4)
+ elif not calendar_start and not business_start:
+ break
+
+ if calendar_end:
+ return 'ce'
+ elif business_end:
+ return 'be'
+ elif calendar_start:
+ return 'cs'
+ elif business_start:
+ return 'bs'
+ else:
+ return None
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/src/datetime/np_datetime.c b/contrib/python/pandas/py2/pandas/_libs/tslibs/src/datetime/np_datetime.c
new file mode 100644
index 00000000000..866c9ca9d3a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/src/datetime/np_datetime.c
@@ -0,0 +1,814 @@
+/*
+
+Copyright (c) 2016, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+
+Copyright (c) 2005-2011, NumPy Developers
+All rights reserved.
+
+This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
+
+*/
+
+#define NO_IMPORT
+
+#ifndef NPY_NO_DEPRECATED_API
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#endif // NPY_NO_DEPRECATED_API
+
+#include <Python.h>
+#include <datetime.h>
+
+#include <numpy/arrayobject.h>
+#include <numpy/arrayscalars.h>
+#include <numpy/ndarraytypes.h>
+#include "np_datetime.h"
+
+#if PY_MAJOR_VERSION >= 3
+#define PyInt_AsLong PyLong_AsLong
+#endif
+
+const npy_datetimestruct _NS_MIN_DTS = {
+ 1677, 9, 21, 0, 12, 43, 145225, 0, 0};
+const npy_datetimestruct _NS_MAX_DTS = {
+ 2262, 4, 11, 23, 47, 16, 854775, 807000, 0};
+
+
+const int days_per_month_table[2][12] = {
+ {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
+ {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}};
+
+/*
+ * Returns 1 if the given year is a leap year, 0 otherwise.
+ */
+int is_leapyear(npy_int64 year) {
+ return (year & 0x3) == 0 && /* year % 4 == 0 */
+ ((year % 100) != 0 || (year % 400) == 0);
+}
+
+/*
+ * Adjusts a datetimestruct based on a minutes offset. Assumes
+ * the current values are valid.g
+ */
+void add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes) {
+ int isleap;
+
+ /* MINUTES */
+ dts->min += minutes;
+ while (dts->min < 0) {
+ dts->min += 60;
+ dts->hour--;
+ }
+ while (dts->min >= 60) {
+ dts->min -= 60;
+ dts->hour++;
+ }
+
+ /* HOURS */
+ while (dts->hour < 0) {
+ dts->hour += 24;
+ dts->day--;
+ }
+ while (dts->hour >= 24) {
+ dts->hour -= 24;
+ dts->day++;
+ }
+
+ /* DAYS */
+ if (dts->day < 1) {
+ dts->month--;
+ if (dts->month < 1) {
+ dts->year--;
+ dts->month = 12;
+ }
+ isleap = is_leapyear(dts->year);
+ dts->day += days_per_month_table[isleap][dts->month - 1];
+ } else if (dts->day > 28) {
+ isleap = is_leapyear(dts->year);
+ if (dts->day > days_per_month_table[isleap][dts->month - 1]) {
+ dts->day -= days_per_month_table[isleap][dts->month - 1];
+ dts->month++;
+ if (dts->month > 12) {
+ dts->year++;
+ dts->month = 1;
+ }
+ }
+ }
+}
+
+/*
+ * Calculates the days offset from the 1970 epoch.
+ */
+npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) {
+ int i, month;
+ npy_int64 year, days = 0;
+ const int *month_lengths;
+
+ year = dts->year - 1970;
+ days = year * 365;
+
+ /* Adjust for leap years */
+ if (days >= 0) {
+ /*
+ * 1968 is the closest leap year before 1970.
+ * Exclude the current year, so add 1.
+ */
+ year += 1;
+ /* Add one day for each 4 years */
+ days += year / 4;
+ /* 1900 is the closest previous year divisible by 100 */
+ year += 68;
+ /* Subtract one day for each 100 years */
+ days -= year / 100;
+ /* 1600 is the closest previous year divisible by 400 */
+ year += 300;
+ /* Add one day for each 400 years */
+ days += year / 400;
+ } else {
+ /*
+ * 1972 is the closest later year after 1970.
+ * Include the current year, so subtract 2.
+ */
+ year -= 2;
+ /* Subtract one day for each 4 years */
+ days += year / 4;
+ /* 2000 is the closest later year divisible by 100 */
+ year -= 28;
+ /* Add one day for each 100 years */
+ days -= year / 100;
+ /* 2000 is also the closest later year divisible by 400 */
+ /* Subtract one day for each 400 years */
+ days += year / 400;
+ }
+
+ month_lengths = days_per_month_table[is_leapyear(dts->year)];
+ month = dts->month - 1;
+
+ /* Add the months */
+ for (i = 0; i < month; ++i) {
+ days += month_lengths[i];
+ }
+
+ /* Add the days */
+ days += dts->day - 1;
+
+ return days;
+}
+
+/*
+ * Modifies '*days_' to be the day offset within the year,
+ * and returns the year.
+ */
+static npy_int64 days_to_yearsdays(npy_int64 *days_) {
+ const npy_int64 days_per_400years = (400 * 365 + 100 - 4 + 1);
+ /* Adjust so it's relative to the year 2000 (divisible by 400) */
+ npy_int64 days = (*days_) - (365 * 30 + 7);
+ npy_int64 year;
+
+ /* Break down the 400 year cycle to get the year and day within the year */
+ if (days >= 0) {
+ year = 400 * (days / days_per_400years);
+ days = days % days_per_400years;
+ } else {
+ year = 400 * ((days - (days_per_400years - 1)) / days_per_400years);
+ days = days % days_per_400years;
+ if (days < 0) {
+ days += days_per_400years;
+ }
+ }
+
+ /* Work out the year/day within the 400 year cycle */
+ if (days >= 366) {
+ year += 100 * ((days - 1) / (100 * 365 + 25 - 1));
+ days = (days - 1) % (100 * 365 + 25 - 1);
+ if (days >= 365) {
+ year += 4 * ((days + 1) / (4 * 365 + 1));
+ days = (days + 1) % (4 * 365 + 1);
+ if (days >= 366) {
+ year += (days - 1) / 365;
+ days = (days - 1) % 365;
+ }
+ }
+ }
+
+ *days_ = days;
+ return year + 2000;
+}
+
+/*
+ * Adjusts a datetimestruct based on a seconds offset. Assumes
+ * the current values are valid.
+ */
+NPY_NO_EXPORT void add_seconds_to_datetimestruct(npy_datetimestruct *dts,
+ int seconds) {
+ int minutes;
+
+ dts->sec += seconds;
+ if (dts->sec < 0) {
+ minutes = dts->sec / 60;
+ dts->sec = dts->sec % 60;
+ if (dts->sec < 0) {
+ --minutes;
+ dts->sec += 60;
+ }
+ add_minutes_to_datetimestruct(dts, minutes);
+ } else if (dts->sec >= 60) {
+ minutes = dts->sec / 60;
+ dts->sec = dts->sec % 60;
+ add_minutes_to_datetimestruct(dts, minutes);
+ }
+}
+
+/*
+ * Fills in the year, month, day in 'dts' based on the days
+ * offset from 1970.
+ */
+static void set_datetimestruct_days(npy_int64 days, npy_datetimestruct *dts) {
+ const int *month_lengths;
+ int i;
+
+ dts->year = days_to_yearsdays(&days);
+ month_lengths = days_per_month_table[is_leapyear(dts->year)];
+
+ for (i = 0; i < 12; ++i) {
+ if (days < month_lengths[i]) {
+ dts->month = i + 1;
+ dts->day = days + 1;
+ return;
+ } else {
+ days -= month_lengths[i];
+ }
+ }
+}
+
+/*
+ * Compares two npy_datetimestruct objects chronologically
+ */
+int cmp_npy_datetimestruct(const npy_datetimestruct *a,
+ const npy_datetimestruct *b) {
+ if (a->year > b->year) {
+ return 1;
+ } else if (a->year < b->year) {
+ return -1;
+ }
+
+ if (a->month > b->month) {
+ return 1;
+ } else if (a->month < b->month) {
+ return -1;
+ }
+
+ if (a->day > b->day) {
+ return 1;
+ } else if (a->day < b->day) {
+ return -1;
+ }
+
+ if (a->hour > b->hour) {
+ return 1;
+ } else if (a->hour < b->hour) {
+ return -1;
+ }
+
+ if (a->min > b->min) {
+ return 1;
+ } else if (a->min < b->min) {
+ return -1;
+ }
+
+ if (a->sec > b->sec) {
+ return 1;
+ } else if (a->sec < b->sec) {
+ return -1;
+ }
+
+ if (a->us > b->us) {
+ return 1;
+ } else if (a->us < b->us) {
+ return -1;
+ }
+
+ if (a->ps > b->ps) {
+ return 1;
+ } else if (a->ps < b->ps) {
+ return -1;
+ }
+
+ if (a->as > b->as) {
+ return 1;
+ } else if (a->as < b->as) {
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ *
+ * Converts a Python datetime.datetime or datetime.date
+ * object into a NumPy npy_datetimestruct. Uses tzinfo (if present)
+ * to convert to UTC time.
+ *
+ * While the C API has PyDate_* and PyDateTime_* functions, the following
+ * implementation just asks for attributes, and thus supports
+ * datetime duck typing. The tzinfo time zone conversion would require
+ * this style of access anyway.
+ *
+ * Returns -1 on error, 0 on success, and 1 (with no error set)
+ * if obj doesn't have the needed date or datetime attributes.
+ */
+int convert_pydatetime_to_datetimestruct(PyDateTime_Date *dtobj,
+ npy_datetimestruct *out) {
+ // Assumes that obj is a valid datetime object
+ PyObject *tmp;
+ PyObject *obj = (PyObject*)dtobj;
+
+ /* Initialize the output to all zeros */
+ memset(out, 0, sizeof(npy_datetimestruct));
+ out->month = 1;
+ out->day = 1;
+
+ out->year = PyInt_AsLong(PyObject_GetAttrString(obj, "year"));
+ out->month = PyInt_AsLong(PyObject_GetAttrString(obj, "month"));
+ out->day = PyInt_AsLong(PyObject_GetAttrString(obj, "day"));
+
+ // TODO(anyone): If we can get PyDateTime_IMPORT to work, we could use
+ // PyDateTime_Check here, and less verbose attribute lookups.
+
+ /* Check for time attributes (if not there, return success as a date) */
+ if (!PyObject_HasAttrString(obj, "hour") ||
+ !PyObject_HasAttrString(obj, "minute") ||
+ !PyObject_HasAttrString(obj, "second") ||
+ !PyObject_HasAttrString(obj, "microsecond")) {
+ return 0;
+ }
+
+ out->hour = PyInt_AsLong(PyObject_GetAttrString(obj, "hour"));
+ out->min = PyInt_AsLong(PyObject_GetAttrString(obj, "minute"));
+ out->sec = PyInt_AsLong(PyObject_GetAttrString(obj, "second"));
+ out->us = PyInt_AsLong(PyObject_GetAttrString(obj, "microsecond"));
+
+ /* Apply the time zone offset if datetime obj is tz-aware */
+ if (PyObject_HasAttrString((PyObject*)obj, "tzinfo")) {
+ tmp = PyObject_GetAttrString(obj, "tzinfo");
+ if (tmp == NULL) {
+ return -1;
+ }
+ if (tmp == Py_None) {
+ Py_DECREF(tmp);
+ } else {
+ PyObject *offset;
+ int seconds_offset, minutes_offset;
+
+ /* The utcoffset function should return a timedelta */
+ offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj);
+ if (offset == NULL) {
+ Py_DECREF(tmp);
+ return -1;
+ }
+ Py_DECREF(tmp);
+
+ /*
+ * The timedelta should have a function "total_seconds"
+ * which contains the value we want.
+ */
+ tmp = PyObject_CallMethod(offset, "total_seconds", "");
+ if (tmp == NULL) {
+ return -1;
+ }
+ seconds_offset = PyInt_AsLong(tmp);
+ if (seconds_offset == -1 && PyErr_Occurred()) {
+ Py_DECREF(tmp);
+ return -1;
+ }
+ Py_DECREF(tmp);
+
+ /* Convert to a minutes offset and apply it */
+ minutes_offset = seconds_offset / 60;
+
+ add_minutes_to_datetimestruct(out, -minutes_offset);
+ }
+ }
+
+ return 0;
+}
+
+
+/*
+ * Converts a datetime from a datetimestruct to a datetime based
+ * on a metadata unit. The date is assumed to be valid.
+ */
+npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base,
+ const npy_datetimestruct *dts) {
+ npy_datetime ret;
+
+ if (base == NPY_FR_Y) {
+ /* Truncate to the year */
+ ret = dts->year - 1970;
+ } else if (base == NPY_FR_M) {
+ /* Truncate to the month */
+ ret = 12 * (dts->year - 1970) + (dts->month - 1);
+ } else {
+ /* Otherwise calculate the number of days to start */
+ npy_int64 days = get_datetimestruct_days(dts);
+
+ switch (base) {
+ case NPY_FR_W:
+ /* Truncate to weeks */
+ if (days >= 0) {
+ ret = days / 7;
+ } else {
+ ret = (days - 6) / 7;
+ }
+ break;
+ case NPY_FR_D:
+ ret = days;
+ break;
+ case NPY_FR_h:
+ ret = days * 24 + dts->hour;
+ break;
+ case NPY_FR_m:
+ ret = (days * 24 + dts->hour) * 60 + dts->min;
+ break;
+ case NPY_FR_s:
+ ret = ((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec;
+ break;
+ case NPY_FR_ms:
+ ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 +
+ dts->sec) *
+ 1000 +
+ dts->us / 1000;
+ break;
+ case NPY_FR_us:
+ ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 +
+ dts->sec) *
+ 1000000 +
+ dts->us;
+ break;
+ case NPY_FR_ns:
+ ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 +
+ dts->sec) *
+ 1000000 +
+ dts->us) *
+ 1000 +
+ dts->ps / 1000;
+ break;
+ case NPY_FR_ps:
+ ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 +
+ dts->sec) *
+ 1000000 +
+ dts->us) *
+ 1000000 +
+ dts->ps;
+ break;
+ case NPY_FR_fs:
+ /* only 2.6 hours */
+ ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 +
+ dts->sec) *
+ 1000000 +
+ dts->us) *
+ 1000000 +
+ dts->ps) *
+ 1000 +
+ dts->as / 1000;
+ break;
+ case NPY_FR_as:
+ /* only 9.2 secs */
+ ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 +
+ dts->sec) *
+ 1000000 +
+ dts->us) *
+ 1000000 +
+ dts->ps) *
+ 1000000 +
+ dts->as;
+ break;
+ default:
+ /* Something got corrupted */
+ PyErr_SetString(
+ PyExc_ValueError,
+ "NumPy datetime metadata with corrupt unit value");
+ return -1;
+ }
+ }
+ return ret;
+}
+
+/*
+ * Converts a datetime based on the given metadata into a datetimestruct
+ */
+void pandas_datetime_to_datetimestruct(npy_datetime dt,
+ NPY_DATETIMEUNIT base,
+ npy_datetimestruct *out) {
+ npy_int64 perday;
+
+ /* Initialize the output to all zeros */
+ memset(out, 0, sizeof(npy_datetimestruct));
+ out->year = 1970;
+ out->month = 1;
+ out->day = 1;
+
+ /*
+ * Note that care must be taken with the / and % operators
+ * for negative values.
+ */
+ switch (base) {
+ case NPY_FR_Y:
+ out->year = 1970 + dt;
+ break;
+
+ case NPY_FR_M:
+ if (dt >= 0) {
+ out->year = 1970 + dt / 12;
+ out->month = dt % 12 + 1;
+ } else {
+ out->year = 1969 + (dt + 1) / 12;
+ out->month = 12 + (dt + 1) % 12;
+ }
+ break;
+
+ case NPY_FR_W:
+ /* A week is 7 days */
+ set_datetimestruct_days(dt * 7, out);
+ break;
+
+ case NPY_FR_D:
+ set_datetimestruct_days(dt, out);
+ break;
+
+ case NPY_FR_h:
+ perday = 24LL;
+
+ if (dt >= 0) {
+ set_datetimestruct_days(dt / perday, out);
+ dt = dt % perday;
+ } else {
+ set_datetimestruct_days(
+ dt / perday - (dt % perday == 0 ? 0 : 1), out);
+ dt = (perday - 1) + (dt + 1) % perday;
+ }
+ out->hour = dt;
+ break;
+
+ case NPY_FR_m:
+ perday = 24LL * 60;
+
+ if (dt >= 0) {
+ set_datetimestruct_days(dt / perday, out);
+ dt = dt % perday;
+ } else {
+ set_datetimestruct_days(
+ dt / perday - (dt % perday == 0 ? 0 : 1), out);
+ dt = (perday - 1) + (dt + 1) % perday;
+ }
+ out->hour = dt / 60;
+ out->min = dt % 60;
+ break;
+
+ case NPY_FR_s:
+ perday = 24LL * 60 * 60;
+
+ if (dt >= 0) {
+ set_datetimestruct_days(dt / perday, out);
+ dt = dt % perday;
+ } else {
+ set_datetimestruct_days(
+ dt / perday - (dt % perday == 0 ? 0 : 1), out);
+ dt = (perday - 1) + (dt + 1) % perday;
+ }
+ out->hour = dt / (60 * 60);
+ out->min = (dt / 60) % 60;
+ out->sec = dt % 60;
+ break;
+
+ case NPY_FR_ms:
+ perday = 24LL * 60 * 60 * 1000;
+
+ if (dt >= 0) {
+ set_datetimestruct_days(dt / perday, out);
+ dt = dt % perday;
+ } else {
+ set_datetimestruct_days(
+ dt / perday - (dt % perday == 0 ? 0 : 1), out);
+ dt = (perday - 1) + (dt + 1) % perday;
+ }
+ out->hour = dt / (60 * 60 * 1000LL);
+ out->min = (dt / (60 * 1000LL)) % 60;
+ out->sec = (dt / 1000LL) % 60;
+ out->us = (dt % 1000LL) * 1000;
+ break;
+
+ case NPY_FR_us:
+ perday = 24LL * 60LL * 60LL * 1000LL * 1000LL;
+
+ if (dt >= 0) {
+ set_datetimestruct_days(dt / perday, out);
+ dt = dt % perday;
+ } else {
+ set_datetimestruct_days(
+ dt / perday - (dt % perday == 0 ? 0 : 1), out);
+ dt = (perday - 1) + (dt + 1) % perday;
+ }
+ out->hour = dt / (60 * 60 * 1000000LL);
+ out->min = (dt / (60 * 1000000LL)) % 60;
+ out->sec = (dt / 1000000LL) % 60;
+ out->us = dt % 1000000LL;
+ break;
+
+ case NPY_FR_ns:
+ perday = 24LL * 60LL * 60LL * 1000LL * 1000LL * 1000LL;
+
+ if (dt >= 0) {
+ set_datetimestruct_days(dt / perday, out);
+ dt = dt % perday;
+ } else {
+ set_datetimestruct_days(
+ dt / perday - (dt % perday == 0 ? 0 : 1), out);
+ dt = (perday - 1) + (dt + 1) % perday;
+ }
+ out->hour = dt / (60 * 60 * 1000000000LL);
+ out->min = (dt / (60 * 1000000000LL)) % 60;
+ out->sec = (dt / 1000000000LL) % 60;
+ out->us = (dt / 1000LL) % 1000000LL;
+ out->ps = (dt % 1000LL) * 1000;
+ break;
+
+ case NPY_FR_ps:
+ perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000;
+
+ if (dt >= 0) {
+ set_datetimestruct_days(dt / perday, out);
+ dt = dt % perday;
+ } else {
+ set_datetimestruct_days(
+ dt / perday - (dt % perday == 0 ? 0 : 1), out);
+ dt = (perday - 1) + (dt + 1) % perday;
+ }
+ out->hour = dt / (60 * 60 * 1000000000000LL);
+ out->min = (dt / (60 * 1000000000000LL)) % 60;
+ out->sec = (dt / 1000000000000LL) % 60;
+ out->us = (dt / 1000000LL) % 1000000LL;
+ out->ps = dt % 1000000LL;
+ break;
+
+ case NPY_FR_fs:
+ /* entire range is only +- 2.6 hours */
+ if (dt >= 0) {
+ out->hour = dt / (60 * 60 * 1000000000000000LL);
+ out->min = (dt / (60 * 1000000000000000LL)) % 60;
+ out->sec = (dt / 1000000000000000LL) % 60;
+ out->us = (dt / 1000000000LL) % 1000000LL;
+ out->ps = (dt / 1000LL) % 1000000LL;
+ out->as = (dt % 1000LL) * 1000;
+ } else {
+ npy_datetime minutes;
+
+ minutes = dt / (60 * 1000000000000000LL);
+ dt = dt % (60 * 1000000000000000LL);
+ if (dt < 0) {
+ dt += (60 * 1000000000000000LL);
+ --minutes;
+ }
+ /* Offset the negative minutes */
+ add_minutes_to_datetimestruct(out, minutes);
+ out->sec = (dt / 1000000000000000LL) % 60;
+ out->us = (dt / 1000000000LL) % 1000000LL;
+ out->ps = (dt / 1000LL) % 1000000LL;
+ out->as = (dt % 1000LL) * 1000;
+ }
+ break;
+
+ case NPY_FR_as:
+ /* entire range is only +- 9.2 seconds */
+ if (dt >= 0) {
+ out->sec = (dt / 1000000000000000000LL) % 60;
+ out->us = (dt / 1000000000000LL) % 1000000LL;
+ out->ps = (dt / 1000000LL) % 1000000LL;
+ out->as = dt % 1000000LL;
+ } else {
+ npy_datetime seconds;
+
+ seconds = dt / 1000000000000000000LL;
+ dt = dt % 1000000000000000000LL;
+ if (dt < 0) {
+ dt += 1000000000000000000LL;
+ --seconds;
+ }
+ /* Offset the negative seconds */
+ add_seconds_to_datetimestruct(out, seconds);
+ out->us = (dt / 1000000000000LL) % 1000000LL;
+ out->ps = (dt / 1000000LL) % 1000000LL;
+ out->as = dt % 1000000LL;
+ }
+ break;
+
+ default:
+ PyErr_SetString(PyExc_RuntimeError,
+ "NumPy datetime metadata is corrupted with invalid "
+ "base unit");
+ }
+}
+
+/*
+ * Converts a timedelta from a timedeltastruct to a timedelta based
+ * on a metadata unit. The timedelta is assumed to be valid.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+void pandas_timedelta_to_timedeltastruct(npy_timedelta td,
+ NPY_DATETIMEUNIT base,
+ pandas_timedeltastruct *out) {
+ npy_int64 frac;
+ npy_int64 sfrac;
+ npy_int64 ifrac;
+ int sign;
+ npy_int64 DAY_NS = 86400000000000LL;
+
+ /* Initialize the output to all zeros */
+ memset(out, 0, sizeof(pandas_timedeltastruct));
+
+ switch (base) {
+ case NPY_FR_ns:
+
+ // put frac in seconds
+ if (td < 0 && td % (1000LL * 1000LL * 1000LL) != 0)
+ frac = td / (1000LL * 1000LL * 1000LL) - 1;
+ else
+ frac = td / (1000LL * 1000LL * 1000LL);
+
+ if (frac < 0) {
+ sign = -1;
+
+ // even fraction
+ if ((-frac % 86400LL) != 0) {
+ out->days = -frac / 86400LL + 1;
+ frac += 86400LL * out->days;
+ } else {
+ frac = -frac;
+ }
+ } else {
+ sign = 1;
+ out->days = 0;
+ }
+
+ if (frac >= 86400) {
+ out->days += frac / 86400LL;
+ frac -= out->days * 86400LL;
+ }
+
+ if (frac >= 3600) {
+ out->hrs = frac / 3600LL;
+ frac -= out->hrs * 3600LL;
+ } else {
+ out->hrs = 0;
+ }
+
+ if (frac >= 60) {
+ out->min = frac / 60LL;
+ frac -= out->min * 60LL;
+ } else {
+ out->min = 0;
+ }
+
+ if (frac >= 0) {
+ out->sec = frac;
+ frac -= out->sec;
+ } else {
+ out->sec = 0;
+ }
+
+ sfrac = (out->hrs * 3600LL + out->min * 60LL
+ + out->sec) * (1000LL * 1000LL * 1000LL);
+
+ if (sign < 0)
+ out->days = -out->days;
+
+ ifrac = td - (out->days * DAY_NS + sfrac);
+
+ if (ifrac != 0) {
+ out->ms = ifrac / (1000LL * 1000LL);
+ ifrac -= out->ms * 1000LL * 1000LL;
+ out->us = ifrac / 1000LL;
+ ifrac -= out->us * 1000LL;
+ out->ns = ifrac;
+ } else {
+ out->ms = 0;
+ out->us = 0;
+ out->ns = 0;
+ }
+
+ out->seconds = out->hrs * 3600 + out->min * 60 + out->sec;
+ out->microseconds = out->ms * 1000 + out->us;
+ out->nanoseconds = out->ns;
+ break;
+
+ default:
+ PyErr_SetString(PyExc_RuntimeError,
+ "NumPy timedelta metadata is corrupted with "
+ "invalid base unit");
+ }
+}
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/src/datetime/np_datetime.h b/contrib/python/pandas/py2/pandas/_libs/tslibs/src/datetime/np_datetime.h
new file mode 100644
index 00000000000..549d38409ca
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/src/datetime/np_datetime.h
@@ -0,0 +1,80 @@
+/*
+
+Copyright (c) 2016, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+
+Copyright (c) 2005-2011, NumPy Developers
+All rights reserved.
+
+This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
+
+*/
+
+#ifndef PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_H_
+#define PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_H_
+
+#ifndef NPY_NO_DEPRECATED_API
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#endif // NPY_NO_DEPRECATED_API
+
+#include <numpy/ndarraytypes.h>
+#include <datetime.h>
+
+typedef struct {
+ npy_int64 days;
+ npy_int32 hrs, min, sec, ms, us, ns, seconds, microseconds, nanoseconds;
+} pandas_timedeltastruct;
+
+extern const npy_datetimestruct _NS_MIN_DTS;
+extern const npy_datetimestruct _NS_MAX_DTS;
+
+// stuff pandas needs
+// ----------------------------------------------------------------------------
+
+int convert_pydatetime_to_datetimestruct(PyDateTime_Date *dtobj,
+ npy_datetimestruct *out);
+
+npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base,
+ const npy_datetimestruct *dts);
+
+void pandas_datetime_to_datetimestruct(npy_datetime val, NPY_DATETIMEUNIT fr,
+ npy_datetimestruct *result);
+
+void pandas_timedelta_to_timedeltastruct(npy_timedelta val,
+ NPY_DATETIMEUNIT fr,
+ pandas_timedeltastruct *result);
+
+extern const int days_per_month_table[2][12];
+
+// stuff numpy-derived code needs in header
+// ----------------------------------------------------------------------------
+
+int is_leapyear(npy_int64 year);
+
+/*
+ * Calculates the days offset from the 1970 epoch.
+ */
+npy_int64
+get_datetimestruct_days(const npy_datetimestruct *dts);
+
+
+/*
+ * Compares two npy_datetimestruct objects chronologically
+ */
+int cmp_npy_datetimestruct(const npy_datetimestruct *a,
+ const npy_datetimestruct *b);
+
+
+/*
+ * Adjusts a datetimestruct based on a minutes offset. Assumes
+ * the current values are valid.
+ */
+void
+add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes);
+
+
+#endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_H_
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/contrib/python/pandas/py2/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c
new file mode 100644
index 00000000000..05ccdd13598
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c
@@ -0,0 +1,886 @@
+/*
+
+Copyright (c) 2016, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+
+Written by Mark Wiebe ([email protected])
+Copyright (c) 2011 by Enthought, Inc.
+
+Copyright (c) 2005-2011, NumPy Developers
+All rights reserved.
+
+See NUMPY_LICENSE.txt for the license.
+
+This file implements string parsing and creation for NumPy datetime.
+
+*/
+
+#define PY_SSIZE_T_CLEAN
+#define NO_IMPORT
+
+#ifndef NPY_NO_DEPRECATED_API
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#endif // NPY_NO_DEPRECATED_API
+
+#include <Python.h>
+
+#include <time.h>
+
+#include <numpy/arrayobject.h>
+#include <numpy/arrayscalars.h>
+#include <numpy/ndarraytypes.h>
+
+#include "np_datetime.h"
+#include "np_datetime_strings.h"
+
+
+/*
+ * Parses (almost) standard ISO 8601 date strings. The differences are:
+ *
+ * + Only seconds may have a decimal point, with up to 18 digits after it
+ * (maximum attoseconds precision).
+ * + Either a 'T' as in ISO 8601 or a ' ' may be used to separate
+ * the date and the time. Both are treated equivalently.
+ * + Doesn't (yet) handle the "YYYY-DDD" or "YYYY-Www" formats.
+ * + Doesn't handle leap seconds (seconds value has 60 in these cases).
+ * + Doesn't handle 24:00:00 as synonym for midnight (00:00:00) tomorrow
+ * + Accepts special values "NaT" (not a time), "Today", (current
+ * day according to local time) and "Now" (current time in UTC).
+ * + ':' separator between hours, minutes, and seconds is optional. When
+ * omitted, each component must be 2 digits if it appears. (GH-10041)
+ *
+ * 'str' must be a NULL-terminated string, and 'len' must be its length.
+ *
+ * 'out' gets filled with the parsed date-time.
+ * 'out_local' gets set to 1 if the parsed time contains timezone,
+ * to 0 otherwise.
+ * 'out_tzoffset' gets set to timezone offset by minutes
+ * if the parsed time was in local time,
+ * to 0 otherwise. The values 'now' and 'today' don't get counted
+ * as local, and neither do UTC +/-#### timezone offsets, because
+ * they aren't using the computer's local timezone offset.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+int parse_iso_8601_datetime(char *str, int len,
+ npy_datetimestruct *out,
+ int *out_local, int *out_tzoffset) {
+ int year_leap = 0;
+ int i, numdigits;
+ char *substr, sublen;
+
+ /* If year-month-day are separated by a valid separator,
+ * months/days without leading zeroes will be parsed
+ * (though not iso8601). If the components aren't separated,
+ * 4 (YYYY) or 8 (YYYYMMDD) digits are expected. 6 digits are
+ * forbidden here (but parsed as YYMMDD elsewhere).
+ */
+ int has_ymd_sep = 0;
+ char ymd_sep = '\0';
+ char valid_ymd_sep[] = {'-', '.', '/', '\\', ' '};
+ int valid_ymd_sep_len = sizeof(valid_ymd_sep);
+
+ /* hour-minute-second may or may not separated by ':'. If not, then
+ * each component must be 2 digits. */
+ int has_hms_sep = 0;
+ int hour_was_2_digits = 0;
+
+ /* Initialize the output to all zeros */
+ memset(out, 0, sizeof(npy_datetimestruct));
+ out->month = 1;
+ out->day = 1;
+
+ substr = str;
+ sublen = len;
+
+ /* Skip leading whitespace */
+ while (sublen > 0 && isspace(*substr)) {
+ ++substr;
+ --sublen;
+ }
+
+ /* Leading '-' sign for negative year */
+ if (*substr == '-') {
+ ++substr;
+ --sublen;
+ }
+
+ if (sublen == 0) {
+ goto parse_error;
+ }
+
+ /* PARSE THE YEAR (4 digits) */
+ out->year = 0;
+ if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) &&
+ isdigit(substr[2]) && isdigit(substr[3])) {
+ out->year = 1000 * (substr[0] - '0') + 100 * (substr[1] - '0') +
+ 10 * (substr[2] - '0') + (substr[3] - '0');
+
+ substr += 4;
+ sublen -= 4;
+ }
+
+ /* Negate the year if necessary */
+ if (str[0] == '-') {
+ out->year = -out->year;
+ }
+ /* Check whether it's a leap-year */
+ year_leap = is_leapyear(out->year);
+
+ /* Next character must be a separator, start of month, or end of string */
+ if (sublen == 0) {
+ if (out_local != NULL) {
+ *out_local = 0;
+ }
+ goto finish;
+ }
+
+ if (!isdigit(*substr)) {
+ for (i = 0; i < valid_ymd_sep_len; ++i) {
+ if (*substr == valid_ymd_sep[i]) {
+ break;
+ }
+ }
+ if (i == valid_ymd_sep_len) {
+ goto parse_error;
+ }
+ has_ymd_sep = 1;
+ ymd_sep = valid_ymd_sep[i];
+ ++substr;
+ --sublen;
+ /* Cannot have trailing separator */
+ if (sublen == 0 || !isdigit(*substr)) {
+ goto parse_error;
+ }
+ }
+
+ /* PARSE THE MONTH */
+ /* First digit required */
+ out->month = (*substr - '0');
+ ++substr;
+ --sublen;
+ /* Second digit optional if there was a separator */
+ if (isdigit(*substr)) {
+ out->month = 10 * out->month + (*substr - '0');
+ ++substr;
+ --sublen;
+ } else if (!has_ymd_sep) {
+ goto parse_error;
+ }
+ if (out->month < 1 || out->month > 12) {
+ PyErr_Format(PyExc_ValueError,
+ "Month out of range in datetime string \"%s\"", str);
+ goto error;
+ }
+
+ /* Next character must be the separator, start of day, or end of string */
+ if (sublen == 0) {
+ /* Forbid YYYYMM. Parsed instead as YYMMDD by someone else. */
+ if (!has_ymd_sep) {
+ goto parse_error;
+ }
+ if (out_local != NULL) {
+ *out_local = 0;
+ }
+ goto finish;
+ }
+
+ if (has_ymd_sep) {
+ /* Must have separator, but cannot be trailing */
+ if (*substr != ymd_sep || sublen == 1) {
+ goto parse_error;
+ }
+ ++substr;
+ --sublen;
+ }
+
+ /* PARSE THE DAY */
+ /* First digit required */
+ if (!isdigit(*substr)) {
+ goto parse_error;
+ }
+ out->day = (*substr - '0');
+ ++substr;
+ --sublen;
+ /* Second digit optional if there was a separator */
+ if (isdigit(*substr)) {
+ out->day = 10 * out->day + (*substr - '0');
+ ++substr;
+ --sublen;
+ } else if (!has_ymd_sep) {
+ goto parse_error;
+ }
+ if (out->day < 1 ||
+ out->day > days_per_month_table[year_leap][out->month - 1]) {
+ PyErr_Format(PyExc_ValueError,
+ "Day out of range in datetime string \"%s\"", str);
+ goto error;
+ }
+
+ /* Next character must be a 'T', ' ', or end of string */
+ if (sublen == 0) {
+ if (out_local != NULL) {
+ *out_local = 0;
+ }
+ goto finish;
+ }
+
+ if ((*substr != 'T' && *substr != ' ') || sublen == 1) {
+ goto parse_error;
+ }
+ ++substr;
+ --sublen;
+
+ /* PARSE THE HOURS */
+ /* First digit required */
+ if (!isdigit(*substr)) {
+ goto parse_error;
+ }
+ out->hour = (*substr - '0');
+ ++substr;
+ --sublen;
+ /* Second digit optional */
+ if (isdigit(*substr)) {
+ hour_was_2_digits = 1;
+ out->hour = 10 * out->hour + (*substr - '0');
+ ++substr;
+ --sublen;
+ if (out->hour >= 24) {
+ PyErr_Format(PyExc_ValueError,
+ "Hours out of range in datetime string \"%s\"", str);
+ goto error;
+ }
+ }
+
+ /* Next character must be a ':' or the end of the string */
+ if (sublen == 0) {
+ if (!hour_was_2_digits) {
+ goto parse_error;
+ }
+ goto finish;
+ }
+
+ if (*substr == ':') {
+ has_hms_sep = 1;
+ ++substr;
+ --sublen;
+ /* Cannot have a trailing separator */
+ if (sublen == 0 || !isdigit(*substr)) {
+ goto parse_error;
+ }
+ } else if (!isdigit(*substr)) {
+ if (!hour_was_2_digits) {
+ goto parse_error;
+ }
+ goto parse_timezone;
+ }
+
+ /* PARSE THE MINUTES */
+ /* First digit required */
+ out->min = (*substr - '0');
+ ++substr;
+ --sublen;
+ /* Second digit optional if there was a separator */
+ if (isdigit(*substr)) {
+ out->min = 10 * out->min + (*substr - '0');
+ ++substr;
+ --sublen;
+ if (out->min >= 60) {
+ PyErr_Format(PyExc_ValueError,
+ "Minutes out of range in datetime string \"%s\"", str);
+ goto error;
+ }
+ } else if (!has_hms_sep) {
+ goto parse_error;
+ }
+
+ if (sublen == 0) {
+ goto finish;
+ }
+
+ /* If we make it through this condition block, then the next
+ * character is a digit. */
+ if (has_hms_sep && *substr == ':') {
+ ++substr;
+ --sublen;
+ /* Cannot have a trailing ':' */
+ if (sublen == 0 || !isdigit(*substr)) {
+ goto parse_error;
+ }
+ } else if (!has_hms_sep && isdigit(*substr)) {
+ } else {
+ goto parse_timezone;
+ }
+
+ /* PARSE THE SECONDS */
+ /* First digit required */
+ out->sec = (*substr - '0');
+ ++substr;
+ --sublen;
+ /* Second digit optional if there was a separator */
+ if (isdigit(*substr)) {
+ out->sec = 10 * out->sec + (*substr - '0');
+ ++substr;
+ --sublen;
+ if (out->sec >= 60) {
+ PyErr_Format(PyExc_ValueError,
+ "Seconds out of range in datetime string \"%s\"", str);
+ goto error;
+ }
+ } else if (!has_hms_sep) {
+ goto parse_error;
+ }
+
+ /* Next character may be a '.' indicating fractional seconds */
+ if (sublen > 0 && *substr == '.') {
+ ++substr;
+ --sublen;
+ } else {
+ goto parse_timezone;
+ }
+
+ /* PARSE THE MICROSECONDS (0 to 6 digits) */
+ numdigits = 0;
+ for (i = 0; i < 6; ++i) {
+ out->us *= 10;
+ if (sublen > 0 && isdigit(*substr)) {
+ out->us += (*substr - '0');
+ ++substr;
+ --sublen;
+ ++numdigits;
+ }
+ }
+
+ if (sublen == 0 || !isdigit(*substr)) {
+ goto parse_timezone;
+ }
+
+ /* PARSE THE PICOSECONDS (0 to 6 digits) */
+ numdigits = 0;
+ for (i = 0; i < 6; ++i) {
+ out->ps *= 10;
+ if (sublen > 0 && isdigit(*substr)) {
+ out->ps += (*substr - '0');
+ ++substr;
+ --sublen;
+ ++numdigits;
+ }
+ }
+
+ if (sublen == 0 || !isdigit(*substr)) {
+ goto parse_timezone;
+ }
+
+ /* PARSE THE ATTOSECONDS (0 to 6 digits) */
+ numdigits = 0;
+ for (i = 0; i < 6; ++i) {
+ out->as *= 10;
+ if (sublen > 0 && isdigit(*substr)) {
+ out->as += (*substr - '0');
+ ++substr;
+ --sublen;
+ ++numdigits;
+ }
+ }
+
+parse_timezone:
+ /* trim any whitepsace between time/timeezone */
+ while (sublen > 0 && isspace(*substr)) {
+ ++substr;
+ --sublen;
+ }
+
+ if (sublen == 0) {
+ // Unlike NumPy, treating no time zone as naive
+ goto finish;
+ }
+
+ /* UTC specifier */
+ if (*substr == 'Z') {
+ /* "Z" should be equivalent to tz offset "+00:00" */
+ if (out_local != NULL) {
+ *out_local = 1;
+ }
+
+ if (out_tzoffset != NULL) {
+ *out_tzoffset = 0;
+ }
+
+ if (sublen == 1) {
+ goto finish;
+ } else {
+ ++substr;
+ --sublen;
+ }
+ } else if (*substr == '-' || *substr == '+') {
+ /* Time zone offset */
+ int offset_neg = 0, offset_hour = 0, offset_minute = 0;
+
+ /*
+ * Since "local" means local with respect to the current
+ * machine, we say this is non-local.
+ */
+
+ if (*substr == '-') {
+ offset_neg = 1;
+ }
+ ++substr;
+ --sublen;
+
+ /* The hours offset */
+ if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
+ offset_hour = 10 * (substr[0] - '0') + (substr[1] - '0');
+ substr += 2;
+ sublen -= 2;
+ if (offset_hour >= 24) {
+ PyErr_Format(PyExc_ValueError,
+ "Timezone hours offset out of range "
+ "in datetime string \"%s\"",
+ str);
+ goto error;
+ }
+ } else if (sublen >= 1 && isdigit(substr[0])) {
+ offset_hour = substr[0] - '0';
+ ++substr;
+ --sublen;
+ } else {
+ goto parse_error;
+ }
+
+ /* The minutes offset is optional */
+ if (sublen > 0) {
+ /* Optional ':' */
+ if (*substr == ':') {
+ ++substr;
+ --sublen;
+ }
+
+ /* The minutes offset (at the end of the string) */
+ if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
+ offset_minute = 10 * (substr[0] - '0') + (substr[1] - '0');
+ substr += 2;
+ sublen -= 2;
+ if (offset_minute >= 60) {
+ PyErr_Format(PyExc_ValueError,
+ "Timezone minutes offset out of range "
+ "in datetime string \"%s\"",
+ str);
+ goto error;
+ }
+ } else if (sublen >= 1 && isdigit(substr[0])) {
+ offset_minute = substr[0] - '0';
+ ++substr;
+ --sublen;
+ } else {
+ goto parse_error;
+ }
+ }
+
+ /* Apply the time zone offset */
+ if (offset_neg) {
+ offset_hour = -offset_hour;
+ offset_minute = -offset_minute;
+ }
+ if (out_local != NULL) {
+ *out_local = 1;
+ // Unlike NumPy, do not change internal value to local time
+ *out_tzoffset = 60 * offset_hour + offset_minute;
+ }
+ }
+
+ /* Skip trailing whitespace */
+ while (sublen > 0 && isspace(*substr)) {
+ ++substr;
+ --sublen;
+ }
+
+ if (sublen != 0) {
+ goto parse_error;
+ }
+
+finish:
+ return 0;
+
+parse_error:
+ PyErr_Format(PyExc_ValueError,
+ "Error parsing datetime string \"%s\" at position %d", str,
+ (int)(substr - str));
+ return -1;
+
+error:
+ return -1;
+}
+
+/*
+ * Provides a string length to use for converting datetime
+ * objects with the given local and unit settings.
+ */
+int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) {
+ int len = 0;
+
+ switch (base) {
+ /* Generic units can only be used to represent NaT */
+ /* return 4;*/
+ case NPY_FR_as:
+ len += 3; /* "###" */
+ case NPY_FR_fs:
+ len += 3; /* "###" */
+ case NPY_FR_ps:
+ len += 3; /* "###" */
+ case NPY_FR_ns:
+ len += 3; /* "###" */
+ case NPY_FR_us:
+ len += 3; /* "###" */
+ case NPY_FR_ms:
+ len += 4; /* ".###" */
+ case NPY_FR_s:
+ len += 3; /* ":##" */
+ case NPY_FR_m:
+ len += 3; /* ":##" */
+ case NPY_FR_h:
+ len += 3; /* "T##" */
+ case NPY_FR_D:
+ case NPY_FR_W:
+ len += 3; /* "-##" */
+ case NPY_FR_M:
+ len += 3; /* "-##" */
+ case NPY_FR_Y:
+ len += 21; /* 64-bit year */
+ break;
+ default:
+ len += 3; /* handle the now defunct NPY_FR_B */
+ break;
+ }
+
+ if (base >= NPY_FR_h) {
+ if (local) {
+ len += 5; /* "+####" or "-####" */
+ } else {
+ len += 1; /* "Z" */
+ }
+ }
+
+ len += 1; /* NULL terminator */
+
+ return len;
+}
+
+
+/*
+ * Converts an npy_datetimestruct to an (almost) ISO 8601
+ * NULL-terminated string using timezone Z (UTC). If the string fits in
+ * the space exactly, it leaves out the NULL terminator and returns success.
+ *
+ * The differences from ISO 8601 are the 'NaT' string, and
+ * the number of year digits is >= 4 instead of strictly 4.
+ *
+ * 'base' restricts the output to that unit. Set 'base' to
+ * -1 to auto-detect a base after which all the values are zero.
+ *
+ * Returns 0 on success, -1 on failure (for example if the output
+ * string was too short).
+ */
+int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
+ NPY_DATETIMEUNIT base) {
+ char *substr = outstr, sublen = outlen;
+ int tmplen;
+
+ /*
+ * Print weeks with the same precision as days.
+ *
+ * TODO: Could print weeks with YYYY-Www format if the week
+ * epoch is a Monday.
+ */
+ if (base == NPY_FR_W) {
+ base = NPY_FR_D;
+ }
+
+/* YEAR */
+/*
+ * Can't use PyOS_snprintf, because it always produces a '\0'
+ * character at the end, and NumPy string types are permitted
+ * to have data all the way to the end of the buffer.
+ */
+#ifdef _WIN32
+ tmplen = _snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year);
+#else
+ tmplen = snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year);
+#endif
+ /* If it ran out of space or there isn't space for the NULL terminator */
+ if (tmplen < 0 || tmplen > sublen) {
+ goto string_too_short;
+ }
+ substr += tmplen;
+ sublen -= tmplen;
+
+ /* Stop if the unit is years */
+ if (base == NPY_FR_Y) {
+ if (sublen > 0) {
+ *substr = '\0';
+ }
+ return 0;
+ }
+
+ /* MONTH */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = '-';
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->month / 10) + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)((dts->month % 10) + '0');
+ substr += 3;
+ sublen -= 3;
+
+ /* Stop if the unit is months */
+ if (base == NPY_FR_M) {
+ if (sublen > 0) {
+ *substr = '\0';
+ }
+ return 0;
+ }
+
+ /* DAY */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = '-';
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->day / 10) + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)((dts->day % 10) + '0');
+ substr += 3;
+ sublen -= 3;
+
+ /* Stop if the unit is days */
+ if (base == NPY_FR_D) {
+ if (sublen > 0) {
+ *substr = '\0';
+ }
+ return 0;
+ }
+
+ /* HOUR */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = 'T';
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->hour / 10) + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)((dts->hour % 10) + '0');
+ substr += 3;
+ sublen -= 3;
+
+ /* Stop if the unit is hours */
+ if (base == NPY_FR_h) {
+ goto add_time_zone;
+ }
+
+ /* MINUTE */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = ':';
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->min / 10) + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)((dts->min % 10) + '0');
+ substr += 3;
+ sublen -= 3;
+
+ /* Stop if the unit is minutes */
+ if (base == NPY_FR_m) {
+ goto add_time_zone;
+ }
+
+ /* SECOND */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = ':';
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->sec / 10) + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)((dts->sec % 10) + '0');
+ substr += 3;
+ sublen -= 3;
+
+ /* Stop if the unit is seconds */
+ if (base == NPY_FR_s) {
+ goto add_time_zone;
+ }
+
+ /* MILLISECOND */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = '.';
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->us / 100000) % 10 + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)((dts->us / 10000) % 10 + '0');
+ if (sublen < 4) {
+ goto string_too_short;
+ }
+ substr[3] = (char)((dts->us / 1000) % 10 + '0');
+ substr += 4;
+ sublen -= 4;
+
+ /* Stop if the unit is milliseconds */
+ if (base == NPY_FR_ms) {
+ goto add_time_zone;
+ }
+
+ /* MICROSECOND */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = (char)((dts->us / 100) % 10 + '0');
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->us / 10) % 10 + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)(dts->us % 10 + '0');
+ substr += 3;
+ sublen -= 3;
+
+ /* Stop if the unit is microseconds */
+ if (base == NPY_FR_us) {
+ goto add_time_zone;
+ }
+
+ /* NANOSECOND */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = (char)((dts->ps / 100000) % 10 + '0');
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->ps / 10000) % 10 + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)((dts->ps / 1000) % 10 + '0');
+ substr += 3;
+ sublen -= 3;
+
+ /* Stop if the unit is nanoseconds */
+ if (base == NPY_FR_ns) {
+ goto add_time_zone;
+ }
+
+ /* PICOSECOND */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = (char)((dts->ps / 100) % 10 + '0');
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->ps / 10) % 10 + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)(dts->ps % 10 + '0');
+ substr += 3;
+ sublen -= 3;
+
+ /* Stop if the unit is picoseconds */
+ if (base == NPY_FR_ps) {
+ goto add_time_zone;
+ }
+
+ /* FEMTOSECOND */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = (char)((dts->as / 100000) % 10 + '0');
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->as / 10000) % 10 + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)((dts->as / 1000) % 10 + '0');
+ substr += 3;
+ sublen -= 3;
+
+ /* Stop if the unit is femtoseconds */
+ if (base == NPY_FR_fs) {
+ goto add_time_zone;
+ }
+
+ /* ATTOSECOND */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = (char)((dts->as / 100) % 10 + '0');
+ if (sublen < 2) {
+ goto string_too_short;
+ }
+ substr[1] = (char)((dts->as / 10) % 10 + '0');
+ if (sublen < 3) {
+ goto string_too_short;
+ }
+ substr[2] = (char)(dts->as % 10 + '0');
+ substr += 3;
+ sublen -= 3;
+
+add_time_zone:
+ /* UTC "Zulu" time */
+ if (sublen < 1) {
+ goto string_too_short;
+ }
+ substr[0] = 'Z';
+ substr += 1;
+ sublen -= 1;
+
+ /* Add a NULL terminator, and return */
+ if (sublen > 0) {
+ substr[0] = '\0';
+ }
+
+ return 0;
+
+string_too_short:
+ PyErr_Format(PyExc_RuntimeError,
+ "The string provided for NumPy ISO datetime formatting "
+ "was too short, with length %d",
+ outlen);
+ return -1;
+}
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/contrib/python/pandas/py2/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h
new file mode 100644
index 00000000000..15d5dd357ea
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h
@@ -0,0 +1,83 @@
+/*
+
+Copyright (c) 2016, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+
+Written by Mark Wiebe ([email protected])
+Copyright (c) 2011 by Enthought, Inc.
+
+Copyright (c) 2005-2011, NumPy Developers
+All rights reserved.
+
+See NUMPY_LICENSE.txt for the license.
+
+This file implements string parsing and creation for NumPy datetime.
+
+*/
+
+#ifndef PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_
+#define PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_
+
+#ifndef NPY_NO_DEPRECATED_API
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#endif // NPY_NO_DEPRECATED_API
+
+/*
+ * Parses (almost) standard ISO 8601 date strings. The differences are:
+ *
+ * + The date "20100312" is parsed as the year 20100312, not as
+ * equivalent to "2010-03-12". The '-' in the dates are not optional.
+ * + Only seconds may have a decimal point, with up to 18 digits after it
+ * (maximum attoseconds precision).
+ * + Either a 'T' as in ISO 8601 or a ' ' may be used to separate
+ * the date and the time. Both are treated equivalently.
+ * + Doesn't (yet) handle the "YYYY-DDD" or "YYYY-Www" formats.
+ * + Doesn't handle leap seconds (seconds value has 60 in these cases).
+ * + Doesn't handle 24:00:00 as synonym for midnight (00:00:00) tomorrow
+ * + Accepts special values "NaT" (not a time), "Today", (current
+ * day according to local time) and "Now" (current time in UTC).
+ *
+ * 'str' must be a NULL-terminated string, and 'len' must be its length.
+ *
+ * 'out' gets filled with the parsed date-time.
+ * 'out_local' gets whether returned value contains timezone. 0 for UTC, 1 for local time.
+ * 'out_tzoffset' gets set to timezone offset by minutes
+ * if the parsed time was in local time,
+ * to 0 otherwise. The values 'now' and 'today' don't get counted
+ * as local, and neither do UTC +/-#### timezone offsets, because
+ * they aren't using the computer's local timezone offset.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+int
+parse_iso_8601_datetime(char *str, int len,
+ npy_datetimestruct *out,
+ int *out_local,
+ int *out_tzoffset);
+
+/*
+ * Provides a string length to use for converting datetime
+ * objects with the given local and unit settings.
+ */
+int
+get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base);
+
+/*
+ * Converts an npy_datetimestruct to an (almost) ISO 8601
+ * NULL-terminated string using timezone Z (UTC).
+ *
+ * 'base' restricts the output to that unit. Set 'base' to
+ * -1 to auto-detect a base after which all the values are zero.
+ *
+ * Returns 0 on success, -1 on failure (for example if the output
+ * string was too short).
+ */
+int
+make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
+ NPY_DATETIMEUNIT base);
+
+#endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/strptime.pyx b/contrib/python/pandas/py2/pandas/_libs/tslibs/strptime.pyx
new file mode 100644
index 00000000000..87658ae9217
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/strptime.pyx
@@ -0,0 +1,668 @@
+# -*- coding: utf-8 -*-
+"""Strptime-related classes and functions.
+"""
+import time
+import locale
+import calendar
+import re
+from datetime import date as datetime_date
+
+
+# Python 2 vs Python 3
+try:
+ from thread import allocate_lock as _thread_allocate_lock
+except:
+ try:
+ from _thread import allocate_lock as _thread_allocate_lock
+ except:
+ try:
+ from dummy_thread import allocate_lock as _thread_allocate_lock
+ except:
+ from _dummy_thread import allocate_lock as _thread_allocate_lock
+
+
+import pytz
+
+import numpy as np
+from numpy cimport int64_t
+
+
+from pandas._libs.tslibs.np_datetime cimport (
+ check_dts_bounds, dtstruct_to_dt64, npy_datetimestruct)
+
+from pandas._libs.tslibs.util cimport is_string_object
+
+from pandas._libs.tslibs.nattype cimport checknull_with_nat, NPY_NAT
+from pandas._libs.tslibs.nattype import nat_strings
+
+cdef dict _parse_code_table = {'y': 0,
+ 'Y': 1,
+ 'm': 2,
+ 'B': 3,
+ 'b': 4,
+ 'd': 5,
+ 'H': 6,
+ 'I': 7,
+ 'M': 8,
+ 'S': 9,
+ 'f': 10,
+ 'A': 11,
+ 'a': 12,
+ 'w': 13,
+ 'j': 14,
+ 'U': 15,
+ 'W': 16,
+ 'Z': 17,
+ 'p': 18, # an additional key, only with I
+ 'z': 19}
+
+
+def array_strptime(object[:] values, object fmt,
+ bint exact=True, errors='raise'):
+ """
+ Calculates the datetime structs represented by the passed array of strings
+
+ Parameters
+ ----------
+ values : ndarray of string-like objects
+ fmt : string-like regex
+ exact : matches must be exact if True, search if False
+ errors : string specifying error handling, {'raise', 'ignore', 'coerce'}
+ """
+
+ cdef:
+ Py_ssize_t i, n = len(values)
+ npy_datetimestruct dts
+ int64_t[:] iresult
+ object[:] result_timezone
+ int year, month, day, minute, hour, second, weekday, julian
+ int week_of_year, week_of_year_start, parse_code, ordinal
+ int64_t us, ns
+ object val, group_key, ampm, found, timezone
+ dict found_key
+ bint is_raise = errors=='raise'
+ bint is_ignore = errors=='ignore'
+ bint is_coerce = errors=='coerce'
+
+ assert is_raise or is_ignore or is_coerce
+
+ if fmt is not None:
+ if '%W' in fmt or '%U' in fmt:
+ if '%Y' not in fmt and '%y' not in fmt:
+ raise ValueError("Cannot use '%W' or '%U' without "
+ "day and year")
+ if ('%A' not in fmt and '%a' not in fmt and '%w' not
+ in fmt):
+ raise ValueError("Cannot use '%W' or '%U' without "
+ "day and year")
+ elif '%Z' in fmt and '%z' in fmt:
+ raise ValueError("Cannot parse both %Z and %z")
+
+ global _TimeRE_cache, _regex_cache
+ with _cache_lock:
+ if _getlang() != _TimeRE_cache.locale_time.lang:
+ _TimeRE_cache = TimeRE()
+ _regex_cache.clear()
+ if len(_regex_cache) > _CACHE_MAX_SIZE:
+ _regex_cache.clear()
+ locale_time = _TimeRE_cache.locale_time
+ format_regex = _regex_cache.get(fmt)
+ if not format_regex:
+ try:
+ format_regex = _TimeRE_cache.compile(fmt)
+ # KeyError raised when a bad format is found; can be specified as
+ # \\, in which case it was a stray % but with a space after it
+ except KeyError, err:
+ bad_directive = err.args[0]
+ if bad_directive == "\\":
+ bad_directive = "%"
+ del err
+ raise ValueError("'%s' is a bad directive in format '%s'" %
+ (bad_directive, fmt))
+ # IndexError only occurs when the format string is "%"
+ except IndexError:
+ raise ValueError("stray %% in format '%s'" % fmt)
+ _regex_cache[fmt] = format_regex
+
+ result = np.empty(n, dtype='M8[ns]')
+ iresult = result.view('i8')
+ result_timezone = np.empty(n, dtype='object')
+
+ dts.us = dts.ps = dts.as = 0
+
+ for i in range(n):
+ val = values[i]
+ if is_string_object(val):
+ if val in nat_strings:
+ iresult[i] = NPY_NAT
+ continue
+ else:
+ if checknull_with_nat(val):
+ iresult[i] = NPY_NAT
+ continue
+ else:
+ val = str(val)
+
+ # exact matching
+ if exact:
+ found = format_regex.match(val)
+ if not found:
+ if is_coerce:
+ iresult[i] = NPY_NAT
+ continue
+ raise ValueError("time data %r does not match "
+ "format %r (match)" % (values[i], fmt))
+ if len(val) != found.end():
+ if is_coerce:
+ iresult[i] = NPY_NAT
+ continue
+ raise ValueError("unconverted data remains: %s" %
+ values[i][found.end():])
+
+ # search
+ else:
+ found = format_regex.search(val)
+ if not found:
+ if is_coerce:
+ iresult[i] = NPY_NAT
+ continue
+ raise ValueError("time data %r does not match format "
+ "%r (search)" % (values[i], fmt))
+
+ year = 1900
+ month = day = 1
+ hour = minute = second = ns = us = 0
+ timezone = None
+ # Default to -1 to signify that values not known; not critical to have,
+ # though
+ week_of_year = -1
+ week_of_year_start = -1
+ # weekday and julian defaulted to -1 so as to signal need to calculate
+ # values
+ weekday = julian = -1
+ found_dict = found.groupdict()
+ for group_key in found_dict.iterkeys():
+ # Directives not explicitly handled below:
+ # c, x, X
+ # handled by making out of other directives
+ # U, W
+ # worthless without day of the week
+ parse_code = _parse_code_table[group_key]
+
+ if parse_code == 0:
+ year = int(found_dict['y'])
+ # Open Group specification for strptime() states that a %y
+ # value in the range of [00, 68] is in the century 2000, while
+ # [69,99] is in the century 1900
+ if year <= 68:
+ year += 2000
+ else:
+ year += 1900
+ elif parse_code == 1:
+ year = int(found_dict['Y'])
+ elif parse_code == 2:
+ month = int(found_dict['m'])
+ elif parse_code == 3:
+ # elif group_key == 'B':
+ month = locale_time.f_month.index(found_dict['B'].lower())
+ elif parse_code == 4:
+ # elif group_key == 'b':
+ month = locale_time.a_month.index(found_dict['b'].lower())
+ elif parse_code == 5:
+ # elif group_key == 'd':
+ day = int(found_dict['d'])
+ elif parse_code == 6:
+ # elif group_key == 'H':
+ hour = int(found_dict['H'])
+ elif parse_code == 7:
+ hour = int(found_dict['I'])
+ ampm = found_dict.get('p', '').lower()
+ # If there was no AM/PM indicator, we'll treat this like AM
+ if ampm in ('', locale_time.am_pm[0]):
+ # We're in AM so the hour is correct unless we're
+ # looking at 12 midnight.
+ # 12 midnight == 12 AM == hour 0
+ if hour == 12:
+ hour = 0
+ elif ampm == locale_time.am_pm[1]:
+ # We're in PM so we need to add 12 to the hour unless
+ # we're looking at 12 noon.
+ # 12 noon == 12 PM == hour 12
+ if hour != 12:
+ hour += 12
+ elif parse_code == 8:
+ minute = int(found_dict['M'])
+ elif parse_code == 9:
+ second = int(found_dict['S'])
+ elif parse_code == 10:
+ s = found_dict['f']
+ # Pad to always return nanoseconds
+ s += "0" * (9 - len(s))
+ us = long(s)
+ ns = us % 1000
+ us = us / 1000
+ elif parse_code == 11:
+ weekday = locale_time.f_weekday.index(found_dict['A'].lower())
+ elif parse_code == 12:
+ weekday = locale_time.a_weekday.index(found_dict['a'].lower())
+ elif parse_code == 13:
+ weekday = int(found_dict['w'])
+ if weekday == 0:
+ weekday = 6
+ else:
+ weekday -= 1
+ elif parse_code == 14:
+ julian = int(found_dict['j'])
+ elif parse_code == 15 or parse_code == 16:
+ week_of_year = int(found_dict[group_key])
+ if group_key == 'U':
+ # U starts week on Sunday.
+ week_of_year_start = 6
+ else:
+ # W starts week on Monday.
+ week_of_year_start = 0
+ elif parse_code == 17:
+ timezone = pytz.timezone(found_dict['Z'])
+ elif parse_code == 19:
+ timezone = parse_timezone_directive(found_dict['z'])
+
+ # If we know the wk of the year and what day of that wk, we can figure
+ # out the Julian day of the year.
+ if julian == -1 and week_of_year != -1 and weekday != -1:
+ week_starts_Mon = True if week_of_year_start == 0 else False
+ julian = _calc_julian_from_U_or_W(year, week_of_year, weekday,
+ week_starts_Mon)
+ # Cannot pre-calculate datetime_date() since can change in Julian
+ # calculation and thus could have different value for the day of the wk
+ # calculation.
+ try:
+ if julian == -1:
+ # Need to add 1 to result since first day of the year is 1, not
+ # 0.
+ ordinal = datetime_date(year, month, day).toordinal()
+ julian = ordinal - datetime_date(year, 1, 1).toordinal() + 1
+ else:
+ # Assume that if they bothered to include Julian day it will
+ # be accurate.
+ datetime_result = datetime_date.fromordinal(
+ (julian - 1) + datetime_date(year, 1, 1).toordinal())
+ year = datetime_result.year
+ month = datetime_result.month
+ day = datetime_result.day
+ except ValueError:
+ if is_coerce:
+ iresult[i] = NPY_NAT
+ continue
+ raise
+ if weekday == -1:
+ weekday = datetime_date(year, month, day).weekday()
+
+ dts.year = year
+ dts.month = month
+ dts.day = day
+ dts.hour = hour
+ dts.min = minute
+ dts.sec = second
+ dts.us = us
+ dts.ps = ns * 1000
+
+ iresult[i] = dtstruct_to_dt64(&dts)
+ try:
+ check_dts_bounds(&dts)
+ except ValueError:
+ if is_coerce:
+ iresult[i] = NPY_NAT
+ continue
+ raise
+
+ result_timezone[i] = timezone
+
+ return result, result_timezone.base
+
+
+"""_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored
+from the standard library, see
+https://github.com/python/cpython/blob/master/Lib/_strptime.py
+The original module-level docstring follows.
+
+Strptime-related classes and functions.
+CLASSES:
+ LocaleTime -- Discovers and stores locale-specific time information
+ TimeRE -- Creates regexes for pattern matching a string of text containing
+ time information
+FUNCTIONS:
+ _getlang -- Figure out what language is being used for the locale
+ strptime -- Calculates the time struct represented by the passed-in string
+"""
+
+
+def _getlang():
+ """Figure out what language is being used for the locale"""
+ return locale.getlocale(locale.LC_TIME)
+
+
+class LocaleTime(object):
+ """Stores and handles locale-specific information related to time.
+
+ ATTRIBUTES:
+ f_weekday -- full weekday names (7-item list)
+ a_weekday -- abbreviated weekday names (7-item list)
+ f_month -- full month names (13-item list; dummy value in [0], which
+ is added by code)
+ a_month -- abbreviated month names (13-item list, dummy value in
+ [0], which is added by code)
+ am_pm -- AM/PM representation (2-item list)
+ LC_date_time -- format string for date/time representation (string)
+ LC_date -- format string for date representation (string)
+ LC_time -- format string for time representation (string)
+ timezone -- daylight- and non-daylight-savings timezone representation
+ (2-item list of sets)
+ lang -- Language used by instance (2-item tuple)
+ """
+
+ def __init__(self):
+ """Set all attributes.
+
+ Order of methods called matters for dependency reasons.
+
+ The locale language is set at the offset and then checked again before
+ exiting. This is to make sure that the attributes were not set with a
+ mix of information from more than one locale. This would most likely
+ happen when using threads where one thread calls a locale-dependent
+ function while another thread changes the locale while the function in
+ the other thread is still running. Proper coding would call for
+ locks to prevent changing the locale while locale-dependent code is
+ running. The check here is done in case someone does not think about
+ doing this.
+
+ Only other possible issue is if someone changed the timezone and did
+ not call tz.tzset . That is an issue for the programmer, though,
+ since changing the timezone is worthless without that call.
+
+ """
+ self.lang = _getlang()
+ self.__calc_weekday()
+ self.__calc_month()
+ self.__calc_am_pm()
+ self.__calc_timezone()
+ self.__calc_date_time()
+ if _getlang() != self.lang:
+ raise ValueError("locale changed during initialization")
+
+ def __pad(self, seq, front):
+ # Add '' to seq to either the front (is True), else the back.
+ seq = list(seq)
+ if front:
+ seq.insert(0, '')
+ else:
+ seq.append('')
+ return seq
+
+ def __calc_weekday(self):
+ # Set self.a_weekday and self.f_weekday using the calendar
+ # module.
+ a_weekday = [calendar.day_abbr[i].lower() for i in range(7)]
+ f_weekday = [calendar.day_name[i].lower() for i in range(7)]
+ self.a_weekday = a_weekday
+ self.f_weekday = f_weekday
+
+ def __calc_month(self):
+ # Set self.f_month and self.a_month using the calendar module.
+ a_month = [calendar.month_abbr[i].lower() for i in range(13)]
+ f_month = [calendar.month_name[i].lower() for i in range(13)]
+ self.a_month = a_month
+ self.f_month = f_month
+
+ def __calc_am_pm(self):
+ # Set self.am_pm by using time.strftime().
+
+ # The magic date (1999,3,17,hour,44,55,2,76,0) is not really that
+ # magical; just happened to have used it everywhere else where a
+ # static date was needed.
+ am_pm = []
+ for hour in (01, 22):
+ time_tuple = time.struct_time(
+ (1999, 3, 17, hour, 44, 55, 2, 76, 0))
+ am_pm.append(time.strftime("%p", time_tuple).lower())
+ self.am_pm = am_pm
+
+ def __calc_date_time(self):
+ # Set self.date_time, self.date, & self.time by using
+ # time.strftime().
+
+ # Use (1999,3,17,22,44,55,2,76,0) for magic date because the amount of
+ # overloaded numbers is minimized. The order in which searches for
+ # values within the format string is very important; it eliminates
+ # possible ambiguity for what something represents.
+ time_tuple = time.struct_time((1999, 3, 17, 22, 44, 55, 2, 76, 0))
+ date_time = [None, None, None]
+ date_time[0] = time.strftime("%c", time_tuple).lower()
+ date_time[1] = time.strftime("%x", time_tuple).lower()
+ date_time[2] = time.strftime("%X", time_tuple).lower()
+ replacement_pairs = [('%', '%%'), (self.f_weekday[2], '%A'),
+ (self.f_month[3], '%B'),
+ (self.a_weekday[2], '%a'),
+ (self.a_month[3], '%b'), (self.am_pm[1], '%p'),
+ ('1999', '%Y'), ('99', '%y'), ('22', '%H'),
+ ('44', '%M'), ('55', '%S'), ('76', '%j'),
+ ('17', '%d'), ('03', '%m'), ('3', '%m'),
+ # '3' needed for when no leading zero.
+ ('2', '%w'), ('10', '%I')]
+ replacement_pairs.extend([(tz, "%Z") for tz_values in self.timezone
+ for tz in tz_values])
+ for offset, directive in ((0, '%c'), (1, '%x'), (2, '%X')):
+ current_format = date_time[offset]
+ for old, new in replacement_pairs:
+ # Must deal with possible lack of locale info
+ # manifesting itself as the empty string (e.g., Swedish's
+ # lack of AM/PM info) or a platform returning a tuple of empty
+ # strings (e.g., MacOS 9 having timezone as ('','')).
+ if old:
+ current_format = current_format.replace(old, new)
+ # If %W is used, then Sunday, 2005-01-03 will fall on week 0 since
+ # 2005-01-03 occurs before the first Monday of the year. Otherwise
+ # %U is used.
+ time_tuple = time.struct_time((1999, 1, 3, 1, 1, 1, 6, 3, 0))
+ if '00' in time.strftime(directive, time_tuple):
+ U_W = '%W'
+ else:
+ U_W = '%U'
+ date_time[offset] = current_format.replace('11', U_W)
+ self.LC_date_time = date_time[0]
+ self.LC_date = date_time[1]
+ self.LC_time = date_time[2]
+
+ def __calc_timezone(self):
+ # Set self.timezone by using time.tzname.
+ # Do not worry about possibility of time.tzname[0] == timetzname[1]
+ # and time.daylight; handle that in strptime .
+ try:
+ time.tzset()
+ except AttributeError:
+ pass
+ no_saving = frozenset(["utc", "gmt", time.tzname[0].lower()])
+ if time.daylight:
+ has_saving = frozenset([time.tzname[1].lower()])
+ else:
+ has_saving = frozenset()
+ self.timezone = (no_saving, has_saving)
+
+
+class TimeRE(dict):
+ """
+ Handle conversion from format directives to regexes.
+
+ Creates regexes for pattern matching a string of text containing
+ time information
+ """
+
+ def __init__(self, locale_time=None):
+ """Create keys/values.
+
+ Order of execution is important for dependency reasons.
+
+ """
+ if locale_time:
+ self.locale_time = locale_time
+ else:
+ self.locale_time = LocaleTime()
+ base = super(TimeRE, self)
+ base.__init__({
+ # The " \d" part of the regex is to make %c from ANSI C work
+ 'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
+ 'f': r"(?P<f>[0-9]{1,9})",
+ 'H': r"(?P<H>2[0-3]|[0-1]\d|\d)",
+ 'I': r"(?P<I>1[0-2]|0[1-9]|[1-9])",
+ 'j': (r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|"
+ r"[1-9]\d|0[1-9]|[1-9])"),
+ 'm': r"(?P<m>1[0-2]|0[1-9]|[1-9])",
+ 'M': r"(?P<M>[0-5]\d|\d)",
+ 'S': r"(?P<S>6[0-1]|[0-5]\d|\d)",
+ 'U': r"(?P<U>5[0-3]|[0-4]\d|\d)",
+ 'w': r"(?P<w>[0-6])",
+ # W is set below by using 'U'
+ 'y': r"(?P<y>\d\d)",
+ # XXX: Does 'Y' need to worry about having less or more than
+ # 4 digits?
+ 'Y': r"(?P<Y>\d\d\d\d)",
+ 'z': r"(?P<z>[+-]\d\d:?[0-5]\d(:?[0-5]\d(\.\d{1,6})?)?|Z)",
+ 'A': self.__seqToRE(self.locale_time.f_weekday, 'A'),
+ 'a': self.__seqToRE(self.locale_time.a_weekday, 'a'),
+ 'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'),
+ 'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'),
+ 'p': self.__seqToRE(self.locale_time.am_pm, 'p'),
+ 'Z': self.__seqToRE(pytz.all_timezones, 'Z'),
+ '%': '%'})
+ base.__setitem__('W', base.__getitem__('U').replace('U', 'W'))
+ base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
+ base.__setitem__('x', self.pattern(self.locale_time.LC_date))
+ base.__setitem__('X', self.pattern(self.locale_time.LC_time))
+
+ def __seqToRE(self, to_convert, directive):
+ """Convert a list to a regex string for matching a directive.
+
+ Want possible matching values to be from longest to shortest. This
+ prevents the possibility of a match occurring for a value that also
+ a substring of a larger value that should have matched (e.g., 'abc'
+ matching when 'abcdef' should have been the match).
+
+ """
+ to_convert = sorted(to_convert, key=len, reverse=True)
+ for value in to_convert:
+ if value != '':
+ break
+ else:
+ return ''
+ regex = '|'.join(re.escape(stuff) for stuff in to_convert)
+ regex = '(?P<%s>%s' % (directive, regex)
+ return '%s)' % regex
+
+ def pattern(self, format):
+ """Return regex pattern for the format string.
+
+ Need to make sure that any characters that might be interpreted as
+ regex syntax are escaped.
+
+ """
+ processed_format = ''
+ # The sub() call escapes all characters that might be misconstrued
+ # as regex syntax. Cannot use re.escape since we have to deal with
+ # format directives (%m, etc.).
+ regex_chars = re.compile(r"([\\.^$*+?\(\){}\[\]|])")
+ format = regex_chars.sub(r"\\\1", format)
+ whitespace_replacement = re.compile(r'\s+')
+ format = whitespace_replacement.sub(r'\\s+', format)
+ while '%' in format:
+ directive_index = format.index('%') +1
+ processed_format = "%s%s%s" % (processed_format,
+ format[:directive_index -1],
+ self[format[directive_index]])
+ format = format[directive_index +1:]
+ return "%s%s" % (processed_format, format)
+
+ def compile(self, format):
+ """Return a compiled re object for the format string."""
+ return re.compile(self.pattern(format), re.IGNORECASE)
+
+
+_cache_lock = _thread_allocate_lock()
+# DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock
+# first!
+_TimeRE_cache = TimeRE()
+_CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache
+_regex_cache = {}
+
+
+cdef _calc_julian_from_U_or_W(int year, int week_of_year,
+ int day_of_week, int week_starts_Mon):
+ """Calculate the Julian day based on the year, week of the year, and day of
+ the week, with week_start_day representing whether the week of the year
+ assumes the week starts on Sunday or Monday (6 or 0)."""
+
+ cdef:
+ int first_weekday, week_0_length, days_to_week
+
+ first_weekday = datetime_date(year, 1, 1).weekday()
+ # If we are dealing with the %U directive (week starts on Sunday), it's
+ # easier to just shift the view to Sunday being the first day of the
+ # week.
+ if not week_starts_Mon:
+ first_weekday = (first_weekday + 1) % 7
+ day_of_week = (day_of_week + 1) % 7
+
+ # Need to watch out for a week 0 (when the first day of the year is not
+ # the same as that specified by %U or %W).
+ week_0_length = (7 - first_weekday) % 7
+ if week_of_year == 0:
+ return 1 + day_of_week - first_weekday
+ else:
+ days_to_week = week_0_length + (7 * (week_of_year - 1))
+ return 1 + days_to_week + day_of_week
+
+
+cdef parse_timezone_directive(object z):
+ """
+ Parse the '%z' directive and return a pytz.FixedOffset
+
+ Parameters
+ ----------
+ z : string of the UTC offset
+
+ Returns
+ -------
+ pytz.FixedOffset
+
+ Notes
+ -----
+ This is essentially similar to the cpython implementation
+ https://github.com/python/cpython/blob/master/Lib/_strptime.py#L457-L479
+ """
+
+ cdef:
+ int gmtoff_fraction, hours, minutes, seconds, pad_number, microseconds
+ int total_minutes
+ object gmtoff_remainder, gmtoff_remainder_padding
+
+ if z == 'Z':
+ return pytz.FixedOffset(0)
+ if z[3] == ':':
+ z = z[:3] + z[4:]
+ if len(z) > 5:
+ if z[5] != ':':
+ msg = "Inconsistent use of : in {0}"
+ raise ValueError(msg.format(z))
+ z = z[:5] + z[6:]
+ hours = int(z[1:3])
+ minutes = int(z[3:5])
+ seconds = int(z[5:7] or 0)
+
+ # Pad to always return microseconds.
+ gmtoff_remainder = z[8:]
+ pad_number = 6 - len(gmtoff_remainder)
+ gmtoff_remainder_padding = "0" * pad_number
+ microseconds = int(gmtoff_remainder + gmtoff_remainder_padding)
+
+ total_minutes = ((hours * 60) + minutes + (seconds / 60) +
+ (microseconds / 60000000))
+ total_minutes = -total_minutes if z.startswith("-") else total_minutes
+ return pytz.FixedOffset(total_minutes)
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/timedeltas.pxd b/contrib/python/pandas/py2/pandas/_libs/tslibs/timedeltas.pxd
new file mode 100644
index 00000000000..097309b1782
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/timedeltas.pxd
@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+
+from numpy cimport int64_t
+
+# Exposed for tslib, not intended for outside use.
+cdef int64_t cast_from_unit(object ts, object unit) except? -1
+cpdef int64_t delta_to_nanoseconds(delta) except? -1
+cdef convert_to_timedelta64(object ts, object unit)
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/timedeltas.pyx b/contrib/python/pandas/py2/pandas/_libs/tslibs/timedeltas.pyx
new file mode 100644
index 00000000000..8a46a0d1f1c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/timedeltas.pyx
@@ -0,0 +1,1534 @@
+# -*- coding: utf-8 -*-
+import collections
+import textwrap
+import warnings
+
+import sys
+cdef bint PY3 = (sys.version_info[0] >= 3)
+
+import cython
+
+from cpython cimport Py_NE, Py_EQ, PyObject_RichCompare
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport int64_t
+cnp.import_array()
+
+from cpython.datetime cimport (datetime, timedelta,
+ PyDateTime_CheckExact,
+ PyDateTime_Check, PyDelta_Check,
+ PyDateTime_IMPORT)
+PyDateTime_IMPORT
+
+
+cimport pandas._libs.tslibs.util as util
+from pandas._libs.tslibs.util cimport (
+ is_timedelta64_object, is_datetime64_object, is_integer_object,
+ is_float_object, is_string_object)
+
+from pandas._libs.tslibs.ccalendar import DAY_SECONDS
+
+from pandas._libs.tslibs.np_datetime cimport (
+ cmp_scalar, reverse_ops, td64_to_tdstruct, pandas_timedeltastruct)
+
+from pandas._libs.tslibs.nattype import nat_strings
+from pandas._libs.tslibs.nattype cimport (
+ checknull_with_nat, NPY_NAT, c_NaT as NaT)
+from pandas._libs.tslibs.offsets cimport to_offset
+from pandas._libs.tslibs.offsets import _Tick as Tick
+
+# ----------------------------------------------------------------------
+# Constants
+
+# components named tuple
+Components = collections.namedtuple('Components', [
+ 'days', 'hours', 'minutes', 'seconds',
+ 'milliseconds', 'microseconds', 'nanoseconds'])
+
+
+cdef dict timedelta_abbrevs = { 'Y': 'Y',
+ 'y': 'Y',
+ 'M': 'M',
+ 'W': 'W',
+ 'w': 'W',
+ 'D': 'D',
+ 'd': 'D',
+ 'days': 'D',
+ 'day': 'D',
+ 'hours': 'h',
+ 'hour': 'h',
+ 'hr': 'h',
+ 'h': 'h',
+ 'm': 'm',
+ 'minute': 'm',
+ 'min': 'm',
+ 'minutes': 'm',
+ 't': 'm',
+ 's': 's',
+ 'seconds': 's',
+ 'sec': 's',
+ 'second': 's',
+ 'ms': 'ms',
+ 'milliseconds': 'ms',
+ 'millisecond': 'ms',
+ 'milli': 'ms',
+ 'millis': 'ms',
+ 'l': 'ms',
+ 'us': 'us',
+ 'microseconds': 'us',
+ 'microsecond': 'us',
+ 'micro': 'us',
+ 'micros': 'us',
+ 'u': 'us',
+ 'ns': 'ns',
+ 'nanoseconds': 'ns',
+ 'nano': 'ns',
+ 'nanos': 'ns',
+ 'nanosecond': 'ns',
+ 'n': 'ns'}
+
+_no_input = object()
+
+
+# ----------------------------------------------------------------------
+# API
+
+def ints_to_pytimedelta(int64_t[:] arr, box=False):
+ """
+ convert an i8 repr to an ndarray of timedelta or Timedelta (if box ==
+ True)
+
+ Parameters
+ ----------
+ arr : ndarray[int64_t]
+ box : bool, default False
+
+ Returns
+ -------
+ result : ndarray[object]
+ array of Timedelta or timedeltas objects
+ """
+ cdef:
+ Py_ssize_t i, n = len(arr)
+ int64_t value
+ object[:] result = np.empty(n, dtype=object)
+
+ for i in range(n):
+
+ value = arr[i]
+ if value == NPY_NAT:
+ result[i] = NaT
+ else:
+ if box:
+ result[i] = Timedelta(value)
+ else:
+ result[i] = timedelta(microseconds=int(value) / 1000)
+
+ return result.base # .base to access underlying np.ndarray
+
+
+# ----------------------------------------------------------------------
+
+cpdef int64_t delta_to_nanoseconds(delta) except? -1:
+ if hasattr(delta, 'nanos'):
+ return delta.nanos
+ if hasattr(delta, 'delta'):
+ delta = delta.delta
+ if is_timedelta64_object(delta):
+ return delta.astype("timedelta64[ns]").item()
+ if is_integer_object(delta):
+ return delta
+ if PyDelta_Check(delta):
+ return (delta.days * 24 * 60 * 60 * 1000000 +
+ delta.seconds * 1000000 +
+ delta.microseconds) * 1000
+
+ raise TypeError(type(delta))
+
+
+cdef convert_to_timedelta64(object ts, object unit):
+ """
+ Convert an incoming object to a timedelta64 if possible.
+ Before calling, unit must be standardized to avoid repeated unit conversion
+
+ Handle these types of objects:
+ - timedelta/Timedelta
+ - timedelta64
+ - an offset
+ - np.int64 (with unit providing a possible modifier)
+ - None/NaT
+
+ Return an ns based int64
+ """
+ if checknull_with_nat(ts):
+ return np.timedelta64(NPY_NAT)
+ elif isinstance(ts, Timedelta):
+ # already in the proper format
+ ts = np.timedelta64(ts.value)
+ elif is_datetime64_object(ts):
+ # only accept a NaT here
+ if ts.astype('int64') == NPY_NAT:
+ return np.timedelta64(NPY_NAT)
+ elif is_timedelta64_object(ts):
+ ts = ts.astype("m8[{unit}]".format(unit=unit.lower()))
+ elif is_integer_object(ts):
+ if ts == NPY_NAT:
+ return np.timedelta64(NPY_NAT)
+ else:
+ if unit in ['Y', 'M', 'W']:
+ ts = np.timedelta64(ts, unit)
+ else:
+ ts = cast_from_unit(ts, unit)
+ ts = np.timedelta64(ts)
+ elif is_float_object(ts):
+ if unit in ['Y', 'M', 'W']:
+ ts = np.timedelta64(int(ts), unit)
+ else:
+ ts = cast_from_unit(ts, unit)
+ ts = np.timedelta64(ts)
+ elif is_string_object(ts):
+ if len(ts) > 0 and ts[0] == 'P':
+ ts = parse_iso_format_string(ts)
+ else:
+ ts = parse_timedelta_string(ts)
+ ts = np.timedelta64(ts)
+ elif hasattr(ts, 'delta'):
+ ts = np.timedelta64(delta_to_nanoseconds(ts), 'ns')
+
+ if PyDelta_Check(ts):
+ ts = np.timedelta64(delta_to_nanoseconds(ts), 'ns')
+ elif not is_timedelta64_object(ts):
+ raise ValueError("Invalid type for timedelta "
+ "scalar: {ts_type}".format(ts_type=type(ts)))
+ return ts.astype('timedelta64[ns]')
+
+
+def array_to_timedelta64(object[:] values, unit='ns', errors='raise'):
+ """
+ Convert an ndarray to an array of timedeltas. If errors == 'coerce',
+ coerce non-convertible objects to NaT. Otherwise, raise.
+ """
+
+ cdef:
+ Py_ssize_t i, n
+ int64_t[:] iresult
+
+ if errors not in ('ignore', 'raise', 'coerce'):
+ raise ValueError("errors must be one of 'ignore', "
+ "'raise', or 'coerce'}")
+
+ n = values.shape[0]
+ result = np.empty(n, dtype='m8[ns]')
+ iresult = result.view('i8')
+
+ # Usually, we have all strings. If so, we hit the fast path.
+ # If this path fails, we try conversion a different way, and
+ # this is where all of the error handling will take place.
+ try:
+ for i in range(n):
+ result[i] = parse_timedelta_string(values[i])
+ except:
+ unit = parse_timedelta_unit(unit)
+ for i in range(n):
+ try:
+ result[i] = convert_to_timedelta64(values[i], unit)
+ except ValueError:
+ if errors == 'coerce':
+ result[i] = NPY_NAT
+ else:
+ raise
+
+ return iresult.base # .base to access underlying np.ndarray
+
+
+cpdef inline object precision_from_unit(object unit):
+ """
+ Return a casting of the unit represented to nanoseconds + the precision
+ to round the fractional part.
+ """
+ cdef:
+ int64_t m
+ int p
+
+ if unit == 'Y':
+ m = 1000000000L * 31556952
+ p = 9
+ elif unit == 'M':
+ m = 1000000000L * 2629746
+ p = 9
+ elif unit == 'W':
+ m = 1000000000L * DAY_SECONDS * 7
+ p = 9
+ elif unit == 'D' or unit == 'd':
+ m = 1000000000L * DAY_SECONDS
+ p = 9
+ elif unit == 'h':
+ m = 1000000000L * 3600
+ p = 9
+ elif unit == 'm':
+ m = 1000000000L * 60
+ p = 9
+ elif unit == 's':
+ m = 1000000000L
+ p = 9
+ elif unit == 'ms':
+ m = 1000000L
+ p = 6
+ elif unit == 'us':
+ m = 1000L
+ p = 3
+ elif unit == 'ns' or unit is None:
+ m = 1L
+ p = 0
+ else:
+ raise ValueError("cannot cast unit {unit}".format(unit=unit))
+ return m, p
+
+
+cdef inline int64_t cast_from_unit(object ts, object unit) except? -1:
+ """ return a casting of the unit represented to nanoseconds
+ round the fractional part of a float to our precision, p """
+ cdef:
+ int64_t m
+ int p
+
+ m, p = precision_from_unit(unit)
+
+ # just give me the unit back
+ if ts is None:
+ return m
+
+ # cast the unit, multiply base/frace separately
+ # to avoid precision issues from float -> int
+ base = <int64_t>ts
+ frac = ts - base
+ if p:
+ frac = round(frac, p)
+ return <int64_t>(base * m) + <int64_t>(frac * m)
+
+
+cdef inline _decode_if_necessary(object ts):
+ # decode ts if necessary
+ if not isinstance(ts, unicode) and not PY3:
+ ts = str(ts).decode('utf-8')
+
+ return ts
+
+
+cdef inline parse_timedelta_string(object ts):
+ """
+ Parse a regular format timedelta string. Return an int64_t (in ns)
+ or raise a ValueError on an invalid parse.
+ """
+
+ cdef:
+ unicode c
+ bint neg = 0, have_dot = 0, have_value = 0, have_hhmmss = 0
+ object current_unit = None
+ int64_t result = 0, m = 0, r
+ list number = [], frac = [], unit = []
+
+ # neg : tracks if we have a leading negative for the value
+ # have_dot : tracks if we are processing a dot (either post hhmmss or
+ # inside an expression)
+ # have_value : track if we have at least 1 leading unit
+ # have_hhmmss : tracks if we have a regular format hh:mm:ss
+
+ if len(ts) == 0 or ts in nat_strings:
+ return NPY_NAT
+
+ ts = _decode_if_necessary(ts)
+
+ for c in ts:
+
+ # skip whitespace / commas
+ if c == ' ' or c == ',':
+ pass
+
+ # positive signs are ignored
+ elif c == '+':
+ pass
+
+ # neg
+ elif c == '-':
+
+ if neg or have_value or have_hhmmss:
+ raise ValueError("only leading negative signs are allowed")
+
+ neg = 1
+
+ # number (ascii codes)
+ elif ord(c) >= 48 and ord(c) <= 57:
+
+ if have_dot:
+
+ # we found a dot, but now its just a fraction
+ if len(unit):
+ number.append(c)
+ have_dot = 0
+ else:
+ frac.append(c)
+
+ elif not len(unit):
+ number.append(c)
+
+ else:
+ r = timedelta_from_spec(number, frac, unit)
+ unit, number, frac = [], [c], []
+
+ result += timedelta_as_neg(r, neg)
+
+ # hh:mm:ss.
+ elif c == ':':
+
+ # we flip this off if we have a leading value
+ if have_value:
+ neg = 0
+
+ # we are in the pattern hh:mm:ss pattern
+ if len(number):
+ if current_unit is None:
+ current_unit = 'h'
+ m = 1000000000L * 3600
+ elif current_unit == 'h':
+ current_unit = 'm'
+ m = 1000000000L * 60
+ elif current_unit == 'm':
+ current_unit = 's'
+ m = 1000000000L
+ r = <int64_t>int(''.join(number)) * m
+ result += timedelta_as_neg(r, neg)
+ have_hhmmss = 1
+ else:
+ raise ValueError("expecting hh:mm:ss format, "
+ "received: {ts}".format(ts=ts))
+
+ unit, number = [], []
+
+ # after the decimal point
+ elif c == '.':
+
+ if len(number) and current_unit is not None:
+
+ # by definition we had something like
+ # so we need to evaluate the final field from a
+ # hh:mm:ss (so current_unit is 'm')
+ if current_unit != 'm':
+ raise ValueError("expected hh:mm:ss format before .")
+ m = 1000000000L
+ r = <int64_t>int(''.join(number)) * m
+ result += timedelta_as_neg(r, neg)
+ have_value = 1
+ unit, number, frac = [], [], []
+
+ have_dot = 1
+
+ # unit
+ else:
+ unit.append(c)
+ have_value = 1
+ have_dot = 0
+
+ # we had a dot, but we have a fractional
+ # value since we have an unit
+ if have_dot and len(unit):
+ r = timedelta_from_spec(number, frac, unit)
+ result += timedelta_as_neg(r, neg)
+
+ # we have a dot as part of a regular format
+ # e.g. hh:mm:ss.fffffff
+ elif have_dot:
+
+ if ((len(number) or len(frac)) and not len(unit)
+ and current_unit is None):
+ raise ValueError("no units specified")
+
+ if len(frac) > 0 and len(frac) <= 3:
+ m = 10**(3 -len(frac)) * 1000L * 1000L
+ elif len(frac) > 3 and len(frac) <= 6:
+ m = 10**(6 -len(frac)) * 1000L
+ else:
+ m = 10**(9 -len(frac))
+
+ r = <int64_t>int(''.join(frac)) * m
+ result += timedelta_as_neg(r, neg)
+
+ # we have a regular format
+ # we must have seconds at this point (hence the unit is still 'm')
+ elif current_unit is not None:
+ if current_unit != 'm':
+ raise ValueError("expected hh:mm:ss format")
+ m = 1000000000L
+ r = <int64_t>int(''.join(number)) * m
+ result += timedelta_as_neg(r, neg)
+
+ # we have a last abbreviation
+ elif len(unit):
+ if len(number):
+ r = timedelta_from_spec(number, frac, unit)
+ result += timedelta_as_neg(r, neg)
+ else:
+ raise ValueError("unit abbreviation w/o a number")
+
+ # treat as nanoseconds
+ # but only if we don't have anything else
+ else:
+ if have_value:
+ raise ValueError("have leftover units")
+ if len(number):
+ r = timedelta_from_spec(number, frac, 'ns')
+ result += timedelta_as_neg(r, neg)
+
+ return result
+
+
+cdef inline int64_t timedelta_as_neg(int64_t value, bint neg):
+ """
+
+ Parameters
+ ----------
+ value : int64_t of the timedelta value
+ neg : boolean if the a negative value
+ """
+ if neg:
+ return -value
+ return value
+
+
+cdef inline timedelta_from_spec(object number, object frac, object unit):
+ """
+
+ Parameters
+ ----------
+ number : a list of number digits
+ frac : a list of frac digits
+ unit : a list of unit characters
+ """
+ cdef object n
+
+ try:
+ unit = ''.join(unit)
+ if unit == 'M':
+ # To parse ISO 8601 string, 'M' should be treated as minute,
+ # not month
+ unit = 'm'
+ unit = parse_timedelta_unit(unit)
+ except KeyError:
+ raise ValueError("invalid abbreviation: {unit}".format(unit=unit))
+
+ n = ''.join(number) + '.' + ''.join(frac)
+ return cast_from_unit(float(n), unit)
+
+
+cpdef inline object parse_timedelta_unit(object unit):
+ """
+ Parameters
+ ----------
+ unit : an unit string
+ """
+ if unit is None:
+ return 'ns'
+ elif unit == 'M':
+ return unit
+ try:
+ return timedelta_abbrevs[unit.lower()]
+ except (KeyError, AttributeError):
+ raise ValueError("invalid unit abbreviation: {unit}"
+ .format(unit=unit))
+
+# ----------------------------------------------------------------------
+# Timedelta ops utilities
+
+cdef bint _validate_ops_compat(other):
+ # return True if we are compat with operating
+ if checknull_with_nat(other):
+ return True
+ elif PyDelta_Check(other) or is_timedelta64_object(other):
+ return True
+ elif is_string_object(other):
+ return True
+ elif hasattr(other, 'delta'):
+ return True
+ return False
+
+
+def _op_unary_method(func, name):
+ def f(self):
+ return Timedelta(func(self.value), unit='ns')
+ f.__name__ = name
+ return f
+
+
+def _binary_op_method_timedeltalike(op, name):
+ # define a binary operation that only works if the other argument is
+ # timedelta like or an array of timedeltalike
+ def f(self, other):
+ if hasattr(other, '_typ'):
+ # Series, DataFrame, ...
+ if other._typ == 'dateoffset' and hasattr(other, 'delta'):
+ # Tick offset
+ return op(self, other.delta)
+ return NotImplemented
+
+ elif other is NaT:
+ return NaT
+
+ elif is_timedelta64_object(other):
+ # convert to Timedelta below; avoid catching this in
+ # has-dtype check before then
+ pass
+
+ elif is_datetime64_object(other) or PyDateTime_CheckExact(other):
+ # the PyDateTime_CheckExact case is for a datetime object that
+ # is specifically *not* a Timestamp, as the Timestamp case will be
+ # handled after `_validate_ops_compat` returns False below
+ from timestamps import Timestamp
+ return op(self, Timestamp(other))
+ # We are implicitly requiring the canonical behavior to be
+ # defined by Timestamp methods.
+
+ elif hasattr(other, 'dtype'):
+ # nd-array like
+ if other.dtype.kind in ['m', 'M']:
+ return op(self.to_timedelta64(), other)
+ elif other.dtype.kind == 'O':
+ return np.array([op(self, x) for x in other])
+ else:
+ return NotImplemented
+
+ elif not _validate_ops_compat(other):
+ return NotImplemented
+
+ try:
+ other = Timedelta(other)
+ except ValueError:
+ # failed to parse as timedelta
+ return NotImplemented
+
+ if other is NaT:
+ # e.g. if original other was timedelta64('NaT')
+ return NaT
+ return Timedelta(op(self.value, other.value), unit='ns')
+
+ f.__name__ = name
+ return f
+
+
+# ----------------------------------------------------------------------
+# Timedelta Construction
+
+cdef inline int64_t parse_iso_format_string(object ts) except? -1:
+ """
+ Extracts and cleanses the appropriate values from a match object with
+ groups for each component of an ISO 8601 duration
+
+ Parameters
+ ----------
+ ts:
+ ISO 8601 Duration formatted string
+
+ Returns
+ -------
+ ns: int64_t
+ Precision in nanoseconds of matched ISO 8601 duration
+
+ Raises
+ ------
+ ValueError
+ If ``ts`` cannot be parsed
+ """
+
+ cdef:
+ unicode c
+ int64_t result = 0, r
+ int p = 0
+ object dec_unit = 'ms', err_msg
+ bint have_dot = 0, have_value = 0, neg = 0
+ list number = [], unit = []
+
+ ts = _decode_if_necessary(ts)
+
+ err_msg = "Invalid ISO 8601 Duration format - {}".format(ts)
+
+ for c in ts:
+ # number (ascii codes)
+ if ord(c) >= 48 and ord(c) <= 57:
+
+ have_value = 1
+ if have_dot:
+ if p == 3 and dec_unit != 'ns':
+ unit.append(dec_unit)
+ if dec_unit == 'ms':
+ dec_unit = 'us'
+ elif dec_unit == 'us':
+ dec_unit = 'ns'
+ p = 0
+ p += 1
+
+ if not len(unit):
+ number.append(c)
+ else:
+ # if in days, pop trailing T
+ if unit[-1] == 'T':
+ unit.pop()
+ elif 'H' in unit or 'M' in unit:
+ if len(number) > 2:
+ raise ValueError(err_msg)
+ r = timedelta_from_spec(number, '0', unit)
+ result += timedelta_as_neg(r, neg)
+
+ neg = 0
+ unit, number = [], [c]
+ else:
+ if c == 'P':
+ pass # ignore leading character
+ elif c == '-':
+ if neg or have_value:
+ raise ValueError(err_msg)
+ else:
+ neg = 1
+ elif c in ['D', 'T', 'H', 'M']:
+ unit.append(c)
+ elif c == '.':
+ # append any seconds
+ if len(number):
+ r = timedelta_from_spec(number, '0', 'S')
+ result += timedelta_as_neg(r, neg)
+ unit, number = [], []
+ have_dot = 1
+ elif c == 'S':
+ if have_dot: # ms, us, or ns
+ if not len(number) or p > 3:
+ raise ValueError(err_msg)
+ # pad to 3 digits as required
+ pad = 3 - p
+ while pad > 0:
+ number.append('0')
+ pad -= 1
+
+ r = timedelta_from_spec(number, '0', dec_unit)
+ result += timedelta_as_neg(r, neg)
+ else: # seconds
+ if len(number) <= 2:
+ r = timedelta_from_spec(number, '0', 'S')
+ result += timedelta_as_neg(r, neg)
+ else:
+ raise ValueError(err_msg)
+ else:
+ raise ValueError(err_msg)
+
+ if not have_value:
+ # Received string only - never parsed any values
+ raise ValueError(err_msg)
+
+ return result
+
+
+cdef _to_py_int_float(v):
+ # Note: This used to be defined inside Timedelta.__new__
+ # but cython will not allow `cdef` functions to be defined dynamically.
+ if is_integer_object(v):
+ return int(v)
+ elif is_float_object(v):
+ return float(v)
+ raise TypeError("Invalid type {typ}. Must be int or "
+ "float.".format(typ=type(v)))
+
+
+# Similar to Timestamp/datetime, this is a construction requirement for
+# timedeltas that we need to do object instantiation in python. This will
+# serve as a C extension type that shadows the Python class, where we do any
+# heavy lifting.
+cdef class _Timedelta(timedelta):
+ cdef readonly:
+ int64_t value # nanoseconds
+ object freq # frequency reference
+ bint is_populated # are my components populated
+ int64_t _d, _h, _m, _s, _ms, _us, _ns
+
+ # higher than np.ndarray and np.matrix
+ __array_priority__ = 100
+
+ def __hash__(_Timedelta self):
+ if self._has_ns():
+ return hash(self.value)
+ else:
+ return timedelta.__hash__(self)
+
+ def __richcmp__(_Timedelta self, object other, int op):
+ cdef:
+ _Timedelta ots
+ int ndim
+
+ if isinstance(other, _Timedelta):
+ ots = other
+ elif PyDelta_Check(other) or isinstance(other, Tick):
+ ots = Timedelta(other)
+ else:
+ ndim = getattr(other, "ndim", -1)
+
+ if ndim != -1:
+ if ndim == 0:
+ if is_timedelta64_object(other):
+ other = Timedelta(other)
+ else:
+ if op == Py_EQ:
+ return False
+ elif op == Py_NE:
+ return True
+ # only allow ==, != ops
+ raise TypeError('Cannot compare type {cls} with '
+ 'type {other}'
+ .format(cls=type(self).__name__,
+ other=type(other).__name__))
+ if util.is_array(other):
+ return PyObject_RichCompare(np.array([self]), other, op)
+ return PyObject_RichCompare(other, self, reverse_ops[op])
+ else:
+ if op == Py_EQ:
+ return False
+ elif op == Py_NE:
+ return True
+ raise TypeError('Cannot compare type {cls} with type {other}'
+ .format(cls=type(self).__name__,
+ other=type(other).__name__))
+
+ return cmp_scalar(self.value, ots.value, op)
+
+ cpdef bint _has_ns(self):
+ return self.value % 1000 != 0
+
+ def _ensure_components(_Timedelta self):
+ """
+ compute the components
+ """
+ if self.is_populated:
+ return
+
+ cdef:
+ pandas_timedeltastruct tds
+
+ td64_to_tdstruct(self.value, &tds)
+ self._d = tds.days
+ self._h = tds.hrs
+ self._m = tds.min
+ self._s = tds.sec
+ self._ms = tds.ms
+ self._us = tds.us
+ self._ns = tds.ns
+ self._seconds = tds.seconds
+ self._microseconds = tds.microseconds
+
+ self.is_populated = 1
+
+ cpdef timedelta to_pytimedelta(_Timedelta self):
+ """
+ return an actual datetime.timedelta object
+ note: we lose nanosecond resolution if any
+ """
+ return timedelta(microseconds=int(self.value) / 1000)
+
+ def to_timedelta64(self):
+ """ Returns a numpy.timedelta64 object with 'ns' precision """
+ return np.timedelta64(self.value, 'ns')
+
+ def total_seconds(self):
+ """
+ Total duration of timedelta in seconds (to ns precision)
+ """
+ return self.value / 1e9
+
+ def view(self, dtype):
+ """ array view compat """
+ return np.timedelta64(self.value).view(dtype)
+
+ @property
+ def components(self):
+ """ Return a Components NamedTuple-like """
+ self._ensure_components()
+ # return the named tuple
+ return Components(self._d, self._h, self._m, self._s,
+ self._ms, self._us, self._ns)
+
+ @property
+ def delta(self):
+ """
+ Return the timedelta in nanoseconds (ns), for internal compatibility.
+
+ Returns
+ -------
+ int
+ Timedelta in nanoseconds.
+
+ Examples
+ --------
+ >>> td = pd.Timedelta('1 days 42 ns')
+ >>> td.delta
+ 86400000000042
+
+ >>> td = pd.Timedelta('3 s')
+ >>> td.delta
+ 3000000000
+
+ >>> td = pd.Timedelta('3 ms 5 us')
+ >>> td.delta
+ 3005000
+
+ >>> td = pd.Timedelta(42, unit='ns')
+ >>> td.delta
+ 42
+ """
+ return self.value
+
+ @property
+ def asm8(self):
+ """
+ Return a numpy timedelta64 array scalar view.
+
+ Provides access to the array scalar view (i.e. a combination of the
+ value and the units) associated with the numpy.timedelta64().view(),
+ including a 64-bit integer representation of the timedelta in
+ nanoseconds (Python int compatible).
+
+ Returns
+ -------
+ numpy timedelta64 array scalar view
+ Array scalar view of the timedelta in nanoseconds.
+
+ Examples
+ --------
+ >>> td = pd.Timedelta('1 days 2 min 3 us 42 ns')
+ >>> td.asm8
+ numpy.timedelta64(86520000003042,'ns')
+
+ >>> td = pd.Timedelta('2 min 3 s')
+ >>> td.asm8
+ numpy.timedelta64(123000000000,'ns')
+
+ >>> td = pd.Timedelta('3 ms 5 us')
+ >>> td.asm8
+ numpy.timedelta64(3005000,'ns')
+
+ >>> td = pd.Timedelta(42, unit='ns')
+ >>> td.asm8
+ numpy.timedelta64(42,'ns')
+ """
+ return np.int64(self.value).view('m8[ns]')
+
+ @property
+ def resolution(self):
+ """
+ Return a string representing the lowest timedelta resolution.
+
+ Each timedelta has a defined resolution that represents the lowest OR
+ most granular level of precision. Each level of resolution is
+ represented by a short string as defined below:
+
+ Resolution: Return value
+
+ * Days: 'D'
+ * Hours: 'H'
+ * Minutes: 'T'
+ * Seconds: 'S'
+ * Milliseconds: 'L'
+ * Microseconds: 'U'
+ * Nanoseconds: 'N'
+
+ Returns
+ -------
+ str
+ Timedelta resolution.
+
+ Examples
+ --------
+ >>> td = pd.Timedelta('1 days 2 min 3 us 42 ns')
+ >>> td.resolution
+ 'N'
+
+ >>> td = pd.Timedelta('1 days 2 min 3 us')
+ >>> td.resolution
+ 'U'
+
+ >>> td = pd.Timedelta('2 min 3 s')
+ >>> td.resolution
+ 'S'
+
+ >>> td = pd.Timedelta(36, unit='us')
+ >>> td.resolution
+ 'U'
+ """
+
+ self._ensure_components()
+ if self._ns:
+ return "N"
+ elif self._us:
+ return "U"
+ elif self._ms:
+ return "L"
+ elif self._s:
+ return "S"
+ elif self._m:
+ return "T"
+ elif self._h:
+ return "H"
+ else:
+ return "D"
+
+ @property
+ def nanoseconds(self):
+ """
+ Return the number of nanoseconds (n), where 0 <= n < 1 microsecond.
+
+ Returns
+ -------
+ int
+ Number of nanoseconds.
+
+ See Also
+ --------
+ Timedelta.components : Return all attributes with assigned values
+ (i.e. days, hours, minutes, seconds, milliseconds, microseconds,
+ nanoseconds).
+
+ Examples
+ --------
+ **Using string input**
+
+ >>> td = pd.Timedelta('1 days 2 min 3 us 42 ns')
+ >>> td.nanoseconds
+ 42
+
+ **Using integer input**
+
+ >>> td = pd.Timedelta(42, unit='ns')
+ >>> td.nanoseconds
+ 42
+ """
+ self._ensure_components()
+ return self._ns
+
+ def _repr_base(self, format=None):
+ """
+
+ Parameters
+ ----------
+ format : None|all|sub_day|long
+
+ Returns
+ -------
+ converted : string of a Timedelta
+
+ """
+ cdef object sign, seconds_pretty, subs, fmt, comp_dict
+
+ self._ensure_components()
+
+ if self._d < 0:
+ sign = " +"
+ else:
+ sign = " "
+
+ if format == 'all':
+ fmt = ("{days} days{sign}{hours:02}:{minutes:02}:{seconds:02}."
+ "{milliseconds:03}{microseconds:03}{nanoseconds:03}")
+ else:
+ # if we have a partial day
+ subs = (self._h or self._m or self._s or
+ self._ms or self._us or self._ns)
+
+ # by default not showing nano
+ if self._ms or self._us or self._ns:
+ seconds_fmt = "{seconds:02}.{milliseconds:03}{microseconds:03}"
+ else:
+ seconds_fmt = "{seconds:02}"
+
+ if format == 'sub_day' and not self._d:
+ fmt = "{hours:02}:{minutes:02}:" + seconds_fmt
+ elif subs or format == 'long':
+ fmt = "{days} days{sign}{hours:02}:{minutes:02}:" + seconds_fmt
+ else:
+ fmt = "{days} days"
+
+ comp_dict = self.components._asdict()
+ comp_dict['sign'] = sign
+
+ return fmt.format(**comp_dict)
+
+ def __repr__(self):
+ return "Timedelta('{val}')".format(val=self._repr_base(format='long'))
+
+ def __str__(self):
+ return self._repr_base(format='long')
+
+ def __bool__(self):
+ return self.value != 0
+
+ def isoformat(self):
+ """
+ Format Timedelta as ISO 8601 Duration like
+ ``P[n]Y[n]M[n]DT[n]H[n]M[n]S``, where the ``[n]`` s are replaced by the
+ values. See https://en.wikipedia.org/wiki/ISO_8601#Durations
+
+ .. versionadded:: 0.20.0
+
+ Returns
+ -------
+ formatted : str
+
+ See Also
+ --------
+ Timestamp.isoformat
+
+ Notes
+ -----
+ The longest component is days, whose value may be larger than
+ 365.
+ Every component is always included, even if its value is 0.
+ Pandas uses nanosecond precision, so up to 9 decimal places may
+ be included in the seconds component.
+ Trailing 0's are removed from the seconds component after the decimal.
+ We do not 0 pad components, so it's `...T5H...`, not `...T05H...`
+
+ Examples
+ --------
+ >>> td = pd.Timedelta(days=6, minutes=50, seconds=3,
+ ... milliseconds=10, microseconds=10, nanoseconds=12)
+ >>> td.isoformat()
+ 'P6DT0H50M3.010010012S'
+ >>> pd.Timedelta(hours=1, seconds=10).isoformat()
+ 'P0DT0H0M10S'
+ >>> pd.Timedelta(hours=1, seconds=10).isoformat()
+ 'P0DT0H0M10S'
+ >>> pd.Timedelta(days=500.5).isoformat()
+ 'P500DT12H0MS'
+ """
+ components = self.components
+ seconds = '{}.{:0>3}{:0>3}{:0>3}'.format(components.seconds,
+ components.milliseconds,
+ components.microseconds,
+ components.nanoseconds)
+ # Trim unnecessary 0s, 1.000000000 -> 1
+ seconds = seconds.rstrip('0').rstrip('.')
+ tpl = ('P{td.days}DT{td.hours}H{td.minutes}M{seconds}S'
+ .format(td=components, seconds=seconds))
+ return tpl
+
+
+# Python front end to C extension type _Timedelta
+# This serves as the box for timedelta64
+
+class Timedelta(_Timedelta):
+ """
+ Represents a duration, the difference between two dates or times.
+
+ Timedelta is the pandas equivalent of python's ``datetime.timedelta``
+ and is interchangeable with it in most cases.
+
+ Parameters
+ ----------
+ value : Timedelta, timedelta, np.timedelta64, string, or integer
+ unit : str, optional
+ Denote the unit of the input, if input is an integer. Default 'ns'.
+ Possible values:
+ {'Y', 'M', 'W', 'D', 'days', 'day', 'hours', hour', 'hr', 'h',
+ 'm', 'minute', 'min', 'minutes', 'T', 'S', 'seconds', 'sec', 'second',
+ 'ms', 'milliseconds', 'millisecond', 'milli', 'millis', 'L',
+ 'us', 'microseconds', 'microsecond', 'micro', 'micros', 'U',
+ 'ns', 'nanoseconds', 'nano', 'nanos', 'nanosecond', 'N'}
+ days, seconds, microseconds,
+ milliseconds, minutes, hours, weeks : numeric, optional
+ Values for construction in compat with datetime.timedelta.
+ np ints and floats will be coerced to python ints and floats.
+
+ Notes
+ -----
+ The ``.value`` attribute is always in ns.
+
+ """
+ def __new__(cls, object value=_no_input, unit=None, **kwargs):
+ cdef _Timedelta td_base
+
+ if value is _no_input:
+ if not len(kwargs):
+ raise ValueError("cannot construct a Timedelta without a "
+ "value/unit or descriptive keywords "
+ "(days,seconds....)")
+
+ kwargs = {key: _to_py_int_float(kwargs[key]) for key in kwargs}
+
+ nano = kwargs.pop('nanoseconds', 0)
+ try:
+ value = nano + convert_to_timedelta64(timedelta(**kwargs),
+ 'ns')
+ except TypeError as e:
+ raise ValueError("cannot construct a Timedelta from the "
+ "passed arguments, allowed keywords are "
+ "[weeks, days, hours, minutes, seconds, "
+ "milliseconds, microseconds, nanoseconds]")
+
+ if isinstance(value, Timedelta):
+ value = value.value
+ elif is_string_object(value):
+ if len(value) > 0 and value[0] == 'P':
+ value = parse_iso_format_string(value)
+ else:
+ value = parse_timedelta_string(value)
+ value = np.timedelta64(value)
+ elif PyDelta_Check(value):
+ value = convert_to_timedelta64(value, 'ns')
+ elif is_timedelta64_object(value):
+ if unit is not None:
+ value = value.astype('timedelta64[{0}]'.format(unit))
+ value = value.astype('timedelta64[ns]')
+ elif hasattr(value, 'delta'):
+ value = np.timedelta64(delta_to_nanoseconds(value.delta), 'ns')
+ elif is_integer_object(value) or is_float_object(value):
+ # unit=None is de-facto 'ns'
+ unit = parse_timedelta_unit(unit)
+ value = convert_to_timedelta64(value, unit)
+ elif checknull_with_nat(value):
+ return NaT
+ else:
+ raise ValueError(
+ "Value must be Timedelta, string, integer, "
+ "float, timedelta or convertible")
+
+ if is_timedelta64_object(value):
+ value = value.view('i8')
+
+ # nat
+ if value == NPY_NAT:
+ return NaT
+
+ # make timedelta happy
+ td_base = _Timedelta.__new__(cls, microseconds=int(value) / 1000)
+ td_base.value = value
+ td_base.is_populated = 0
+ return td_base
+
+ def __setstate__(self, state):
+ (value) = state
+ self.value = value
+
+ def __reduce__(self):
+ object_state = self.value,
+ return (Timedelta, object_state)
+
+ def _round(self, freq, rounder):
+ cdef:
+ int64_t result, unit
+
+ unit = to_offset(freq).nanos
+ result = unit * rounder(self.value / float(unit))
+ return Timedelta(result, unit='ns')
+
+ def round(self, freq):
+ """
+ Round the Timedelta to the specified resolution
+
+ Parameters
+ ----------
+ freq : a freq string indicating the rounding resolution
+
+ Returns
+ -------
+ a new Timedelta rounded to the given resolution of `freq`
+
+ Raises
+ ------
+ ValueError if the freq cannot be converted
+ """
+ return self._round(freq, np.round)
+
+ def floor(self, freq):
+ """
+ return a new Timedelta floored to this resolution
+
+ Parameters
+ ----------
+ freq : a freq string indicating the flooring resolution
+ """
+ return self._round(freq, np.floor)
+
+ def ceil(self, freq):
+ """
+ return a new Timedelta ceiled to this resolution
+
+ Parameters
+ ----------
+ freq : a freq string indicating the ceiling resolution
+ """
+ return self._round(freq, np.ceil)
+
+ # ----------------------------------------------------------------
+ # Arithmetic Methods
+ # TODO: Can some of these be defined in the cython class?
+
+ __inv__ = _op_unary_method(lambda x: -x, '__inv__')
+ __neg__ = _op_unary_method(lambda x: -x, '__neg__')
+ __pos__ = _op_unary_method(lambda x: x, '__pos__')
+ __abs__ = _op_unary_method(lambda x: abs(x), '__abs__')
+
+ __add__ = _binary_op_method_timedeltalike(lambda x, y: x + y, '__add__')
+ __radd__ = _binary_op_method_timedeltalike(lambda x, y: x + y, '__radd__')
+ __sub__ = _binary_op_method_timedeltalike(lambda x, y: x - y, '__sub__')
+ __rsub__ = _binary_op_method_timedeltalike(lambda x, y: y - x, '__rsub__')
+
+ def __mul__(self, other):
+ if hasattr(other, '_typ'):
+ # Series, DataFrame, ...
+ if other._typ == 'dateoffset' and hasattr(other, 'delta'):
+ # Tick offset; this op will raise TypeError
+ return other.delta * self
+ return NotImplemented
+
+ elif util.is_nan(other):
+ # i.e. np.nan, but also catch np.float64("NaN") which would
+ # otherwise get caught by the hasattr(other, "dtype") branch
+ # incorrectly return a np.timedelta64 object.
+ return NaT
+
+ elif hasattr(other, 'dtype'):
+ # ndarray-like
+ return other * self.to_timedelta64()
+
+ elif other is NaT:
+ raise TypeError('Cannot multiply Timedelta with NaT')
+
+ elif not (is_integer_object(other) or is_float_object(other)):
+ # only integers and floats allowed
+ return NotImplemented
+
+ return Timedelta(other * self.value, unit='ns')
+
+ __rmul__ = __mul__
+
+ def __truediv__(self, other):
+ if hasattr(other, '_typ'):
+ # Series, DataFrame, ...
+ if other._typ == 'dateoffset' and hasattr(other, 'delta'):
+ # Tick offset
+ return self / other.delta
+ return NotImplemented
+
+ elif is_timedelta64_object(other):
+ # convert to Timedelta below
+ pass
+
+ elif util.is_nan(other):
+ # i.e. np.nan, but also catch np.float64("NaN") which would
+ # otherwise get caught by the hasattr(other, "dtype") branch
+ # incorrectly return a np.timedelta64 object.
+ return NaT
+
+ elif hasattr(other, 'dtype'):
+ return self.to_timedelta64() / other
+
+ elif is_integer_object(other) or is_float_object(other):
+ # integers or floats
+ return Timedelta(self.value / other, unit='ns')
+
+ elif not _validate_ops_compat(other):
+ return NotImplemented
+
+ other = Timedelta(other)
+ if other is NaT:
+ return np.nan
+ return self.value / float(other.value)
+
+ def __rtruediv__(self, other):
+ if hasattr(other, '_typ'):
+ # Series, DataFrame, ...
+ if other._typ == 'dateoffset' and hasattr(other, 'delta'):
+ # Tick offset
+ return other.delta / self
+ return NotImplemented
+
+ elif is_timedelta64_object(other):
+ # convert to Timedelta below
+ pass
+
+ elif hasattr(other, 'dtype'):
+ return other / self.to_timedelta64()
+
+ elif not _validate_ops_compat(other):
+ return NotImplemented
+
+ other = Timedelta(other)
+ if other is NaT:
+ return NaT
+ return float(other.value) / self.value
+
+ if not PY3:
+ __div__ = __truediv__
+ __rdiv__ = __rtruediv__
+
+ def __floordiv__(self, other):
+ # numpy does not implement floordiv for timedelta64 dtype, so we cannot
+ # just defer
+ if hasattr(other, '_typ'):
+ # Series, DataFrame, ...
+ if other._typ == 'dateoffset' and hasattr(other, 'delta'):
+ # Tick offset
+ return self // other.delta
+ return NotImplemented
+
+ elif is_timedelta64_object(other):
+ # convert to Timedelta below
+ pass
+
+ elif hasattr(other, 'dtype'):
+ if other.dtype.kind == 'm':
+ # also timedelta-like
+ return _broadcast_floordiv_td64(self.value, other, _floordiv)
+ elif other.dtype.kind in ['i', 'u', 'f']:
+ if other.ndim == 0:
+ return Timedelta(self.value // other)
+ else:
+ return self.to_timedelta64() // other
+
+ raise TypeError('Invalid dtype {dtype} for '
+ '{op}'.format(dtype=other.dtype,
+ op='__floordiv__'))
+
+ elif is_integer_object(other) or is_float_object(other):
+ return Timedelta(self.value // other, unit='ns')
+
+ elif not _validate_ops_compat(other):
+ return NotImplemented
+
+ other = Timedelta(other)
+ if other is NaT:
+ return np.nan
+ return self.value // other.value
+
+ def __rfloordiv__(self, other):
+ # numpy does not implement floordiv for timedelta64 dtype, so we cannot
+ # just defer
+ if hasattr(other, '_typ'):
+ # Series, DataFrame, ...
+ if other._typ == 'dateoffset' and hasattr(other, 'delta'):
+ # Tick offset
+ return other.delta // self
+ return NotImplemented
+
+ elif is_timedelta64_object(other):
+ # convert to Timedelta below
+ pass
+
+ elif hasattr(other, 'dtype'):
+ if other.dtype.kind == 'm':
+ # also timedelta-like
+ return _broadcast_floordiv_td64(self.value, other, _rfloordiv)
+ elif other.dtype.kind == 'i':
+ # Backwards compatibility
+ # GH-19761
+ msg = textwrap.dedent("""\
+ Floor division between integer array and Timedelta is
+ deprecated. Use 'array // timedelta.value' instead.
+ If you want to obtain epochs from an array of timestamps,
+ you can rather use
+ '(array - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")'.
+ """)
+ warnings.warn(msg, FutureWarning)
+ return other // self.value
+ raise TypeError('Invalid dtype {dtype} for '
+ '{op}'.format(dtype=other.dtype,
+ op='__floordiv__'))
+
+ elif is_float_object(other) and util.is_nan(other):
+ # i.e. np.nan
+ return NotImplemented
+
+ elif not _validate_ops_compat(other):
+ return NotImplemented
+
+ other = Timedelta(other)
+ if other is NaT:
+ return np.nan
+ return other.value // self.value
+
+ def __mod__(self, other):
+ # Naive implementation, room for optimization
+ return self.__divmod__(other)[1]
+
+ def __rmod__(self, other):
+ # Naive implementation, room for optimization
+ if hasattr(other, 'dtype') and other.dtype.kind == 'i':
+ # TODO: Remove this check with backwards-compat shim
+ # for integer / Timedelta is removed.
+ raise TypeError("Invalid type {dtype} for "
+ "{op}".format(dtype=other.dtype, op='__mod__'))
+ return self.__rdivmod__(other)[1]
+
+ def __divmod__(self, other):
+ # Naive implementation, room for optimization
+ div = self // other
+ return div, self - div * other
+
+ def __rdivmod__(self, other):
+ # Naive implementation, room for optimization
+ if hasattr(other, 'dtype') and other.dtype.kind == 'i':
+ # TODO: Remove this check with backwards-compat shim
+ # for integer / Timedelta is removed.
+ raise TypeError("Invalid type {dtype} for "
+ "{op}".format(dtype=other.dtype, op='__mod__'))
+ div = other // self
+ return div, other - div * self
+
+
+cdef _floordiv(int64_t value, right):
+ return value // right
+
+
+cdef _rfloordiv(int64_t value, right):
+ # analogous to referencing operator.div, but there is no operator.rfloordiv
+ return right // value
+
+
+cdef _broadcast_floordiv_td64(int64_t value, object other,
+ object (*operation)(int64_t value,
+ object right)):
+ """Boilerplate code shared by Timedelta.__floordiv__ and
+ Timedelta.__rfloordiv__ because np.timedelta64 does not implement these.
+
+ Parameters
+ ----------
+ value : int64_t; `self.value` from a Timedelta object
+ other : object
+ operation : function, either _floordiv or _rfloordiv
+
+ Returns
+ -------
+ result : varies based on `other`
+ """
+ # assumes other.dtype.kind == 'm', i.e. other is timedelta-like
+ cdef:
+ int ndim = getattr(other, 'ndim', -1)
+
+ # We need to watch out for np.timedelta64('NaT').
+ mask = other.view('i8') == NPY_NAT
+
+ if ndim == 0:
+ if mask:
+ return np.nan
+
+ return operation(value, other.astype('m8[ns]').astype('i8'))
+
+ else:
+ res = operation(value, other.astype('m8[ns]').astype('i8'))
+
+ if mask.any():
+ res = res.astype('f8')
+ res[mask] = np.nan
+ return res
+
+
+# resolution in ns
+Timedelta.min = Timedelta(np.iinfo(np.int64).min + 1)
+Timedelta.max = Timedelta(np.iinfo(np.int64).max)
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/timestamps.pxd b/contrib/python/pandas/py2/pandas/_libs/tslibs/timestamps.pxd
new file mode 100644
index 00000000000..b7282e02ff1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/timestamps.pxd
@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+
+from numpy cimport int64_t
+from pandas._libs.tslibs.np_datetime cimport npy_datetimestruct
+
+cdef object create_timestamp_from_ts(int64_t value,
+ npy_datetimestruct dts,
+ object tz, object freq)
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/timestamps.pyx b/contrib/python/pandas/py2/pandas/_libs/tslibs/timestamps.pyx
new file mode 100644
index 00000000000..fe0564cb62c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/timestamps.pyx
@@ -0,0 +1,1349 @@
+# -*- coding: utf-8 -*-
+import warnings
+
+from cpython cimport (PyObject_RichCompareBool, PyObject_RichCompare,
+ Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE)
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport int64_t, int32_t, int8_t
+cnp.import_array()
+
+from datetime import time as datetime_time, timedelta
+from cpython.datetime cimport (datetime,
+ PyDateTime_Check, PyDelta_Check, PyTZInfo_Check,
+ PyDateTime_IMPORT)
+PyDateTime_IMPORT
+
+from pandas._libs.tslibs.util cimport (
+ is_datetime64_object, is_timedelta64_object, is_integer_object,
+ is_string_object, is_array, is_offset_object)
+
+cimport pandas._libs.tslibs.ccalendar as ccalendar
+from pandas._libs.tslibs.ccalendar import DAY_SECONDS
+from pandas._libs.tslibs.conversion import (
+ tz_localize_to_utc, normalize_i8_timestamps)
+from pandas._libs.tslibs.conversion cimport (
+ tz_convert_single, _TSObject, convert_to_tsobject,
+ convert_datetime_to_tsobject)
+from pandas._libs.tslibs.fields import get_start_end_field, get_date_name_field
+from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT
+from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
+from pandas._libs.tslibs.np_datetime cimport (
+ reverse_ops, cmp_scalar, check_dts_bounds, npy_datetimestruct,
+ dt64_to_dtstruct)
+from pandas._libs.tslibs.offsets cimport to_offset
+from pandas._libs.tslibs.timedeltas import Timedelta
+from pandas._libs.tslibs.timedeltas cimport delta_to_nanoseconds
+from pandas._libs.tslibs.timezones cimport (
+ get_timezone, is_utc, maybe_get_tz, treat_tz_as_pytz, tz_compare)
+from pandas._libs.tslibs.timezones import UTC
+
+# ----------------------------------------------------------------------
+# Constants
+_zero_time = datetime_time(0, 0)
+_no_input = object()
+
+
+# ----------------------------------------------------------------------
+
+def maybe_integer_op_deprecated(obj):
+ # GH#22535 add/sub of integers and int-arrays is deprecated
+ if obj.freq is not None:
+ warnings.warn("Addition/subtraction of integers and integer-arrays "
+ "to {cls} is deprecated, will be removed in a future "
+ "version. Instead of adding/subtracting `n`, use "
+ "`n * self.freq`"
+ .format(cls=type(obj).__name__),
+ FutureWarning)
+
+
+cdef inline object create_timestamp_from_ts(int64_t value,
+ npy_datetimestruct dts,
+ object tz, object freq):
+ """ convenience routine to construct a Timestamp from its parts """
+ cdef _Timestamp ts_base
+ ts_base = _Timestamp.__new__(Timestamp, dts.year, dts.month,
+ dts.day, dts.hour, dts.min,
+ dts.sec, dts.us, tz)
+ ts_base.value = value
+ ts_base.freq = freq
+ ts_base.nanosecond = dts.ps / 1000
+
+ return ts_base
+
+
+class RoundTo(object):
+ """
+ enumeration defining the available rounding modes
+
+ Attributes
+ ----------
+ MINUS_INFTY
+ round towards -∞, or floor [2]_
+ PLUS_INFTY
+ round towards +∞, or ceil [3]_
+ NEAREST_HALF_EVEN
+ round to nearest, tie-break half to even [6]_
+ NEAREST_HALF_MINUS_INFTY
+ round to nearest, tie-break half to -∞ [5]_
+ NEAREST_HALF_PLUS_INFTY
+ round to nearest, tie-break half to +∞ [4]_
+
+
+ References
+ ----------
+ .. [1] "Rounding - Wikipedia"
+ https://en.wikipedia.org/wiki/Rounding
+ .. [2] "Rounding down"
+ https://en.wikipedia.org/wiki/Rounding#Rounding_down
+ .. [3] "Rounding up"
+ https://en.wikipedia.org/wiki/Rounding#Rounding_up
+ .. [4] "Round half up"
+ https://en.wikipedia.org/wiki/Rounding#Round_half_up
+ .. [5] "Round half down"
+ https://en.wikipedia.org/wiki/Rounding#Round_half_down
+ .. [6] "Round half to even"
+ https://en.wikipedia.org/wiki/Rounding#Round_half_to_even
+ """
+ @property
+ def MINUS_INFTY(self):
+ return 0
+
+ @property
+ def PLUS_INFTY(self):
+ return 1
+
+ @property
+ def NEAREST_HALF_EVEN(self):
+ return 2
+
+ @property
+ def NEAREST_HALF_PLUS_INFTY(self):
+ return 3
+
+ @property
+ def NEAREST_HALF_MINUS_INFTY(self):
+ return 4
+
+
+cdef inline _npdivmod(x1, x2):
+ """implement divmod for numpy < 1.13"""
+ return np.floor_divide(x1, x2), np.remainder(x1, x2)
+
+
+try:
+ from numpy import divmod as npdivmod
+except ImportError:
+ # numpy < 1.13
+ npdivmod = _npdivmod
+
+
+cdef inline _floor_int64(values, unit):
+ return values - np.remainder(values, unit)
+
+cdef inline _ceil_int64(values, unit):
+ return values + np.remainder(-values, unit)
+
+cdef inline _rounddown_int64(values, unit):
+ return _ceil_int64(values - unit//2, unit)
+
+cdef inline _roundup_int64(values, unit):
+ return _floor_int64(values + unit//2, unit)
+
+
+def round_nsint64(values, mode, freq):
+ """
+ Applies rounding mode at given frequency
+
+ Parameters
+ ----------
+ values : :obj:`ndarray`
+ mode : instance of `RoundTo` enumeration
+ freq : str, obj
+
+ Returns
+ -------
+ :obj:`ndarray`
+ """
+
+ unit = to_offset(freq).nanos
+
+ if mode == RoundTo.MINUS_INFTY:
+ return _floor_int64(values, unit)
+ elif mode == RoundTo.PLUS_INFTY:
+ return _ceil_int64(values, unit)
+ elif mode == RoundTo.NEAREST_HALF_MINUS_INFTY:
+ return _rounddown_int64(values, unit)
+ elif mode == RoundTo.NEAREST_HALF_PLUS_INFTY:
+ return _roundup_int64(values, unit)
+ elif mode == RoundTo.NEAREST_HALF_EVEN:
+ # for odd unit there is no need of a tie break
+ if unit % 2:
+ return _rounddown_int64(values, unit)
+ quotient, remainder = npdivmod(values, unit)
+ mask = np.logical_or(
+ remainder > (unit // 2),
+ np.logical_and(remainder == (unit // 2), quotient % 2)
+ )
+ quotient[mask] += 1
+ return quotient * unit
+
+ # if/elif above should catch all rounding modes defined in enum 'RoundTo':
+ # if flow of control arrives here, it is a bug
+ raise ValueError("round_nsint64 called with an unrecognized "
+ "rounding mode")
+
+
+# This is PITA. Because we inherit from datetime, which has very specific
+# construction requirements, we need to do object instantiation in python
+# (see Timestamp class above). This will serve as a C extension type that
+# shadows the python class, where we do any heavy lifting.
+cdef class _Timestamp(datetime):
+
+ cdef readonly:
+ int64_t value, nanosecond
+ object freq # frequency reference
+ list _date_attributes
+
+ def __hash__(_Timestamp self):
+ if self.nanosecond:
+ return hash(self.value)
+ return datetime.__hash__(self)
+
+ def __richcmp__(_Timestamp self, object other, int op):
+ cdef:
+ _Timestamp ots
+ int ndim
+
+ if isinstance(other, _Timestamp):
+ ots = other
+ elif other is NaT:
+ return op == Py_NE
+ elif PyDateTime_Check(other):
+ if self.nanosecond == 0:
+ val = self.to_pydatetime()
+ return PyObject_RichCompareBool(val, other, op)
+
+ try:
+ ots = Timestamp(other)
+ except ValueError:
+ return self._compare_outside_nanorange(other, op)
+ else:
+ ndim = getattr(other, "ndim", -1)
+
+ if ndim != -1:
+ if ndim == 0:
+ if is_datetime64_object(other):
+ other = Timestamp(other)
+ else:
+ if op == Py_EQ:
+ return False
+ elif op == Py_NE:
+ return True
+
+ # only allow ==, != ops
+ raise TypeError('Cannot compare type %r with type %r' %
+ (type(self).__name__,
+ type(other).__name__))
+ elif is_array(other):
+ # avoid recursion error GH#15183
+ return PyObject_RichCompare(np.array([self]), other, op)
+ return PyObject_RichCompare(other, self, reverse_ops[op])
+ else:
+ if op == Py_EQ:
+ return False
+ elif op == Py_NE:
+ return True
+ raise TypeError('Cannot compare type %r with type %r' %
+ (type(self).__name__, type(other).__name__))
+
+ self._assert_tzawareness_compat(other)
+ return cmp_scalar(self.value, ots.value, op)
+
+ def __reduce_ex__(self, protocol):
+ # python 3.6 compat
+ # http://bugs.python.org/issue28730
+ # now __reduce_ex__ is defined and higher priority than __reduce__
+ return self.__reduce__()
+
+ def __repr__(self):
+ stamp = self._repr_base
+ zone = None
+
+ try:
+ stamp += self.strftime('%z')
+ if self.tzinfo:
+ zone = get_timezone(self.tzinfo)
+ except ValueError:
+ year2000 = self.replace(year=2000)
+ stamp += year2000.strftime('%z')
+ if self.tzinfo:
+ zone = get_timezone(self.tzinfo)
+
+ try:
+ stamp += zone.strftime(' %%Z')
+ except:
+ pass
+
+ tz = ", tz='{0}'".format(zone) if zone is not None else ""
+ freq = "" if self.freq is None else ", freq='{0}'".format(self.freqstr)
+
+ return "Timestamp('{stamp}'{tz}{freq})".format(stamp=stamp,
+ tz=tz, freq=freq)
+
+ cdef bint _compare_outside_nanorange(_Timestamp self, datetime other,
+ int op) except -1:
+ cdef:
+ datetime dtval = self.to_pydatetime()
+
+ self._assert_tzawareness_compat(other)
+
+ if self.nanosecond == 0:
+ return PyObject_RichCompareBool(dtval, other, op)
+ else:
+ if op == Py_EQ:
+ return False
+ elif op == Py_NE:
+ return True
+ elif op == Py_LT:
+ return dtval < other
+ elif op == Py_LE:
+ return dtval < other
+ elif op == Py_GT:
+ return dtval >= other
+ elif op == Py_GE:
+ return dtval >= other
+
+ cdef _assert_tzawareness_compat(_Timestamp self, datetime other):
+ if self.tzinfo is None:
+ if other.tzinfo is not None:
+ raise TypeError('Cannot compare tz-naive and tz-aware '
+ 'timestamps')
+ elif other.tzinfo is None:
+ raise TypeError('Cannot compare tz-naive and tz-aware timestamps')
+
+ cpdef datetime to_pydatetime(_Timestamp self, bint warn=True):
+ """
+ Convert a Timestamp object to a native Python datetime object.
+
+ If warn=True, issue a warning if nanoseconds is nonzero.
+ """
+ if self.nanosecond != 0 and warn:
+ warnings.warn("Discarding nonzero nanoseconds in conversion",
+ UserWarning, stacklevel=2)
+
+ return datetime(self.year, self.month, self.day,
+ self.hour, self.minute, self.second,
+ self.microsecond, self.tzinfo)
+
+ cpdef to_datetime64(self):
+ """ Returns a numpy.datetime64 object with 'ns' precision """
+ return np.datetime64(self.value, 'ns')
+
+ def __add__(self, other):
+ cdef:
+ int64_t other_int, nanos
+
+ if is_timedelta64_object(other):
+ other_int = other.astype('timedelta64[ns]').view('i8')
+ return Timestamp(self.value + other_int,
+ tz=self.tzinfo, freq=self.freq)
+
+ elif is_integer_object(other):
+ maybe_integer_op_deprecated(self)
+
+ if self is NaT:
+ # to be compat with Period
+ return NaT
+ elif self.freq is None:
+ raise ValueError("Cannot add integral value to Timestamp "
+ "without freq.")
+ return Timestamp((self.freq * other).apply(self), freq=self.freq)
+
+ elif PyDelta_Check(other) or hasattr(other, 'delta'):
+ # delta --> offsets.Tick
+ nanos = delta_to_nanoseconds(other)
+ result = Timestamp(self.value + nanos,
+ tz=self.tzinfo, freq=self.freq)
+ if getattr(other, 'normalize', False):
+ # DateOffset
+ result = result.normalize()
+ return result
+
+ # index/series like
+ elif hasattr(other, '_typ'):
+ return NotImplemented
+
+ result = datetime.__add__(self, other)
+ if PyDateTime_Check(result):
+ result = Timestamp(result)
+ result.nanosecond = self.nanosecond
+ return result
+
+ def __sub__(self, other):
+ if (is_timedelta64_object(other) or is_integer_object(other) or
+ PyDelta_Check(other) or hasattr(other, 'delta')):
+ # `delta` attribute is for offsets.Tick or offsets.Week obj
+ neg_other = -other
+ return self + neg_other
+
+ typ = getattr(other, '_typ', None)
+
+ # a Timestamp-DatetimeIndex -> yields a negative TimedeltaIndex
+ if typ in ('datetimeindex', 'datetimearray'):
+ # timezone comparison is performed in DatetimeIndex._sub_datelike
+ return -other.__sub__(self)
+
+ # a Timestamp-TimedeltaIndex -> yields a negative TimedeltaIndex
+ elif typ in ('timedeltaindex', 'timedeltaarray'):
+ return (-other).__add__(self)
+
+ elif other is NaT:
+ return NaT
+
+ # coerce if necessary if we are a Timestamp-like
+ if (PyDateTime_Check(self)
+ and (PyDateTime_Check(other) or is_datetime64_object(other))):
+ self = Timestamp(self)
+ other = Timestamp(other)
+
+ # validate tz's
+ if not tz_compare(self.tzinfo, other.tzinfo):
+ raise TypeError("Timestamp subtraction must have the "
+ "same timezones or no timezones")
+
+ # scalar Timestamp/datetime - Timestamp/datetime -> yields a
+ # Timedelta
+ try:
+ return Timedelta(self.value - other.value)
+ except (OverflowError, OutOfBoundsDatetime):
+ pass
+
+ # scalar Timestamp/datetime - Timedelta -> yields a Timestamp (with
+ # same timezone if specified)
+ return datetime.__sub__(self, other)
+
+ cdef int64_t _maybe_convert_value_to_local(self):
+ """Convert UTC i8 value to local i8 value if tz exists"""
+ cdef:
+ int64_t val
+ val = self.value
+ if self.tz is not None and not is_utc(self.tz):
+ val = tz_convert_single(self.value, UTC, self.tz)
+ return val
+
+ cpdef bint _get_start_end_field(self, str field):
+ cdef:
+ int64_t val
+ dict kwds
+ int8_t out[1]
+ int month_kw
+
+ freq = self.freq
+ if freq:
+ kwds = freq.kwds
+ month_kw = kwds.get('startingMonth', kwds.get('month', 12))
+ freqstr = self.freqstr
+ else:
+ month_kw = 12
+ freqstr = None
+
+ val = self._maybe_convert_value_to_local()
+ out = get_start_end_field(np.array([val], dtype=np.int64),
+ field, freqstr, month_kw)
+ return out[0]
+
+ cpdef _get_date_name_field(self, object field, object locale):
+ cdef:
+ int64_t val
+ object[:] out
+
+ val = self._maybe_convert_value_to_local()
+ out = get_date_name_field(np.array([val], dtype=np.int64),
+ field, locale=locale)
+ return out[0]
+
+ @property
+ def _repr_base(self):
+ return '{date} {time}'.format(date=self._date_repr,
+ time=self._time_repr)
+
+ @property
+ def _date_repr(self):
+ # Ideal here would be self.strftime("%Y-%m-%d"), but
+ # the datetime strftime() methods require year >= 1900
+ return '%d-%.2d-%.2d' % (self.year, self.month, self.day)
+
+ @property
+ def _time_repr(self):
+ result = '%.2d:%.2d:%.2d' % (self.hour, self.minute, self.second)
+
+ if self.nanosecond != 0:
+ result += '.%.9d' % (self.nanosecond + 1000 * self.microsecond)
+ elif self.microsecond != 0:
+ result += '.%.6d' % self.microsecond
+
+ return result
+
+ @property
+ def _short_repr(self):
+ # format a Timestamp with only _date_repr if possible
+ # otherwise _repr_base
+ if (self.hour == 0 and
+ self.minute == 0 and
+ self.second == 0 and
+ self.microsecond == 0 and
+ self.nanosecond == 0):
+ return self._date_repr
+ return self._repr_base
+
+ @property
+ def asm8(self):
+ return np.datetime64(self.value, 'ns')
+
+ @property
+ def resolution(self):
+ """
+ Return resolution describing the smallest difference between two
+ times that can be represented by Timestamp object_state
+ """
+ # GH#21336, GH#21365
+ return Timedelta(nanoseconds=1)
+
+ def timestamp(self):
+ """Return POSIX timestamp as float."""
+ # py27 compat, see GH#17329
+ return round(self.value / 1e9, 6)
+
+
+# ----------------------------------------------------------------------
+
+# Python front end to C extension type _Timestamp
+# This serves as the box for datetime64
+
+
+class Timestamp(_Timestamp):
+ """Pandas replacement for datetime.datetime
+
+ Timestamp is the pandas equivalent of python's Datetime
+ and is interchangeable with it in most cases. It's the type used
+ for the entries that make up a DatetimeIndex, and other timeseries
+ oriented data structures in pandas.
+
+ Parameters
+ ----------
+ ts_input : datetime-like, str, int, float
+ Value to be converted to Timestamp
+ freq : str, DateOffset
+ Offset which Timestamp will have
+ tz : str, pytz.timezone, dateutil.tz.tzfile or None
+ Time zone for time which Timestamp will have.
+ unit : str
+ Unit used for conversion if ts_input is of type int or float. The
+ valid values are 'D', 'h', 'm', 's', 'ms', 'us', and 'ns'. For
+ example, 's' means seconds and 'ms' means milliseconds.
+ year, month, day : int
+ .. versionadded:: 0.19.0
+ hour, minute, second, microsecond : int, optional, default 0
+ .. versionadded:: 0.19.0
+ nanosecond : int, optional, default 0
+ .. versionadded:: 0.23.0
+ tzinfo : datetime.tzinfo, optional, default None
+ .. versionadded:: 0.19.0
+
+ Notes
+ -----
+ There are essentially three calling conventions for the constructor. The
+ primary form accepts four parameters. They can be passed by position or
+ keyword.
+
+ The other two forms mimic the parameters from ``datetime.datetime``. They
+ can be passed by either position or keyword, but not both mixed together.
+
+ Examples
+ --------
+ Using the primary calling convention:
+
+ This converts a datetime-like string
+ >>> pd.Timestamp('2017-01-01T12')
+ Timestamp('2017-01-01 12:00:00')
+
+ This converts a float representing a Unix epoch in units of seconds
+ >>> pd.Timestamp(1513393355.5, unit='s')
+ Timestamp('2017-12-16 03:02:35.500000')
+
+ This converts an int representing a Unix-epoch in units of seconds
+ and for a particular timezone
+ >>> pd.Timestamp(1513393355, unit='s', tz='US/Pacific')
+ Timestamp('2017-12-15 19:02:35-0800', tz='US/Pacific')
+
+ Using the other two forms that mimic the API for ``datetime.datetime``:
+
+ >>> pd.Timestamp(2017, 1, 1, 12)
+ Timestamp('2017-01-01 12:00:00')
+
+ >>> pd.Timestamp(year=2017, month=1, day=1, hour=12)
+ Timestamp('2017-01-01 12:00:00')
+ """
+
+ @classmethod
+ def fromordinal(cls, ordinal, freq=None, tz=None):
+ """
+ Timestamp.fromordinal(ordinal, freq=None, tz=None)
+
+ passed an ordinal, translate and convert to a ts
+ note: by definition there cannot be any tz info on the ordinal itself
+
+ Parameters
+ ----------
+ ordinal : int
+ date corresponding to a proleptic Gregorian ordinal
+ freq : str, DateOffset
+ Offset which Timestamp will have
+ tz : str, pytz.timezone, dateutil.tz.tzfile or None
+ Time zone for time which Timestamp will have.
+ """
+ return cls(datetime.fromordinal(ordinal),
+ freq=freq, tz=tz)
+
+ @classmethod
+ def now(cls, tz=None):
+ """
+ Timestamp.now(tz=None)
+
+ Returns new Timestamp object representing current time local to
+ tz.
+
+ Parameters
+ ----------
+ tz : str or timezone object, default None
+ Timezone to localize to
+ """
+ if is_string_object(tz):
+ tz = maybe_get_tz(tz)
+ return cls(datetime.now(tz))
+
+ @classmethod
+ def today(cls, tz=None):
+ """
+ Timestamp.today(cls, tz=None)
+
+ Return the current time in the local timezone. This differs
+ from datetime.today() in that it can be localized to a
+ passed timezone.
+
+ Parameters
+ ----------
+ tz : str or timezone object, default None
+ Timezone to localize to
+ """
+ return cls.now(tz)
+
+ @classmethod
+ def utcnow(cls):
+ """
+ Timestamp.utcnow()
+
+ Return a new Timestamp representing UTC day and time.
+ """
+ return cls.now(UTC)
+
+ @classmethod
+ def utcfromtimestamp(cls, ts):
+ """
+ Timestamp.utcfromtimestamp(ts)
+
+ Construct a naive UTC datetime from a POSIX timestamp.
+ """
+ return cls(datetime.utcfromtimestamp(ts))
+
+ @classmethod
+ def fromtimestamp(cls, ts):
+ """
+ Timestamp.fromtimestamp(ts)
+
+ timestamp[, tz] -> tz's local time from POSIX timestamp.
+ """
+ return cls(datetime.fromtimestamp(ts))
+
+ @classmethod
+ def combine(cls, date, time):
+ """
+ Timsetamp.combine(date, time)
+
+ date, time -> datetime with same date and time fields
+ """
+ return cls(datetime.combine(date, time))
+
+ def __new__(cls, object ts_input=_no_input,
+ object freq=None, tz=None, unit=None,
+ year=None, month=None, day=None,
+ hour=None, minute=None, second=None, microsecond=None,
+ nanosecond=None, tzinfo=None):
+ # The parameter list folds together legacy parameter names (the first
+ # four) and positional and keyword parameter names from pydatetime.
+ #
+ # There are three calling forms:
+ #
+ # - In the legacy form, the first parameter, ts_input, is required
+ # and may be datetime-like, str, int, or float. The second
+ # parameter, offset, is optional and may be str or DateOffset.
+ #
+ # - ints in the first, second, and third arguments indicate
+ # pydatetime positional arguments. Only the first 8 arguments
+ # (standing in for year, month, day, hour, minute, second,
+ # microsecond, tzinfo) may be non-None. As a shortcut, we just
+ # check that the second argument is an int.
+ #
+ # - Nones for the first four (legacy) arguments indicate pydatetime
+ # keyword arguments. year, month, and day are required. As a
+ # shortcut, we just check that the first argument was not passed.
+ #
+ # Mixing pydatetime positional and keyword arguments is forbidden!
+
+ cdef _TSObject ts
+
+ _date_attributes = [year, month, day, hour, minute, second,
+ microsecond, nanosecond]
+
+ if tzinfo is not None:
+ if not PyTZInfo_Check(tzinfo):
+ # tzinfo must be a datetime.tzinfo object, GH#17690
+ raise TypeError('tzinfo must be a datetime.tzinfo object, '
+ 'not %s' % type(tzinfo))
+ elif tz is not None:
+ raise ValueError('Can provide at most one of tz, tzinfo')
+
+ # User passed tzinfo instead of tz; avoid silently ignoring
+ tz, tzinfo = tzinfo, None
+
+ if is_string_object(ts_input):
+ # User passed a date string to parse.
+ # Check that the user didn't also pass a date attribute kwarg.
+ if any(arg is not None for arg in _date_attributes):
+ raise ValueError('Cannot pass a date attribute keyword '
+ 'argument when passing a date string')
+
+ elif ts_input is _no_input:
+ # User passed keyword arguments.
+ ts_input = datetime(year, month, day, hour or 0,
+ minute or 0, second or 0,
+ microsecond or 0)
+ elif is_integer_object(freq):
+ # User passed positional arguments:
+ # Timestamp(year, month, day[, hour[, minute[, second[,
+ # microsecond[, nanosecond[, tzinfo]]]]]])
+ ts_input = datetime(ts_input, freq, tz, unit or 0,
+ year or 0, month or 0, day or 0)
+ nanosecond = hour
+ tz = minute
+ freq = None
+
+ if getattr(ts_input, 'tzinfo', None) is not None and tz is not None:
+ warnings.warn("Passing a datetime or Timestamp with tzinfo and the"
+ " tz parameter will raise in the future. Use"
+ " tz_convert instead.", FutureWarning)
+
+ ts = convert_to_tsobject(ts_input, tz, unit, 0, 0, nanosecond or 0)
+
+ if ts.value == NPY_NAT:
+ return NaT
+
+ if freq is None:
+ # GH 22311: Try to extract the frequency of a given Timestamp input
+ freq = getattr(ts_input, 'freq', None)
+ elif not is_offset_object(freq):
+ freq = to_offset(freq)
+
+ return create_timestamp_from_ts(ts.value, ts.dts, ts.tzinfo, freq)
+
+ def _round(self, freq, mode, ambiguous='raise', nonexistent='raise'):
+ if self.tz is not None:
+ value = self.tz_localize(None).value
+ else:
+ value = self.value
+
+ value = np.array([value], dtype=np.int64)
+
+ # Will only ever contain 1 element for timestamp
+ r = round_nsint64(value, mode, freq)[0]
+ result = Timestamp(r, unit='ns')
+ if self.tz is not None:
+ result = result.tz_localize(
+ self.tz, ambiguous=ambiguous, nonexistent=nonexistent
+ )
+ return result
+
+ def round(self, freq, ambiguous='raise', nonexistent='raise'):
+ """
+ Round the Timestamp to the specified resolution
+
+ Parameters
+ ----------
+ freq : a freq string indicating the rounding resolution
+ ambiguous : bool, 'NaT', default 'raise'
+ - bool contains flags to determine if time is dst or not (note
+ that this flag is only applicable for ambiguous fall dst dates)
+ - 'NaT' will return NaT for an ambiguous time
+ - 'raise' will raise an AmbiguousTimeError for an ambiguous time
+
+ .. versionadded:: 0.24.0
+ nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta,
+ default 'raise'
+ A nonexistent time does not exist in a particular timezone
+ where clocks moved forward due to DST.
+
+ - 'shift_forward' will shift the nonexistent time forward to the
+ closest existing time
+ - 'shift_backward' will shift the nonexistent time backward to the
+ closest existing time
+ - 'NaT' will return NaT where there are nonexistent times
+ - timedelta objects will shift nonexistent times by the timedelta
+ - 'raise' will raise an NonExistentTimeError if there are
+ nonexistent times
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ a new Timestamp rounded to the given resolution of `freq`
+
+ Raises
+ ------
+ ValueError if the freq cannot be converted
+ """
+ return self._round(
+ freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent
+ )
+
+ def floor(self, freq, ambiguous='raise', nonexistent='raise'):
+ """
+ return a new Timestamp floored to this resolution
+
+ Parameters
+ ----------
+ freq : a freq string indicating the flooring resolution
+ ambiguous : bool, 'NaT', default 'raise'
+ - bool contains flags to determine if time is dst or not (note
+ that this flag is only applicable for ambiguous fall dst dates)
+ - 'NaT' will return NaT for an ambiguous time
+ - 'raise' will raise an AmbiguousTimeError for an ambiguous time
+
+ .. versionadded:: 0.24.0
+ nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta,
+ default 'raise'
+ A nonexistent time does not exist in a particular timezone
+ where clocks moved forward due to DST.
+
+ - 'shift_forward' will shift the nonexistent time forward to the
+ closest existing time
+ - 'shift_backward' will shift the nonexistent time backward to the
+ closest existing time
+ - 'NaT' will return NaT where there are nonexistent times
+ - timedelta objects will shift nonexistent times by the timedelta
+ - 'raise' will raise an NonExistentTimeError if there are
+ nonexistent times
+
+ .. versionadded:: 0.24.0
+
+ Raises
+ ------
+ ValueError if the freq cannot be converted
+ """
+ return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent)
+
+ def ceil(self, freq, ambiguous='raise', nonexistent='raise'):
+ """
+ return a new Timestamp ceiled to this resolution
+
+ Parameters
+ ----------
+ freq : a freq string indicating the ceiling resolution
+ ambiguous : bool, 'NaT', default 'raise'
+ - bool contains flags to determine if time is dst or not (note
+ that this flag is only applicable for ambiguous fall dst dates)
+ - 'NaT' will return NaT for an ambiguous time
+ - 'raise' will raise an AmbiguousTimeError for an ambiguous time
+
+ .. versionadded:: 0.24.0
+ nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta,
+ default 'raise'
+ A nonexistent time does not exist in a particular timezone
+ where clocks moved forward due to DST.
+
+ - 'shift_forward' will shift the nonexistent time forward to the
+ closest existing time
+ - 'shift_backward' will shift the nonexistent time backward to the
+ closest existing time
+ - 'NaT' will return NaT where there are nonexistent times
+ - timedelta objects will shift nonexistent times by the timedelta
+ - 'raise' will raise an NonExistentTimeError if there are
+ nonexistent times
+
+ .. versionadded:: 0.24.0
+
+ Raises
+ ------
+ ValueError if the freq cannot be converted
+ """
+ return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent)
+
+ @property
+ def tz(self):
+ """
+ Alias for tzinfo
+ """
+ return self.tzinfo
+
+ @tz.setter
+ def tz(self, value):
+ # GH 3746: Prevent localizing or converting the index by setting tz
+ raise AttributeError("Cannot directly set timezone. Use tz_localize() "
+ "or tz_convert() as appropriate")
+
+ def __setstate__(self, state):
+ self.value = state[0]
+ self.freq = state[1]
+ self.tzinfo = state[2]
+
+ def __reduce__(self):
+ object_state = self.value, self.freq, self.tzinfo
+ return (Timestamp, object_state)
+
+ def to_period(self, freq=None):
+ """
+ Return an period of which this timestamp is an observation.
+ """
+ from pandas import Period
+
+ if self.tz is not None:
+ # GH#21333
+ warnings.warn("Converting to Period representation will "
+ "drop timezone information.",
+ UserWarning)
+
+ if freq is None:
+ freq = self.freq
+
+ return Period(self, freq=freq)
+
+ @property
+ def dayofweek(self):
+ return self.weekday()
+
+ def day_name(self, locale=None):
+ """
+ Return the day name of the Timestamp with specified locale.
+
+ Parameters
+ ----------
+ locale : string, default None (English locale)
+ locale determining the language in which to return the day name
+
+ Returns
+ -------
+ day_name : string
+
+ .. versionadded:: 0.23.0
+ """
+ return self._get_date_name_field('day_name', locale)
+
+ def month_name(self, locale=None):
+ """
+ Return the month name of the Timestamp with specified locale.
+
+ Parameters
+ ----------
+ locale : string, default None (English locale)
+ locale determining the language in which to return the month name
+
+ Returns
+ -------
+ month_name : string
+
+ .. versionadded:: 0.23.0
+ """
+ return self._get_date_name_field('month_name', locale)
+
+ @property
+ def weekday_name(self):
+ """
+ .. deprecated:: 0.23.0
+ Use ``Timestamp.day_name()`` instead
+ """
+ warnings.warn("`weekday_name` is deprecated and will be removed in a "
+ "future version. Use `day_name` instead",
+ FutureWarning)
+ return self.day_name()
+
+ @property
+ def dayofyear(self):
+ return ccalendar.get_day_of_year(self.year, self.month, self.day)
+
+ @property
+ def week(self):
+ return ccalendar.get_week_of_year(self.year, self.month, self.day)
+
+ weekofyear = week
+
+ @property
+ def quarter(self):
+ return ((self.month - 1) // 3) + 1
+
+ @property
+ def days_in_month(self):
+ return ccalendar.get_days_in_month(self.year, self.month)
+
+ daysinmonth = days_in_month
+
+ @property
+ def freqstr(self):
+ return getattr(self.freq, 'freqstr', self.freq)
+
+ @property
+ def is_month_start(self):
+ if self.freq is None:
+ # fast-path for non-business frequencies
+ return self.day == 1
+ return self._get_start_end_field('is_month_start')
+
+ @property
+ def is_month_end(self):
+ if self.freq is None:
+ # fast-path for non-business frequencies
+ return self.day == self.days_in_month
+ return self._get_start_end_field('is_month_end')
+
+ @property
+ def is_quarter_start(self):
+ if self.freq is None:
+ # fast-path for non-business frequencies
+ return self.day == 1 and self.month % 3 == 1
+ return self._get_start_end_field('is_quarter_start')
+
+ @property
+ def is_quarter_end(self):
+ if self.freq is None:
+ # fast-path for non-business frequencies
+ return (self.month % 3) == 0 and self.day == self.days_in_month
+ return self._get_start_end_field('is_quarter_end')
+
+ @property
+ def is_year_start(self):
+ if self.freq is None:
+ # fast-path for non-business frequencies
+ return self.day == self.month == 1
+ return self._get_start_end_field('is_year_start')
+
+ @property
+ def is_year_end(self):
+ if self.freq is None:
+ # fast-path for non-business frequencies
+ return self.month == 12 and self.day == 31
+ return self._get_start_end_field('is_year_end')
+
+ @property
+ def is_leap_year(self):
+ return bool(ccalendar.is_leapyear(self.year))
+
+ def tz_localize(self, tz, ambiguous='raise', nonexistent='raise',
+ errors=None):
+ """
+ Convert naive Timestamp to local time zone, or remove
+ timezone from tz-aware Timestamp.
+
+ Parameters
+ ----------
+ tz : str, pytz.timezone, dateutil.tz.tzfile or None
+ Time zone for time which Timestamp will be converted to.
+ None will remove timezone holding local time.
+
+ ambiguous : bool, 'NaT', default 'raise'
+ When clocks moved backward due to DST, ambiguous times may arise.
+ For example in Central European Time (UTC+01), when going from
+ 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at
+ 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the
+ `ambiguous` parameter dictates how ambiguous times should be
+ handled.
+
+ - bool contains flags to determine if time is dst or not (note
+ that this flag is only applicable for ambiguous fall dst dates)
+ - 'NaT' will return NaT for an ambiguous time
+ - 'raise' will raise an AmbiguousTimeError for an ambiguous time
+
+ nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta,
+ default 'raise'
+ A nonexistent time does not exist in a particular timezone
+ where clocks moved forward due to DST.
+
+ - 'shift_forward' will shift the nonexistent time forward to the
+ closest existing time
+ - 'shift_backward' will shift the nonexistent time backward to the
+ closest existing time
+ - 'NaT' will return NaT where there are nonexistent times
+ - timedelta objects will shift nonexistent times by the timedelta
+ - 'raise' will raise an NonExistentTimeError if there are
+ nonexistent times
+
+ .. versionadded:: 0.24.0
+
+ errors : 'raise', 'coerce', default None
+ - 'raise' will raise a NonExistentTimeError if a timestamp is not
+ valid in the specified timezone (e.g. due to a transition from
+ or to DST time). Use ``nonexistent='raise'`` instead.
+ - 'coerce' will return NaT if the timestamp can not be converted
+ into the specified timezone. Use ``nonexistent='NaT'`` instead.
+
+ .. deprecated:: 0.24.0
+
+ Returns
+ -------
+ localized : Timestamp
+
+ Raises
+ ------
+ TypeError
+ If the Timestamp is tz-aware and tz is not None.
+ """
+ if ambiguous == 'infer':
+ raise ValueError('Cannot infer offset with only one time.')
+
+ if errors is not None:
+ warnings.warn("The errors argument is deprecated and will be "
+ "removed in a future release. Use "
+ "nonexistent='NaT' or nonexistent='raise' "
+ "instead.", FutureWarning)
+ if errors == 'coerce':
+ nonexistent = 'NaT'
+ elif errors == 'raise':
+ nonexistent = 'raise'
+ else:
+ raise ValueError("The errors argument must be either 'coerce' "
+ "or 'raise'.")
+
+ nonexistent_options = ('raise', 'NaT', 'shift_forward',
+ 'shift_backward')
+ if nonexistent not in nonexistent_options and not isinstance(
+ nonexistent, timedelta):
+ raise ValueError("The nonexistent argument must be one of 'raise',"
+ " 'NaT', 'shift_forward', 'shift_backward' or"
+ " a timedelta object")
+
+ if self.tzinfo is None:
+ # tz naive, localize
+ tz = maybe_get_tz(tz)
+ if not is_string_object(ambiguous):
+ ambiguous = [ambiguous]
+ value = tz_localize_to_utc(np.array([self.value], dtype='i8'), tz,
+ ambiguous=ambiguous,
+ nonexistent=nonexistent)[0]
+ return Timestamp(value, tz=tz)
+ else:
+ if tz is None:
+ # reset tz
+ value = tz_convert_single(self.value, UTC, self.tz)
+ return Timestamp(value, tz=None)
+ else:
+ raise TypeError('Cannot localize tz-aware Timestamp, use '
+ 'tz_convert for conversions')
+
+ def tz_convert(self, tz):
+ """
+ Convert tz-aware Timestamp to another time zone.
+
+ Parameters
+ ----------
+ tz : str, pytz.timezone, dateutil.tz.tzfile or None
+ Time zone for time which Timestamp will be converted to.
+ None will remove timezone holding UTC time.
+
+ Returns
+ -------
+ converted : Timestamp
+
+ Raises
+ ------
+ TypeError
+ If Timestamp is tz-naive.
+ """
+ if self.tzinfo is None:
+ # tz naive, use tz_localize
+ raise TypeError('Cannot convert tz-naive Timestamp, use '
+ 'tz_localize to localize')
+ else:
+ # Same UTC timestamp, different time zone
+ return Timestamp(self.value, tz=tz)
+
+ astimezone = tz_convert
+
+ def replace(self, year=None, month=None, day=None,
+ hour=None, minute=None, second=None, microsecond=None,
+ nanosecond=None, tzinfo=object, fold=0):
+ """
+ implements datetime.replace, handles nanoseconds
+
+ Parameters
+ ----------
+ year : int, optional
+ month : int, optional
+ day : int, optional
+ hour : int, optional
+ minute : int, optional
+ second : int, optional
+ microsecond : int, optional
+ nanosecond : int, optional
+ tzinfo : tz-convertible, optional
+ fold : int, optional, default is 0
+ added in 3.6, NotImplemented
+
+ Returns
+ -------
+ Timestamp with fields replaced
+ """
+
+ cdef:
+ npy_datetimestruct dts
+ int64_t value, value_tz, offset
+ object _tzinfo, result, k, v
+ datetime ts_input
+
+ # set to naive if needed
+ _tzinfo = self.tzinfo
+ value = self.value
+ if _tzinfo is not None:
+ value_tz = tz_convert_single(value, _tzinfo, UTC)
+ value += value - value_tz
+
+ # setup components
+ dt64_to_dtstruct(value, &dts)
+ dts.ps = self.nanosecond * 1000
+
+ # replace
+ def validate(k, v):
+ """ validate integers """
+ if not is_integer_object(v):
+ raise ValueError("value must be an integer, received "
+ "{v} for {k}".format(v=type(v), k=k))
+ return v
+
+ if year is not None:
+ dts.year = validate('year', year)
+ if month is not None:
+ dts.month = validate('month', month)
+ if day is not None:
+ dts.day = validate('day', day)
+ if hour is not None:
+ dts.hour = validate('hour', hour)
+ if minute is not None:
+ dts.min = validate('minute', minute)
+ if second is not None:
+ dts.sec = validate('second', second)
+ if microsecond is not None:
+ dts.us = validate('microsecond', microsecond)
+ if nanosecond is not None:
+ dts.ps = validate('nanosecond', nanosecond) * 1000
+ if tzinfo is not object:
+ _tzinfo = tzinfo
+
+ # reconstruct & check bounds
+ if _tzinfo is not None and treat_tz_as_pytz(_tzinfo):
+ # replacing across a DST boundary may induce a new tzinfo object
+ # see GH#18319
+ ts_input = _tzinfo.localize(datetime(dts.year, dts.month, dts.day,
+ dts.hour, dts.min, dts.sec,
+ dts.us))
+ _tzinfo = ts_input.tzinfo
+ else:
+ ts_input = datetime(dts.year, dts.month, dts.day,
+ dts.hour, dts.min, dts.sec, dts.us,
+ tzinfo=_tzinfo)
+
+ ts = convert_datetime_to_tsobject(ts_input, _tzinfo)
+ value = ts.value + (dts.ps // 1000)
+ if value != NPY_NAT:
+ check_dts_bounds(&dts)
+
+ return create_timestamp_from_ts(value, dts, _tzinfo, self.freq)
+
+ def isoformat(self, sep='T'):
+ base = super(_Timestamp, self).isoformat(sep=sep)
+ if self.nanosecond == 0:
+ return base
+
+ if self.tzinfo is not None:
+ base1, base2 = base[:-6], base[-6:]
+ else:
+ base1, base2 = base, ""
+
+ if self.microsecond != 0:
+ base1 += "%.3d" % self.nanosecond
+ else:
+ base1 += ".%.9d" % self.nanosecond
+
+ return base1 + base2
+
+ def _has_time_component(self):
+ """
+ Returns if the Timestamp has a time component
+ in addition to the date part
+ """
+ return (self.time() != _zero_time
+ or self.tzinfo is not None
+ or self.nanosecond != 0)
+
+ def to_julian_date(self):
+ """
+ Convert TimeStamp to a Julian Date.
+ 0 Julian date is noon January 1, 4713 BC.
+ """
+ year = self.year
+ month = self.month
+ day = self.day
+ if month <= 2:
+ year -= 1
+ month += 12
+ return (day +
+ np.fix((153 * month - 457) / 5) +
+ 365 * year +
+ np.floor(year / 4) -
+ np.floor(year / 100) +
+ np.floor(year / 400) +
+ 1721118.5 +
+ (self.hour +
+ self.minute / 60.0 +
+ self.second / 3600.0 +
+ self.microsecond / 3600.0 / 1e+6 +
+ self.nanosecond / 3600.0 / 1e+9
+ ) / 24.0)
+
+ def normalize(self):
+ """
+ Normalize Timestamp to midnight, preserving
+ tz information.
+ """
+ if self.tz is None or is_utc(self.tz):
+ DAY_NS = DAY_SECONDS * 1000000000
+ normalized_value = self.value - (self.value % DAY_NS)
+ return Timestamp(normalized_value).tz_localize(self.tz)
+ normalized_value = normalize_i8_timestamps(
+ np.array([self.value], dtype='i8'), tz=self.tz)[0]
+ return Timestamp(normalized_value).tz_localize(self.tz)
+
+ def __radd__(self, other):
+ # __radd__ on cython extension types like _Timestamp is not used, so
+ # define it here instead
+ return self + other
+
+
+# Add the min and max fields at the class level
+cdef int64_t _NS_UPPER_BOUND = np.iinfo(np.int64).max
+# the smallest value we could actually represent is
+# INT64_MIN + 1 == -9223372036854775807
+# but to allow overflow free conversion with a microsecond resolution
+# use the smallest value with a 0 nanosecond unit (0s in last 3 digits)
+cdef int64_t _NS_LOWER_BOUND = -9223372036854775000
+
+# Resolution is in nanoseconds
+Timestamp.min = Timestamp(_NS_LOWER_BOUND)
+Timestamp.max = Timestamp(_NS_UPPER_BOUND)
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/timezones.pxd b/contrib/python/pandas/py2/pandas/_libs/tslibs/timezones.pxd
new file mode 100644
index 00000000000..50c4a41f97a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/timezones.pxd
@@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+
+cpdef bint is_utc(object tz)
+cdef bint is_tzlocal(object tz)
+
+cdef bint treat_tz_as_pytz(object tz)
+cdef bint treat_tz_as_dateutil(object tz)
+
+cpdef bint tz_compare(object start, object end)
+cpdef object get_timezone(object tz)
+cpdef object maybe_get_tz(object tz)
+
+cdef get_utcoffset(tzinfo, obj)
+cdef bint is_fixed_offset(object tz)
+
+cdef object get_dst_info(object tz)
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/timezones.pyx b/contrib/python/pandas/py2/pandas/_libs/tslibs/timezones.pyx
new file mode 100644
index 00000000000..43a35d77dd1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/timezones.pyx
@@ -0,0 +1,359 @@
+# -*- coding: utf-8 -*-
+
+# dateutil compat
+from dateutil.tz import (
+ tzutc as _dateutil_tzutc,
+ tzlocal as _dateutil_tzlocal,
+ tzfile as _dateutil_tzfile)
+
+from dateutil.tz import gettz as dateutil_gettz
+
+from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo
+import pytz
+UTC = pytz.utc
+
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport int64_t
+cnp.import_array()
+
+# ----------------------------------------------------------------------
+from pandas._libs.tslibs.util cimport (
+ is_string_object, is_integer_object, get_nat)
+
+cdef int64_t NPY_NAT = get_nat()
+
+# ----------------------------------------------------------------------
+
+cpdef inline bint is_utc(object tz):
+ return tz is UTC or isinstance(tz, _dateutil_tzutc)
+
+
+cdef inline bint is_tzlocal(object tz):
+ return isinstance(tz, _dateutil_tzlocal)
+
+
+cdef inline bint treat_tz_as_pytz(object tz):
+ return (hasattr(tz, '_utc_transition_times') and
+ hasattr(tz, '_transition_info'))
+
+
+cdef inline bint treat_tz_as_dateutil(object tz):
+ return hasattr(tz, '_trans_list') and hasattr(tz, '_trans_idx')
+
+
+cpdef inline object get_timezone(object tz):
+ """
+ We need to do several things here:
+ 1) Distinguish between pytz and dateutil timezones
+ 2) Not be over-specific (e.g. US/Eastern with/without DST is same *zone*
+ but a different tz object)
+ 3) Provide something to serialize when we're storing a datetime object
+ in pytables.
+
+ We return a string prefaced with dateutil if it's a dateutil tz, else just
+ the tz name. It needs to be a string so that we can serialize it with
+ UJSON/pytables. maybe_get_tz (below) is the inverse of this process.
+ """
+ if is_utc(tz):
+ return tz
+ else:
+ if treat_tz_as_dateutil(tz):
+ if '.tar.gz' in tz._filename:
+ raise ValueError(
+ 'Bad tz filename. Dateutil on python 3 on windows has a '
+ 'bug which causes tzfile._filename to be the same for all '
+ 'timezone files. Please construct dateutil timezones '
+ 'implicitly by passing a string like "dateutil/Europe'
+ '/London" when you construct your pandas objects instead '
+ 'of passing a timezone object. See '
+ 'https://github.com/pandas-dev/pandas/pull/7362')
+ return 'dateutil/' + tz._filename
+ else:
+ # tz is a pytz timezone or unknown.
+ try:
+ zone = tz.zone
+ if zone is None:
+ return tz
+ return zone
+ except AttributeError:
+ return tz
+
+
+cpdef inline object maybe_get_tz(object tz):
+ """
+ (Maybe) Construct a timezone object from a string. If tz is a string, use
+ it to construct a timezone object. Otherwise, just return tz.
+ """
+ if is_string_object(tz):
+ if tz == 'tzlocal()':
+ tz = _dateutil_tzlocal()
+ elif tz.startswith('dateutil/'):
+ zone = tz[9:]
+ tz = dateutil_gettz(zone)
+ # On Python 3 on Windows, the filename is not always set correctly.
+ if isinstance(tz, _dateutil_tzfile) and '.tar.gz' in tz._filename:
+ tz._filename = zone
+ else:
+ tz = pytz.timezone(tz)
+ elif is_integer_object(tz):
+ tz = pytz.FixedOffset(tz / 60)
+ return tz
+
+
+def _p_tz_cache_key(tz):
+ """ Python interface for cache function to facilitate testing."""
+ return tz_cache_key(tz)
+
+
+# Timezone data caches, key is the pytz string or dateutil file name.
+dst_cache = {}
+
+
+cdef inline object tz_cache_key(object tz):
+ """
+ Return the key in the cache for the timezone info object or None
+ if unknown.
+
+ The key is currently the tz string for pytz timezones, the filename for
+ dateutil timezones.
+
+ Notes
+ =====
+ This cannot just be the hash of a timezone object. Unfortunately, the
+ hashes of two dateutil tz objects which represent the same timezone are
+ not equal (even though the tz objects will compare equal and represent
+ the same tz file). Also, pytz objects are not always hashable so we use
+ str(tz) instead.
+ """
+ if isinstance(tz, _pytz_BaseTzInfo):
+ return tz.zone
+ elif isinstance(tz, _dateutil_tzfile):
+ if '.tar.gz' in tz._filename:
+ raise ValueError('Bad tz filename. Dateutil on python 3 on '
+ 'windows has a bug which causes tzfile._filename '
+ 'to be the same for all timezone files. Please '
+ 'construct dateutil timezones implicitly by '
+ 'passing a string like "dateutil/Europe/London" '
+ 'when you construct your pandas objects instead '
+ 'of passing a timezone object. See '
+ 'https://github.com/pandas-dev/pandas/pull/7362')
+ return 'dateutil' + tz._filename
+ else:
+ return None
+
+
+# ----------------------------------------------------------------------
+# UTC Offsets
+
+
+cdef get_utcoffset(tzinfo, obj):
+ try:
+ return tzinfo._utcoffset
+ except AttributeError:
+ return tzinfo.utcoffset(obj)
+
+
+cdef inline bint is_fixed_offset(object tz):
+ if treat_tz_as_dateutil(tz):
+ if len(tz._trans_idx) == 0 and len(tz._trans_list) == 0:
+ return 1
+ else:
+ return 0
+ elif treat_tz_as_pytz(tz):
+ if (len(tz._transition_info) == 0
+ and len(tz._utc_transition_times) == 0):
+ return 1
+ else:
+ return 0
+ return 1
+
+
+cdef object get_utc_trans_times_from_dateutil_tz(object tz):
+ """
+ Transition times in dateutil timezones are stored in local non-dst
+ time. This code converts them to UTC. It's the reverse of the code
+ in dateutil.tz.tzfile.__init__.
+ """
+ new_trans = list(tz._trans_list)
+ last_std_offset = 0
+ for i, (trans, tti) in enumerate(zip(tz._trans_list, tz._trans_idx)):
+ if not tti.isdst:
+ last_std_offset = tti.offset
+ new_trans[i] = trans - last_std_offset
+ return new_trans
+
+
+cdef int64_t[:] unbox_utcoffsets(object transinfo):
+ cdef:
+ Py_ssize_t i, sz
+ int64_t[:] arr
+
+ sz = len(transinfo)
+ arr = np.empty(sz, dtype='i8')
+
+ for i in range(sz):
+ arr[i] = int(transinfo[i][0].total_seconds()) * 1000000000
+
+ return arr
+
+
+# ----------------------------------------------------------------------
+# Daylight Savings
+
+
+cdef object get_dst_info(object tz):
+ """
+ return a tuple of :
+ (UTC times of DST transitions,
+ UTC offsets in microseconds corresponding to DST transitions,
+ string of type of transitions)
+
+ """
+ cache_key = tz_cache_key(tz)
+ if cache_key is None:
+ # e.g. pytz.FixedOffset, matplotlib.dates._UTC,
+ # psycopg2.tz.FixedOffsetTimezone
+ num = int(get_utcoffset(tz, None).total_seconds()) * 1000000000
+ return (np.array([NPY_NAT + 1], dtype=np.int64),
+ np.array([num], dtype=np.int64),
+ None)
+
+ if cache_key not in dst_cache:
+ if treat_tz_as_pytz(tz):
+ trans = np.array(tz._utc_transition_times, dtype='M8[ns]')
+ trans = trans.view('i8')
+ try:
+ if tz._utc_transition_times[0].year == 1:
+ trans[0] = NPY_NAT + 1
+ except Exception:
+ pass
+ deltas = unbox_utcoffsets(tz._transition_info)
+ typ = 'pytz'
+
+ elif treat_tz_as_dateutil(tz):
+ if len(tz._trans_list):
+ # get utc trans times
+ trans_list = get_utc_trans_times_from_dateutil_tz(tz)
+ trans = np.hstack([
+ np.array([0], dtype='M8[s]'), # place holder for 1st item
+ np.array(trans_list, dtype='M8[s]')]).astype(
+ 'M8[ns]') # all trans listed
+ trans = trans.view('i8')
+ trans[0] = NPY_NAT + 1
+
+ # deltas
+ deltas = np.array([v.offset for v in (
+ tz._ttinfo_before,) + tz._trans_idx], dtype='i8')
+ deltas *= 1000000000
+ typ = 'dateutil'
+
+ elif is_fixed_offset(tz):
+ trans = np.array([NPY_NAT + 1], dtype=np.int64)
+ deltas = np.array([tz._ttinfo_std.offset],
+ dtype='i8') * 1000000000
+ typ = 'fixed'
+ else:
+ # 2018-07-12 this is not reached in the tests, and this case
+ # is not handled in any of the functions that call
+ # get_dst_info. If this case _were_ hit the calling
+ # functions would then hit an IndexError because they assume
+ # `deltas` is non-empty.
+ # (under the just-deleted code that returned empty arrays)
+ raise AssertionError("dateutil tzinfo is not a FixedOffset "
+ "and has an empty `_trans_list`.", tz)
+
+ else:
+ # static tzinfo
+ # TODO: This case is not hit in tests (2018-07-17); is it possible?
+ trans = np.array([NPY_NAT + 1], dtype=np.int64)
+ num = int(get_utcoffset(tz, None).total_seconds()) * 1000000000
+ deltas = np.array([num], dtype=np.int64)
+ typ = 'static'
+
+ dst_cache[cache_key] = (trans, deltas, typ)
+
+ return dst_cache[cache_key]
+
+
+def infer_tzinfo(start, end):
+ if start is not None and end is not None:
+ tz = start.tzinfo
+ if not tz_compare(tz, end.tzinfo):
+ msg = 'Inputs must both have the same timezone, {tz1} != {tz2}'
+ raise AssertionError(msg.format(tz1=tz, tz2=end.tzinfo))
+ elif start is not None:
+ tz = start.tzinfo
+ elif end is not None:
+ tz = end.tzinfo
+ else:
+ tz = None
+ return tz
+
+
+cpdef bint tz_compare(object start, object end):
+ """
+ Compare string representations of timezones
+
+ The same timezone can be represented as different instances of
+ timezones. For example
+ `<DstTzInfo 'Europe/Paris' LMT+0:09:00 STD>` and
+ `<DstTzInfo 'Europe/Paris' CET+1:00:00 STD>` are essentially same
+ timezones but aren't evaluated such, but the string representation
+ for both of these is `'Europe/Paris'`.
+
+ This exists only to add a notion of equality to pytz-style zones
+ that is compatible with the notion of equality expected of tzinfo
+ subclasses.
+
+ Parameters
+ ----------
+ start : tzinfo
+ end : tzinfo
+
+ Returns:
+ -------
+ compare : bint
+
+ """
+ # GH 18523
+ return get_timezone(start) == get_timezone(end)
+
+
+def tz_standardize(tz: object):
+ """
+ If the passed tz is a pytz timezone object, "normalize" it to the a
+ consistent version
+
+ Parameters
+ ----------
+ tz : tz object
+
+ Returns:
+ -------
+ tz object
+
+ Examples:
+ --------
+ >>> tz
+ <DstTzInfo 'US/Pacific' PST-1 day, 16:00:00 STD>
+
+ >>> tz_standardize(tz)
+ <DstTzInfo 'US/Pacific' LMT-1 day, 16:07:00 STD>
+
+ >>> tz
+ <DstTzInfo 'US/Pacific' LMT-1 day, 16:07:00 STD>
+
+ >>> tz_standardize(tz)
+ <DstTzInfo 'US/Pacific' LMT-1 day, 16:07:00 STD>
+
+ >>> tz
+ dateutil.tz.tz.tzutc
+
+ >>> tz_standardize(tz)
+ dateutil.tz.tz.tzutc
+ """
+ if treat_tz_as_pytz(tz):
+ return pytz.timezone(str(tz))
+ return tz
diff --git a/contrib/python/pandas/py2/pandas/_libs/tslibs/util.pxd b/contrib/python/pandas/py2/pandas/_libs/tslibs/util.pxd
new file mode 100644
index 00000000000..84fb4c4ce7b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/tslibs/util.pxd
@@ -0,0 +1,229 @@
+
+from cpython cimport PyTypeObject
+
+cdef extern from *:
+ """
+ static PyObject* char_to_string(const char* data) {
+ #if PY_VERSION_HEX >= 0x03000000
+ return PyUnicode_FromString(data);
+ #else
+ return PyString_FromString(data);
+ #endif
+ }
+ """
+ object char_to_string(const char* data)
+
+
+cdef extern from "Python.h":
+ # Note: importing extern-style allows us to declare these as nogil
+ # functions, whereas `from cpython cimport` does not.
+ bint PyUnicode_Check(object obj) nogil
+ bint PyString_Check(object obj) nogil
+ bint PyBool_Check(object obj) nogil
+ bint PyFloat_Check(object obj) nogil
+ bint PyComplex_Check(object obj) nogil
+ bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil
+
+from numpy cimport int64_t
+
+cdef extern from "numpy/arrayobject.h":
+ PyTypeObject PyFloatingArrType_Type
+ int _import_array() except -1
+
+cdef extern from "numpy/ndarrayobject.h":
+ PyTypeObject PyTimedeltaArrType_Type
+ PyTypeObject PyDatetimeArrType_Type
+ PyTypeObject PyComplexFloatingArrType_Type
+ PyTypeObject PyBoolArrType_Type
+
+ bint PyArray_IsIntegerScalar(obj) nogil
+ bint PyArray_Check(obj) nogil
+
+cdef extern from "numpy/npy_common.h":
+ int64_t NPY_MIN_INT64
+
+
+cdef inline int64_t get_nat():
+ return NPY_MIN_INT64
+
+
+cdef inline int import_array() except -1:
+ _import_array()
+
+
+# --------------------------------------------------------------------
+# Type Checking
+
+cdef inline bint is_string_object(object obj) nogil:
+ """
+ Cython equivalent of `isinstance(val, compat.string_types)`
+
+ Parameters
+ ----------
+ val : object
+
+ Returns
+ -------
+ is_string : bool
+ """
+ return PyString_Check(obj) or PyUnicode_Check(obj)
+
+
+cdef inline bint is_integer_object(object obj) nogil:
+ """
+ Cython equivalent of
+
+ `isinstance(val, (int, long, np.integer)) and not isinstance(val, bool)`
+
+ Parameters
+ ----------
+ val : object
+
+ Returns
+ -------
+ is_integer : bool
+
+ Notes
+ -----
+ This counts np.timedelta64 objects as integers.
+ """
+ return not PyBool_Check(obj) and PyArray_IsIntegerScalar(obj)
+
+
+cdef inline bint is_float_object(object obj) nogil:
+ """
+ Cython equivalent of `isinstance(val, (float, np.complex_))`
+
+ Parameters
+ ----------
+ val : object
+
+ Returns
+ -------
+ is_float : bool
+ """
+ return (PyFloat_Check(obj) or
+ (PyObject_TypeCheck(obj, &PyFloatingArrType_Type)))
+
+
+cdef inline bint is_complex_object(object obj) nogil:
+ """
+ Cython equivalent of `isinstance(val, (complex, np.complex_))`
+
+ Parameters
+ ----------
+ val : object
+
+ Returns
+ -------
+ is_complex : bool
+ """
+ return (PyComplex_Check(obj) or
+ PyObject_TypeCheck(obj, &PyComplexFloatingArrType_Type))
+
+
+cdef inline bint is_bool_object(object obj) nogil:
+ """
+ Cython equivalent of `isinstance(val, (bool, np.bool_))`
+
+ Parameters
+ ----------
+ val : object
+
+ Returns
+ -------
+ is_bool : bool
+ """
+ return (PyBool_Check(obj) or
+ PyObject_TypeCheck(obj, &PyBoolArrType_Type))
+
+
+cdef inline bint is_timedelta64_object(object obj) nogil:
+ """
+ Cython equivalent of `isinstance(val, np.timedelta64)`
+
+ Parameters
+ ----------
+ val : object
+
+ Returns
+ -------
+ is_timedelta64 : bool
+ """
+ return PyObject_TypeCheck(obj, &PyTimedeltaArrType_Type)
+
+
+cdef inline bint is_datetime64_object(object obj) nogil:
+ """
+ Cython equivalent of `isinstance(val, np.datetime64)`
+
+ Parameters
+ ----------
+ val : object
+
+ Returns
+ -------
+ is_datetime64 : bool
+ """
+ return PyObject_TypeCheck(obj, &PyDatetimeArrType_Type)
+
+
+cdef inline bint is_array(object val):
+ """
+ Cython equivalent of `isinstance(val, np.ndarray)`
+
+ Parameters
+ ----------
+ val : object
+
+ Returns
+ -------
+ is_ndarray : bool
+ """
+ return PyArray_Check(val)
+
+
+cdef inline bint is_period_object(object val):
+ """
+ Cython equivalent of `isinstance(val, pd.Period)`
+
+ Parameters
+ ----------
+ val : object
+
+ Returns
+ -------
+ is_period : bool
+ """
+ return getattr(val, '_typ', '_typ') == 'period'
+
+
+cdef inline bint is_offset_object(object val):
+ """
+ Check if an object is a DateOffset object.
+
+ Parameters
+ ----------
+ val : object
+
+ Returns
+ -------
+ is_date_offset : bool
+ """
+ return getattr(val, '_typ', None) == "dateoffset"
+
+
+cdef inline bint is_nan(object val):
+ """
+ Check if val is a Not-A-Number float or complex, including
+ float('NaN') and np.nan.
+
+ Parameters
+ ----------
+ val : object
+
+ Returns
+ -------
+ is_nan : bool
+ """
+ return (is_float_object(val) or is_complex_object(val)) and val != val
diff --git a/contrib/python/pandas/py2/pandas/_libs/util.pxd b/contrib/python/pandas/py2/pandas/_libs/util.pxd
new file mode 100644
index 00000000000..05a013ec0d7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/util.pxd
@@ -0,0 +1,114 @@
+from pandas._libs.tslibs.util cimport *
+
+from cython cimport Py_ssize_t
+
+cimport numpy as cnp
+from numpy cimport ndarray
+
+cdef extern from "numpy/ndarraytypes.h":
+ void PyArray_CLEARFLAGS(ndarray arr, int flags) nogil
+
+
+cdef extern from "numpy/arrayobject.h":
+ enum:
+ NPY_ARRAY_C_CONTIGUOUS
+ NPY_ARRAY_F_CONTIGUOUS
+
+
+cdef extern from *:
+ """
+ // returns ASCII or UTF8 (py3) view on python str
+ // python object owns memory, should not be freed
+ static const char* get_c_string(PyObject* obj) {
+ #if PY_VERSION_HEX >= 0x03000000
+ return PyUnicode_AsUTF8(obj);
+ #else
+ return PyString_AsString(obj);
+ #endif
+ }
+ """
+ const char *get_c_string(object) except NULL
+
+
+cdef extern from "src/headers/stdint.h":
+ enum: UINT8_MAX
+ enum: UINT16_MAX
+ enum: UINT32_MAX
+ enum: UINT64_MAX
+ enum: INT8_MIN
+ enum: INT8_MAX
+ enum: INT16_MIN
+ enum: INT16_MAX
+ enum: INT32_MAX
+ enum: INT32_MIN
+ enum: INT64_MAX
+ enum: INT64_MIN
+
+
+ctypedef fused numeric:
+ cnp.int8_t
+ cnp.int16_t
+ cnp.int32_t
+ cnp.int64_t
+
+ cnp.uint8_t
+ cnp.uint16_t
+ cnp.uint32_t
+ cnp.uint64_t
+
+ cnp.float32_t
+ cnp.float64_t
+
+
+cdef inline void set_array_not_contiguous(ndarray ao) nogil:
+ # Numpy>=1.8-compliant equivalent to:
+ # ao->flags &= ~(NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS);
+ PyArray_CLEARFLAGS(ao,
+ (NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS))
+
+
+cdef inline Py_ssize_t validate_indexer(ndarray arr, object loc) except -1:
+ """
+ Cast the given indexer `loc` to an integer. If it is negative, i.e. a
+ python-style indexing-from-the-end indexer, translate it to a
+ from-the-front indexer. Raise if this is not possible.
+
+ Parameters
+ ----------
+ arr : ndarray
+ loc : object
+
+ Returns
+ -------
+ idx : Py_ssize_t
+
+ Raises
+ ------
+ IndexError
+ """
+ cdef:
+ Py_ssize_t idx, size
+ int casted
+
+ if is_float_object(loc):
+ casted = int(loc)
+ if casted == loc:
+ loc = casted
+
+ idx = <Py_ssize_t>loc
+ size = cnp.PyArray_SIZE(arr)
+
+ if idx < 0 and size > 0:
+ idx += size
+ if idx >= size or size == 0 or idx < 0:
+ raise IndexError('index out of bounds')
+
+ return idx
+
+
+cdef inline object get_value_at(ndarray arr, object loc):
+ cdef:
+ Py_ssize_t i
+
+ i = validate_indexer(arr, loc)
+ return arr[i]
diff --git a/contrib/python/pandas/py2/pandas/_libs/window.pyx b/contrib/python/pandas/py2/pandas/_libs/window.pyx
new file mode 100644
index 00000000000..e8f3de64c38
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/window.pyx
@@ -0,0 +1,1910 @@
+# -*- coding: utf-8 -*-
+# cython: boundscheck=False, wraparound=False, cdivision=True
+
+import cython
+from cython import Py_ssize_t
+from libcpp.deque cimport deque
+
+from libc.stdlib cimport malloc, free
+
+import numpy as np
+cimport numpy as cnp
+from numpy cimport ndarray, int64_t, float64_t, float32_t
+cnp.import_array()
+
+
+cdef extern from "src/headers/cmath" namespace "std":
+ bint isnan(float64_t) nogil
+ bint notnan(float64_t) nogil
+ int signbit(float64_t) nogil
+ float64_t sqrt(float64_t x) nogil
+
+cimport pandas._libs.util as util
+from pandas._libs.util cimport numeric
+
+from pandas._libs.skiplist cimport (
+ skiplist_t, skiplist_init, skiplist_destroy, skiplist_get, skiplist_insert,
+ skiplist_remove)
+
+cdef float32_t MINfloat32 = np.NINF
+cdef float64_t MINfloat64 = np.NINF
+
+cdef float32_t MAXfloat32 = np.inf
+cdef float64_t MAXfloat64 = np.inf
+
+cdef float64_t NaN = <float64_t>np.NaN
+
+cdef inline int int_max(int a, int b): return a if a >= b else b
+cdef inline int int_min(int a, int b): return a if a <= b else b
+
+
+# Cython implementations of rolling sum, mean, variance, skewness,
+# other statistical moment functions
+#
+# Misc implementation notes
+# -------------------------
+#
+# - In Cython x * x is faster than x ** 2 for C types, this should be
+# periodically revisited to see if it's still true.
+#
+
+
+def _check_minp(win, minp, N, floor=None):
+ """
+ Parameters
+ ----------
+ win: int
+ minp: int or None
+ N: len of window
+ floor: int, optional
+ default 1
+
+ Returns
+ -------
+ minimum period
+ """
+
+ if minp is None:
+ minp = 1
+ if not util.is_integer_object(minp):
+ raise ValueError("min_periods must be an integer")
+ if minp > win:
+ raise ValueError("min_periods (%d) must be <= "
+ "window (%d)" % (minp, win))
+ elif minp > N:
+ minp = N + 1
+ elif minp < 0:
+ raise ValueError('min_periods must be >= 0')
+ if floor is None:
+ floor = 1
+
+ return max(minp, floor)
+
+
+# original C implementation by N. Devillard.
+# This code in public domain.
+# Function : kth_smallest()
+# In : array of elements, # of elements in the array, rank k
+# Out : one element
+# Job : find the kth smallest element in the array
+
+# Reference:
+
+# Author: Wirth, Niklaus
+# Title: Algorithms + data structures = programs
+# Publisher: Englewood Cliffs: Prentice-Hall, 1976
+# Physical description: 366 p.
+# Series: Prentice-Hall Series in Automatic Computation
+
+# ----------------------------------------------------------------------
+# The indexer objects for rolling
+# These define start/end indexers to compute offsets
+
+
+cdef class WindowIndexer:
+
+ cdef:
+ ndarray start, end
+ int64_t N, minp, win
+ bint is_variable
+
+ def get_data(self):
+ return (self.start, self.end, <int64_t>self.N,
+ <int64_t>self.win, <int64_t>self.minp,
+ self.is_variable)
+
+
+cdef class MockFixedWindowIndexer(WindowIndexer):
+ """
+
+ We are just checking parameters of the indexer,
+ and returning a consistent API with fixed/variable
+ indexers.
+
+ Parameters
+ ----------
+ values: ndarray
+ values data array
+ win: int64_t
+ window size
+ minp: int64_t
+ min number of obs in a window to consider non-NaN
+ index: object
+ index of the values
+ floor: optional
+ unit for flooring
+ left_closed: bint
+ left endpoint closedness
+ right_closed: bint
+ right endpoint closedness
+
+ """
+ def __init__(self, ndarray values, int64_t win, int64_t minp,
+ bint left_closed, bint right_closed,
+ object index=None, object floor=None):
+
+ assert index is None
+ self.is_variable = 0
+ self.N = len(values)
+ self.minp = _check_minp(win, minp, self.N, floor=floor)
+ self.start = np.empty(0, dtype='int64')
+ self.end = np.empty(0, dtype='int64')
+ self.win = win
+
+
+cdef class FixedWindowIndexer(WindowIndexer):
+ """
+ create a fixed length window indexer object
+ that has start & end, that point to offsets in
+ the index object; these are defined based on the win
+ arguments
+
+ Parameters
+ ----------
+ values: ndarray
+ values data array
+ win: int64_t
+ window size
+ minp: int64_t
+ min number of obs in a window to consider non-NaN
+ index: object
+ index of the values
+ floor: optional
+ unit for flooring the unit
+ left_closed: bint
+ left endpoint closedness
+ right_closed: bint
+ right endpoint closedness
+
+ """
+ def __init__(self, ndarray values, int64_t win, int64_t minp,
+ bint left_closed, bint right_closed,
+ object index=None, object floor=None):
+ cdef ndarray start_s, start_e, end_s, end_e
+
+ assert index is None
+ self.is_variable = 0
+ self.N = len(values)
+ self.minp = _check_minp(win, minp, self.N, floor=floor)
+
+ start_s = np.zeros(win, dtype='int64')
+ start_e = np.arange(win, self.N, dtype='int64') - win + 1
+ self.start = np.concatenate([start_s, start_e])
+
+ end_s = np.arange(win, dtype='int64') + 1
+ end_e = start_e + win
+ self.end = np.concatenate([end_s, end_e])
+ self.win = win
+
+
+cdef class VariableWindowIndexer(WindowIndexer):
+ """
+ create a variable length window indexer object
+ that has start & end, that point to offsets in
+ the index object; these are defined based on the win
+ arguments
+
+ Parameters
+ ----------
+ values: ndarray
+ values data array
+ win: int64_t
+ window size
+ minp: int64_t
+ min number of obs in a window to consider non-NaN
+ index: ndarray
+ index of the values
+ left_closed: bint
+ left endpoint closedness
+ True if the left endpoint is closed, False if open
+ right_closed: bint
+ right endpoint closedness
+ True if the right endpoint is closed, False if open
+ floor: optional
+ unit for flooring the unit
+ """
+ def __init__(self, ndarray values, int64_t win, int64_t minp,
+ bint left_closed, bint right_closed, ndarray index,
+ object floor=None):
+
+ self.is_variable = 1
+ self.N = len(index)
+ self.minp = _check_minp(win, minp, self.N, floor=floor)
+
+ self.start = np.empty(self.N, dtype='int64')
+ self.start.fill(-1)
+
+ self.end = np.empty(self.N, dtype='int64')
+ self.end.fill(-1)
+
+ self.build(index, win, left_closed, right_closed)
+
+ # max window size
+ self.win = (self.end - self.start).max()
+
+ def build(self, ndarray[int64_t] index, int64_t win, bint left_closed,
+ bint right_closed):
+
+ cdef:
+ ndarray[int64_t] start, end
+ int64_t start_bound, end_bound, N
+ Py_ssize_t i, j
+
+ start = self.start
+ end = self.end
+ N = self.N
+
+ start[0] = 0
+
+ # right endpoint is closed
+ if right_closed:
+ end[0] = 1
+ # right endpoint is open
+ else:
+ end[0] = 0
+
+ with nogil:
+
+ # start is start of slice interval (including)
+ # end is end of slice interval (not including)
+ for i in range(1, N):
+ end_bound = index[i]
+ start_bound = index[i] - win
+
+ # left endpoint is closed
+ if left_closed:
+ start_bound -= 1
+
+ # advance the start bound until we are
+ # within the constraint
+ start[i] = i
+ for j in range(start[i - 1], i):
+ if index[j] > start_bound:
+ start[i] = j
+ break
+
+ # end bound is previous end
+ # or current index
+ if index[end[i - 1]] <= end_bound:
+ end[i] = i + 1
+ else:
+ end[i] = end[i - 1]
+
+ # right endpoint is open
+ if not right_closed:
+ end[i] -= 1
+
+
+def get_window_indexer(values, win, minp, index, closed,
+ floor=None, use_mock=True):
+ """
+ return the correct window indexer for the computation
+
+ Parameters
+ ----------
+ values: 1d ndarray
+ win: integer, window size
+ minp: integer, minimum periods
+ index: 1d ndarray, optional
+ index to the values array
+ closed: string, default None
+ {'right', 'left', 'both', 'neither'}
+ window endpoint closedness. Defaults to 'right' in
+ VariableWindowIndexer and to 'both' in FixedWindowIndexer
+ floor: optional
+ unit for flooring the unit
+ use_mock: boolean, default True
+ if we are a fixed indexer, return a mock indexer
+ instead of the FixedWindow Indexer. This is a type
+ compat Indexer that allows us to use a standard
+ code path with all of the indexers.
+
+
+ Returns
+ -------
+ tuple of 1d int64 ndarrays of the offsets & data about the window
+
+ """
+
+ cdef:
+ bint left_closed = False
+ bint right_closed = False
+
+ assert closed is None or closed in ['right', 'left', 'both', 'neither']
+
+ # if windows is variable, default is 'right', otherwise default is 'both'
+ if closed is None:
+ closed = 'right' if index is not None else 'both'
+
+ if closed in ['right', 'both']:
+ right_closed = True
+
+ if closed in ['left', 'both']:
+ left_closed = True
+
+ if index is not None:
+ indexer = VariableWindowIndexer(values, win, minp, left_closed,
+ right_closed, index, floor)
+ elif use_mock:
+ indexer = MockFixedWindowIndexer(values, win, minp, left_closed,
+ right_closed, index, floor)
+ else:
+ indexer = FixedWindowIndexer(values, win, minp, left_closed,
+ right_closed, index, floor)
+ return indexer.get_data()
+
+
+# ----------------------------------------------------------------------
+# Rolling count
+# this is only an impl for index not None, IOW, freq aware
+
+
+def roll_count(ndarray[float64_t] values, int64_t win, int64_t minp,
+ object index, object closed):
+ cdef:
+ float64_t val, count_x = 0.0
+ int64_t s, e, nobs, N
+ Py_ssize_t i, j
+ ndarray[int64_t] start, end
+ ndarray[float64_t] output
+
+ start, end, N, win, minp, _ = get_window_indexer(values, win,
+ minp, index, closed)
+ output = np.empty(N, dtype=float)
+
+ with nogil:
+
+ for i in range(0, N):
+ s = start[i]
+ e = end[i]
+
+ if i == 0:
+
+ # setup
+ count_x = 0.0
+ for j in range(s, e):
+ val = values[j]
+ if notnan(val):
+ count_x += 1.0
+
+ else:
+
+ # calculate deletes
+ for j in range(start[i - 1], s):
+ val = values[j]
+ if notnan(val):
+ count_x -= 1.0
+
+ # calculate adds
+ for j in range(end[i - 1], e):
+ val = values[j]
+ if notnan(val):
+ count_x += 1.0
+
+ if count_x >= minp:
+ output[i] = count_x
+ else:
+ output[i] = NaN
+
+ return output
+
+
+# ----------------------------------------------------------------------
+# Rolling sum
+
+
+cdef inline float64_t calc_sum(int64_t minp, int64_t nobs,
+ float64_t sum_x) nogil:
+ cdef:
+ float64_t result
+
+ if nobs >= minp:
+ result = sum_x
+ else:
+ result = NaN
+
+ return result
+
+
+cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil:
+ """ add a value from the sum calc """
+
+ # Not NaN
+ if notnan(val):
+ nobs[0] = nobs[0] + 1
+ sum_x[0] = sum_x[0] + val
+
+
+cdef inline void remove_sum(float64_t val,
+ int64_t *nobs, float64_t *sum_x) nogil:
+ """ remove a value from the sum calc """
+
+ if notnan(val):
+ nobs[0] = nobs[0] - 1
+ sum_x[0] = sum_x[0] - val
+
+
+def roll_sum(ndarray[float64_t] values, int64_t win, int64_t minp,
+ object index, object closed):
+ cdef:
+ float64_t val, prev_x, sum_x = 0
+ int64_t s, e, range_endpoint
+ int64_t nobs = 0, i, j, N
+ bint is_variable
+ ndarray[int64_t] start, end
+ ndarray[float64_t] output
+
+ start, end, N, win, minp, is_variable = get_window_indexer(values, win,
+ minp, index,
+ closed,
+ floor=0)
+ output = np.empty(N, dtype=float)
+
+ # for performance we are going to iterate
+ # fixed windows separately, makes the code more complex as we have 2 paths
+ # but is faster
+
+ if is_variable:
+
+ # variable window
+ with nogil:
+
+ for i in range(0, N):
+ s = start[i]
+ e = end[i]
+
+ if i == 0:
+
+ # setup
+ sum_x = 0.0
+ nobs = 0
+ for j in range(s, e):
+ add_sum(values[j], &nobs, &sum_x)
+
+ else:
+
+ # calculate deletes
+ for j in range(start[i - 1], s):
+ remove_sum(values[j], &nobs, &sum_x)
+
+ # calculate adds
+ for j in range(end[i - 1], e):
+ add_sum(values[j], &nobs, &sum_x)
+
+ output[i] = calc_sum(minp, nobs, sum_x)
+
+ else:
+
+ # fixed window
+
+ range_endpoint = int_max(minp, 1) - 1
+
+ with nogil:
+
+ for i in range(0, range_endpoint):
+ add_sum(values[i], &nobs, &sum_x)
+ output[i] = NaN
+
+ for i in range(range_endpoint, N):
+ val = values[i]
+ add_sum(val, &nobs, &sum_x)
+
+ if i > win - 1:
+ prev_x = values[i - win]
+ remove_sum(prev_x, &nobs, &sum_x)
+
+ output[i] = calc_sum(minp, nobs, sum_x)
+
+ return output
+
+
+# ----------------------------------------------------------------------
+# Rolling mean
+
+
+cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs,
+ Py_ssize_t neg_ct, float64_t sum_x) nogil:
+ cdef:
+ float64_t result
+
+ if nobs >= minp:
+ result = sum_x / <float64_t>nobs
+ if neg_ct == 0 and result < 0:
+ # all positive
+ result = 0
+ elif neg_ct == nobs and result > 0:
+ # all negative
+ result = 0
+ else:
+ pass
+ else:
+ result = NaN
+ return result
+
+
+cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
+ Py_ssize_t *neg_ct) nogil:
+ """ add a value from the mean calc """
+
+ # Not NaN
+ if notnan(val):
+ nobs[0] = nobs[0] + 1
+ sum_x[0] = sum_x[0] + val
+ if signbit(val):
+ neg_ct[0] = neg_ct[0] + 1
+
+
+cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
+ Py_ssize_t *neg_ct) nogil:
+ """ remove a value from the mean calc """
+
+ if notnan(val):
+ nobs[0] = nobs[0] - 1
+ sum_x[0] = sum_x[0] - val
+ if signbit(val):
+ neg_ct[0] = neg_ct[0] - 1
+
+
+def roll_mean(ndarray[float64_t] values, int64_t win, int64_t minp,
+ object index, object closed):
+ cdef:
+ float64_t val, prev_x, result, sum_x = 0
+ int64_t s, e
+ bint is_variable
+ Py_ssize_t nobs = 0, i, j, neg_ct = 0, N
+ ndarray[int64_t] start, end
+ ndarray[float64_t] output
+
+ start, end, N, win, minp, is_variable = get_window_indexer(values, win,
+ minp, index,
+ closed)
+ output = np.empty(N, dtype=float)
+
+ # for performance we are going to iterate
+ # fixed windows separately, makes the code more complex as we have 2 paths
+ # but is faster
+
+ if is_variable:
+
+ with nogil:
+
+ for i in range(0, N):
+ s = start[i]
+ e = end[i]
+
+ if i == 0:
+
+ # setup
+ sum_x = 0.0
+ nobs = 0
+ for j in range(s, e):
+ val = values[j]
+ add_mean(val, &nobs, &sum_x, &neg_ct)
+
+ else:
+
+ # calculate deletes
+ for j in range(start[i - 1], s):
+ val = values[j]
+ remove_mean(val, &nobs, &sum_x, &neg_ct)
+
+ # calculate adds
+ for j in range(end[i - 1], e):
+ val = values[j]
+ add_mean(val, &nobs, &sum_x, &neg_ct)
+
+ output[i] = calc_mean(minp, nobs, neg_ct, sum_x)
+
+ else:
+
+ with nogil:
+ for i in range(minp - 1):
+ val = values[i]
+ add_mean(val, &nobs, &sum_x, &neg_ct)
+ output[i] = NaN
+
+ for i in range(minp - 1, N):
+ val = values[i]
+ add_mean(val, &nobs, &sum_x, &neg_ct)
+
+ if i > win - 1:
+ prev_x = values[i - win]
+ remove_mean(prev_x, &nobs, &sum_x, &neg_ct)
+
+ output[i] = calc_mean(minp, nobs, neg_ct, sum_x)
+
+ return output
+
+
+# ----------------------------------------------------------------------
+# Rolling variance
+
+
+cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs,
+ float64_t ssqdm_x) nogil:
+ cdef:
+ float64_t result
+
+ # Variance is unchanged if no observation is added or removed
+ if (nobs >= minp) and (nobs > ddof):
+
+ # pathological case
+ if nobs == 1:
+ result = 0
+ else:
+ result = ssqdm_x / (nobs - <float64_t>ddof)
+ if result < 0:
+ result = 0
+ else:
+ result = NaN
+
+ return result
+
+
+cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x,
+ float64_t *ssqdm_x) nogil:
+ """ add a value from the var calc """
+ cdef:
+ float64_t delta
+
+ # `isnan` instead of equality as fix for GH-21813, msvc 2017 bug
+ if isnan(val):
+ return
+
+ nobs[0] = nobs[0] + 1
+ # a part of Welford's method for the online variance-calculation
+ # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+ delta = val - mean_x[0]
+ mean_x[0] = mean_x[0] + delta / nobs[0]
+ ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0]
+
+
+cdef inline void remove_var(float64_t val, float64_t *nobs, float64_t *mean_x,
+ float64_t *ssqdm_x) nogil:
+ """ remove a value from the var calc """
+ cdef:
+ float64_t delta
+
+ if notnan(val):
+ nobs[0] = nobs[0] - 1
+ if nobs[0]:
+ # a part of Welford's method for the online variance-calculation
+ # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+ delta = val - mean_x[0]
+ mean_x[0] = mean_x[0] - delta / nobs[0]
+ ssqdm_x[0] = ssqdm_x[0] - ((nobs[0] + 1) * delta ** 2) / nobs[0]
+ else:
+ mean_x[0] = 0
+ ssqdm_x[0] = 0
+
+
+def roll_var(ndarray[float64_t] values, int64_t win, int64_t minp,
+ object index, object closed, int ddof=1):
+ """
+ Numerically stable implementation using Welford's method.
+ """
+ cdef:
+ float64_t mean_x = 0, ssqdm_x = 0, nobs = 0,
+ float64_t val, prev, delta, mean_x_old
+ int64_t s, e
+ bint is_variable
+ Py_ssize_t i, j, N
+ ndarray[int64_t] start, end
+ ndarray[float64_t] output
+
+ start, end, N, win, minp, is_variable = get_window_indexer(values, win,
+ minp, index,
+ closed)
+ output = np.empty(N, dtype=float)
+
+ # Check for windows larger than array, addresses #7297
+ win = min(win, N)
+
+ # for performance we are going to iterate
+ # fixed windows separately, makes the code more complex as we
+ # have 2 paths but is faster
+
+ if is_variable:
+
+ with nogil:
+
+ for i in range(0, N):
+
+ s = start[i]
+ e = end[i]
+
+ # Over the first window, observations can only be added
+ # never removed
+ if i == 0:
+
+ for j in range(s, e):
+ add_var(values[j], &nobs, &mean_x, &ssqdm_x)
+
+ else:
+
+ # After the first window, observations can both be added
+ # and removed
+
+ # calculate adds
+ for j in range(end[i - 1], e):
+ add_var(values[j], &nobs, &mean_x, &ssqdm_x)
+
+ # calculate deletes
+ for j in range(start[i - 1], s):
+ remove_var(values[j], &nobs, &mean_x, &ssqdm_x)
+
+ output[i] = calc_var(minp, ddof, nobs, ssqdm_x)
+
+ else:
+
+ with nogil:
+
+ # Over the first window, observations can only be added, never
+ # removed
+ for i in range(win):
+ add_var(values[i], &nobs, &mean_x, &ssqdm_x)
+ output[i] = calc_var(minp, ddof, nobs, ssqdm_x)
+
+ # a part of Welford's method for the online variance-calculation
+ # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+
+ # After the first window, observations can both be added and
+ # removed
+ for i in range(win, N):
+ val = values[i]
+ prev = values[i - win]
+
+ if notnan(val):
+ if prev == prev:
+
+ # Adding one observation and removing another one
+ delta = val - prev
+ mean_x_old = mean_x
+
+ mean_x += delta / nobs
+ ssqdm_x += ((nobs - 1) * val
+ + (nobs + 1) * prev
+ - 2 * nobs * mean_x_old) * delta / nobs
+
+ else:
+ add_var(val, &nobs, &mean_x, &ssqdm_x)
+ elif prev == prev:
+ remove_var(prev, &nobs, &mean_x, &ssqdm_x)
+
+ output[i] = calc_var(minp, ddof, nobs, ssqdm_x)
+
+ return output
+
+
+# ----------------------------------------------------------------------
+# Rolling skewness
+
+cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
+ float64_t x, float64_t xx,
+ float64_t xxx) nogil:
+ cdef:
+ float64_t result, dnobs
+ float64_t A, B, C, R
+
+ if nobs >= minp:
+ dnobs = <float64_t>nobs
+ A = x / dnobs
+ B = xx / dnobs - A * A
+ C = xxx / dnobs - A * A * A - 3 * A * B
+
+ # #18044: with uniform distribution, floating issue will
+ # cause B != 0. and cause the result is a very
+ # large number.
+ #
+ # in core/nanops.py nanskew/nankurt call the function
+ # _zero_out_fperr(m2) to fix floating error.
+ # if the variance is less than 1e-14, it could be
+ # treat as zero, here we follow the original
+ # skew/kurt behaviour to check B <= 1e-14
+ if B <= 1e-14 or nobs < 3:
+ result = NaN
+ else:
+ R = sqrt(B)
+ result = ((sqrt(dnobs * (dnobs - 1.)) * C) /
+ ((dnobs - 2) * R * R * R))
+ else:
+ result = NaN
+
+ return result
+
+
+cdef inline void add_skew(float64_t val, int64_t *nobs,
+ float64_t *x, float64_t *xx,
+ float64_t *xxx) nogil:
+ """ add a value from the skew calc """
+
+ # Not NaN
+ if notnan(val):
+ nobs[0] = nobs[0] + 1
+
+ # seriously don't ask me why this is faster
+ x[0] = x[0] + val
+ xx[0] = xx[0] + val * val
+ xxx[0] = xxx[0] + val * val * val
+
+
+cdef inline void remove_skew(float64_t val, int64_t *nobs,
+ float64_t *x, float64_t *xx,
+ float64_t *xxx) nogil:
+ """ remove a value from the skew calc """
+
+ # Not NaN
+ if notnan(val):
+ nobs[0] = nobs[0] - 1
+
+ # seriously don't ask me why this is faster
+ x[0] = x[0] - val
+ xx[0] = xx[0] - val * val
+ xxx[0] = xxx[0] - val * val * val
+
+
+def roll_skew(ndarray[float64_t] values, int64_t win, int64_t minp,
+ object index, object closed):
+ cdef:
+ float64_t val, prev
+ float64_t x = 0, xx = 0, xxx = 0
+ int64_t nobs = 0, i, j, N
+ int64_t s, e
+ bint is_variable
+ ndarray[int64_t] start, end
+ ndarray[float64_t] output
+
+ start, end, N, win, minp, is_variable = get_window_indexer(values, win,
+ minp, index,
+ closed)
+ output = np.empty(N, dtype=float)
+
+ if is_variable:
+
+ with nogil:
+
+ for i in range(0, N):
+
+ s = start[i]
+ e = end[i]
+
+ # Over the first window, observations can only be added
+ # never removed
+ if i == 0:
+
+ for j in range(s, e):
+ val = values[j]
+ add_skew(val, &nobs, &x, &xx, &xxx)
+
+ else:
+
+ # After the first window, observations can both be added
+ # and removed
+
+ # calculate adds
+ for j in range(end[i - 1], e):
+ val = values[j]
+ add_skew(val, &nobs, &x, &xx, &xxx)
+
+ # calculate deletes
+ for j in range(start[i - 1], s):
+ val = values[j]
+ remove_skew(val, &nobs, &x, &xx, &xxx)
+
+ output[i] = calc_skew(minp, nobs, x, xx, xxx)
+
+ else:
+
+ with nogil:
+ for i in range(minp - 1):
+ val = values[i]
+ add_skew(val, &nobs, &x, &xx, &xxx)
+ output[i] = NaN
+
+ for i in range(minp - 1, N):
+ val = values[i]
+ add_skew(val, &nobs, &x, &xx, &xxx)
+
+ if i > win - 1:
+ prev = values[i - win]
+ remove_skew(prev, &nobs, &x, &xx, &xxx)
+
+ output[i] = calc_skew(minp, nobs, x, xx, xxx)
+
+ return output
+
+
+# ----------------------------------------------------------------------
+# Rolling kurtosis
+
+
+cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs,
+ float64_t x, float64_t xx,
+ float64_t xxx, float64_t xxxx) nogil:
+ cdef:
+ float64_t result, dnobs
+ float64_t A, B, C, D, R, K
+
+ if nobs >= minp:
+ dnobs = <float64_t>nobs
+ A = x / dnobs
+ R = A * A
+ B = xx / dnobs - R
+ R = R * A
+ C = xxx / dnobs - R - 3 * A * B
+ R = R * A
+ D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A
+
+ # #18044: with uniform distribution, floating issue will
+ # cause B != 0. and cause the result is a very
+ # large number.
+ #
+ # in core/nanops.py nanskew/nankurt call the function
+ # _zero_out_fperr(m2) to fix floating error.
+ # if the variance is less than 1e-14, it could be
+ # treat as zero, here we follow the original
+ # skew/kurt behaviour to check B <= 1e-14
+ if B <= 1e-14 or nobs < 4:
+ result = NaN
+ else:
+ K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2)
+ result = K / ((dnobs - 2.) * (dnobs - 3.))
+ else:
+ result = NaN
+
+ return result
+
+
+cdef inline void add_kurt(float64_t val, int64_t *nobs,
+ float64_t *x, float64_t *xx,
+ float64_t *xxx, float64_t *xxxx) nogil:
+ """ add a value from the kurotic calc """
+
+ # Not NaN
+ if notnan(val):
+ nobs[0] = nobs[0] + 1
+
+ # seriously don't ask me why this is faster
+ x[0] = x[0] + val
+ xx[0] = xx[0] + val * val
+ xxx[0] = xxx[0] + val * val * val
+ xxxx[0] = xxxx[0] + val * val * val * val
+
+
+cdef inline void remove_kurt(float64_t val, int64_t *nobs,
+ float64_t *x, float64_t *xx,
+ float64_t *xxx, float64_t *xxxx) nogil:
+ """ remove a value from the kurotic calc """
+
+ # Not NaN
+ if notnan(val):
+ nobs[0] = nobs[0] - 1
+
+ # seriously don't ask me why this is faster
+ x[0] = x[0] - val
+ xx[0] = xx[0] - val * val
+ xxx[0] = xxx[0] - val * val * val
+ xxxx[0] = xxxx[0] - val * val * val * val
+
+
+def roll_kurt(ndarray[float64_t] values, int64_t win, int64_t minp,
+ object index, object closed):
+ cdef:
+ float64_t val, prev
+ float64_t x = 0, xx = 0, xxx = 0, xxxx = 0
+ int64_t nobs = 0, i, j, N
+ int64_t s, e
+ bint is_variable
+ ndarray[int64_t] start, end
+ ndarray[float64_t] output
+
+ start, end, N, win, minp, is_variable = get_window_indexer(values, win,
+ minp, index,
+ closed)
+ output = np.empty(N, dtype=float)
+
+ if is_variable:
+
+ with nogil:
+
+ for i in range(0, N):
+
+ s = start[i]
+ e = end[i]
+
+ # Over the first window, observations can only be added
+ # never removed
+ if i == 0:
+
+ for j in range(s, e):
+ add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx)
+
+ else:
+
+ # After the first window, observations can both be added
+ # and removed
+
+ # calculate adds
+ for j in range(end[i - 1], e):
+ add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx)
+
+ # calculate deletes
+ for j in range(start[i - 1], s):
+ remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx)
+
+ output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx)
+
+ else:
+
+ with nogil:
+
+ for i in range(minp - 1):
+ add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx)
+ output[i] = NaN
+
+ for i in range(minp - 1, N):
+ add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx)
+
+ if i > win - 1:
+ prev = values[i - win]
+ remove_kurt(prev, &nobs, &x, &xx, &xxx, &xxxx)
+
+ output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx)
+
+ return output
+
+
+# ----------------------------------------------------------------------
+# Rolling median, min, max
+
+
+def roll_median_c(ndarray[float64_t] values, int64_t win, int64_t minp,
+ object index, object closed):
+ cdef:
+ float64_t val, res, prev
+ bint err = 0, is_variable
+ int ret = 0
+ skiplist_t *sl
+ Py_ssize_t i, j
+ int64_t nobs = 0, N, s, e
+ int midpoint
+ ndarray[int64_t] start, end
+ ndarray[float64_t] output
+
+ # we use the Fixed/Variable Indexer here as the
+ # actual skiplist ops outweigh any window computation costs
+ start, end, N, win, minp, is_variable = get_window_indexer(
+ values, win,
+ minp, index, closed,
+ use_mock=False)
+ output = np.empty(N, dtype=float)
+
+ sl = skiplist_init(<int>win)
+ if sl == NULL:
+ raise MemoryError("skiplist_init failed")
+
+ with nogil:
+
+ for i in range(0, N):
+ s = start[i]
+ e = end[i]
+
+ if i == 0:
+
+ # setup
+ val = values[i]
+ if notnan(val):
+ nobs += 1
+ err = skiplist_insert(sl, val) != 1
+ if err:
+ break
+
+ else:
+
+ # calculate deletes
+ for j in range(start[i - 1], s):
+ val = values[j]
+ if notnan(val):
+ skiplist_remove(sl, val)
+ nobs -= 1
+
+ # calculate adds
+ for j in range(end[i - 1], e):
+ val = values[j]
+ if notnan(val):
+ nobs += 1
+ err = skiplist_insert(sl, val) != 1
+ if err:
+ break
+
+ if nobs >= minp:
+ midpoint = <int>(nobs / 2)
+ if nobs % 2:
+ res = skiplist_get(sl, midpoint, &ret)
+ else:
+ res = (skiplist_get(sl, midpoint, &ret) +
+ skiplist_get(sl, (midpoint - 1), &ret)) / 2
+ else:
+ res = NaN
+
+ output[i] = res
+
+ skiplist_destroy(sl)
+ if err:
+ raise MemoryError("skiplist_insert failed")
+ return output
+
+
+# ----------------------------------------------------------------------
+
+# Moving maximum / minimum code taken from Bottleneck under the terms
+# of its Simplified BSD license
+# https://github.com/kwgoodman/bottleneck
+
+
+cdef inline numeric init_mm(numeric ai, Py_ssize_t *nobs, bint is_max) nogil:
+
+ if numeric in cython.floating:
+ if ai == ai:
+ nobs[0] = nobs[0] + 1
+ elif is_max:
+ if numeric == cython.float:
+ ai = MINfloat32
+ else:
+ ai = MINfloat64
+ else:
+ if numeric == cython.float:
+ ai = MAXfloat32
+ else:
+ ai = MAXfloat64
+
+ else:
+ nobs[0] = nobs[0] + 1
+
+ return ai
+
+
+cdef inline void remove_mm(numeric aold, Py_ssize_t *nobs) nogil:
+ """ remove a value from the mm calc """
+ if numeric in cython.floating and aold == aold:
+ nobs[0] = nobs[0] - 1
+
+
+cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs,
+ numeric value) nogil:
+ cdef:
+ numeric result
+
+ if numeric in cython.floating:
+ if nobs >= minp:
+ result = value
+ else:
+ result = NaN
+ else:
+ result = value
+
+ return result
+
+
+def roll_max(ndarray[numeric] values, int64_t win, int64_t minp,
+ object index, object closed):
+ """
+ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs.
+
+ Parameters
+ ----------
+ values: numpy array
+ window: int, size of rolling window
+ minp: if number of observations in window
+ is below this, output a NaN
+ index: ndarray, optional
+ index for window computation
+ closed: 'right', 'left', 'both', 'neither'
+ make the interval closed on the right, left,
+ both or neither endpoints
+ """
+ return _roll_min_max(values, win, minp, index, closed=closed, is_max=1)
+
+
+def roll_min(ndarray[numeric] values, int64_t win, int64_t minp,
+ object index, object closed):
+ """
+ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs.
+
+ Parameters
+ ----------
+ values: numpy array
+ window: int, size of rolling window
+ minp: if number of observations in window
+ is below this, output a NaN
+ index: ndarray, optional
+ index for window computation
+ """
+ return _roll_min_max(values, win, minp, index, is_max=0, closed=closed)
+
+
+cdef _roll_min_max(ndarray[numeric] values, int64_t win, int64_t minp,
+ object index, object closed, bint is_max):
+ """
+ Moving min/max of 1d array of any numeric type along axis=0
+ ignoring NaNs.
+ """
+ cdef:
+ ndarray[int64_t] starti, endi
+ int64_t N
+ bint is_variable
+
+ starti, endi, N, win, minp, is_variable = get_window_indexer(
+ values, win,
+ minp, index, closed)
+
+ if is_variable:
+ return _roll_min_max_variable(values, starti, endi, N, win, minp,
+ is_max)
+ else:
+ return _roll_min_max_fixed(values, starti, endi, N, win, minp, is_max)
+
+
+cdef _roll_min_max_variable(ndarray[numeric] values,
+ ndarray[int64_t] starti,
+ ndarray[int64_t] endi,
+ int64_t N,
+ int64_t win,
+ int64_t minp,
+ bint is_max):
+ cdef:
+ numeric ai
+ int64_t i, close_offset, curr_win_size
+ Py_ssize_t nobs = 0
+ deque Q[int64_t] # min/max always the front
+ deque W[int64_t] # track the whole window for nobs compute
+ ndarray[float64_t, ndim=1] output
+
+ output = np.empty(N, dtype=float)
+ Q = deque[int64_t]()
+ W = deque[int64_t]()
+
+ with nogil:
+
+ # This is using a modified version of the C++ code in this
+ # SO post: http://bit.ly/2nOoHlY
+ # The original impl didn't deal with variable window sizes
+ # So the code was optimized for that
+
+ for i from starti[0] <= i < endi[0]:
+ ai = init_mm(values[i], &nobs, is_max)
+
+ # Discard previous entries if we find new min or max
+ if is_max:
+ while not Q.empty() and ((ai >= values[Q.back()]) or
+ values[Q.back()] != values[Q.back()]):
+ Q.pop_back()
+ else:
+ while not Q.empty() and ((ai <= values[Q.back()]) or
+ values[Q.back()] != values[Q.back()]):
+ Q.pop_back()
+ Q.push_back(i)
+ W.push_back(i)
+
+ # if right is open then the first window is empty
+ close_offset = 0 if endi[0] > starti[0] else 1
+
+ for i in range(endi[0], endi[N-1]):
+ if not Q.empty():
+ output[i-1+close_offset] = calc_mm(
+ minp, nobs, values[Q.front()])
+ else:
+ output[i-1+close_offset] = NaN
+
+ ai = init_mm(values[i], &nobs, is_max)
+
+ # Discard previous entries if we find new min or max
+ if is_max:
+ while not Q.empty() and ((ai >= values[Q.back()]) or
+ values[Q.back()] != values[Q.back()]):
+ Q.pop_back()
+ else:
+ while not Q.empty() and ((ai <= values[Q.back()]) or
+ values[Q.back()] != values[Q.back()]):
+ Q.pop_back()
+
+ # Maintain window/nobs retention
+ curr_win_size = endi[i + close_offset] - starti[i + close_offset]
+ while not Q.empty() and Q.front() <= i - curr_win_size:
+ Q.pop_front()
+ while not W.empty() and W.front() <= i - curr_win_size:
+ remove_mm(values[W.front()], &nobs)
+ W.pop_front()
+
+ Q.push_back(i)
+ W.push_back(i)
+
+ if not Q.empty():
+ output[N-1] = calc_mm(minp, nobs, values[Q.front()])
+ else:
+ output[N-1] = NaN
+
+ return output
+
+
+cdef _roll_min_max_fixed(ndarray[numeric] values,
+ ndarray[int64_t] starti,
+ ndarray[int64_t] endi,
+ int64_t N,
+ int64_t win,
+ int64_t minp,
+ bint is_max):
+ cdef:
+ numeric ai
+ bint should_replace
+ int64_t i, removed, window_i,
+ Py_ssize_t nobs = 0
+ int64_t* death
+ numeric* ring
+ numeric* minvalue
+ numeric* end
+ numeric* last
+ ndarray[float64_t, ndim=1] output
+
+ output = np.empty(N, dtype=float)
+ # setup the rings of death!
+ ring = <numeric *>malloc(win * sizeof(numeric))
+ death = <int64_t *>malloc(win * sizeof(int64_t))
+
+ end = ring + win
+ last = ring
+ minvalue = ring
+ ai = values[0]
+ minvalue[0] = init_mm(values[0], &nobs, is_max)
+ death[0] = win
+ nobs = 0
+
+ with nogil:
+
+ for i in range(N):
+ ai = init_mm(values[i], &nobs, is_max)
+
+ if i >= win:
+ remove_mm(values[i - win], &nobs)
+
+ if death[minvalue - ring] == i:
+ minvalue = minvalue + 1
+ if minvalue >= end:
+ minvalue = ring
+
+ if is_max:
+ should_replace = ai >= minvalue[0]
+ else:
+ should_replace = ai <= minvalue[0]
+ if should_replace:
+
+ minvalue[0] = ai
+ death[minvalue - ring] = i + win
+ last = minvalue
+
+ else:
+
+ if is_max:
+ should_replace = last[0] <= ai
+ else:
+ should_replace = last[0] >= ai
+ while should_replace:
+ if last == ring:
+ last = end
+ last -= 1
+ if is_max:
+ should_replace = last[0] <= ai
+ else:
+ should_replace = last[0] >= ai
+
+ last += 1
+ if last == end:
+ last = ring
+ last[0] = ai
+ death[last - ring] = i + win
+
+ output[i] = calc_mm(minp, nobs, minvalue[0])
+
+ for i in range(minp - 1):
+ if numeric in cython.floating:
+ output[i] = NaN
+ else:
+ output[i] = 0
+
+ free(ring)
+ free(death)
+
+ return output
+
+
+cdef enum InterpolationType:
+ LINEAR,
+ LOWER,
+ HIGHER,
+ NEAREST,
+ MIDPOINT
+
+
+interpolation_types = {
+ 'linear': LINEAR,
+ 'lower': LOWER,
+ 'higher': HIGHER,
+ 'nearest': NEAREST,
+ 'midpoint': MIDPOINT,
+}
+
+
+def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win,
+ int64_t minp, object index, object closed,
+ float64_t quantile, str interpolation):
+ """
+ O(N log(window)) implementation using skip list
+ """
+ cdef:
+ float64_t val, prev, midpoint, idx_with_fraction
+ skiplist_t *skiplist
+ int64_t nobs = 0, i, j, s, e, N
+ Py_ssize_t idx
+ bint is_variable
+ ndarray[int64_t] start, end
+ ndarray[float64_t] output
+ float64_t vlow, vhigh
+ InterpolationType interpolation_type
+ int ret = 0
+
+ if quantile <= 0.0 or quantile >= 1.0:
+ raise ValueError("quantile value {0} not in [0, 1]".format(quantile))
+
+ try:
+ interpolation_type = interpolation_types[interpolation]
+ except KeyError:
+ raise ValueError("Interpolation '{interp}' is not supported"
+ .format(interp=interpolation))
+
+ # we use the Fixed/Variable Indexer here as the
+ # actual skiplist ops outweigh any window computation costs
+ start, end, N, win, minp, is_variable = get_window_indexer(
+ values, win,
+ minp, index, closed,
+ use_mock=False)
+ output = np.empty(N, dtype=float)
+ skiplist = skiplist_init(<int>win)
+ if skiplist == NULL:
+ raise MemoryError("skiplist_init failed")
+
+ with nogil:
+ for i in range(0, N):
+ s = start[i]
+ e = end[i]
+
+ if i == 0:
+
+ # setup
+ val = values[i]
+ if notnan(val):
+ nobs += 1
+ skiplist_insert(skiplist, val)
+
+ else:
+
+ # calculate deletes
+ for j in range(start[i - 1], s):
+ val = values[j]
+ if notnan(val):
+ skiplist_remove(skiplist, val)
+ nobs -= 1
+
+ # calculate adds
+ for j in range(end[i - 1], e):
+ val = values[j]
+ if notnan(val):
+ nobs += 1
+ skiplist_insert(skiplist, val)
+
+ if nobs >= minp:
+ if nobs == 1:
+ # Single value in skip list
+ output[i] = skiplist_get(skiplist, 0, &ret)
+ else:
+ idx_with_fraction = quantile * (nobs - 1)
+ idx = <int>idx_with_fraction
+
+ if idx_with_fraction == idx:
+ # no need to interpolate
+ output[i] = skiplist_get(skiplist, idx, &ret)
+ continue
+
+ if interpolation_type == LINEAR:
+ vlow = skiplist_get(skiplist, idx, &ret)
+ vhigh = skiplist_get(skiplist, idx + 1, &ret)
+ output[i] = ((vlow + (vhigh - vlow) *
+ (idx_with_fraction - idx)))
+ elif interpolation_type == LOWER:
+ output[i] = skiplist_get(skiplist, idx, &ret)
+ elif interpolation_type == HIGHER:
+ output[i] = skiplist_get(skiplist, idx + 1, &ret)
+ elif interpolation_type == NEAREST:
+ # the same behaviour as round()
+ if idx_with_fraction - idx == 0.5:
+ if idx % 2 == 0:
+ output[i] = skiplist_get(skiplist, idx, &ret)
+ else:
+ output[i] = skiplist_get(
+ skiplist, idx + 1, &ret)
+ elif idx_with_fraction - idx < 0.5:
+ output[i] = skiplist_get(skiplist, idx, &ret)
+ else:
+ output[i] = skiplist_get(skiplist, idx + 1, &ret)
+ elif interpolation_type == MIDPOINT:
+ vlow = skiplist_get(skiplist, idx, &ret)
+ vhigh = skiplist_get(skiplist, idx + 1, &ret)
+ output[i] = <float64_t>(vlow + vhigh) / 2
+ else:
+ output[i] = NaN
+
+ skiplist_destroy(skiplist)
+
+ return output
+
+
+def roll_generic(object obj,
+ int64_t win, int64_t minp, object index, object closed,
+ int offset, object func, bint raw,
+ object args, object kwargs):
+ cdef:
+ ndarray[float64_t] output, counts, bufarr
+ ndarray[float64_t, cast=True] arr
+ float64_t *buf
+ float64_t *oldbuf
+ int64_t nobs = 0, i, j, s, e, N
+ bint is_variable
+ ndarray[int64_t] start, end
+
+ n = len(obj)
+ if n == 0:
+ return obj
+
+ arr = np.asarray(obj)
+
+ # ndarray input
+ if raw:
+ if not arr.flags.c_contiguous:
+ arr = arr.copy('C')
+
+ counts = roll_sum(np.concatenate([np.isfinite(arr).astype(float),
+ np.array([0.] * offset)]),
+ win, minp, index, closed)[offset:]
+
+ start, end, N, win, minp, is_variable = get_window_indexer(arr, win,
+ minp, index,
+ closed,
+ floor=0)
+
+ output = np.empty(N, dtype=float)
+
+ if is_variable:
+ # variable window arr or series
+
+ if offset != 0:
+ raise ValueError("unable to roll_generic with a non-zero offset")
+
+ for i in range(0, N):
+ s = start[i]
+ e = end[i]
+
+ if counts[i] >= minp:
+ if raw:
+ output[i] = func(arr[s:e], *args, **kwargs)
+ else:
+ output[i] = func(obj.iloc[s:e], *args, **kwargs)
+ else:
+ output[i] = NaN
+
+ elif not raw:
+ # series
+ for i in range(N):
+ if counts[i] >= minp:
+ sl = slice(int_max(i + offset - win + 1, 0),
+ int_min(i + offset + 1, N))
+ output[i] = func(obj.iloc[sl], *args, **kwargs)
+ else:
+ output[i] = NaN
+
+ else:
+
+ # truncated windows at the beginning, through first full-length window
+ for i from 0 <= i < (int_min(win, N) - offset):
+ if counts[i] >= minp:
+ output[i] = func(arr[0: (i + offset + 1)], *args, **kwargs)
+ else:
+ output[i] = NaN
+
+ # remaining full-length windows
+ buf = <float64_t *>arr.data
+ bufarr = np.empty(win, dtype=float)
+ oldbuf = <float64_t *>bufarr.data
+ for i from (win - offset) <= i < (N - offset):
+ buf = buf + 1
+ bufarr.data = <char *>buf
+ if counts[i] >= minp:
+ output[i] = func(bufarr, *args, **kwargs)
+ else:
+ output[i] = NaN
+ bufarr.data = <char *>oldbuf
+
+ # truncated windows at the end
+ for i from int_max(N - offset, 0) <= i < N:
+ if counts[i] >= minp:
+ output[i] = func(arr[int_max(i + offset - win + 1, 0): N],
+ *args,
+ **kwargs)
+ else:
+ output[i] = NaN
+
+ return output
+
+
+def roll_window(ndarray[float64_t, ndim=1, cast=True] values,
+ ndarray[float64_t, ndim=1, cast=True] weights,
+ int minp, bint avg=True):
+ """
+ Assume len(weights) << len(values)
+ """
+ cdef:
+ ndarray[float64_t] output, tot_wgt, counts
+ Py_ssize_t in_i, win_i, win_n, win_k, in_n, in_k
+ float64_t val_in, val_win, c, w
+
+ in_n = len(values)
+ win_n = len(weights)
+ output = np.zeros(in_n, dtype=float)
+ counts = np.zeros(in_n, dtype=float)
+ if avg:
+ tot_wgt = np.zeros(in_n, dtype=float)
+
+ minp = _check_minp(len(weights), minp, in_n)
+
+ if avg:
+ for win_i in range(win_n):
+ val_win = weights[win_i]
+ if val_win != val_win:
+ continue
+
+ for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1:
+ val_in = values[in_i]
+ if val_in == val_in:
+ output[in_i + (win_n - win_i) - 1] += val_in * val_win
+ counts[in_i + (win_n - win_i) - 1] += 1
+ tot_wgt[in_i + (win_n - win_i) - 1] += val_win
+
+ for in_i in range(in_n):
+ c = counts[in_i]
+ if c < minp:
+ output[in_i] = NaN
+ else:
+ w = tot_wgt[in_i]
+ if w == 0:
+ output[in_i] = NaN
+ else:
+ output[in_i] /= tot_wgt[in_i]
+
+ else:
+ for win_i in range(win_n):
+ val_win = weights[win_i]
+ if val_win != val_win:
+ continue
+
+ for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1:
+ val_in = values[in_i]
+
+ if val_in == val_in:
+ output[in_i + (win_n - win_i) - 1] += val_in * val_win
+ counts[in_i + (win_n - win_i) - 1] += 1
+
+ for in_i in range(in_n):
+ c = counts[in_i]
+ if c < minp:
+ output[in_i] = NaN
+
+ return output
+
+# ----------------------------------------------------------------------
+# Exponentially weighted moving average
+
+
+def ewma(float64_t[:] vals, float64_t com,
+ int adjust, int ignore_na, int minp):
+ """
+ Compute exponentially-weighted moving average using center-of-mass.
+
+ Parameters
+ ----------
+ vals : ndarray (float64 type)
+ com : float64
+ adjust: int
+ ignore_na: int
+ minp: int
+
+ Returns
+ -------
+ y : ndarray
+ """
+
+ cdef:
+ Py_ssize_t N = len(vals)
+ ndarray[float64_t] output = np.empty(N, dtype=float)
+ float64_t alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur
+ Py_ssize_t i, nobs
+
+ if N == 0:
+ return output
+
+ minp = max(minp, 1)
+
+ alpha = 1. / (1. + com)
+ old_wt_factor = 1. - alpha
+ new_wt = 1. if adjust else alpha
+
+ weighted_avg = vals[0]
+ is_observation = (weighted_avg == weighted_avg)
+ nobs = int(is_observation)
+ output[0] = weighted_avg if (nobs >= minp) else NaN
+ old_wt = 1.
+
+ for i in range(1, N):
+ cur = vals[i]
+ is_observation = (cur == cur)
+ nobs += int(is_observation)
+ if weighted_avg == weighted_avg:
+
+ if is_observation or (not ignore_na):
+
+ old_wt *= old_wt_factor
+ if is_observation:
+
+ # avoid numerical errors on constant series
+ if weighted_avg != cur:
+ weighted_avg = ((old_wt * weighted_avg) +
+ (new_wt * cur)) / (old_wt + new_wt)
+ if adjust:
+ old_wt += new_wt
+ else:
+ old_wt = 1.
+ elif is_observation:
+ weighted_avg = cur
+
+ output[i] = weighted_avg if (nobs >= minp) else NaN
+
+ return output
+
+
+# ----------------------------------------------------------------------
+# Exponentially weighted moving covariance
+
+
+def ewmcov(float64_t[:] input_x, float64_t[:] input_y,
+ float64_t com, int adjust, int ignore_na, int minp, int bias):
+ """
+ Compute exponentially-weighted moving variance using center-of-mass.
+
+ Parameters
+ ----------
+ input_x : ndarray (float64 type)
+ input_y : ndarray (float64 type)
+ com : float64
+ adjust: int
+ ignore_na: int
+ minp: int
+ bias: int
+
+ Returns
+ -------
+ y : ndarray
+ """
+
+ cdef:
+ Py_ssize_t N = len(input_x)
+ float64_t alpha, old_wt_factor, new_wt, mean_x, mean_y, cov
+ float64_t sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y
+ Py_ssize_t i, nobs
+ ndarray[float64_t] output
+
+ if len(input_y) != N:
+ raise ValueError("arrays are of different lengths "
+ "({N} and {len_y})".format(N=N, len_y=len(input_y)))
+
+ output = np.empty(N, dtype=float)
+ if N == 0:
+ return output
+
+ minp = max(minp, 1)
+
+ alpha = 1. / (1. + com)
+ old_wt_factor = 1. - alpha
+ new_wt = 1. if adjust else alpha
+
+ mean_x = input_x[0]
+ mean_y = input_y[0]
+ is_observation = ((mean_x == mean_x) and (mean_y == mean_y))
+ nobs = int(is_observation)
+ if not is_observation:
+ mean_x = NaN
+ mean_y = NaN
+ output[0] = (0. if bias else NaN) if (nobs >= minp) else NaN
+ cov = 0.
+ sum_wt = 1.
+ sum_wt2 = 1.
+ old_wt = 1.
+
+ for i in range(1, N):
+ cur_x = input_x[i]
+ cur_y = input_y[i]
+ is_observation = ((cur_x == cur_x) and (cur_y == cur_y))
+ nobs += int(is_observation)
+ if mean_x == mean_x:
+ if is_observation or (not ignore_na):
+ sum_wt *= old_wt_factor
+ sum_wt2 *= (old_wt_factor * old_wt_factor)
+ old_wt *= old_wt_factor
+ if is_observation:
+ old_mean_x = mean_x
+ old_mean_y = mean_y
+
+ # avoid numerical errors on constant series
+ if mean_x != cur_x:
+ mean_x = ((old_wt * old_mean_x) +
+ (new_wt * cur_x)) / (old_wt + new_wt)
+
+ # avoid numerical errors on constant series
+ if mean_y != cur_y:
+ mean_y = ((old_wt * old_mean_y) +
+ (new_wt * cur_y)) / (old_wt + new_wt)
+ cov = ((old_wt * (cov + ((old_mean_x - mean_x) *
+ (old_mean_y - mean_y)))) +
+ (new_wt * ((cur_x - mean_x) *
+ (cur_y - mean_y)))) / (old_wt + new_wt)
+ sum_wt += new_wt
+ sum_wt2 += (new_wt * new_wt)
+ old_wt += new_wt
+ if not adjust:
+ sum_wt /= old_wt
+ sum_wt2 /= (old_wt * old_wt)
+ old_wt = 1.
+ elif is_observation:
+ mean_x = cur_x
+ mean_y = cur_y
+
+ if nobs >= minp:
+ if not bias:
+ numerator = sum_wt * sum_wt
+ denominator = numerator - sum_wt2
+ if (denominator > 0.):
+ output[i] = ((numerator / denominator) * cov)
+ else:
+ output[i] = NaN
+ else:
+ output[i] = cov
+ else:
+ output[i] = NaN
+
+ return output
diff --git a/contrib/python/pandas/py2/pandas/_libs/writers.pyx b/contrib/python/pandas/py2/pandas/_libs/writers.pyx
new file mode 100644
index 00000000000..6449a331689
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_libs/writers.pyx
@@ -0,0 +1,167 @@
+# -*- coding: utf-8 -*-
+
+import cython
+from cython import Py_ssize_t
+
+from cpython cimport PyBytes_GET_SIZE, PyUnicode_GET_SIZE
+
+try:
+ from cpython cimport PyString_GET_SIZE
+except ImportError:
+ from cpython cimport PyUnicode_GET_SIZE as PyString_GET_SIZE
+
+import numpy as np
+from numpy cimport ndarray, uint8_t
+
+
+ctypedef fused pandas_string:
+ str
+ unicode
+ bytes
+
+
+def write_csv_rows(list data, ndarray data_index,
+ Py_ssize_t nlevels, ndarray cols, object writer):
+ """
+ Write the given data to the writer object, pre-allocating where possible
+ for performance improvements.
+
+ Parameters
+ ----------
+ data : list
+ data_index : ndarray
+ nlevels : int
+ cols : ndarray
+ writer : object
+ """
+ # In crude testing, N>100 yields little marginal improvement
+ cdef:
+ Py_ssize_t i, j, k = len(data_index), N = 100, ncols = len(cols)
+ list rows
+
+ # pre-allocate rows
+ rows = [[None] * (nlevels + ncols) for _ in range(N)]
+
+ if nlevels == 1:
+ for j in range(k):
+ row = rows[j % N]
+ row[0] = data_index[j]
+ for i in range(ncols):
+ row[1 + i] = data[i][j]
+
+ if j >= N - 1 and j % N == N - 1:
+ writer.writerows(rows)
+ elif nlevels > 1:
+ for j in range(k):
+ row = rows[j % N]
+ row[:nlevels] = list(data_index[j])
+ for i in range(ncols):
+ row[nlevels + i] = data[i][j]
+
+ if j >= N - 1 and j % N == N - 1:
+ writer.writerows(rows)
+ else:
+ for j in range(k):
+ row = rows[j % N]
+ for i in range(ncols):
+ row[i] = data[i][j]
+
+ if j >= N - 1 and j % N == N - 1:
+ writer.writerows(rows)
+
+ if j >= 0 and (j < N - 1 or (j % N) != N - 1):
+ writer.writerows(rows[:((j + 1) % N)])
+
+
+def convert_json_to_lines(object arr):
+ """
+ replace comma separated json with line feeds, paying special attention
+ to quotes & brackets
+ """
+ cdef:
+ Py_ssize_t i = 0, num_open_brackets_seen = 0, length
+ bint in_quotes = 0, is_escaping = 0
+ ndarray[uint8_t, ndim=1] narr
+ unsigned char val, newline, comma, left_bracket, right_bracket, quote
+ unsigned char backslash
+
+ newline = ord('\n')
+ comma = ord(',')
+ left_bracket = ord('{')
+ right_bracket = ord('}')
+ quote = ord('"')
+ backslash = ord('\\')
+
+ narr = np.frombuffer(arr.encode('utf-8'), dtype='u1').copy()
+ length = narr.shape[0]
+ for i in range(length):
+ val = narr[i]
+ if val == quote and i > 0 and not is_escaping:
+ in_quotes = ~in_quotes
+ if val == backslash or is_escaping:
+ is_escaping = ~is_escaping
+ if val == comma: # commas that should be \n
+ if num_open_brackets_seen == 0 and not in_quotes:
+ narr[i] = newline
+ elif val == left_bracket:
+ if not in_quotes:
+ num_open_brackets_seen += 1
+ elif val == right_bracket:
+ if not in_quotes:
+ num_open_brackets_seen -= 1
+
+ return narr.tostring().decode('utf-8')
+
+
+# stata, pytables
+def max_len_string_array(pandas_string[:] arr) -> Py_ssize_t:
+ """ return the maximum size of elements in a 1-dim string array """
+ cdef:
+ Py_ssize_t i, m = 0, l = 0, length = arr.shape[0]
+ pandas_string val
+
+ for i in range(length):
+ val = arr[i]
+ if isinstance(val, str):
+ l = PyString_GET_SIZE(val)
+ elif isinstance(val, bytes):
+ l = PyBytes_GET_SIZE(val)
+ elif isinstance(val, unicode):
+ l = PyUnicode_GET_SIZE(val)
+
+ if l > m:
+ m = l
+
+ return m
+
+
+# ------------------------------------------------------------------
+# PyTables Helpers
+
+
+def string_array_replace_from_nan_rep(
+ ndarray[object, ndim=1] arr, object nan_rep,
+ object replace=None):
+ """
+ Replace the values in the array with 'replacement' if
+ they are 'nan_rep'. Return the same array.
+ """
+ cdef:
+ Py_ssize_t length = len(arr), i = 0
+
+ if replace is None:
+ replace = np.nan
+
+ for i in range(length):
+ if arr[i] == nan_rep:
+ arr[i] = replace
+
+ return arr
diff --git a/contrib/python/pandas/py2/pandas/_version.py b/contrib/python/pandas/py2/pandas/_version.py
new file mode 100644
index 00000000000..843359bd56e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/_version.py
@@ -0,0 +1,23 @@
+
+# This file was generated by 'versioneer.py' (0.15) from
+# revision-control system data, or from the parent directory name of an
+# unpacked source archive. Distribution tarballs contain a pre-generated copy
+# of this file.
+
+from warnings import catch_warnings
+with catch_warnings(record=True):
+ import json
+import sys
+
+version_json = '''
+{
+ "dirty": false,
+ "error": null,
+ "full-revisionid": "cb00deb94500205fcb27a33cc1d0df79a9727f8b",
+ "version": "0.24.2"
+}
+''' # END VERSION_JSON
+
+
+def get_versions():
+ return json.loads(version_json)
diff --git a/contrib/python/pandas/py2/pandas/api/__init__.py b/contrib/python/pandas/py2/pandas/api/__init__.py
new file mode 100644
index 00000000000..afff059e7b6
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/api/__init__.py
@@ -0,0 +1,2 @@
+""" public toolkit API """
+from . import types, extensions # noqa
diff --git a/contrib/python/pandas/py2/pandas/api/extensions/__init__.py b/contrib/python/pandas/py2/pandas/api/extensions/__init__.py
new file mode 100644
index 00000000000..cb6241016d8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/api/extensions/__init__.py
@@ -0,0 +1,10 @@
+"""Public API for extending pandas objects."""
+from pandas.core.accessor import (register_dataframe_accessor, # noqa
+ register_index_accessor,
+ register_series_accessor)
+from pandas.core.algorithms import take # noqa
+from pandas.core.arrays import (ExtensionArray, # noqa
+ ExtensionScalarOpsMixin)
+from pandas.core.dtypes.dtypes import ( # noqa
+ ExtensionDtype, register_extension_dtype
+)
diff --git a/contrib/python/pandas/py2/pandas/api/types/__init__.py b/contrib/python/pandas/py2/pandas/api/types/__init__.py
new file mode 100644
index 00000000000..438e4afa3f5
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/api/types/__init__.py
@@ -0,0 +1,9 @@
+""" public toolkit API """
+
+from pandas.core.dtypes.api import * # noqa
+from pandas.core.dtypes.dtypes import (CategoricalDtype, # noqa
+ DatetimeTZDtype,
+ PeriodDtype,
+ IntervalDtype)
+from pandas.core.dtypes.concat import union_categoricals # noqa
+from pandas._libs.lib import infer_dtype # noqa
diff --git a/contrib/python/pandas/py2/pandas/arrays/__init__.py b/contrib/python/pandas/py2/pandas/arrays/__init__.py
new file mode 100644
index 00000000000..7d9b1b7c7a6
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/arrays/__init__.py
@@ -0,0 +1,23 @@
+"""
+All of pandas' ExtensionArrays.
+
+See :ref:`extending.extension-types` for more.
+"""
+from pandas.core.arrays import (
+ IntervalArray, PeriodArray, Categorical, SparseArray, IntegerArray,
+ PandasArray,
+ DatetimeArray,
+ TimedeltaArray,
+)
+
+
+__all__ = [
+ 'Categorical',
+ 'DatetimeArray',
+ 'IntegerArray',
+ 'IntervalArray',
+ 'PandasArray',
+ 'PeriodArray',
+ 'SparseArray',
+ 'TimedeltaArray',
+]
diff --git a/contrib/python/pandas/py2/pandas/compat/__init__.py b/contrib/python/pandas/py2/pandas/compat/__init__.py
new file mode 100644
index 00000000000..f9c659106a5
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/compat/__init__.py
@@ -0,0 +1,470 @@
+"""
+compat
+======
+
+Cross-compatible functions for Python 2 and 3.
+
+Key items to import for 2/3 compatible code:
+* iterators: range(), map(), zip(), filter(), reduce()
+* lists: lrange(), lmap(), lzip(), lfilter()
+* unicode: u() [no unicode builtin in Python 3]
+* longs: long (int in Python 3)
+* callable
+* iterable method compatibility: iteritems, iterkeys, itervalues
+ * Uses the original method if available, otherwise uses items, keys, values.
+* types:
+ * text_type: unicode in Python 2, str in Python 3
+ * binary_type: str in Python 2, bytes in Python 3
+ * string_types: basestring in Python 2, str in Python 3
+* bind_method: binds functions to classes
+* add_metaclass(metaclass) - class decorator that recreates class with with the
+ given metaclass instead (and avoids intermediary class creation)
+
+Other items:
+* platform checker
+"""
+# pylint disable=W0611
+# flake8: noqa
+
+import re
+import functools
+import itertools
+from distutils.version import LooseVersion
+from itertools import product
+import sys
+import platform
+import types
+from unicodedata import east_asian_width
+import struct
+import inspect
+from collections import namedtuple
+import collections
+
+PY2 = sys.version_info[0] == 2
+PY3 = sys.version_info[0] >= 3
+PY35 = sys.version_info >= (3, 5)
+PY36 = sys.version_info >= (3, 6)
+PY37 = sys.version_info >= (3, 7)
+PYPY = platform.python_implementation() == 'PyPy'
+
+try:
+ import __builtin__ as builtins
+ # not writeable when instantiated with string, doesn't handle unicode well
+ from cStringIO import StringIO as cStringIO
+ # always writeable
+ from StringIO import StringIO
+ BytesIO = StringIO
+ import cPickle
+ import httplib
+except ImportError:
+ import builtins
+ from io import StringIO, BytesIO
+ cStringIO = StringIO
+ import pickle as cPickle
+ import http.client as httplib
+
+from pandas.compat.chainmap import DeepChainMap
+
+
+if PY3:
+ def isidentifier(s):
+ return s.isidentifier()
+
+ def str_to_bytes(s, encoding=None):
+ return s.encode(encoding or 'ascii')
+
+ def bytes_to_str(b, encoding=None):
+ return b.decode(encoding or 'utf-8')
+
+ # The signature version below is directly copied from Django,
+ # https://github.com/django/django/pull/4846
+ def signature(f):
+ sig = inspect.signature(f)
+ args = [
+ p.name for p in sig.parameters.values()
+ if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
+ ]
+ varargs = [
+ p.name for p in sig.parameters.values()
+ if p.kind == inspect.Parameter.VAR_POSITIONAL
+ ]
+ varargs = varargs[0] if varargs else None
+ keywords = [
+ p.name for p in sig.parameters.values()
+ if p.kind == inspect.Parameter.VAR_KEYWORD
+ ]
+ keywords = keywords[0] if keywords else None
+ defaults = [
+ p.default for p in sig.parameters.values()
+ if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
+ and p.default is not p.empty
+ ] or None
+ argspec = namedtuple('Signature', ['args', 'defaults',
+ 'varargs', 'keywords'])
+ return argspec(args, defaults, varargs, keywords)
+
+ def get_range_parameters(data):
+ """Gets the start, stop, and step parameters from a range object"""
+ return data.start, data.stop, data.step
+
+ # have to explicitly put builtins into the namespace
+ range = range
+ map = map
+ zip = zip
+ filter = filter
+ intern = sys.intern
+ reduce = functools.reduce
+ long = int
+ unichr = chr
+
+ # This was introduced in Python 3.3, but we don't support
+ # Python 3.x < 3.5, so checking PY3 is safe.
+ FileNotFoundError = FileNotFoundError
+
+ # list-producing versions of the major Python iterating functions
+ def lrange(*args, **kwargs):
+ return list(range(*args, **kwargs))
+
+ def lzip(*args, **kwargs):
+ return list(zip(*args, **kwargs))
+
+ def lmap(*args, **kwargs):
+ return list(map(*args, **kwargs))
+
+ def lfilter(*args, **kwargs):
+ return list(filter(*args, **kwargs))
+
+ from importlib import reload
+ reload = reload
+ Hashable = collections.abc.Hashable
+ Iterable = collections.abc.Iterable
+ Mapping = collections.abc.Mapping
+ MutableMapping = collections.abc.MutableMapping
+ Sequence = collections.abc.Sequence
+ Sized = collections.abc.Sized
+ Set = collections.abc.Set
+
+else:
+ # Python 2
+ _name_re = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*$")
+
+ FileNotFoundError = IOError
+
+ def isidentifier(s, dotted=False):
+ return bool(_name_re.match(s))
+
+ def str_to_bytes(s, encoding='ascii'):
+ return s
+
+ def bytes_to_str(b, encoding='ascii'):
+ return b
+
+ def signature(f):
+ return inspect.getargspec(f)
+
+ def get_range_parameters(data):
+ """Gets the start, stop, and step parameters from a range object"""
+ # seems we only have indexing ops to infer
+ # rather than direct accessors
+ if len(data) > 1:
+ step = data[1] - data[0]
+ stop = data[-1] + step
+ start = data[0]
+ elif len(data):
+ start = data[0]
+ stop = data[0] + 1
+ step = 1
+ else:
+ start = stop = 0
+ step = 1
+
+ return start, stop, step
+
+ # import iterator versions of these functions
+ range = xrange
+ intern = intern
+ zip = itertools.izip
+ filter = itertools.ifilter
+ map = itertools.imap
+ reduce = reduce
+ long = long
+ unichr = unichr
+
+ # Python 2-builtin ranges produce lists
+ lrange = builtins.range
+ lzip = builtins.zip
+ lmap = builtins.map
+ lfilter = builtins.filter
+
+ reload = builtins.reload
+
+ Hashable = collections.Hashable
+ Iterable = collections.Iterable
+ Mapping = collections.Mapping
+ MutableMapping = collections.MutableMapping
+ Sequence = collections.Sequence
+ Sized = collections.Sized
+ Set = collections.Set
+
+if PY2:
+ def iteritems(obj, **kw):
+ return obj.iteritems(**kw)
+
+ def iterkeys(obj, **kw):
+ return obj.iterkeys(**kw)
+
+ def itervalues(obj, **kw):
+ return obj.itervalues(**kw)
+
+ next = lambda it: it.next()
+else:
+ def iteritems(obj, **kw):
+ return iter(obj.items(**kw))
+
+ def iterkeys(obj, **kw):
+ return iter(obj.keys(**kw))
+
+ def itervalues(obj, **kw):
+ return iter(obj.values(**kw))
+
+ next = next
+
+
+def bind_method(cls, name, func):
+ """Bind a method to class, python 2 and python 3 compatible.
+
+ Parameters
+ ----------
+
+ cls : type
+ class to receive bound method
+ name : basestring
+ name of method on class instance
+ func : function
+ function to be bound as method
+
+
+ Returns
+ -------
+ None
+ """
+ # only python 2 has bound/unbound method issue
+ if not PY3:
+ setattr(cls, name, types.MethodType(func, None, cls))
+ else:
+ setattr(cls, name, func)
+# ----------------------------------------------------------------------------
+# functions largely based / taken from the six module
+
+# Much of the code in this module comes from Benjamin Peterson's six library.
+# The license for this library can be found in LICENSES/SIX and the code can be
+# found at https://bitbucket.org/gutworth/six
+
+# Definition of East Asian Width
+# http://unicode.org/reports/tr11/
+# Ambiguous width can be changed by option
+_EAW_MAP = {'Na': 1, 'N': 1, 'W': 2, 'F': 2, 'H': 1}
+
+if PY3:
+ string_types = str,
+ integer_types = int,
+ class_types = type,
+ text_type = str
+ binary_type = bytes
+
+ def u(s):
+ return s
+
+ def u_safe(s):
+ return s
+
+ def to_str(s):
+ """
+ Convert bytes and non-string into Python 3 str
+ """
+ if isinstance(s, binary_type):
+ s = bytes_to_str(s)
+ elif not isinstance(s, string_types):
+ s = str(s)
+ return s
+
+ def strlen(data, encoding=None):
+ # encoding is for compat with PY2
+ return len(data)
+
+ def east_asian_len(data, encoding=None, ambiguous_width=1):
+ """
+ Calculate display width considering unicode East Asian Width
+ """
+ if isinstance(data, text_type):
+ return sum(_EAW_MAP.get(east_asian_width(c), ambiguous_width) for c in data)
+ else:
+ return len(data)
+
+ def import_lzma():
+ """ import lzma from the std library """
+ import lzma
+ return lzma
+
+ def set_function_name(f, name, cls):
+ """ Bind the name/qualname attributes of the function """
+ f.__name__ = name
+ f.__qualname__ = '{klass}.{name}'.format(
+ klass=cls.__name__,
+ name=name)
+ f.__module__ = cls.__module__
+ return f
+
+ ResourceWarning = ResourceWarning
+
+else:
+ string_types = basestring,
+ integer_types = (int, long)
+ class_types = (type, types.ClassType)
+ text_type = unicode
+ binary_type = str
+
+ def u(s):
+ return unicode(s, "unicode_escape")
+
+ def u_safe(s):
+ try:
+ return unicode(s, "unicode_escape")
+ except:
+ return s
+
+ def to_str(s):
+ """
+ Convert unicode and non-string into Python 2 str
+ """
+ if not isinstance(s, string_types):
+ s = str(s)
+ return s
+
+ def strlen(data, encoding=None):
+ try:
+ data = data.decode(encoding)
+ except UnicodeError:
+ pass
+ return len(data)
+
+ def east_asian_len(data, encoding=None, ambiguous_width=1):
+ """
+ Calculate display width considering unicode East Asian Width
+ """
+ if isinstance(data, text_type):
+ try:
+ data = data.decode(encoding)
+ except UnicodeError:
+ pass
+ return sum(_EAW_MAP.get(east_asian_width(c), ambiguous_width) for c in data)
+ else:
+ return len(data)
+
+ def import_lzma():
+ """ import the backported lzma library
+ or raise ImportError if not available """
+ from backports import lzma
+ return lzma
+
+ def set_function_name(f, name, cls):
+ """ Bind the name attributes of the function """
+ f.__name__ = name
+ return f
+
+ class ResourceWarning(Warning):
+ pass
+
+string_and_binary_types = string_types + (binary_type,)
+
+
+try:
+ # callable reintroduced in later versions of Python
+ callable = callable
+except NameError:
+ def callable(obj):
+ return any("__call__" in klass.__dict__ for klass in type(obj).__mro__)
+
+
+if PY2:
+ # In PY2 functools.wraps doesn't provide metadata pytest needs to generate
+ # decorated tests using parametrization. See pytest GH issue #2782
+ def wraps(wrapped, assigned=functools.WRAPPER_ASSIGNMENTS,
+ updated=functools.WRAPPER_UPDATES):
+ def wrapper(f):
+ f = functools.wraps(wrapped, assigned, updated)(f)
+ f.__wrapped__ = wrapped
+ return f
+ return wrapper
+else:
+ wraps = functools.wraps
+
+
+def add_metaclass(metaclass):
+ """Class decorator for creating a class with a metaclass."""
+ def wrapper(cls):
+ orig_vars = cls.__dict__.copy()
+ orig_vars.pop('__dict__', None)
+ orig_vars.pop('__weakref__', None)
+ for slots_var in orig_vars.get('__slots__', ()):
+ orig_vars.pop(slots_var)
+ return metaclass(cls.__name__, cls.__bases__, orig_vars)
+ return wrapper
+
+from collections import OrderedDict, Counter
+
+if PY3:
+ def raise_with_traceback(exc, traceback=Ellipsis):
+ if traceback == Ellipsis:
+ _, _, traceback = sys.exc_info()
+ raise exc.with_traceback(traceback)
+else:
+ # this version of raise is a syntax error in Python 3
+ exec("""
+def raise_with_traceback(exc, traceback=Ellipsis):
+ if traceback == Ellipsis:
+ _, _, traceback = sys.exc_info()
+ raise exc, None, traceback
+""")
+
+raise_with_traceback.__doc__ = """Raise exception with existing traceback.
+If traceback is not passed, uses sys.exc_info() to get traceback."""
+
+
+# dateutil minimum version
+import dateutil
+
+if LooseVersion(dateutil.__version__) < LooseVersion('2.5'):
+ raise ImportError('dateutil 2.5.0 is the minimum required version')
+from dateutil import parser as _date_parser
+parse_date = _date_parser.parse
+
+
+# In Python 3.7, the private re._pattern_type is removed.
+# Python 3.5+ have typing.re.Pattern
+if PY36:
+ import typing
+ re_type = typing.re.Pattern
+else:
+ re_type = type(re.compile(''))
+
+# https://github.com/pandas-dev/pandas/pull/9123
+def is_platform_little_endian():
+ """ am I little endian """
+ return sys.byteorder == 'little'
+
+
+def is_platform_windows():
+ return sys.platform == 'win32' or sys.platform == 'cygwin'
+
+
+def is_platform_linux():
+ return sys.platform == 'linux2'
+
+
+def is_platform_mac():
+ return sys.platform == 'darwin'
+
+
+def is_platform_32bit():
+ return struct.calcsize("P") * 8 < 64
diff --git a/contrib/python/pandas/py2/pandas/compat/chainmap.py b/contrib/python/pandas/py2/pandas/compat/chainmap.py
new file mode 100644
index 00000000000..cf1cad56945
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/compat/chainmap.py
@@ -0,0 +1,27 @@
+try:
+ from collections import ChainMap
+except ImportError:
+ from pandas.compat.chainmap_impl import ChainMap
+
+
+class DeepChainMap(ChainMap):
+
+ def __setitem__(self, key, value):
+ for mapping in self.maps:
+ if key in mapping:
+ mapping[key] = value
+ return
+ self.maps[0][key] = value
+
+ def __delitem__(self, key):
+ for mapping in self.maps:
+ if key in mapping:
+ del mapping[key]
+ return
+ raise KeyError(key)
+
+ # override because the m parameter is introduced in Python 3.4
+ def new_child(self, m=None):
+ if m is None:
+ m = {}
+ return self.__class__(m, *self.maps)
diff --git a/contrib/python/pandas/py2/pandas/compat/chainmap_impl.py b/contrib/python/pandas/py2/pandas/compat/chainmap_impl.py
new file mode 100644
index 00000000000..3ea5414cc41
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/compat/chainmap_impl.py
@@ -0,0 +1,157 @@
+import sys
+
+PY3 = sys.version_info[0] >= 3
+
+if PY3:
+ from collections.abc import MutableMapping
+else:
+ from collections import MutableMapping
+
+try:
+ from thread import get_ident
+except ImportError:
+ from _thread import get_ident
+
+
+def recursive_repr(fillvalue='...'):
+ 'Decorator to make a repr function return fillvalue for a recursive call'
+
+ def decorating_function(user_function):
+ repr_running = set()
+
+ def wrapper(self):
+ key = id(self), get_ident()
+ if key in repr_running:
+ return fillvalue
+ repr_running.add(key)
+ try:
+ result = user_function(self)
+ finally:
+ repr_running.discard(key)
+ return result
+
+ # Can't use functools.wraps() here because of bootstrap issues
+ wrapper.__module__ = getattr(user_function, '__module__')
+ wrapper.__doc__ = getattr(user_function, '__doc__')
+ wrapper.__name__ = getattr(user_function, '__name__')
+ return wrapper
+
+ return decorating_function
+
+
+class ChainMap(MutableMapping):
+ """ A ChainMap groups multiple dicts (or other mappings) together
+ to create a single, updatable view.
+
+ The underlying mappings are stored in a list. That list is public and can
+ be accessed / updated using the *maps* attribute. There is no other state.
+
+ Lookups search the underlying mappings successively until a key is found.
+ In contrast, writes, updates, and deletions only operate on the first
+ mapping.
+
+ """
+
+ def __init__(self, *maps):
+ """Initialize a ChainMap by setting *maps* to the given mappings.
+ If no mappings are provided, a single empty dictionary is used.
+
+ """
+ self.maps = list(maps) or [{}] # always at least one map
+
+ def __missing__(self, key):
+ raise KeyError(key)
+
+ def __getitem__(self, key):
+ for mapping in self.maps:
+ try:
+ # can't use 'key in mapping' with defaultdict
+ return mapping[key]
+ except KeyError:
+ pass
+ # support subclasses that define __missing__
+ return self.__missing__(key)
+
+ def get(self, key, default=None):
+ return self[key] if key in self else default
+
+ def __len__(self):
+ # reuses stored hash values if possible
+ return len(set().union(*self.maps))
+
+ def __iter__(self):
+ return iter(set().union(*self.maps))
+
+ def __contains__(self, key):
+ return any(key in m for m in self.maps)
+
+ def __bool__(self):
+ return any(self.maps)
+
+ @recursive_repr()
+ def __repr__(self):
+ return '{0.__class__.__name__}({1})'.format(
+ self, ', '.join(repr(m) for m in self.maps))
+
+ @classmethod
+ def fromkeys(cls, iterable, *args):
+ 'Create a ChainMap with a single dict created from the iterable.'
+ return cls(dict.fromkeys(iterable, *args))
+
+ def copy(self):
+ """
+ New ChainMap or subclass with a new copy of maps[0] and refs to
+ maps[1:]
+ """
+ return self.__class__(self.maps[0].copy(), *self.maps[1:])
+
+ __copy__ = copy
+
+ def new_child(self, m=None): # like Django's Context.push()
+ """
+ New ChainMap with a new map followed by all previous maps. If no
+ map is provided, an empty dict is used.
+ """
+ if m is None:
+ m = {}
+ return self.__class__(m, *self.maps)
+
+ @property
+ def parents(self): # like Django's Context.pop()
+ 'New ChainMap from maps[1:].'
+ return self.__class__(*self.maps[1:])
+
+ def __setitem__(self, key, value):
+ self.maps[0][key] = value
+
+ def __delitem__(self, key):
+ try:
+ del self.maps[0][key]
+ except KeyError:
+ raise KeyError('Key not found in the first mapping: {!r}'
+ .format(key))
+
+ def popitem(self):
+ """
+ Remove and return an item pair from maps[0]. Raise KeyError is maps[0]
+ is empty.
+ """
+ try:
+ return self.maps[0].popitem()
+ except KeyError:
+ raise KeyError('No keys found in the first mapping.')
+
+ def pop(self, key, *args):
+ """
+ Remove *key* from maps[0] and return its value. Raise KeyError if
+ *key* not in maps[0].
+ """
+ try:
+ return self.maps[0].pop(key, *args)
+ except KeyError:
+ raise KeyError('Key not found in the first mapping: {!r}'
+ .format(key))
+
+ def clear(self):
+ 'Clear maps[0], leaving maps[1:] intact.'
+ self.maps[0].clear()
diff --git a/contrib/python/pandas/py2/pandas/compat/numpy/__init__.py b/contrib/python/pandas/py2/pandas/compat/numpy/__init__.py
new file mode 100644
index 00000000000..6e9f768d8bd
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/compat/numpy/__init__.py
@@ -0,0 +1,72 @@
+""" support numpy compatiblitiy across versions """
+
+import re
+import numpy as np
+from distutils.version import LooseVersion
+from pandas.compat import string_types, string_and_binary_types
+
+
+# numpy versioning
+_np_version = np.__version__
+_nlv = LooseVersion(_np_version)
+_np_version_under1p13 = _nlv < LooseVersion('1.13')
+_np_version_under1p14 = _nlv < LooseVersion('1.14')
+_np_version_under1p15 = _nlv < LooseVersion('1.15')
+_np_version_under1p16 = _nlv < LooseVersion('1.16')
+_np_version_under1p17 = _nlv < LooseVersion('1.17')
+
+
+if _nlv < '1.12':
+ raise ImportError('this version of pandas is incompatible with '
+ 'numpy < 1.12.0\n'
+ 'your numpy version is {0}.\n'
+ 'Please upgrade numpy to >= 1.12.0 to use '
+ 'this pandas version'.format(_np_version))
+
+
+_tz_regex = re.compile('[+-]0000$')
+
+
+def tz_replacer(s):
+ if isinstance(s, string_types):
+ if s.endswith('Z'):
+ s = s[:-1]
+ elif _tz_regex.search(s):
+ s = s[:-5]
+ return s
+
+
+def np_datetime64_compat(s, *args, **kwargs):
+ """
+ provide compat for construction of strings to numpy datetime64's with
+ tz-changes in 1.11 that make '2015-01-01 09:00:00Z' show a deprecation
+ warning, when need to pass '2015-01-01 09:00:00'
+ """
+ s = tz_replacer(s)
+ return np.datetime64(s, *args, **kwargs)
+
+
+def np_array_datetime64_compat(arr, *args, **kwargs):
+ """
+ provide compat for construction of an array of strings to a
+ np.array(..., dtype=np.datetime64(..))
+ tz-changes in 1.11 that make '2015-01-01 09:00:00Z' show a deprecation
+ warning, when need to pass '2015-01-01 09:00:00'
+ """
+ # is_list_like
+ if (hasattr(arr, '__iter__')
+ and not isinstance(arr, string_and_binary_types)):
+ arr = [tz_replacer(s) for s in arr]
+ else:
+ arr = tz_replacer(arr)
+
+ return np.array(arr, *args, **kwargs)
+
+
+__all__ = ['np',
+ '_np_version_under1p13',
+ '_np_version_under1p14',
+ '_np_version_under1p15',
+ '_np_version_under1p16',
+ '_np_version_under1p17'
+ ]
diff --git a/contrib/python/pandas/py2/pandas/compat/numpy/function.py b/contrib/python/pandas/py2/pandas/compat/numpy/function.py
new file mode 100644
index 00000000000..417ddd0d8af
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/compat/numpy/function.py
@@ -0,0 +1,402 @@
+"""
+For compatibility with numpy libraries, pandas functions or
+methods have to accept '*args' and '**kwargs' parameters to
+accommodate numpy arguments that are not actually used or
+respected in the pandas implementation.
+
+To ensure that users do not abuse these parameters, validation
+is performed in 'validators.py' to make sure that any extra
+parameters passed correspond ONLY to those in the numpy signature.
+Part of that validation includes whether or not the user attempted
+to pass in non-default values for these extraneous parameters. As we
+want to discourage users from relying on these parameters when calling
+the pandas implementation, we want them only to pass in the default values
+for these parameters.
+
+This module provides a set of commonly used default arguments for functions
+and methods that are spread throughout the codebase. This module will make it
+easier to adjust to future upstream changes in the analogous numpy signatures.
+"""
+
+from numpy import ndarray
+
+from pandas.compat import OrderedDict
+from pandas.errors import UnsupportedFunctionCall
+from pandas.util._validators import (
+ validate_args, validate_args_and_kwargs, validate_kwargs)
+
+from pandas.core.dtypes.common import is_bool, is_integer
+
+
+class CompatValidator(object):
+
+ def __init__(self, defaults, fname=None, method=None,
+ max_fname_arg_count=None):
+ self.fname = fname
+ self.method = method
+ self.defaults = defaults
+ self.max_fname_arg_count = max_fname_arg_count
+
+ def __call__(self, args, kwargs, fname=None,
+ max_fname_arg_count=None, method=None):
+ if args or kwargs:
+ fname = self.fname if fname is None else fname
+ max_fname_arg_count = (self.max_fname_arg_count if
+ max_fname_arg_count is None
+ else max_fname_arg_count)
+ method = self.method if method is None else method
+
+ if method == 'args':
+ validate_args(fname, args, max_fname_arg_count, self.defaults)
+ elif method == 'kwargs':
+ validate_kwargs(fname, kwargs, self.defaults)
+ elif method == 'both':
+ validate_args_and_kwargs(fname, args, kwargs,
+ max_fname_arg_count,
+ self.defaults)
+ else:
+ raise ValueError("invalid validation method "
+ "'{method}'".format(method=method))
+
+
+ARGMINMAX_DEFAULTS = dict(out=None)
+validate_argmin = CompatValidator(ARGMINMAX_DEFAULTS, fname='argmin',
+ method='both', max_fname_arg_count=1)
+validate_argmax = CompatValidator(ARGMINMAX_DEFAULTS, fname='argmax',
+ method='both', max_fname_arg_count=1)
+
+
+def process_skipna(skipna, args):
+ if isinstance(skipna, ndarray) or skipna is None:
+ args = (skipna,) + args
+ skipna = True
+
+ return skipna, args
+
+
+def validate_argmin_with_skipna(skipna, args, kwargs):
+ """
+ If 'Series.argmin' is called via the 'numpy' library,
+ the third parameter in its signature is 'out', which
+ takes either an ndarray or 'None', so check if the
+ 'skipna' parameter is either an instance of ndarray or
+ is None, since 'skipna' itself should be a boolean
+ """
+
+ skipna, args = process_skipna(skipna, args)
+ validate_argmin(args, kwargs)
+ return skipna
+
+
+def validate_argmax_with_skipna(skipna, args, kwargs):
+ """
+ If 'Series.argmax' is called via the 'numpy' library,
+ the third parameter in its signature is 'out', which
+ takes either an ndarray or 'None', so check if the
+ 'skipna' parameter is either an instance of ndarray or
+ is None, since 'skipna' itself should be a boolean
+ """
+
+ skipna, args = process_skipna(skipna, args)
+ validate_argmax(args, kwargs)
+ return skipna
+
+
+ARGSORT_DEFAULTS = OrderedDict()
+ARGSORT_DEFAULTS['axis'] = -1
+ARGSORT_DEFAULTS['kind'] = 'quicksort'
+ARGSORT_DEFAULTS['order'] = None
+validate_argsort = CompatValidator(ARGSORT_DEFAULTS, fname='argsort',
+ max_fname_arg_count=0, method='both')
+
+# two different signatures of argsort, this second validation
+# for when the `kind` param is supported
+ARGSORT_DEFAULTS_KIND = OrderedDict()
+ARGSORT_DEFAULTS_KIND['axis'] = -1
+ARGSORT_DEFAULTS_KIND['order'] = None
+validate_argsort_kind = CompatValidator(ARGSORT_DEFAULTS_KIND, fname='argsort',
+ max_fname_arg_count=0, method='both')
+
+
+def validate_argsort_with_ascending(ascending, args, kwargs):
+ """
+ If 'Categorical.argsort' is called via the 'numpy' library, the
+ first parameter in its signature is 'axis', which takes either
+ an integer or 'None', so check if the 'ascending' parameter has
+ either integer type or is None, since 'ascending' itself should
+ be a boolean
+ """
+
+ if is_integer(ascending) or ascending is None:
+ args = (ascending,) + args
+ ascending = True
+
+ validate_argsort_kind(args, kwargs, max_fname_arg_count=3)
+ return ascending
+
+
+CLIP_DEFAULTS = dict(out=None)
+validate_clip = CompatValidator(CLIP_DEFAULTS, fname='clip',
+ method='both', max_fname_arg_count=3)
+
+
+def validate_clip_with_axis(axis, args, kwargs):
+ """
+ If 'NDFrame.clip' is called via the numpy library, the third
+ parameter in its signature is 'out', which can takes an ndarray,
+ so check if the 'axis' parameter is an instance of ndarray, since
+ 'axis' itself should either be an integer or None
+ """
+
+ if isinstance(axis, ndarray):
+ args = (axis,) + args
+ axis = None
+
+ validate_clip(args, kwargs)
+ return axis
+
+
+COMPRESS_DEFAULTS = OrderedDict()
+COMPRESS_DEFAULTS['axis'] = None
+COMPRESS_DEFAULTS['out'] = None
+validate_compress = CompatValidator(COMPRESS_DEFAULTS, fname='compress',
+ method='both', max_fname_arg_count=1)
+
+CUM_FUNC_DEFAULTS = OrderedDict()
+CUM_FUNC_DEFAULTS['dtype'] = None
+CUM_FUNC_DEFAULTS['out'] = None
+validate_cum_func = CompatValidator(CUM_FUNC_DEFAULTS, method='both',
+ max_fname_arg_count=1)
+validate_cumsum = CompatValidator(CUM_FUNC_DEFAULTS, fname='cumsum',
+ method='both', max_fname_arg_count=1)
+
+
+def validate_cum_func_with_skipna(skipna, args, kwargs, name):
+ """
+ If this function is called via the 'numpy' library, the third
+ parameter in its signature is 'dtype', which takes either a
+ 'numpy' dtype or 'None', so check if the 'skipna' parameter is
+ a boolean or not
+ """
+ if not is_bool(skipna):
+ args = (skipna,) + args
+ skipna = True
+
+ validate_cum_func(args, kwargs, fname=name)
+ return skipna
+
+
+ALLANY_DEFAULTS = OrderedDict()
+ALLANY_DEFAULTS['dtype'] = None
+ALLANY_DEFAULTS['out'] = None
+ALLANY_DEFAULTS['keepdims'] = False
+validate_all = CompatValidator(ALLANY_DEFAULTS, fname='all',
+ method='both', max_fname_arg_count=1)
+validate_any = CompatValidator(ALLANY_DEFAULTS, fname='any',
+ method='both', max_fname_arg_count=1)
+
+LOGICAL_FUNC_DEFAULTS = dict(out=None, keepdims=False)
+validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method='kwargs')
+
+MINMAX_DEFAULTS = dict(out=None, keepdims=False)
+validate_min = CompatValidator(MINMAX_DEFAULTS, fname='min',
+ method='both', max_fname_arg_count=1)
+validate_max = CompatValidator(MINMAX_DEFAULTS, fname='max',
+ method='both', max_fname_arg_count=1)
+
+RESHAPE_DEFAULTS = dict(order='C')
+validate_reshape = CompatValidator(RESHAPE_DEFAULTS, fname='reshape',
+ method='both', max_fname_arg_count=1)
+
+REPEAT_DEFAULTS = dict(axis=None)
+validate_repeat = CompatValidator(REPEAT_DEFAULTS, fname='repeat',
+ method='both', max_fname_arg_count=1)
+
+ROUND_DEFAULTS = dict(out=None)
+validate_round = CompatValidator(ROUND_DEFAULTS, fname='round',
+ method='both', max_fname_arg_count=1)
+
+SORT_DEFAULTS = OrderedDict()
+SORT_DEFAULTS['axis'] = -1
+SORT_DEFAULTS['kind'] = 'quicksort'
+SORT_DEFAULTS['order'] = None
+validate_sort = CompatValidator(SORT_DEFAULTS, fname='sort',
+ method='kwargs')
+
+STAT_FUNC_DEFAULTS = OrderedDict()
+STAT_FUNC_DEFAULTS['dtype'] = None
+STAT_FUNC_DEFAULTS['out'] = None
+
+PROD_DEFAULTS = SUM_DEFAULTS = STAT_FUNC_DEFAULTS.copy()
+SUM_DEFAULTS['keepdims'] = False
+SUM_DEFAULTS['initial'] = None
+
+MEDIAN_DEFAULTS = STAT_FUNC_DEFAULTS.copy()
+MEDIAN_DEFAULTS['overwrite_input'] = False
+MEDIAN_DEFAULTS['keepdims'] = False
+
+STAT_FUNC_DEFAULTS['keepdims'] = False
+
+validate_stat_func = CompatValidator(STAT_FUNC_DEFAULTS,
+ method='kwargs')
+validate_sum = CompatValidator(SUM_DEFAULTS, fname='sum',
+ method='both', max_fname_arg_count=1)
+validate_prod = CompatValidator(PROD_DEFAULTS, fname="prod",
+ method="both", max_fname_arg_count=1)
+validate_mean = CompatValidator(STAT_FUNC_DEFAULTS, fname='mean',
+ method='both', max_fname_arg_count=1)
+validate_median = CompatValidator(MEDIAN_DEFAULTS, fname='median',
+ method='both', max_fname_arg_count=1)
+
+STAT_DDOF_FUNC_DEFAULTS = OrderedDict()
+STAT_DDOF_FUNC_DEFAULTS['dtype'] = None
+STAT_DDOF_FUNC_DEFAULTS['out'] = None
+STAT_DDOF_FUNC_DEFAULTS['keepdims'] = False
+validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS,
+ method='kwargs')
+
+TAKE_DEFAULTS = OrderedDict()
+TAKE_DEFAULTS['out'] = None
+TAKE_DEFAULTS['mode'] = 'raise'
+validate_take = CompatValidator(TAKE_DEFAULTS, fname='take',
+ method='kwargs')
+
+
+def validate_take_with_convert(convert, args, kwargs):
+ """
+ If this function is called via the 'numpy' library, the third
+ parameter in its signature is 'axis', which takes either an
+ ndarray or 'None', so check if the 'convert' parameter is either
+ an instance of ndarray or is None
+ """
+
+ if isinstance(convert, ndarray) or convert is None:
+ args = (convert,) + args
+ convert = True
+
+ validate_take(args, kwargs, max_fname_arg_count=3, method='both')
+ return convert
+
+
+TRANSPOSE_DEFAULTS = dict(axes=None)
+validate_transpose = CompatValidator(TRANSPOSE_DEFAULTS, fname='transpose',
+ method='both', max_fname_arg_count=0)
+
+
+def validate_transpose_for_generic(inst, kwargs):
+ try:
+ validate_transpose(tuple(), kwargs)
+ except ValueError as e:
+ klass = type(inst).__name__
+ msg = str(e)
+
+ # the Panel class actual relies on the 'axes' parameter if called
+ # via the 'numpy' library, so let's make sure the error is specific
+ # about saying that the parameter is not supported for particular
+ # implementations of 'transpose'
+ if "the 'axes' parameter is not supported" in msg:
+ msg += " for {klass} instances".format(klass=klass)
+
+ raise ValueError(msg)
+
+
+def validate_window_func(name, args, kwargs):
+ numpy_args = ('axis', 'dtype', 'out')
+ msg = ("numpy operations are not "
+ "valid with window objects. "
+ "Use .{func}() directly instead ".format(func=name))
+
+ if len(args) > 0:
+ raise UnsupportedFunctionCall(msg)
+
+ for arg in numpy_args:
+ if arg in kwargs:
+ raise UnsupportedFunctionCall(msg)
+
+
+def validate_rolling_func(name, args, kwargs):
+ numpy_args = ('axis', 'dtype', 'out')
+ msg = ("numpy operations are not "
+ "valid with window objects. "
+ "Use .rolling(...).{func}() instead ".format(func=name))
+
+ if len(args) > 0:
+ raise UnsupportedFunctionCall(msg)
+
+ for arg in numpy_args:
+ if arg in kwargs:
+ raise UnsupportedFunctionCall(msg)
+
+
+def validate_expanding_func(name, args, kwargs):
+ numpy_args = ('axis', 'dtype', 'out')
+ msg = ("numpy operations are not "
+ "valid with window objects. "
+ "Use .expanding(...).{func}() instead ".format(func=name))
+
+ if len(args) > 0:
+ raise UnsupportedFunctionCall(msg)
+
+ for arg in numpy_args:
+ if arg in kwargs:
+ raise UnsupportedFunctionCall(msg)
+
+
+def validate_groupby_func(name, args, kwargs, allowed=None):
+ """
+ 'args' and 'kwargs' should be empty, except for allowed
+ kwargs because all of
+ their necessary parameters are explicitly listed in
+ the function signature
+ """
+ if allowed is None:
+ allowed = []
+
+ kwargs = set(kwargs) - set(allowed)
+
+ if len(args) + len(kwargs) > 0:
+ raise UnsupportedFunctionCall((
+ "numpy operations are not valid "
+ "with groupby. Use .groupby(...)."
+ "{func}() instead".format(func=name)))
+
+
+RESAMPLER_NUMPY_OPS = ('min', 'max', 'sum', 'prod',
+ 'mean', 'std', 'var')
+
+
+def validate_resampler_func(method, args, kwargs):
+ """
+ 'args' and 'kwargs' should be empty because all of
+ their necessary parameters are explicitly listed in
+ the function signature
+ """
+ if len(args) + len(kwargs) > 0:
+ if method in RESAMPLER_NUMPY_OPS:
+ raise UnsupportedFunctionCall((
+ "numpy operations are not valid "
+ "with resample. Use .resample(...)."
+ "{func}() instead".format(func=method)))
+ else:
+ raise TypeError("too many arguments passed in")
+
+
+def validate_minmax_axis(axis):
+ """
+ Ensure that the axis argument passed to min, max, argmin, or argmax is
+ zero or None, as otherwise it will be incorrectly ignored.
+
+ Parameters
+ ----------
+ axis : int or None
+
+ Raises
+ ------
+ ValueError
+ """
+ ndim = 1 # hard-coded for Index
+ if axis is None:
+ return
+ if axis >= ndim or (axis < 0 and ndim + axis < 0):
+ raise ValueError("`axis` must be fewer than the number of "
+ "dimensions ({ndim})".format(ndim=ndim))
diff --git a/contrib/python/pandas/py2/pandas/compat/pickle_compat.py b/contrib/python/pandas/py2/pandas/compat/pickle_compat.py
new file mode 100644
index 00000000000..61295b8249f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/compat/pickle_compat.py
@@ -0,0 +1,229 @@
+"""
+Support pre-0.12 series pickle compatibility.
+"""
+
+import copy
+import pickle as pkl
+import sys
+
+from pandas.compat import string_types, u # noqa
+
+import pandas # noqa
+from pandas import Index, compat
+
+
+def load_reduce(self):
+ stack = self.stack
+ args = stack.pop()
+ func = stack[-1]
+
+ if len(args) and type(args[0]) is type:
+ n = args[0].__name__ # noqa
+
+ try:
+ stack[-1] = func(*args)
+ return
+ except Exception as e:
+
+ # If we have a deprecated function,
+ # try to replace and try again.
+
+ msg = '_reconstruct: First argument must be a sub-type of ndarray'
+
+ if msg in str(e):
+ try:
+ cls = args[0]
+ stack[-1] = object.__new__(cls)
+ return
+ except TypeError:
+ pass
+
+ # try to re-encode the arguments
+ if getattr(self, 'encoding', None) is not None:
+ args = tuple(arg.encode(self.encoding)
+ if isinstance(arg, string_types)
+ else arg for arg in args)
+ try:
+ stack[-1] = func(*args)
+ return
+ except TypeError:
+ pass
+
+ # unknown exception, re-raise
+ if getattr(self, 'is_verbose', None):
+ print(sys.exc_info())
+ print(func, args)
+ raise
+
+
+# If classes are moved, provide compat here.
+_class_locations_map = {
+ ('pandas.core.sparse.array', 'SparseArray'):
+ ('pandas.core.arrays', 'SparseArray'),
+
+ # 15477
+ #
+ # TODO: When FrozenNDArray is removed, add
+ # the following lines for compat:
+ #
+ # ('pandas.core.base', 'FrozenNDArray'):
+ # ('numpy', 'ndarray'),
+ # ('pandas.core.indexes.frozen', 'FrozenNDArray'):
+ # ('numpy', 'ndarray'),
+ #
+ # Afterwards, remove the current entry
+ # for `pandas.core.base.FrozenNDArray`.
+ ('pandas.core.base', 'FrozenNDArray'):
+ ('pandas.core.indexes.frozen', 'FrozenNDArray'),
+ ('pandas.core.base', 'FrozenList'):
+ ('pandas.core.indexes.frozen', 'FrozenList'),
+
+ # 10890
+ ('pandas.core.series', 'TimeSeries'):
+ ('pandas.core.series', 'Series'),
+ ('pandas.sparse.series', 'SparseTimeSeries'):
+ ('pandas.core.sparse.series', 'SparseSeries'),
+
+ # 12588, extensions moving
+ ('pandas._sparse', 'BlockIndex'):
+ ('pandas._libs.sparse', 'BlockIndex'),
+ ('pandas.tslib', 'Timestamp'):
+ ('pandas._libs.tslib', 'Timestamp'),
+
+ # 18543 moving period
+ ('pandas._period', 'Period'): ('pandas._libs.tslibs.period', 'Period'),
+ ('pandas._libs.period', 'Period'):
+ ('pandas._libs.tslibs.period', 'Period'),
+
+ # 18014 moved __nat_unpickle from _libs.tslib-->_libs.tslibs.nattype
+ ('pandas.tslib', '__nat_unpickle'):
+ ('pandas._libs.tslibs.nattype', '__nat_unpickle'),
+ ('pandas._libs.tslib', '__nat_unpickle'):
+ ('pandas._libs.tslibs.nattype', '__nat_unpickle'),
+
+ # 15998 top-level dirs moving
+ ('pandas.sparse.array', 'SparseArray'):
+ ('pandas.core.arrays.sparse', 'SparseArray'),
+ ('pandas.sparse.series', 'SparseSeries'):
+ ('pandas.core.sparse.series', 'SparseSeries'),
+ ('pandas.sparse.frame', 'SparseDataFrame'):
+ ('pandas.core.sparse.frame', 'SparseDataFrame'),
+ ('pandas.indexes.base', '_new_Index'):
+ ('pandas.core.indexes.base', '_new_Index'),
+ ('pandas.indexes.base', 'Index'):
+ ('pandas.core.indexes.base', 'Index'),
+ ('pandas.indexes.numeric', 'Int64Index'):
+ ('pandas.core.indexes.numeric', 'Int64Index'),
+ ('pandas.indexes.range', 'RangeIndex'):
+ ('pandas.core.indexes.range', 'RangeIndex'),
+ ('pandas.indexes.multi', 'MultiIndex'):
+ ('pandas.core.indexes.multi', 'MultiIndex'),
+ ('pandas.tseries.index', '_new_DatetimeIndex'):
+ ('pandas.core.indexes.datetimes', '_new_DatetimeIndex'),
+ ('pandas.tseries.index', 'DatetimeIndex'):
+ ('pandas.core.indexes.datetimes', 'DatetimeIndex'),
+ ('pandas.tseries.period', 'PeriodIndex'):
+ ('pandas.core.indexes.period', 'PeriodIndex'),
+
+ # 19269, arrays moving
+ ('pandas.core.categorical', 'Categorical'):
+ ('pandas.core.arrays', 'Categorical'),
+
+ # 19939, add timedeltaindex, float64index compat from 15998 move
+ ('pandas.tseries.tdi', 'TimedeltaIndex'):
+ ('pandas.core.indexes.timedeltas', 'TimedeltaIndex'),
+ ('pandas.indexes.numeric', 'Float64Index'):
+ ('pandas.core.indexes.numeric', 'Float64Index'),
+}
+
+
+# our Unpickler sub-class to override methods and some dispatcher
+# functions for compat
+
+if compat.PY3:
+ class Unpickler(pkl._Unpickler):
+
+ def find_class(self, module, name):
+ # override superclass
+ key = (module, name)
+ module, name = _class_locations_map.get(key, key)
+ return super(Unpickler, self).find_class(module, name)
+
+else:
+
+ class Unpickler(pkl.Unpickler):
+
+ def find_class(self, module, name):
+ # override superclass
+ key = (module, name)
+ module, name = _class_locations_map.get(key, key)
+ __import__(module)
+ mod = sys.modules[module]
+ klass = getattr(mod, name)
+ return klass
+
+Unpickler.dispatch = copy.copy(Unpickler.dispatch)
+Unpickler.dispatch[pkl.REDUCE[0]] = load_reduce
+
+
+def load_newobj(self):
+ args = self.stack.pop()
+ cls = self.stack[-1]
+
+ # compat
+ if issubclass(cls, Index):
+ obj = object.__new__(cls)
+ else:
+ obj = cls.__new__(cls, *args)
+
+ self.stack[-1] = obj
+
+
+Unpickler.dispatch[pkl.NEWOBJ[0]] = load_newobj
+
+
+def load_newobj_ex(self):
+ kwargs = self.stack.pop()
+ args = self.stack.pop()
+ cls = self.stack.pop()
+
+ # compat
+ if issubclass(cls, Index):
+ obj = object.__new__(cls)
+ else:
+ obj = cls.__new__(cls, *args, **kwargs)
+ self.append(obj)
+
+
+try:
+ Unpickler.dispatch[pkl.NEWOBJ_EX[0]] = load_newobj_ex
+except (AttributeError, KeyError):
+ pass
+
+
+def load(fh, encoding=None, compat=False, is_verbose=False):
+ """load a pickle, with a provided encoding
+
+ if compat is True:
+ fake the old class hierarchy
+ if it works, then return the new type objects
+
+ Parameters
+ ----------
+ fh : a filelike object
+ encoding : an optional encoding
+ compat : provide Series compatibility mode, boolean, default False
+ is_verbose : show exception output
+ """
+
+ try:
+ fh.seek(0)
+ if encoding is not None:
+ up = Unpickler(fh, encoding=encoding)
+ else:
+ up = Unpickler(fh)
+ up.is_verbose = is_verbose
+
+ return up.load()
+ except (ValueError, TypeError):
+ raise
diff --git a/contrib/python/pandas/py2/pandas/core/__init__.py b/contrib/python/pandas/py2/pandas/core/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/core/accessor.py b/contrib/python/pandas/py2/pandas/core/accessor.py
new file mode 100644
index 00000000000..961488ff12e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/accessor.py
@@ -0,0 +1,281 @@
+# -*- coding: utf-8 -*-
+"""
+
+accessor.py contains base classes for implementing accessor properties
+that can be mixed into or pinned onto other pandas classes.
+
+"""
+import warnings
+
+from pandas.util._decorators import Appender
+
+
+class DirNamesMixin(object):
+ _accessors = frozenset()
+ _deprecations = frozenset(
+ ['asobject', 'base', 'data', 'flags', 'itemsize', 'strides'])
+
+ def _dir_deletions(self):
+ """ delete unwanted __dir__ for this object """
+ return self._accessors | self._deprecations
+
+ def _dir_additions(self):
+ """ add additional __dir__ for this object """
+ rv = set()
+ for accessor in self._accessors:
+ try:
+ getattr(self, accessor)
+ rv.add(accessor)
+ except AttributeError:
+ pass
+ return rv
+
+ def __dir__(self):
+ """
+ Provide method name lookup and completion
+ Only provide 'public' methods
+ """
+ rv = set(dir(type(self)))
+ rv = (rv - self._dir_deletions()) | self._dir_additions()
+ return sorted(rv)
+
+
+class PandasDelegate(object):
+ """
+ an abstract base class for delegating methods/properties
+ """
+
+ def _delegate_property_get(self, name, *args, **kwargs):
+ raise TypeError("You cannot access the "
+ "property {name}".format(name=name))
+
+ def _delegate_property_set(self, name, value, *args, **kwargs):
+ raise TypeError("The property {name} cannot be set".format(name=name))
+
+ def _delegate_method(self, name, *args, **kwargs):
+ raise TypeError("You cannot call method {name}".format(name=name))
+
+ @classmethod
+ def _add_delegate_accessors(cls, delegate, accessors, typ,
+ overwrite=False):
+ """
+ Add accessors to cls from the delegate class.
+
+ Parameters
+ ----------
+ cls : the class to add the methods/properties to
+ delegate : the class to get methods/properties & doc-strings
+ acccessors : string list of accessors to add
+ typ : 'property' or 'method'
+ overwrite : boolean, default False
+ overwrite the method/property in the target class if it exists
+ """
+
+ def _create_delegator_property(name):
+
+ def _getter(self):
+ return self._delegate_property_get(name)
+
+ def _setter(self, new_values):
+ return self._delegate_property_set(name, new_values)
+
+ _getter.__name__ = name
+ _setter.__name__ = name
+
+ return property(fget=_getter, fset=_setter,
+ doc=getattr(delegate, name).__doc__)
+
+ def _create_delegator_method(name):
+
+ def f(self, *args, **kwargs):
+ return self._delegate_method(name, *args, **kwargs)
+
+ f.__name__ = name
+ f.__doc__ = getattr(delegate, name).__doc__
+
+ return f
+
+ for name in accessors:
+
+ if typ == 'property':
+ f = _create_delegator_property(name)
+ else:
+ f = _create_delegator_method(name)
+
+ # don't overwrite existing methods/properties
+ if overwrite or not hasattr(cls, name):
+ setattr(cls, name, f)
+
+
+def delegate_names(delegate, accessors, typ, overwrite=False):
+ """
+ Add delegated names to a class using a class decorator. This provides
+ an alternative usage to directly calling `_add_delegate_accessors`
+ below a class definition.
+
+ Parameters
+ ----------
+ delegate : object
+ the class to get methods/properties & doc-strings
+ acccessors : Sequence[str]
+ List of accessor to add
+ typ : {'property', 'method'}
+ overwrite : boolean, default False
+ overwrite the method/property in the target class if it exists
+
+ Returns
+ -------
+ callable
+ A class decorator.
+
+ Examples
+ --------
+ @delegate_names(Categorical, ["categories", "ordered"], "property")
+ class CategoricalAccessor(PandasDelegate):
+ [...]
+ """
+ def add_delegate_accessors(cls):
+ cls._add_delegate_accessors(delegate, accessors, typ,
+ overwrite=overwrite)
+ return cls
+
+ return add_delegate_accessors
+
+
+# Ported with modifications from xarray
+# https://github.com/pydata/xarray/blob/master/xarray/core/extensions.py
+# 1. We don't need to catch and re-raise AttributeErrors as RuntimeErrors
+# 2. We use a UserWarning instead of a custom Warning
+
+class CachedAccessor(object):
+ """
+ Custom property-like object (descriptor) for caching accessors.
+
+ Parameters
+ ----------
+ name : str
+ The namespace this will be accessed under, e.g. ``df.foo``
+ accessor : cls
+ The class with the extension methods. The class' __init__ method
+ should expect one of a ``Series``, ``DataFrame`` or ``Index`` as
+ the single argument ``data``
+ """
+ def __init__(self, name, accessor):
+ self._name = name
+ self._accessor = accessor
+
+ def __get__(self, obj, cls):
+ if obj is None:
+ # we're accessing the attribute of the class, i.e., Dataset.geo
+ return self._accessor
+ accessor_obj = self._accessor(obj)
+ # Replace the property with the accessor object. Inspired by:
+ # http://www.pydanny.com/cached-property.html
+ # We need to use object.__setattr__ because we overwrite __setattr__ on
+ # NDFrame
+ object.__setattr__(obj, self._name, accessor_obj)
+ return accessor_obj
+
+
+def _register_accessor(name, cls):
+ def decorator(accessor):
+ if hasattr(cls, name):
+ warnings.warn(
+ 'registration of accessor {!r} under name {!r} for type '
+ '{!r} is overriding a preexisting attribute with the same '
+ 'name.'.format(accessor, name, cls),
+ UserWarning,
+ stacklevel=2)
+ setattr(cls, name, CachedAccessor(name, accessor))
+ cls._accessors.add(name)
+ return accessor
+ return decorator
+
+
+_doc = """\
+Register a custom accessor on %(klass)s objects.
+
+Parameters
+----------
+name : str
+ Name under which the accessor should be registered. A warning is issued
+ if this name conflicts with a preexisting attribute.
+
+See Also
+--------
+%(others)s
+
+Notes
+-----
+When accessed, your accessor will be initialized with the pandas object
+the user is interacting with. So the signature must be
+
+.. code-block:: python
+
+ def __init__(self, pandas_object): # noqa: E999
+ ...
+
+For consistency with pandas methods, you should raise an ``AttributeError``
+if the data passed to your accessor has an incorrect dtype.
+
+>>> pd.Series(['a', 'b']).dt
+Traceback (most recent call last):
+...
+AttributeError: Can only use .dt accessor with datetimelike values
+
+Examples
+--------
+
+In your library code::
+
+ import pandas as pd
+
+ @pd.api.extensions.register_dataframe_accessor("geo")
+ class GeoAccessor(object):
+ def __init__(self, pandas_obj):
+ self._obj = pandas_obj
+
+ @property
+ def center(self):
+ # return the geographic center point of this DataFrame
+ lat = self._obj.latitude
+ lon = self._obj.longitude
+ return (float(lon.mean()), float(lat.mean()))
+
+ def plot(self):
+ # plot this array's data on a map, e.g., using Cartopy
+ pass
+
+Back in an interactive IPython session:
+
+ >>> ds = pd.DataFrame({'longitude': np.linspace(0, 10),
+ ... 'latitude': np.linspace(0, 20)})
+ >>> ds.geo.center
+ (5.0, 10.0)
+ >>> ds.geo.plot()
+ # plots data on a map
+"""
+
+
+@Appender(_doc % dict(klass="DataFrame",
+ others=("register_series_accessor, "
+ "register_index_accessor")))
+def register_dataframe_accessor(name):
+ from pandas import DataFrame
+ return _register_accessor(name, DataFrame)
+
+
+@Appender(_doc % dict(klass="Series",
+ others=("register_dataframe_accessor, "
+ "register_index_accessor")))
+def register_series_accessor(name):
+ from pandas import Series
+ return _register_accessor(name, Series)
+
+
+@Appender(_doc % dict(klass="Index",
+ others=("register_dataframe_accessor, "
+ "register_series_accessor")))
+def register_index_accessor(name):
+ from pandas import Index
+ return _register_accessor(name, Index)
diff --git a/contrib/python/pandas/py2/pandas/core/algorithms.py b/contrib/python/pandas/py2/pandas/core/algorithms.py
new file mode 100644
index 00000000000..f35a117706d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/algorithms.py
@@ -0,0 +1,1826 @@
+"""
+Generic data algorithms. This module is experimental at the moment and not
+intended for public consumption
+"""
+from __future__ import division
+
+from textwrap import dedent
+from warnings import catch_warnings, simplefilter, warn
+
+import numpy as np
+
+from pandas._libs import algos, hashtable as htable, lib
+from pandas._libs.tslib import iNaT
+from pandas.util._decorators import Appender, Substitution, deprecate_kwarg
+
+from pandas.core.dtypes.cast import (
+ construct_1d_object_array_from_listlike, maybe_promote)
+from pandas.core.dtypes.common import (
+ ensure_float64, ensure_int64, ensure_object, ensure_platform_int,
+ ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype,
+ is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype,
+ is_datetimelike, is_extension_array_dtype, is_float_dtype,
+ is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype,
+ is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype,
+ is_sparse, is_timedelta64_dtype, is_unsigned_integer_dtype,
+ needs_i8_conversion)
+from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries
+from pandas.core.dtypes.missing import isna, na_value_for_dtype
+
+from pandas.core import common as com
+
+_shared_docs = {}
+
+
+# --------------- #
+# dtype access #
+# --------------- #
+def _ensure_data(values, dtype=None):
+ """
+ routine to ensure that our data is of the correct
+ input dtype for lower-level routines
+
+ This will coerce:
+ - ints -> int64
+ - uint -> uint64
+ - bool -> uint64 (TODO this should be uint8)
+ - datetimelike -> i8
+ - datetime64tz -> i8 (in local tz)
+ - categorical -> codes
+
+ Parameters
+ ----------
+ values : array-like
+ dtype : pandas_dtype, optional
+ coerce to this dtype
+
+ Returns
+ -------
+ (ndarray, pandas_dtype, algo dtype as a string)
+
+ """
+
+ # we check some simple dtypes first
+ try:
+ if is_object_dtype(dtype):
+ return ensure_object(np.asarray(values)), 'object', 'object'
+ if is_bool_dtype(values) or is_bool_dtype(dtype):
+ # we are actually coercing to uint64
+ # until our algos support uint8 directly (see TODO)
+ return np.asarray(values).astype('uint64'), 'bool', 'uint64'
+ elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype):
+ return ensure_int64(values), 'int64', 'int64'
+ elif (is_unsigned_integer_dtype(values) or
+ is_unsigned_integer_dtype(dtype)):
+ return ensure_uint64(values), 'uint64', 'uint64'
+ elif is_float_dtype(values) or is_float_dtype(dtype):
+ return ensure_float64(values), 'float64', 'float64'
+ elif is_object_dtype(values) and dtype is None:
+ return ensure_object(np.asarray(values)), 'object', 'object'
+ elif is_complex_dtype(values) or is_complex_dtype(dtype):
+
+ # ignore the fact that we are casting to float
+ # which discards complex parts
+ with catch_warnings():
+ simplefilter("ignore", np.ComplexWarning)
+ values = ensure_float64(values)
+ return values, 'float64', 'float64'
+
+ except (TypeError, ValueError, OverflowError):
+ # if we are trying to coerce to a dtype
+ # and it is incompat this will fall thru to here
+ return ensure_object(values), 'object', 'object'
+
+ # datetimelike
+ if (needs_i8_conversion(values) or
+ is_period_dtype(dtype) or
+ is_datetime64_any_dtype(dtype) or
+ is_timedelta64_dtype(dtype)):
+ if is_period_dtype(values) or is_period_dtype(dtype):
+ from pandas import PeriodIndex
+ values = PeriodIndex(values)
+ dtype = values.dtype
+ elif is_timedelta64_dtype(values) or is_timedelta64_dtype(dtype):
+ from pandas import TimedeltaIndex
+ values = TimedeltaIndex(values)
+ dtype = values.dtype
+ else:
+ # Datetime
+ from pandas import DatetimeIndex
+ values = DatetimeIndex(values)
+ dtype = values.dtype
+
+ return values.asi8, dtype, 'int64'
+
+ elif (is_categorical_dtype(values) and
+ (is_categorical_dtype(dtype) or dtype is None)):
+ values = getattr(values, 'values', values)
+ values = values.codes
+ dtype = 'category'
+
+ # we are actually coercing to int64
+ # until our algos support int* directly (not all do)
+ values = ensure_int64(values)
+
+ return values, dtype, 'int64'
+
+ # we have failed, return object
+ values = np.asarray(values, dtype=np.object)
+ return ensure_object(values), 'object', 'object'
+
+
+def _reconstruct_data(values, dtype, original):
+ """
+ reverse of _ensure_data
+
+ Parameters
+ ----------
+ values : ndarray
+ dtype : pandas_dtype
+ original : ndarray-like
+
+ Returns
+ -------
+ Index for extension types, otherwise ndarray casted to dtype
+ """
+ from pandas import Index
+ if is_extension_array_dtype(dtype):
+ values = dtype.construct_array_type()._from_sequence(values)
+ elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
+ values = Index(original)._shallow_copy(values, name=None)
+ elif is_bool_dtype(dtype):
+ values = values.astype(dtype)
+
+ # we only support object dtypes bool Index
+ if isinstance(original, Index):
+ values = values.astype(object)
+ elif dtype is not None:
+ values = values.astype(dtype)
+
+ return values
+
+
+def _ensure_arraylike(values):
+ """
+ ensure that we are arraylike if not already
+ """
+ if not is_array_like(values):
+ inferred = lib.infer_dtype(values, skipna=False)
+ if inferred in ['mixed', 'string', 'unicode']:
+ if isinstance(values, tuple):
+ values = list(values)
+ values = construct_1d_object_array_from_listlike(values)
+ else:
+ values = np.asarray(values)
+ return values
+
+
+_hashtables = {
+ 'float64': (htable.Float64HashTable, htable.Float64Vector),
+ 'uint64': (htable.UInt64HashTable, htable.UInt64Vector),
+ 'int64': (htable.Int64HashTable, htable.Int64Vector),
+ 'string': (htable.StringHashTable, htable.ObjectVector),
+ 'object': (htable.PyObjectHashTable, htable.ObjectVector)
+}
+
+
+def _get_hashtable_algo(values):
+ """
+ Parameters
+ ----------
+ values : arraylike
+
+ Returns
+ -------
+ tuples(hashtable class,
+ vector class,
+ values,
+ dtype,
+ ndtype)
+ """
+ values, dtype, ndtype = _ensure_data(values)
+
+ if ndtype == 'object':
+
+ # it's cheaper to use a String Hash Table than Object; we infer
+ # including nulls because that is the only difference between
+ # StringHashTable and ObjectHashtable
+ if lib.infer_dtype(values, skipna=False) in ['string']:
+ ndtype = 'string'
+ else:
+ ndtype = 'object'
+
+ htable, table = _hashtables[ndtype]
+ return (htable, table, values, dtype, ndtype)
+
+
+def _get_data_algo(values, func_map):
+
+ if is_categorical_dtype(values):
+ values = values._values_for_rank()
+
+ values, dtype, ndtype = _ensure_data(values)
+ if ndtype == 'object':
+
+ # it's cheaper to use a String Hash Table than Object; we infer
+ # including nulls because that is the only difference between
+ # StringHashTable and ObjectHashtable
+ if lib.infer_dtype(values, skipna=False) in ['string']:
+ ndtype = 'string'
+
+ f = func_map.get(ndtype, func_map['object'])
+
+ return f, values
+
+
+# --------------- #
+# top-level algos #
+# --------------- #
+
+def match(to_match, values, na_sentinel=-1):
+ """
+ Compute locations of to_match into values
+
+ Parameters
+ ----------
+ to_match : array-like
+ values to find positions of
+ values : array-like
+ Unique set of values
+ na_sentinel : int, default -1
+ Value to mark "not found"
+
+ Examples
+ --------
+
+ Returns
+ -------
+ match : ndarray of integers
+ """
+ values = com.asarray_tuplesafe(values)
+ htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
+ to_match, _, _ = _ensure_data(to_match, dtype)
+ table = htable(min(len(to_match), 1000000))
+ table.map_locations(values)
+ result = table.lookup(to_match)
+
+ if na_sentinel != -1:
+
+ # replace but return a numpy array
+ # use a Series because it handles dtype conversions properly
+ from pandas import Series
+ result = Series(result.ravel()).replace(-1, na_sentinel)
+ result = result.values.reshape(result.shape)
+
+ return result
+
+
+def unique(values):
+ """
+ Hash table-based unique. Uniques are returned in order
+ of appearance. This does NOT sort.
+
+ Significantly faster than numpy.unique. Includes NA values.
+
+ Parameters
+ ----------
+ values : 1d array-like
+
+ Returns
+ -------
+ unique values.
+ - If the input is an Index, the return is an Index
+ - If the input is a Categorical dtype, the return is a Categorical
+ - If the input is a Series/ndarray, the return will be an ndarray
+
+ See Also
+ --------
+ pandas.Index.unique
+ pandas.Series.unique
+
+ Examples
+ --------
+ >>> pd.unique(pd.Series([2, 1, 3, 3]))
+ array([2, 1, 3])
+
+ >>> pd.unique(pd.Series([2] + [1] * 5))
+ array([2, 1])
+
+ >>> pd.unique(pd.Series([pd.Timestamp('20160101'),
+ ... pd.Timestamp('20160101')]))
+ array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')
+
+ >>> pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'),
+ ... pd.Timestamp('20160101', tz='US/Eastern')]))
+ array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')],
+ dtype=object)
+
+ >>> pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
+ ... pd.Timestamp('20160101', tz='US/Eastern')]))
+ DatetimeIndex(['2016-01-01 00:00:00-05:00'],
+ ... dtype='datetime64[ns, US/Eastern]', freq=None)
+
+ >>> pd.unique(list('baabc'))
+ array(['b', 'a', 'c'], dtype=object)
+
+ An unordered Categorical will return categories in the
+ order of appearance.
+
+ >>> pd.unique(pd.Series(pd.Categorical(list('baabc'))))
+ [b, a, c]
+ Categories (3, object): [b, a, c]
+
+ >>> pd.unique(pd.Series(pd.Categorical(list('baabc'),
+ ... categories=list('abc'))))
+ [b, a, c]
+ Categories (3, object): [b, a, c]
+
+ An ordered Categorical preserves the category ordering.
+
+ >>> pd.unique(pd.Series(pd.Categorical(list('baabc'),
+ ... categories=list('abc'),
+ ... ordered=True)))
+ [b, a, c]
+ Categories (3, object): [a < b < c]
+
+ An array of tuples
+
+ >>> pd.unique([('a', 'b'), ('b', 'a'), ('a', 'c'), ('b', 'a')])
+ array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
+ """
+
+ values = _ensure_arraylike(values)
+
+ if is_extension_array_dtype(values):
+ # Dispatch to extension dtype's unique.
+ return values.unique()
+
+ original = values
+ htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
+
+ table = htable(len(values))
+ uniques = table.unique(values)
+ uniques = _reconstruct_data(uniques, dtype, original)
+ return uniques
+
+
+unique1d = unique
+
+
+def isin(comps, values):
+ """
+ Compute the isin boolean array
+
+ Parameters
+ ----------
+ comps : array-like
+ values : array-like
+
+ Returns
+ -------
+ boolean array same length as comps
+ """
+
+ if not is_list_like(comps):
+ raise TypeError("only list-like objects are allowed to be passed"
+ " to isin(), you passed a [{comps_type}]"
+ .format(comps_type=type(comps).__name__))
+ if not is_list_like(values):
+ raise TypeError("only list-like objects are allowed to be passed"
+ " to isin(), you passed a [{values_type}]"
+ .format(values_type=type(values).__name__))
+
+ if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)):
+ values = construct_1d_object_array_from_listlike(list(values))
+
+ if is_categorical_dtype(comps):
+ # TODO(extension)
+ # handle categoricals
+ return comps._values.isin(values)
+
+ comps = com.values_from_object(comps)
+
+ comps, dtype, _ = _ensure_data(comps)
+ values, _, _ = _ensure_data(values, dtype=dtype)
+
+ # faster for larger cases to use np.in1d
+ f = lambda x, y: htable.ismember_object(x, values)
+
+ # GH16012
+ # Ensure np.in1d doesn't get object types or it *may* throw an exception
+ if len(comps) > 1000000 and not is_object_dtype(comps):
+ f = lambda x, y: np.in1d(x, y)
+ elif is_integer_dtype(comps):
+ try:
+ values = values.astype('int64', copy=False)
+ comps = comps.astype('int64', copy=False)
+ f = lambda x, y: htable.ismember_int64(x, y)
+ except (TypeError, ValueError, OverflowError):
+ values = values.astype(object)
+ comps = comps.astype(object)
+
+ elif is_float_dtype(comps):
+ try:
+ values = values.astype('float64', copy=False)
+ comps = comps.astype('float64', copy=False)
+ f = lambda x, y: htable.ismember_float64(x, y)
+ except (TypeError, ValueError):
+ values = values.astype(object)
+ comps = comps.astype(object)
+
+ return f(comps, values)
+
+
+def _factorize_array(values, na_sentinel=-1, size_hint=None,
+ na_value=None):
+ """Factorize an array-like to labels and uniques.
+
+ This doesn't do any coercion of types or unboxing before factorization.
+
+ Parameters
+ ----------
+ values : ndarray
+ na_sentinel : int, default -1
+ size_hint : int, optional
+ Passsed through to the hashtable's 'get_labels' method
+ na_value : object, optional
+ A value in `values` to consider missing. Note: only use this
+ parameter when you know that you don't have any values pandas would
+ consider missing in the array (NaN for float data, iNaT for
+ datetimes, etc.).
+
+ Returns
+ -------
+ labels, uniques : ndarray
+ """
+ (hash_klass, _), values = _get_data_algo(values, _hashtables)
+
+ table = hash_klass(size_hint or len(values))
+ uniques, labels = table.factorize(values, na_sentinel=na_sentinel,
+ na_value=na_value)
+
+ labels = ensure_platform_int(labels)
+ return labels, uniques
+
+
+_shared_docs['factorize'] = """
+ Encode the object as an enumerated type or categorical variable.
+
+ This method is useful for obtaining a numeric representation of an
+ array when all that matters is identifying distinct values. `factorize`
+ is available as both a top-level function :func:`pandas.factorize`,
+ and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.
+
+ Parameters
+ ----------
+ %(values)s%(sort)s%(order)s
+ na_sentinel : int, default -1
+ Value to mark "not found".
+ %(size_hint)s\
+
+ Returns
+ -------
+ labels : ndarray
+ An integer ndarray that's an indexer into `uniques`.
+ ``uniques.take(labels)`` will have the same values as `values`.
+ uniques : ndarray, Index, or Categorical
+ The unique valid values. When `values` is Categorical, `uniques`
+ is a Categorical. When `values` is some other pandas object, an
+ `Index` is returned. Otherwise, a 1-D ndarray is returned.
+
+ .. note ::
+
+ Even if there's a missing value in `values`, `uniques` will
+ *not* contain an entry for it.
+
+ See Also
+ --------
+ cut : Discretize continuous-valued array.
+ unique : Find the unique value in an array.
+
+ Examples
+ --------
+ These examples all show factorize as a top-level method like
+ ``pd.factorize(values)``. The results are identical for methods like
+ :meth:`Series.factorize`.
+
+ >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
+ >>> labels
+ array([0, 0, 1, 2, 0])
+ >>> uniques
+ array(['b', 'a', 'c'], dtype=object)
+
+ With ``sort=True``, the `uniques` will be sorted, and `labels` will be
+ shuffled so that the relationship is the maintained.
+
+ >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
+ >>> labels
+ array([1, 1, 0, 2, 1])
+ >>> uniques
+ array(['a', 'b', 'c'], dtype=object)
+
+ Missing values are indicated in `labels` with `na_sentinel`
+ (``-1`` by default). Note that missing values are never
+ included in `uniques`.
+
+ >>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
+ >>> labels
+ array([ 0, -1, 1, 2, 0])
+ >>> uniques
+ array(['b', 'a', 'c'], dtype=object)
+
+ Thus far, we've only factorized lists (which are internally coerced to
+ NumPy arrays). When factorizing pandas objects, the type of `uniques`
+ will differ. For Categoricals, a `Categorical` is returned.
+
+ >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
+ >>> labels, uniques = pd.factorize(cat)
+ >>> labels
+ array([0, 0, 1])
+ >>> uniques
+ [a, c]
+ Categories (3, object): [a, b, c]
+
+ Notice that ``'b'`` is in ``uniques.categories``, despite not being
+ present in ``cat.values``.
+
+ For all other pandas objects, an Index of the appropriate type is
+ returned.
+
+ >>> cat = pd.Series(['a', 'a', 'c'])
+ >>> labels, uniques = pd.factorize(cat)
+ >>> labels
+ array([0, 0, 1])
+ >>> uniques
+ Index(['a', 'c'], dtype='object')
+ """
+
+
+@Substitution(
+ values=dedent("""\
+ values : sequence
+ A 1-D sequence. Sequences that aren't pandas objects are
+ coerced to ndarrays before factorization.
+ """),
+ order=dedent("""\
+ order
+ .. deprecated:: 0.23.0
+
+ This parameter has no effect and is deprecated.
+ """),
+ sort=dedent("""\
+ sort : bool, default False
+ Sort `uniques` and shuffle `labels` to maintain the
+ relationship.
+ """),
+ size_hint=dedent("""\
+ size_hint : int, optional
+ Hint to the hashtable sizer.
+ """),
+)
+@Appender(_shared_docs['factorize'])
+@deprecate_kwarg(old_arg_name='order', new_arg_name=None)
+def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
+ # Implementation notes: This method is responsible for 3 things
+ # 1.) coercing data to array-like (ndarray, Index, extension array)
+ # 2.) factorizing labels and uniques
+ # 3.) Maybe boxing the output in an Index
+ #
+ # Step 2 is dispatched to extension types (like Categorical). They are
+ # responsible only for factorization. All data coercion, sorting and boxing
+ # should happen here.
+
+ values = _ensure_arraylike(values)
+ original = values
+
+ if is_extension_array_dtype(values):
+ values = getattr(values, '_values', values)
+ labels, uniques = values.factorize(na_sentinel=na_sentinel)
+ dtype = original.dtype
+ else:
+ values, dtype, _ = _ensure_data(values)
+
+ if (is_datetime64_any_dtype(original) or
+ is_timedelta64_dtype(original) or
+ is_period_dtype(original)):
+ na_value = na_value_for_dtype(original.dtype)
+ else:
+ na_value = None
+
+ labels, uniques = _factorize_array(values,
+ na_sentinel=na_sentinel,
+ size_hint=size_hint,
+ na_value=na_value)
+
+ if sort and len(uniques) > 0:
+ from pandas.core.sorting import safe_sort
+ if na_sentinel == -1:
+ # GH-25409 take_1d only works for na_sentinels of -1
+ try:
+ order = uniques.argsort()
+ order2 = order.argsort()
+ labels = take_1d(order2, labels, fill_value=na_sentinel)
+ uniques = uniques.take(order)
+ except TypeError:
+ # Mixed types, where uniques.argsort fails.
+ uniques, labels = safe_sort(uniques, labels,
+ na_sentinel=na_sentinel,
+ assume_unique=True)
+ else:
+ uniques, labels = safe_sort(uniques, labels,
+ na_sentinel=na_sentinel,
+ assume_unique=True)
+
+ uniques = _reconstruct_data(uniques, dtype, original)
+
+ # return original tenor
+ if isinstance(original, ABCIndexClass):
+ uniques = original._shallow_copy(uniques, name=None)
+ elif isinstance(original, ABCSeries):
+ from pandas import Index
+ uniques = Index(uniques)
+
+ return labels, uniques
+
+
+def value_counts(values, sort=True, ascending=False, normalize=False,
+ bins=None, dropna=True):
+ """
+ Compute a histogram of the counts of non-null values.
+
+ Parameters
+ ----------
+ values : ndarray (1-d)
+ sort : boolean, default True
+ Sort by values
+ ascending : boolean, default False
+ Sort in ascending order
+ normalize: boolean, default False
+ If True then compute a relative histogram
+ bins : integer, optional
+ Rather than count values, group them into half-open bins,
+ convenience for pd.cut, only works with numeric data
+ dropna : boolean, default True
+ Don't include counts of NaN
+
+ Returns
+ -------
+ value_counts : Series
+
+ """
+ from pandas.core.series import Series, Index
+ name = getattr(values, 'name', None)
+
+ if bins is not None:
+ try:
+ from pandas.core.reshape.tile import cut
+ values = Series(values)
+ ii = cut(values, bins, include_lowest=True)
+ except TypeError:
+ raise TypeError("bins argument only works with numeric data.")
+
+ # count, remove nulls (from the index), and but the bins
+ result = ii.value_counts(dropna=dropna)
+ result = result[result.index.notna()]
+ result.index = result.index.astype('interval')
+ result = result.sort_index()
+
+ # if we are dropna and we have NO values
+ if dropna and (result.values == 0).all():
+ result = result.iloc[0:0]
+
+ # normalizing is by len of all (regardless of dropna)
+ counts = np.array([len(ii)])
+
+ else:
+
+ if is_extension_array_dtype(values) or is_sparse(values):
+
+ # handle Categorical and sparse,
+ result = Series(values)._values.value_counts(dropna=dropna)
+ result.name = name
+ counts = result.values
+
+ else:
+ keys, counts = _value_counts_arraylike(values, dropna)
+
+ if not isinstance(keys, Index):
+ keys = Index(keys)
+ result = Series(counts, index=keys, name=name)
+
+ if sort:
+ result = result.sort_values(ascending=ascending)
+
+ if normalize:
+ result = result / float(counts.sum())
+
+ return result
+
+
+def _value_counts_arraylike(values, dropna):
+ """
+ Parameters
+ ----------
+ values : arraylike
+ dropna : boolean
+
+ Returns
+ -------
+ (uniques, counts)
+
+ """
+ values = _ensure_arraylike(values)
+ original = values
+ values, dtype, ndtype = _ensure_data(values)
+
+ if needs_i8_conversion(dtype):
+ # i8
+
+ keys, counts = htable.value_count_int64(values, dropna)
+
+ if dropna:
+ msk = keys != iNaT
+ keys, counts = keys[msk], counts[msk]
+
+ else:
+ # ndarray like
+
+ # TODO: handle uint8
+ f = getattr(htable, "value_count_{dtype}".format(dtype=ndtype))
+ keys, counts = f(values, dropna)
+
+ mask = isna(values)
+ if not dropna and mask.any():
+ if not isna(keys).any():
+ keys = np.insert(keys, 0, np.NaN)
+ counts = np.insert(counts, 0, mask.sum())
+
+ keys = _reconstruct_data(keys, original.dtype, original)
+
+ return keys, counts
+
+
+def duplicated(values, keep='first'):
+ """
+ Return boolean ndarray denoting duplicate values.
+
+ .. versionadded:: 0.19.0
+
+ Parameters
+ ----------
+ values : ndarray-like
+ Array over which to check for duplicate values.
+ keep : {'first', 'last', False}, default 'first'
+ - ``first`` : Mark duplicates as ``True`` except for the first
+ occurrence.
+ - ``last`` : Mark duplicates as ``True`` except for the last
+ occurrence.
+ - False : Mark all duplicates as ``True``.
+
+ Returns
+ -------
+ duplicated : ndarray
+ """
+
+ values, dtype, ndtype = _ensure_data(values)
+ f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype))
+ return f(values, keep=keep)
+
+
+def mode(values, dropna=True):
+ """
+ Returns the mode(s) of an array.
+
+ Parameters
+ ----------
+ values : array-like
+ Array over which to check for duplicate values.
+ dropna : boolean, default True
+ Don't consider counts of NaN/NaT.
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ mode : Series
+ """
+ from pandas import Series
+
+ values = _ensure_arraylike(values)
+ original = values
+
+ # categorical is a fast-path
+ if is_categorical_dtype(values):
+ if isinstance(values, Series):
+ return Series(values.values.mode(dropna=dropna), name=values.name)
+ return values.mode(dropna=dropna)
+
+ if dropna and is_datetimelike(values):
+ mask = values.isnull()
+ values = values[~mask]
+
+ values, dtype, ndtype = _ensure_data(values)
+
+ f = getattr(htable, "mode_{dtype}".format(dtype=ndtype))
+ result = f(values, dropna=dropna)
+ try:
+ result = np.sort(result)
+ except TypeError as e:
+ warn("Unable to sort modes: {error}".format(error=e))
+
+ result = _reconstruct_data(result, original.dtype, original)
+ return Series(result)
+
+
+def rank(values, axis=0, method='average', na_option='keep',
+ ascending=True, pct=False):
+ """
+ Rank the values along a given axis.
+
+ Parameters
+ ----------
+ values : array-like
+ Array whose values will be ranked. The number of dimensions in this
+ array must not exceed 2.
+ axis : int, default 0
+ Axis over which to perform rankings.
+ method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
+ The method by which tiebreaks are broken during the ranking.
+ na_option : {'keep', 'top'}, default 'keep'
+ The method by which NaNs are placed in the ranking.
+ - ``keep``: rank each NaN value with a NaN ranking
+ - ``top``: replace each NaN with either +/- inf so that they
+ there are ranked at the top
+ ascending : boolean, default True
+ Whether or not the elements should be ranked in ascending order.
+ pct : boolean, default False
+ Whether or not to the display the returned rankings in integer form
+ (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
+ """
+ if values.ndim == 1:
+ f, values = _get_data_algo(values, _rank1d_functions)
+ ranks = f(values, ties_method=method, ascending=ascending,
+ na_option=na_option, pct=pct)
+ elif values.ndim == 2:
+ f, values = _get_data_algo(values, _rank2d_functions)
+ ranks = f(values, axis=axis, ties_method=method,
+ ascending=ascending, na_option=na_option, pct=pct)
+ else:
+ raise TypeError("Array with ndim > 2 are not supported.")
+
+ return ranks
+
+
+def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None):
+ """
+ Perform array addition that checks for underflow and overflow.
+
+ Performs the addition of an int64 array and an int64 integer (or array)
+ but checks that they do not result in overflow first. For elements that
+ are indicated to be NaN, whether or not there is overflow for that element
+ is automatically ignored.
+
+ Parameters
+ ----------
+ arr : array addend.
+ b : array or scalar addend.
+ arr_mask : boolean array or None
+ array indicating which elements to exclude from checking
+ b_mask : boolean array or boolean or None
+ array or scalar indicating which element(s) to exclude from checking
+
+ Returns
+ -------
+ sum : An array for elements x + b for each element x in arr if b is
+ a scalar or an array for elements x + y for each element pair
+ (x, y) in (arr, b).
+
+ Raises
+ ------
+ OverflowError if any x + y exceeds the maximum or minimum int64 value.
+ """
+ # For performance reasons, we broadcast 'b' to the new array 'b2'
+ # so that it has the same size as 'arr'.
+ b2 = np.broadcast_to(b, arr.shape)
+ if b_mask is not None:
+ # We do the same broadcasting for b_mask as well.
+ b2_mask = np.broadcast_to(b_mask, arr.shape)
+ else:
+ b2_mask = None
+
+ # For elements that are NaN, regardless of their value, we should
+ # ignore whether they overflow or not when doing the checked add.
+ if arr_mask is not None and b2_mask is not None:
+ not_nan = np.logical_not(arr_mask | b2_mask)
+ elif arr_mask is not None:
+ not_nan = np.logical_not(arr_mask)
+ elif b_mask is not None:
+ not_nan = np.logical_not(b2_mask)
+ else:
+ not_nan = np.empty(arr.shape, dtype=bool)
+ not_nan.fill(True)
+
+ # gh-14324: For each element in 'arr' and its corresponding element
+ # in 'b2', we check the sign of the element in 'b2'. If it is positive,
+ # we then check whether its sum with the element in 'arr' exceeds
+ # np.iinfo(np.int64).max. If so, we have an overflow error. If it
+ # it is negative, we then check whether its sum with the element in
+ # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
+ # error as well.
+ mask1 = b2 > 0
+ mask2 = b2 < 0
+
+ if not mask1.any():
+ to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any()
+ elif not mask2.any():
+ to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any()
+ else:
+ to_raise = (((np.iinfo(np.int64).max -
+ b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or
+ ((np.iinfo(np.int64).min -
+ b2[mask2] > arr[mask2]) & not_nan[mask2]).any())
+
+ if to_raise:
+ raise OverflowError("Overflow in int64 addition")
+ return arr + b
+
+
+_rank1d_functions = {
+ 'float64': algos.rank_1d_float64,
+ 'int64': algos.rank_1d_int64,
+ 'uint64': algos.rank_1d_uint64,
+ 'object': algos.rank_1d_object
+}
+
+_rank2d_functions = {
+ 'float64': algos.rank_2d_float64,
+ 'int64': algos.rank_2d_int64,
+ 'uint64': algos.rank_2d_uint64,
+ 'object': algos.rank_2d_object
+}
+
+
+def quantile(x, q, interpolation_method='fraction'):
+ """
+ Compute sample quantile or quantiles of the input array. For example, q=0.5
+ computes the median.
+
+ The `interpolation_method` parameter supports three values, namely
+ `fraction` (default), `lower` and `higher`. Interpolation is done only,
+ if the desired quantile lies between two data points `i` and `j`. For
+ `fraction`, the result is an interpolated value between `i` and `j`;
+ for `lower`, the result is `i`, for `higher` the result is `j`.
+
+ Parameters
+ ----------
+ x : ndarray
+ Values from which to extract score.
+ q : scalar or array
+ Percentile at which to extract score.
+ interpolation_method : {'fraction', 'lower', 'higher'}, optional
+ This optional parameter specifies the interpolation method to use,
+ when the desired quantile lies between two data points `i` and `j`:
+
+ - fraction: `i + (j - i)*fraction`, where `fraction` is the
+ fractional part of the index surrounded by `i` and `j`.
+ -lower: `i`.
+ - higher: `j`.
+
+ Returns
+ -------
+ score : float
+ Score at percentile.
+
+ Examples
+ --------
+ >>> from scipy import stats
+ >>> a = np.arange(100)
+ >>> stats.scoreatpercentile(a, 50)
+ 49.5
+
+ """
+ x = np.asarray(x)
+ mask = isna(x)
+
+ x = x[~mask]
+
+ values = np.sort(x)
+
+ def _interpolate(a, b, fraction):
+ """Returns the point at the given fraction between a and b, where
+ 'fraction' must be between 0 and 1.
+ """
+ return a + (b - a) * fraction
+
+ def _get_score(at):
+ if len(values) == 0:
+ return np.nan
+
+ idx = at * (len(values) - 1)
+ if idx % 1 == 0:
+ score = values[int(idx)]
+ else:
+ if interpolation_method == 'fraction':
+ score = _interpolate(values[int(idx)], values[int(idx) + 1],
+ idx % 1)
+ elif interpolation_method == 'lower':
+ score = values[np.floor(idx)]
+ elif interpolation_method == 'higher':
+ score = values[np.ceil(idx)]
+ else:
+ raise ValueError("interpolation_method can only be 'fraction' "
+ ", 'lower' or 'higher'")
+
+ return score
+
+ if is_scalar(q):
+ return _get_score(q)
+ else:
+ q = np.asarray(q, np.float64)
+ return algos.arrmap_float64(q, _get_score)
+
+
+# --------------- #
+# select n #
+# --------------- #
+
+class SelectN(object):
+
+ def __init__(self, obj, n, keep):
+ self.obj = obj
+ self.n = n
+ self.keep = keep
+
+ if self.keep not in ('first', 'last', 'all'):
+ raise ValueError('keep must be either "first", "last" or "all"')
+
+ def nlargest(self):
+ return self.compute('nlargest')
+
+ def nsmallest(self):
+ return self.compute('nsmallest')
+
+ @staticmethod
+ def is_valid_dtype_n_method(dtype):
+ """
+ Helper function to determine if dtype is valid for
+ nsmallest/nlargest methods
+ """
+ return ((is_numeric_dtype(dtype) and not is_complex_dtype(dtype)) or
+ needs_i8_conversion(dtype))
+
+
+class SelectNSeries(SelectN):
+ """
+ Implement n largest/smallest for Series
+
+ Parameters
+ ----------
+ obj : Series
+ n : int
+ keep : {'first', 'last'}, default 'first'
+
+ Returns
+ -------
+ nordered : Series
+ """
+
+ def compute(self, method):
+
+ n = self.n
+ dtype = self.obj.dtype
+ if not self.is_valid_dtype_n_method(dtype):
+ raise TypeError("Cannot use method '{method}' with "
+ "dtype {dtype}".format(method=method,
+ dtype=dtype))
+
+ if n <= 0:
+ return self.obj[[]]
+
+ dropped = self.obj.dropna()
+
+ # slow method
+ if n >= len(self.obj):
+
+ reverse_it = (self.keep == 'last' or method == 'nlargest')
+ ascending = method == 'nsmallest'
+ slc = np.s_[::-1] if reverse_it else np.s_[:]
+ return dropped[slc].sort_values(ascending=ascending).head(n)
+
+ # fast method
+ arr, pandas_dtype, _ = _ensure_data(dropped.values)
+ if method == 'nlargest':
+ arr = -arr
+ if is_integer_dtype(pandas_dtype):
+ # GH 21426: ensure reverse ordering at boundaries
+ arr -= 1
+
+ if self.keep == 'last':
+ arr = arr[::-1]
+
+ narr = len(arr)
+ n = min(n, narr)
+
+ kth_val = algos.kth_smallest(arr.copy(), n - 1)
+ ns, = np.nonzero(arr <= kth_val)
+ inds = ns[arr[ns].argsort(kind='mergesort')]
+
+ if self.keep != 'all':
+ inds = inds[:n]
+
+ if self.keep == 'last':
+ # reverse indices
+ inds = narr - 1 - inds
+
+ return dropped.iloc[inds]
+
+
+class SelectNFrame(SelectN):
+ """
+ Implement n largest/smallest for DataFrame
+
+ Parameters
+ ----------
+ obj : DataFrame
+ n : int
+ keep : {'first', 'last'}, default 'first'
+ columns : list or str
+
+ Returns
+ -------
+ nordered : DataFrame
+ """
+
+ def __init__(self, obj, n, keep, columns):
+ super(SelectNFrame, self).__init__(obj, n, keep)
+ if not is_list_like(columns) or isinstance(columns, tuple):
+ columns = [columns]
+ columns = list(columns)
+ self.columns = columns
+
+ def compute(self, method):
+
+ from pandas import Int64Index
+ n = self.n
+ frame = self.obj
+ columns = self.columns
+
+ for column in columns:
+ dtype = frame[column].dtype
+ if not self.is_valid_dtype_n_method(dtype):
+ raise TypeError((
+ "Column {column!r} has dtype {dtype}, cannot use method "
+ "{method!r} with this dtype"
+ ).format(column=column, dtype=dtype, method=method))
+
+ def get_indexer(current_indexer, other_indexer):
+ """Helper function to concat `current_indexer` and `other_indexer`
+ depending on `method`
+ """
+ if method == 'nsmallest':
+ return current_indexer.append(other_indexer)
+ else:
+ return other_indexer.append(current_indexer)
+
+ # Below we save and reset the index in case index contains duplicates
+ original_index = frame.index
+ cur_frame = frame = frame.reset_index(drop=True)
+ cur_n = n
+ indexer = Int64Index([])
+
+ for i, column in enumerate(columns):
+ # For each column we apply method to cur_frame[column].
+ # If it's the last column or if we have the number of
+ # results desired we are done.
+ # Otherwise there are duplicates of the largest/smallest
+ # value and we need to look at the rest of the columns
+ # to determine which of the rows with the largest/smallest
+ # value in the column to keep.
+ series = cur_frame[column]
+ is_last_column = len(columns) - 1 == i
+ values = getattr(series, method)(
+ cur_n,
+ keep=self.keep if is_last_column else 'all')
+
+ if is_last_column or len(values) <= cur_n:
+ indexer = get_indexer(indexer, values.index)
+ break
+
+ # Now find all values which are equal to
+ # the (nsmallest: largest)/(nlarrgest: smallest)
+ # from our series.
+ border_value = values == values[values.index[-1]]
+
+ # Some of these values are among the top-n
+ # some aren't.
+ unsafe_values = values[border_value]
+
+ # These values are definitely among the top-n
+ safe_values = values[~border_value]
+ indexer = get_indexer(indexer, safe_values.index)
+
+ # Go on and separate the unsafe_values on the remaining
+ # columns.
+ cur_frame = cur_frame.loc[unsafe_values.index]
+ cur_n = n - len(indexer)
+
+ frame = frame.take(indexer)
+
+ # Restore the index on frame
+ frame.index = original_index.take(indexer)
+
+ # If there is only one column, the frame is already sorted.
+ if len(columns) == 1:
+ return frame
+
+ ascending = method == 'nsmallest'
+
+ return frame.sort_values(
+ columns,
+ ascending=ascending,
+ kind='mergesort')
+
+
+# ------- ## ---- #
+# take #
+# ---- #
+
+
+def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None):
+ def wrapper(arr, indexer, out, fill_value=np.nan):
+ if arr_dtype is not None:
+ arr = arr.view(arr_dtype)
+ if out_dtype is not None:
+ out = out.view(out_dtype)
+ if fill_wrap is not None:
+ fill_value = fill_wrap(fill_value)
+ f(arr, indexer, out, fill_value=fill_value)
+
+ return wrapper
+
+
+def _convert_wrapper(f, conv_dtype):
+ def wrapper(arr, indexer, out, fill_value=np.nan):
+ arr = arr.astype(conv_dtype)
+ f(arr, indexer, out, fill_value=fill_value)
+
+ return wrapper
+
+
+def _take_2d_multi_object(arr, indexer, out, fill_value, mask_info):
+ # this is not ideal, performance-wise, but it's better than raising
+ # an exception (best to optimize in Cython to avoid getting here)
+ row_idx, col_idx = indexer
+ if mask_info is not None:
+ (row_mask, col_mask), (row_needs, col_needs) = mask_info
+ else:
+ row_mask = row_idx == -1
+ col_mask = col_idx == -1
+ row_needs = row_mask.any()
+ col_needs = col_mask.any()
+ if fill_value is not None:
+ if row_needs:
+ out[row_mask, :] = fill_value
+ if col_needs:
+ out[:, col_mask] = fill_value
+ for i in range(len(row_idx)):
+ u_ = row_idx[i]
+ for j in range(len(col_idx)):
+ v = col_idx[j]
+ out[i, j] = arr[u_, v]
+
+
+def _take_nd_object(arr, indexer, out, axis, fill_value, mask_info):
+ if mask_info is not None:
+ mask, needs_masking = mask_info
+ else:
+ mask = indexer == -1
+ needs_masking = mask.any()
+ if arr.dtype != out.dtype:
+ arr = arr.astype(out.dtype)
+ if arr.shape[axis] > 0:
+ arr.take(ensure_platform_int(indexer), axis=axis, out=out)
+ if needs_masking:
+ outindexer = [slice(None)] * arr.ndim
+ outindexer[axis] = mask
+ out[tuple(outindexer)] = fill_value
+
+
+_take_1d_dict = {
+ ('int8', 'int8'): algos.take_1d_int8_int8,
+ ('int8', 'int32'): algos.take_1d_int8_int32,
+ ('int8', 'int64'): algos.take_1d_int8_int64,
+ ('int8', 'float64'): algos.take_1d_int8_float64,
+ ('int16', 'int16'): algos.take_1d_int16_int16,
+ ('int16', 'int32'): algos.take_1d_int16_int32,
+ ('int16', 'int64'): algos.take_1d_int16_int64,
+ ('int16', 'float64'): algos.take_1d_int16_float64,
+ ('int32', 'int32'): algos.take_1d_int32_int32,
+ ('int32', 'int64'): algos.take_1d_int32_int64,
+ ('int32', 'float64'): algos.take_1d_int32_float64,
+ ('int64', 'int64'): algos.take_1d_int64_int64,
+ ('int64', 'float64'): algos.take_1d_int64_float64,
+ ('float32', 'float32'): algos.take_1d_float32_float32,
+ ('float32', 'float64'): algos.take_1d_float32_float64,
+ ('float64', 'float64'): algos.take_1d_float64_float64,
+ ('object', 'object'): algos.take_1d_object_object,
+ ('bool', 'bool'): _view_wrapper(algos.take_1d_bool_bool, np.uint8,
+ np.uint8),
+ ('bool', 'object'): _view_wrapper(algos.take_1d_bool_object, np.uint8,
+ None),
+ ('datetime64[ns]', 'datetime64[ns]'): _view_wrapper(
+ algos.take_1d_int64_int64, np.int64, np.int64, np.int64)
+}
+
+_take_2d_axis0_dict = {
+ ('int8', 'int8'): algos.take_2d_axis0_int8_int8,
+ ('int8', 'int32'): algos.take_2d_axis0_int8_int32,
+ ('int8', 'int64'): algos.take_2d_axis0_int8_int64,
+ ('int8', 'float64'): algos.take_2d_axis0_int8_float64,
+ ('int16', 'int16'): algos.take_2d_axis0_int16_int16,
+ ('int16', 'int32'): algos.take_2d_axis0_int16_int32,
+ ('int16', 'int64'): algos.take_2d_axis0_int16_int64,
+ ('int16', 'float64'): algos.take_2d_axis0_int16_float64,
+ ('int32', 'int32'): algos.take_2d_axis0_int32_int32,
+ ('int32', 'int64'): algos.take_2d_axis0_int32_int64,
+ ('int32', 'float64'): algos.take_2d_axis0_int32_float64,
+ ('int64', 'int64'): algos.take_2d_axis0_int64_int64,
+ ('int64', 'float64'): algos.take_2d_axis0_int64_float64,
+ ('float32', 'float32'): algos.take_2d_axis0_float32_float32,
+ ('float32', 'float64'): algos.take_2d_axis0_float32_float64,
+ ('float64', 'float64'): algos.take_2d_axis0_float64_float64,
+ ('object', 'object'): algos.take_2d_axis0_object_object,
+ ('bool', 'bool'): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8,
+ np.uint8),
+ ('bool', 'object'): _view_wrapper(algos.take_2d_axis0_bool_object,
+ np.uint8, None),
+ ('datetime64[ns]', 'datetime64[ns]'):
+ _view_wrapper(algos.take_2d_axis0_int64_int64, np.int64, np.int64,
+ fill_wrap=np.int64)
+}
+
+_take_2d_axis1_dict = {
+ ('int8', 'int8'): algos.take_2d_axis1_int8_int8,
+ ('int8', 'int32'): algos.take_2d_axis1_int8_int32,
+ ('int8', 'int64'): algos.take_2d_axis1_int8_int64,
+ ('int8', 'float64'): algos.take_2d_axis1_int8_float64,
+ ('int16', 'int16'): algos.take_2d_axis1_int16_int16,
+ ('int16', 'int32'): algos.take_2d_axis1_int16_int32,
+ ('int16', 'int64'): algos.take_2d_axis1_int16_int64,
+ ('int16', 'float64'): algos.take_2d_axis1_int16_float64,
+ ('int32', 'int32'): algos.take_2d_axis1_int32_int32,
+ ('int32', 'int64'): algos.take_2d_axis1_int32_int64,
+ ('int32', 'float64'): algos.take_2d_axis1_int32_float64,
+ ('int64', 'int64'): algos.take_2d_axis1_int64_int64,
+ ('int64', 'float64'): algos.take_2d_axis1_int64_float64,
+ ('float32', 'float32'): algos.take_2d_axis1_float32_float32,
+ ('float32', 'float64'): algos.take_2d_axis1_float32_float64,
+ ('float64', 'float64'): algos.take_2d_axis1_float64_float64,
+ ('object', 'object'): algos.take_2d_axis1_object_object,
+ ('bool', 'bool'): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8,
+ np.uint8),
+ ('bool', 'object'): _view_wrapper(algos.take_2d_axis1_bool_object,
+ np.uint8, None),
+ ('datetime64[ns]', 'datetime64[ns]'):
+ _view_wrapper(algos.take_2d_axis1_int64_int64, np.int64, np.int64,
+ fill_wrap=np.int64)
+}
+
+_take_2d_multi_dict = {
+ ('int8', 'int8'): algos.take_2d_multi_int8_int8,
+ ('int8', 'int32'): algos.take_2d_multi_int8_int32,
+ ('int8', 'int64'): algos.take_2d_multi_int8_int64,
+ ('int8', 'float64'): algos.take_2d_multi_int8_float64,
+ ('int16', 'int16'): algos.take_2d_multi_int16_int16,
+ ('int16', 'int32'): algos.take_2d_multi_int16_int32,
+ ('int16', 'int64'): algos.take_2d_multi_int16_int64,
+ ('int16', 'float64'): algos.take_2d_multi_int16_float64,
+ ('int32', 'int32'): algos.take_2d_multi_int32_int32,
+ ('int32', 'int64'): algos.take_2d_multi_int32_int64,
+ ('int32', 'float64'): algos.take_2d_multi_int32_float64,
+ ('int64', 'int64'): algos.take_2d_multi_int64_int64,
+ ('int64', 'float64'): algos.take_2d_multi_int64_float64,
+ ('float32', 'float32'): algos.take_2d_multi_float32_float32,
+ ('float32', 'float64'): algos.take_2d_multi_float32_float64,
+ ('float64', 'float64'): algos.take_2d_multi_float64_float64,
+ ('object', 'object'): algos.take_2d_multi_object_object,
+ ('bool', 'bool'): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8,
+ np.uint8),
+ ('bool', 'object'): _view_wrapper(algos.take_2d_multi_bool_object,
+ np.uint8, None),
+ ('datetime64[ns]', 'datetime64[ns]'):
+ _view_wrapper(algos.take_2d_multi_int64_int64, np.int64, np.int64,
+ fill_wrap=np.int64)
+}
+
+
+def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None):
+ if ndim <= 2:
+ tup = (arr_dtype.name, out_dtype.name)
+ if ndim == 1:
+ func = _take_1d_dict.get(tup, None)
+ elif ndim == 2:
+ if axis == 0:
+ func = _take_2d_axis0_dict.get(tup, None)
+ else:
+ func = _take_2d_axis1_dict.get(tup, None)
+ if func is not None:
+ return func
+
+ tup = (out_dtype.name, out_dtype.name)
+ if ndim == 1:
+ func = _take_1d_dict.get(tup, None)
+ elif ndim == 2:
+ if axis == 0:
+ func = _take_2d_axis0_dict.get(tup, None)
+ else:
+ func = _take_2d_axis1_dict.get(tup, None)
+ if func is not None:
+ func = _convert_wrapper(func, out_dtype)
+ return func
+
+ def func(arr, indexer, out, fill_value=np.nan):
+ indexer = ensure_int64(indexer)
+ _take_nd_object(arr, indexer, out, axis=axis, fill_value=fill_value,
+ mask_info=mask_info)
+
+ return func
+
+
+def take(arr, indices, axis=0, allow_fill=False, fill_value=None):
+ """
+ Take elements from an array.
+
+ .. versionadded:: 0.23.0
+
+ Parameters
+ ----------
+ arr : sequence
+ Non array-likes (sequences without a dtype) are coerced
+ to an ndarray.
+ indices : sequence of integers
+ Indices to be taken.
+ axis : int, default 0
+ The axis over which to select values.
+ allow_fill : bool, default False
+ How to handle negative values in `indices`.
+
+ * False: negative values in `indices` indicate positional indices
+ from the right (the default). This is similar to :func:`numpy.take`.
+
+ * True: negative values in `indices` indicate
+ missing values. These values are set to `fill_value`. Any other
+ other negative values raise a ``ValueError``.
+
+ fill_value : any, optional
+ Fill value to use for NA-indices when `allow_fill` is True.
+ This may be ``None``, in which case the default NA value for
+ the type (``self.dtype.na_value``) is used.
+
+ For multi-dimensional `arr`, each *element* is filled with
+ `fill_value`.
+
+ Returns
+ -------
+ ndarray or ExtensionArray
+ Same type as the input.
+
+ Raises
+ ------
+ IndexError
+ When `indices` is out of bounds for the array.
+ ValueError
+ When the indexer contains negative values other than ``-1``
+ and `allow_fill` is True.
+
+ Notes
+ -----
+ When `allow_fill` is False, `indices` may be whatever dimensionality
+ is accepted by NumPy for `arr`.
+
+ When `allow_fill` is True, `indices` should be 1-D.
+
+ See Also
+ --------
+ numpy.take
+
+ Examples
+ --------
+ >>> from pandas.api.extensions import take
+
+ With the default ``allow_fill=False``, negative numbers indicate
+ positional indices from the right.
+
+ >>> take(np.array([10, 20, 30]), [0, 0, -1])
+ array([10, 10, 30])
+
+ Setting ``allow_fill=True`` will place `fill_value` in those positions.
+
+ >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True)
+ array([10., 10., nan])
+
+ >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True,
+ ... fill_value=-10)
+ array([ 10, 10, -10])
+ """
+ from pandas.core.indexing import validate_indices
+
+ if not is_array_like(arr):
+ arr = np.asarray(arr)
+
+ indices = np.asarray(indices, dtype=np.intp)
+
+ if allow_fill:
+ # Pandas style, -1 means NA
+ validate_indices(indices, len(arr))
+ result = take_1d(arr, indices, axis=axis, allow_fill=True,
+ fill_value=fill_value)
+ else:
+ # NumPy style
+ result = arr.take(indices, axis=axis)
+ return result
+
+
+def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
+ allow_fill=True):
+ """
+ Specialized Cython take which sets NaN values in one pass
+
+ This dispatches to ``take`` defined on ExtensionArrays. It does not
+ currently dispatch to ``SparseArray.take`` for sparse ``arr``.
+
+ Parameters
+ ----------
+ arr : array-like
+ Input array.
+ indexer : ndarray
+ 1-D array of indices to take, subarrays corresponding to -1 value
+ indices are filed with fill_value
+ axis : int, default 0
+ Axis to take from
+ out : ndarray or None, default None
+ Optional output array, must be appropriate type to hold input and
+ fill_value together, if indexer has any -1 value entries; call
+ _maybe_promote to determine this type for any fill_value
+ fill_value : any, default np.nan
+ Fill value to replace -1 values with
+ mask_info : tuple of (ndarray, boolean)
+ If provided, value should correspond to:
+ (indexer != -1, (indexer != -1).any())
+ If not provided, it will be computed internally if necessary
+ allow_fill : boolean, default True
+ If False, indexer is assumed to contain no -1 values so no filling
+ will be done. This short-circuits computation of a mask. Result is
+ undefined if allow_fill == False and -1 is present in indexer.
+
+ Returns
+ -------
+ subarray : array-like
+ May be the same type as the input, or cast to an ndarray.
+ """
+
+ # TODO(EA): Remove these if / elifs as datetimeTZ, interval, become EAs
+ # dispatch to internal type takes
+ if is_extension_array_dtype(arr):
+ return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
+ elif is_datetime64tz_dtype(arr):
+ return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
+ elif is_interval_dtype(arr):
+ return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
+
+ if is_sparse(arr):
+ arr = arr.get_values()
+ elif isinstance(arr, (ABCIndexClass, ABCSeries)):
+ arr = arr.values
+
+ arr = np.asarray(arr)
+
+ if indexer is None:
+ indexer = np.arange(arr.shape[axis], dtype=np.int64)
+ dtype, fill_value = arr.dtype, arr.dtype.type()
+ else:
+ indexer = ensure_int64(indexer, copy=False)
+ if not allow_fill:
+ dtype, fill_value = arr.dtype, arr.dtype.type()
+ mask_info = None, False
+ else:
+ # check for promotion based on types only (do this first because
+ # it's faster than computing a mask)
+ dtype, fill_value = maybe_promote(arr.dtype, fill_value)
+ if dtype != arr.dtype and (out is None or out.dtype != dtype):
+ # check if promotion is actually required based on indexer
+ if mask_info is not None:
+ mask, needs_masking = mask_info
+ else:
+ mask = indexer == -1
+ needs_masking = mask.any()
+ mask_info = mask, needs_masking
+ if needs_masking:
+ if out is not None and out.dtype != dtype:
+ raise TypeError('Incompatible type for fill_value')
+ else:
+ # if not, then depromote, set fill_value to dummy
+ # (it won't be used but we don't want the cython code
+ # to crash when trying to cast it to dtype)
+ dtype, fill_value = arr.dtype, arr.dtype.type()
+
+ flip_order = False
+ if arr.ndim == 2:
+ if arr.flags.f_contiguous:
+ flip_order = True
+
+ if flip_order:
+ arr = arr.T
+ axis = arr.ndim - axis - 1
+ if out is not None:
+ out = out.T
+
+ # at this point, it's guaranteed that dtype can hold both the arr values
+ # and the fill_value
+ if out is None:
+ out_shape = list(arr.shape)
+ out_shape[axis] = len(indexer)
+ out_shape = tuple(out_shape)
+ if arr.flags.f_contiguous and axis == arr.ndim - 1:
+ # minor tweak that can make an order-of-magnitude difference
+ # for dataframes initialized directly from 2-d ndarrays
+ # (s.t. df.values is c-contiguous and df._data.blocks[0] is its
+ # f-contiguous transpose)
+ out = np.empty(out_shape, dtype=dtype, order='F')
+ else:
+ out = np.empty(out_shape, dtype=dtype)
+
+ func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis,
+ mask_info=mask_info)
+ func(arr, indexer, out, fill_value)
+
+ if flip_order:
+ out = out.T
+ return out
+
+
+take_1d = take_nd
+
+
+def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None,
+ allow_fill=True):
+ """
+ Specialized Cython take which sets NaN values in one pass
+ """
+ if indexer is None or (indexer[0] is None and indexer[1] is None):
+ row_idx = np.arange(arr.shape[0], dtype=np.int64)
+ col_idx = np.arange(arr.shape[1], dtype=np.int64)
+ indexer = row_idx, col_idx
+ dtype, fill_value = arr.dtype, arr.dtype.type()
+ else:
+ row_idx, col_idx = indexer
+ if row_idx is None:
+ row_idx = np.arange(arr.shape[0], dtype=np.int64)
+ else:
+ row_idx = ensure_int64(row_idx)
+ if col_idx is None:
+ col_idx = np.arange(arr.shape[1], dtype=np.int64)
+ else:
+ col_idx = ensure_int64(col_idx)
+ indexer = row_idx, col_idx
+ if not allow_fill:
+ dtype, fill_value = arr.dtype, arr.dtype.type()
+ mask_info = None, False
+ else:
+ # check for promotion based on types only (do this first because
+ # it's faster than computing a mask)
+ dtype, fill_value = maybe_promote(arr.dtype, fill_value)
+ if dtype != arr.dtype and (out is None or out.dtype != dtype):
+ # check if promotion is actually required based on indexer
+ if mask_info is not None:
+ (row_mask, col_mask), (row_needs, col_needs) = mask_info
+ else:
+ row_mask = row_idx == -1
+ col_mask = col_idx == -1
+ row_needs = row_mask.any()
+ col_needs = col_mask.any()
+ mask_info = (row_mask, col_mask), (row_needs, col_needs)
+ if row_needs or col_needs:
+ if out is not None and out.dtype != dtype:
+ raise TypeError('Incompatible type for fill_value')
+ else:
+ # if not, then depromote, set fill_value to dummy
+ # (it won't be used but we don't want the cython code
+ # to crash when trying to cast it to dtype)
+ dtype, fill_value = arr.dtype, arr.dtype.type()
+
+ # at this point, it's guaranteed that dtype can hold both the arr values
+ # and the fill_value
+ if out is None:
+ out_shape = len(row_idx), len(col_idx)
+ out = np.empty(out_shape, dtype=dtype)
+
+ func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None)
+ if func is None and arr.dtype != out.dtype:
+ func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None)
+ if func is not None:
+ func = _convert_wrapper(func, out.dtype)
+ if func is None:
+
+ def func(arr, indexer, out, fill_value=np.nan):
+ _take_2d_multi_object(arr, indexer, out, fill_value=fill_value,
+ mask_info=mask_info)
+
+ func(arr, indexer, out=out, fill_value=fill_value)
+ return out
+
+
+# ---- #
+# diff #
+# ---- #
+
+_diff_special = {
+ 'float64': algos.diff_2d_float64,
+ 'float32': algos.diff_2d_float32,
+ 'int64': algos.diff_2d_int64,
+ 'int32': algos.diff_2d_int32,
+ 'int16': algos.diff_2d_int16,
+ 'int8': algos.diff_2d_int8,
+}
+
+
+def diff(arr, n, axis=0):
+ """
+ difference of n between self,
+ analogous to s-s.shift(n)
+
+ Parameters
+ ----------
+ arr : ndarray
+ n : int
+ number of periods
+ axis : int
+ axis to shift on
+
+ Returns
+ -------
+ shifted
+
+ """
+
+ n = int(n)
+ na = np.nan
+ dtype = arr.dtype
+
+ is_timedelta = False
+ if needs_i8_conversion(arr):
+ dtype = np.float64
+ arr = arr.view('i8')
+ na = iNaT
+ is_timedelta = True
+
+ elif is_bool_dtype(dtype):
+ dtype = np.object_
+
+ elif is_integer_dtype(dtype):
+ dtype = np.float64
+
+ dtype = np.dtype(dtype)
+ out_arr = np.empty(arr.shape, dtype=dtype)
+
+ na_indexer = [slice(None)] * arr.ndim
+ na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None)
+ out_arr[tuple(na_indexer)] = na
+
+ if arr.ndim == 2 and arr.dtype.name in _diff_special:
+ f = _diff_special[arr.dtype.name]
+ f(arr, out_arr, n, axis)
+ else:
+ res_indexer = [slice(None)] * arr.ndim
+ res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n)
+ res_indexer = tuple(res_indexer)
+
+ lag_indexer = [slice(None)] * arr.ndim
+ lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None)
+ lag_indexer = tuple(lag_indexer)
+
+ # need to make sure that we account for na for datelike/timedelta
+ # we don't actually want to subtract these i8 numbers
+ if is_timedelta:
+ res = arr[res_indexer]
+ lag = arr[lag_indexer]
+
+ mask = (arr[res_indexer] == na) | (arr[lag_indexer] == na)
+ if mask.any():
+ res = res.copy()
+ res[mask] = 0
+ lag = lag.copy()
+ lag[mask] = 0
+
+ result = res - lag
+ result[mask] = na
+ out_arr[res_indexer] = result
+ else:
+ out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer]
+
+ if is_timedelta:
+ from pandas import TimedeltaIndex
+ out_arr = TimedeltaIndex(out_arr.ravel().astype('int64')).asi8.reshape(
+ out_arr.shape).astype('timedelta64[ns]')
+
+ return out_arr
diff --git a/contrib/python/pandas/py2/pandas/core/api.py b/contrib/python/pandas/py2/pandas/core/api.py
new file mode 100644
index 00000000000..8c92287e212
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/api.py
@@ -0,0 +1,64 @@
+
+# pylint: disable=W0614,W0401,W0611
+# flake8: noqa
+
+import numpy as np
+
+from pandas.core.arrays.integer import (
+ Int8Dtype,
+ Int16Dtype,
+ Int32Dtype,
+ Int64Dtype,
+ UInt8Dtype,
+ UInt16Dtype,
+ UInt32Dtype,
+ UInt64Dtype,
+)
+from pandas.core.algorithms import factorize, unique, value_counts
+from pandas.core.dtypes.missing import isna, isnull, notna, notnull
+from pandas.core.dtypes.dtypes import (
+ CategoricalDtype,
+ PeriodDtype,
+ IntervalDtype,
+ DatetimeTZDtype,
+)
+from pandas.core.arrays import Categorical, array
+from pandas.core.groupby import Grouper
+from pandas.io.formats.format import set_eng_float_format
+from pandas.core.index import (Index, CategoricalIndex, Int64Index,
+ UInt64Index, RangeIndex, Float64Index,
+ MultiIndex, IntervalIndex,
+ TimedeltaIndex, DatetimeIndex,
+ PeriodIndex, NaT)
+from pandas.core.indexes.period import Period, period_range
+from pandas.core.indexes.timedeltas import Timedelta, timedelta_range
+from pandas.core.indexes.datetimes import Timestamp, date_range, bdate_range
+from pandas.core.indexes.interval import Interval, interval_range
+
+from pandas.core.series import Series
+from pandas.core.frame import DataFrame
+from pandas.core.panel import Panel
+
+# TODO: Remove import when statsmodels updates #18264
+from pandas.core.reshape.reshape import get_dummies
+
+from pandas.core.indexing import IndexSlice
+from pandas.core.tools.numeric import to_numeric
+from pandas.tseries.offsets import DateOffset
+from pandas.core.tools.datetimes import to_datetime
+from pandas.core.tools.timedeltas import to_timedelta
+
+from pandas.core.config import (get_option, set_option, reset_option,
+ describe_option, option_context, options)
+
+
+# Deprecation: xref gh-16747
+class TimeGrouper(object):
+
+ def __new__(cls, *args, **kwargs):
+ from pandas.core.resample import TimeGrouper
+ import warnings
+ warnings.warn("pd.TimeGrouper is deprecated and will be removed; "
+ "Please use pd.Grouper(freq=...)",
+ FutureWarning, stacklevel=2)
+ return TimeGrouper(*args, **kwargs)
diff --git a/contrib/python/pandas/py2/pandas/core/apply.py b/contrib/python/pandas/py2/pandas/core/apply.py
new file mode 100644
index 00000000000..5658094ec36
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/apply.py
@@ -0,0 +1,411 @@
+import warnings
+
+import numpy as np
+
+from pandas._libs import reduction
+import pandas.compat as compat
+from pandas.util._decorators import cache_readonly
+
+from pandas.core.dtypes.common import (
+ is_dict_like, is_extension_type, is_list_like, is_sequence)
+from pandas.core.dtypes.generic import ABCSeries
+
+from pandas.io.formats.printing import pprint_thing
+
+
+def frame_apply(obj, func, axis=0, broadcast=None,
+ raw=False, reduce=None, result_type=None,
+ ignore_failures=False,
+ args=None, kwds=None):
+ """ construct and return a row or column based frame apply object """
+
+ axis = obj._get_axis_number(axis)
+ if axis == 0:
+ klass = FrameRowApply
+ elif axis == 1:
+ klass = FrameColumnApply
+
+ return klass(obj, func, broadcast=broadcast,
+ raw=raw, reduce=reduce, result_type=result_type,
+ ignore_failures=ignore_failures,
+ args=args, kwds=kwds)
+
+
+class FrameApply(object):
+
+ def __init__(self, obj, func, broadcast, raw, reduce, result_type,
+ ignore_failures, args, kwds):
+ self.obj = obj
+ self.raw = raw
+ self.ignore_failures = ignore_failures
+ self.args = args or ()
+ self.kwds = kwds or {}
+
+ if result_type not in [None, 'reduce', 'broadcast', 'expand']:
+ raise ValueError("invalid value for result_type, must be one "
+ "of {None, 'reduce', 'broadcast', 'expand'}")
+
+ if broadcast is not None:
+ warnings.warn("The broadcast argument is deprecated and will "
+ "be removed in a future version. You can specify "
+ "result_type='broadcast' to broadcast the result "
+ "to the original dimensions",
+ FutureWarning, stacklevel=4)
+ if broadcast:
+ result_type = 'broadcast'
+
+ if reduce is not None:
+ warnings.warn("The reduce argument is deprecated and will "
+ "be removed in a future version. You can specify "
+ "result_type='reduce' to try to reduce the result "
+ "to the original dimensions",
+ FutureWarning, stacklevel=4)
+ if reduce:
+
+ if result_type is not None:
+ raise ValueError(
+ "cannot pass both reduce=True and result_type")
+
+ result_type = 'reduce'
+
+ self.result_type = result_type
+
+ # curry if needed
+ if ((kwds or args) and
+ not isinstance(func, (np.ufunc, compat.string_types))):
+
+ def f(x):
+ return func(x, *args, **kwds)
+ else:
+ f = func
+
+ self.f = f
+
+ # results
+ self.result = None
+ self.res_index = None
+ self.res_columns = None
+
+ @property
+ def columns(self):
+ return self.obj.columns
+
+ @property
+ def index(self):
+ return self.obj.index
+
+ @cache_readonly
+ def values(self):
+ return self.obj.values
+
+ @cache_readonly
+ def dtypes(self):
+ return self.obj.dtypes
+
+ @property
+ def agg_axis(self):
+ return self.obj._get_agg_axis(self.axis)
+
+ def get_result(self):
+ """ compute the results """
+
+ # dispatch to agg
+ if is_list_like(self.f) or is_dict_like(self.f):
+ return self.obj.aggregate(self.f, axis=self.axis,
+ *self.args, **self.kwds)
+
+ # all empty
+ if len(self.columns) == 0 and len(self.index) == 0:
+ return self.apply_empty_result()
+
+ # string dispatch
+ if isinstance(self.f, compat.string_types):
+ # Support for `frame.transform('method')`
+ # Some methods (shift, etc.) require the axis argument, others
+ # don't, so inspect and insert if necessary.
+ func = getattr(self.obj, self.f)
+ sig = compat.signature(func)
+ if 'axis' in sig.args:
+ self.kwds['axis'] = self.axis
+ return func(*self.args, **self.kwds)
+
+ # ufunc
+ elif isinstance(self.f, np.ufunc):
+ with np.errstate(all='ignore'):
+ results = self.obj._data.apply('apply', func=self.f)
+ return self.obj._constructor(data=results, index=self.index,
+ columns=self.columns, copy=False)
+
+ # broadcasting
+ if self.result_type == 'broadcast':
+ return self.apply_broadcast()
+
+ # one axis empty
+ elif not all(self.obj.shape):
+ return self.apply_empty_result()
+
+ # raw
+ elif self.raw and not self.obj._is_mixed_type:
+ return self.apply_raw()
+
+ return self.apply_standard()
+
+ def apply_empty_result(self):
+ """
+ we have an empty result; at least 1 axis is 0
+
+ we will try to apply the function to an empty
+ series in order to see if this is a reduction function
+ """
+
+ # we are not asked to reduce or infer reduction
+ # so just return a copy of the existing object
+ if self.result_type not in ['reduce', None]:
+ return self.obj.copy()
+
+ # we may need to infer
+ reduce = self.result_type == 'reduce'
+
+ from pandas import Series
+ if not reduce:
+
+ EMPTY_SERIES = Series([])
+ try:
+ r = self.f(EMPTY_SERIES, *self.args, **self.kwds)
+ reduce = not isinstance(r, Series)
+ except Exception:
+ pass
+
+ if reduce:
+ return self.obj._constructor_sliced(np.nan, index=self.agg_axis)
+ else:
+ return self.obj.copy()
+
+ def apply_raw(self):
+ """ apply to the values as a numpy array """
+
+ try:
+ result = reduction.reduce(self.values, self.f, axis=self.axis)
+ except Exception:
+ result = np.apply_along_axis(self.f, self.axis, self.values)
+
+ # TODO: mixed type case
+ if result.ndim == 2:
+ return self.obj._constructor(result,
+ index=self.index,
+ columns=self.columns)
+ else:
+ return self.obj._constructor_sliced(result,
+ index=self.agg_axis)
+
+ def apply_broadcast(self, target):
+ result_values = np.empty_like(target.values)
+
+ # axis which we want to compare compliance
+ result_compare = target.shape[0]
+
+ for i, col in enumerate(target.columns):
+ res = self.f(target[col])
+ ares = np.asarray(res).ndim
+
+ # must be a scalar or 1d
+ if ares > 1:
+ raise ValueError("too many dims to broadcast")
+ elif ares == 1:
+
+ # must match return dim
+ if result_compare != len(res):
+ raise ValueError("cannot broadcast result")
+
+ result_values[:, i] = res
+
+ # we *always* preserve the original index / columns
+ result = self.obj._constructor(result_values,
+ index=target.index,
+ columns=target.columns)
+ return result
+
+ def apply_standard(self):
+
+ # try to reduce first (by default)
+ # this only matters if the reduction in values is of different dtype
+ # e.g. if we want to apply to a SparseFrame, then can't directly reduce
+
+ # we cannot reduce using non-numpy dtypes,
+ # as demonstrated in gh-12244
+ if (self.result_type in ['reduce', None] and
+ not self.dtypes.apply(is_extension_type).any()):
+
+ # Create a dummy Series from an empty array
+ from pandas import Series
+ values = self.values
+ index = self.obj._get_axis(self.axis)
+ labels = self.agg_axis
+ empty_arr = np.empty(len(index), dtype=values.dtype)
+ dummy = Series(empty_arr, index=index, dtype=values.dtype)
+
+ try:
+ result = reduction.reduce(values, self.f,
+ axis=self.axis,
+ dummy=dummy,
+ labels=labels)
+ return self.obj._constructor_sliced(result, index=labels)
+ except Exception:
+ pass
+
+ # compute the result using the series generator
+ self.apply_series_generator()
+
+ # wrap results
+ return self.wrap_results()
+
+ def apply_series_generator(self):
+ series_gen = self.series_generator
+ res_index = self.result_index
+
+ i = None
+ keys = []
+ results = {}
+ if self.ignore_failures:
+ successes = []
+ for i, v in enumerate(series_gen):
+ try:
+ results[i] = self.f(v)
+ keys.append(v.name)
+ successes.append(i)
+ except Exception:
+ pass
+
+ # so will work with MultiIndex
+ if len(successes) < len(res_index):
+ res_index = res_index.take(successes)
+
+ else:
+ try:
+ for i, v in enumerate(series_gen):
+ results[i] = self.f(v)
+ keys.append(v.name)
+ except Exception as e:
+ if hasattr(e, 'args'):
+
+ # make sure i is defined
+ if i is not None:
+ k = res_index[i]
+ e.args = e.args + ('occurred at index %s' %
+ pprint_thing(k), )
+ raise
+
+ self.results = results
+ self.res_index = res_index
+ self.res_columns = self.result_columns
+
+ def wrap_results(self):
+ results = self.results
+
+ # see if we can infer the results
+ if len(results) > 0 and is_sequence(results[0]):
+
+ return self.wrap_results_for_axis()
+
+ # dict of scalars
+ result = self.obj._constructor_sliced(results)
+ result.index = self.res_index
+
+ return result
+
+
+class FrameRowApply(FrameApply):
+ axis = 0
+
+ def apply_broadcast(self):
+ return super(FrameRowApply, self).apply_broadcast(self.obj)
+
+ @property
+ def series_generator(self):
+ return (self.obj._ixs(i, axis=1)
+ for i in range(len(self.columns)))
+
+ @property
+ def result_index(self):
+ return self.columns
+
+ @property
+ def result_columns(self):
+ return self.index
+
+ def wrap_results_for_axis(self):
+ """ return the results for the rows """
+
+ results = self.results
+ result = self.obj._constructor(data=results)
+
+ if not isinstance(results[0], ABCSeries):
+ try:
+ result.index = self.res_columns
+ except ValueError:
+ pass
+
+ try:
+ result.columns = self.res_index
+ except ValueError:
+ pass
+
+ return result
+
+
+class FrameColumnApply(FrameApply):
+ axis = 1
+
+ def apply_broadcast(self):
+ result = super(FrameColumnApply, self).apply_broadcast(self.obj.T)
+ return result.T
+
+ @property
+ def series_generator(self):
+ constructor = self.obj._constructor_sliced
+ return (constructor(arr, index=self.columns, name=name)
+ for i, (arr, name) in enumerate(zip(self.values,
+ self.index)))
+
+ @property
+ def result_index(self):
+ return self.index
+
+ @property
+ def result_columns(self):
+ return self.columns
+
+ def wrap_results_for_axis(self):
+ """ return the results for the columns """
+ results = self.results
+
+ # we have requested to expand
+ if self.result_type == 'expand':
+ result = self.infer_to_same_shape()
+
+ # we have a non-series and don't want inference
+ elif not isinstance(results[0], ABCSeries):
+ from pandas import Series
+ result = Series(results)
+ result.index = self.res_index
+
+ # we may want to infer results
+ else:
+ result = self.infer_to_same_shape()
+
+ return result
+
+ def infer_to_same_shape(self):
+ """ infer the results to the same shape as the input object """
+ results = self.results
+
+ result = self.obj._constructor(data=results)
+ result = result.T
+
+ # set the index
+ result.index = self.res_index
+
+ # infer dtypes
+ result = result.infer_objects()
+
+ return result
diff --git a/contrib/python/pandas/py2/pandas/core/arrays/__init__.py b/contrib/python/pandas/py2/pandas/core/arrays/__init__.py
new file mode 100644
index 00000000000..1033ce78404
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/arrays/__init__.py
@@ -0,0 +1,13 @@
+from .array_ import array # noqa
+from .base import (ExtensionArray, # noqa
+ ExtensionOpsMixin,
+ ExtensionScalarOpsMixin)
+from .categorical import Categorical # noqa
+from .datetimes import DatetimeArray # noqa
+from .interval import IntervalArray # noqa
+from .period import PeriodArray, period_array # noqa
+from .timedeltas import TimedeltaArray # noqa
+from .integer import ( # noqa
+ IntegerArray, integer_array)
+from .sparse import SparseArray # noqa
+from .numpy_ import PandasArray, PandasDtype # noqa
diff --git a/contrib/python/pandas/py2/pandas/core/arrays/_ranges.py b/contrib/python/pandas/py2/pandas/core/arrays/_ranges.py
new file mode 100644
index 00000000000..66c1b8e1586
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/arrays/_ranges.py
@@ -0,0 +1,188 @@
+# -*- coding: utf-8 -*-
+"""
+Helper functions to generate range-like data for DatetimeArray
+(and possibly TimedeltaArray/PeriodArray)
+"""
+
+import numpy as np
+
+from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp
+
+from pandas.tseries.offsets import Tick, generate_range
+
+
+def generate_regular_range(start, end, periods, freq):
+ """
+ Generate a range of dates with the spans between dates described by
+ the given `freq` DateOffset.
+
+ Parameters
+ ----------
+ start : Timestamp or None
+ first point of produced date range
+ end : Timestamp or None
+ last point of produced date range
+ periods : int
+ number of periods in produced date range
+ freq : DateOffset
+ describes space between dates in produced date range
+
+ Returns
+ -------
+ ndarray[np.int64] representing nanosecond unix timestamps
+ """
+ if isinstance(freq, Tick):
+ stride = freq.nanos
+ if periods is None:
+ b = Timestamp(start).value
+ # cannot just use e = Timestamp(end) + 1 because arange breaks when
+ # stride is too large, see GH10887
+ e = (b + (Timestamp(end).value - b) // stride * stride +
+ stride // 2 + 1)
+ # end.tz == start.tz by this point due to _generate implementation
+ tz = start.tz
+ elif start is not None:
+ b = Timestamp(start).value
+ e = _generate_range_overflow_safe(b, periods, stride, side='start')
+ tz = start.tz
+ elif end is not None:
+ e = Timestamp(end).value + stride
+ b = _generate_range_overflow_safe(e, periods, stride, side='end')
+ tz = end.tz
+ else:
+ raise ValueError("at least 'start' or 'end' should be specified "
+ "if a 'period' is given.")
+
+ with np.errstate(over="raise"):
+ # If the range is sufficiently large, np.arange may overflow
+ # and incorrectly return an empty array if not caught.
+ try:
+ values = np.arange(b, e, stride, dtype=np.int64)
+ except FloatingPointError:
+ xdr = [b]
+ while xdr[-1] != e:
+ xdr.append(xdr[-1] + stride)
+ values = np.array(xdr[:-1], dtype=np.int64)
+
+ else:
+ tz = None
+ # start and end should have the same timezone by this point
+ if start is not None:
+ tz = start.tz
+ elif end is not None:
+ tz = end.tz
+
+ xdr = generate_range(start=start, end=end,
+ periods=periods, offset=freq)
+
+ values = np.array([x.value for x in xdr], dtype=np.int64)
+
+ return values, tz
+
+
+def _generate_range_overflow_safe(endpoint, periods, stride, side='start'):
+ """
+ Calculate the second endpoint for passing to np.arange, checking
+ to avoid an integer overflow. Catch OverflowError and re-raise
+ as OutOfBoundsDatetime.
+
+ Parameters
+ ----------
+ endpoint : int
+ nanosecond timestamp of the known endpoint of the desired range
+ periods : int
+ number of periods in the desired range
+ stride : int
+ nanoseconds between periods in the desired range
+ side : {'start', 'end'}
+ which end of the range `endpoint` refers to
+
+ Returns
+ -------
+ other_end : int
+
+ Raises
+ ------
+ OutOfBoundsDatetime
+ """
+ # GH#14187 raise instead of incorrectly wrapping around
+ assert side in ['start', 'end']
+
+ i64max = np.uint64(np.iinfo(np.int64).max)
+ msg = ('Cannot generate range with {side}={endpoint} and '
+ 'periods={periods}'
+ .format(side=side, endpoint=endpoint, periods=periods))
+
+ with np.errstate(over="raise"):
+ # if periods * strides cannot be multiplied within the *uint64* bounds,
+ # we cannot salvage the operation by recursing, so raise
+ try:
+ addend = np.uint64(periods) * np.uint64(np.abs(stride))
+ except FloatingPointError:
+ raise OutOfBoundsDatetime(msg)
+
+ if np.abs(addend) <= i64max:
+ # relatively easy case without casting concerns
+ return _generate_range_overflow_safe_signed(
+ endpoint, periods, stride, side)
+
+ elif ((endpoint > 0 and side == 'start' and stride > 0) or
+ (endpoint < 0 and side == 'end' and stride > 0)):
+ # no chance of not-overflowing
+ raise OutOfBoundsDatetime(msg)
+
+ elif (side == 'end' and endpoint > i64max and endpoint - stride <= i64max):
+ # in _generate_regular_range we added `stride` thereby overflowing
+ # the bounds. Adjust to fix this.
+ return _generate_range_overflow_safe(endpoint - stride,
+ periods - 1, stride, side)
+
+ # split into smaller pieces
+ mid_periods = periods // 2
+ remaining = periods - mid_periods
+ assert 0 < remaining < periods, (remaining, periods, endpoint, stride)
+
+ midpoint = _generate_range_overflow_safe(endpoint, mid_periods,
+ stride, side)
+ return _generate_range_overflow_safe(midpoint, remaining, stride, side)
+
+
+def _generate_range_overflow_safe_signed(endpoint, periods, stride, side):
+ """
+ A special case for _generate_range_overflow_safe where `periods * stride`
+ can be calculated without overflowing int64 bounds.
+ """
+ assert side in ['start', 'end']
+ if side == 'end':
+ stride *= -1
+
+ with np.errstate(over="raise"):
+ addend = np.int64(periods) * np.int64(stride)
+ try:
+ # easy case with no overflows
+ return np.int64(endpoint) + addend
+ except (FloatingPointError, OverflowError):
+ # with endpoint negative and addend positive we risk
+ # FloatingPointError; with reversed signed we risk OverflowError
+ pass
+
+ # if stride and endpoint had opposite signs, then endpoint + addend
+ # should never overflow. so they must have the same signs
+ assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0)
+
+ if stride > 0:
+ # watch out for very special case in which we just slightly
+ # exceed implementation bounds, but when passing the result to
+ # np.arange will get a result slightly within the bounds
+ assert endpoint >= 0
+ result = np.uint64(endpoint) + np.uint64(addend)
+ i64max = np.uint64(np.iinfo(np.int64).max)
+ assert result > i64max
+ if result <= i64max + np.uint64(stride):
+ return result
+
+ raise OutOfBoundsDatetime('Cannot generate range with '
+ '{side}={endpoint} and '
+ 'periods={periods}'
+ .format(side=side, endpoint=endpoint,
+ periods=periods))
diff --git a/contrib/python/pandas/py2/pandas/core/arrays/array_.py b/contrib/python/pandas/py2/pandas/core/arrays/array_.py
new file mode 100644
index 00000000000..41d623c7efd
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/arrays/array_.py
@@ -0,0 +1,274 @@
+from pandas._libs import lib, tslibs
+
+from pandas.core.dtypes.common import (
+ is_datetime64_ns_dtype, is_extension_array_dtype, is_timedelta64_ns_dtype)
+from pandas.core.dtypes.dtypes import registry
+
+from pandas import compat
+
+
+def array(data, # type: Sequence[object]
+ dtype=None, # type: Optional[Union[str, np.dtype, ExtensionDtype]]
+ copy=True, # type: bool
+ ):
+ # type: (...) -> ExtensionArray
+ """
+ Create an array.
+
+ .. versionadded:: 0.24.0
+
+ Parameters
+ ----------
+ data : Sequence of objects
+ The scalars inside `data` should be instances of the
+ scalar type for `dtype`. It's expected that `data`
+ represents a 1-dimensional array of data.
+
+ When `data` is an Index or Series, the underlying array
+ will be extracted from `data`.
+
+ dtype : str, np.dtype, or ExtensionDtype, optional
+ The dtype to use for the array. This may be a NumPy
+ dtype or an extension type registered with pandas using
+ :meth:`pandas.api.extensions.register_extension_dtype`.
+
+ If not specified, there are two possibilities:
+
+ 1. When `data` is a :class:`Series`, :class:`Index`, or
+ :class:`ExtensionArray`, the `dtype` will be taken
+ from the data.
+ 2. Otherwise, pandas will attempt to infer the `dtype`
+ from the data.
+
+ Note that when `data` is a NumPy array, ``data.dtype`` is
+ *not* used for inferring the array type. This is because
+ NumPy cannot represent all the types of data that can be
+ held in extension arrays.
+
+ Currently, pandas will infer an extension dtype for sequences of
+
+ ============================== =====================================
+ Scalar Type Array Type
+ ============================== =====================================
+ :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray`
+ :class:`pandas.Period` :class:`pandas.arrays.PeriodArray`
+ :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray`
+ :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray`
+ ============================== =====================================
+
+ For all other cases, NumPy's usual inference rules will be used.
+
+ copy : bool, default True
+ Whether to copy the data, even if not necessary. Depending
+ on the type of `data`, creating the new array may require
+ copying data, even if ``copy=False``.
+
+ Returns
+ -------
+ ExtensionArray
+ The newly created array.
+
+ Raises
+ ------
+ ValueError
+ When `data` is not 1-dimensional.
+
+ See Also
+ --------
+ numpy.array : Construct a NumPy array.
+ Series : Construct a pandas Series.
+ Index : Construct a pandas Index.
+ arrays.PandasArray : ExtensionArray wrapping a NumPy array.
+ Series.array : Extract the array stored within a Series.
+
+ Notes
+ -----
+ Omitting the `dtype` argument means pandas will attempt to infer the
+ best array type from the values in the data. As new array types are
+ added by pandas and 3rd party libraries, the "best" array type may
+ change. We recommend specifying `dtype` to ensure that
+
+ 1. the correct array type for the data is returned
+ 2. the returned array type doesn't change as new extension types
+ are added by pandas and third-party libraries
+
+ Additionally, if the underlying memory representation of the returned
+ array matters, we recommend specifying the `dtype` as a concrete object
+ rather than a string alias or allowing it to be inferred. For example,
+ a future version of pandas or a 3rd-party library may include a
+ dedicated ExtensionArray for string data. In this event, the following
+ would no longer return a :class:`arrays.PandasArray` backed by a NumPy
+ array.
+
+ >>> pd.array(['a', 'b'], dtype=str)
+ <PandasArray>
+ ['a', 'b']
+ Length: 2, dtype: str32
+
+ This would instead return the new ExtensionArray dedicated for string
+ data. If you really need the new array to be backed by a NumPy array,
+ specify that in the dtype.
+
+ >>> pd.array(['a', 'b'], dtype=np.dtype("<U1"))
+ <PandasArray>
+ ['a', 'b']
+ Length: 2, dtype: str32
+
+ Or use the dedicated constructor for the array you're expecting, and
+ wrap that in a PandasArray
+
+ >>> pd.array(np.array(['a', 'b'], dtype='<U1'))
+ <PandasArray>
+ ['a', 'b']
+ Length: 2, dtype: str32
+
+ Finally, Pandas has arrays that mostly overlap with NumPy
+
+ * :class:`arrays.DatetimeArray`
+ * :class:`arrays.TimedeltaArray`
+
+ When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is
+ passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray``
+ rather than a ``PandasArray``. This is for symmetry with the case of
+ timezone-aware data, which NumPy does not natively support.
+
+ >>> pd.array(['2015', '2016'], dtype='datetime64[ns]')
+ <DatetimeArray>
+ ['2015-01-01 00:00:00', '2016-01-01 00:00:00']
+ Length: 2, dtype: datetime64[ns]
+
+ >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]')
+ <TimedeltaArray>
+ ['01:00:00', '02:00:00']
+ Length: 2, dtype: timedelta64[ns]
+
+ Examples
+ --------
+ If a dtype is not specified, `data` is passed through to
+ :meth:`numpy.array`, and a :class:`arrays.PandasArray` is returned.
+
+ >>> pd.array([1, 2])
+ <PandasArray>
+ [1, 2]
+ Length: 2, dtype: int64
+
+ Or the NumPy dtype can be specified
+
+ >>> pd.array([1, 2], dtype=np.dtype("int32"))
+ <PandasArray>
+ [1, 2]
+ Length: 2, dtype: int32
+
+ You can use the string alias for `dtype`
+
+ >>> pd.array(['a', 'b', 'a'], dtype='category')
+ [a, b, a]
+ Categories (2, object): [a, b]
+
+ Or specify the actual dtype
+
+ >>> pd.array(['a', 'b', 'a'],
+ ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True))
+ [a, b, a]
+ Categories (3, object): [a < b < c]
+
+ Because omitting the `dtype` passes the data through to NumPy,
+ a mixture of valid integers and NA will return a floating-point
+ NumPy array.
+
+ >>> pd.array([1, 2, np.nan])
+ <PandasArray>
+ [1.0, 2.0, nan]
+ Length: 3, dtype: float64
+
+ To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify
+ the dtype:
+
+ >>> pd.array([1, 2, np.nan], dtype='Int64')
+ <IntegerArray>
+ [1, 2, NaN]
+ Length: 3, dtype: Int64
+
+ Pandas will infer an ExtensionArray for some types of data:
+
+ >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
+ <PeriodArray>
+ ['2000-01-01', '2000-01-01']
+ Length: 2, dtype: period[D]
+
+ `data` must be 1-dimensional. A ValueError is raised when the input
+ has the wrong dimensionality.
+
+ >>> pd.array(1)
+ Traceback (most recent call last):
+ ...
+ ValueError: Cannot pass scalar '1' to 'pandas.array'.
+ """
+ from pandas.core.arrays import (
+ period_array, ExtensionArray, IntervalArray, PandasArray,
+ DatetimeArray,
+ TimedeltaArray,
+ )
+ from pandas.core.internals.arrays import extract_array
+
+ if lib.is_scalar(data):
+ msg = (
+ "Cannot pass scalar '{}' to 'pandas.array'."
+ )
+ raise ValueError(msg.format(data))
+
+ data = extract_array(data, extract_numpy=True)
+
+ if dtype is None and isinstance(data, ExtensionArray):
+ dtype = data.dtype
+
+ # this returns None for not-found dtypes.
+ if isinstance(dtype, compat.string_types):
+ dtype = registry.find(dtype) or dtype
+
+ if is_extension_array_dtype(dtype):
+ cls = dtype.construct_array_type()
+ return cls._from_sequence(data, dtype=dtype, copy=copy)
+
+ if dtype is None:
+ inferred_dtype = lib.infer_dtype(data, skipna=False)
+ if inferred_dtype == 'period':
+ try:
+ return period_array(data, copy=copy)
+ except tslibs.IncompatibleFrequency:
+ # We may have a mixture of frequencies.
+ # We choose to return an ndarray, rather than raising.
+ pass
+ elif inferred_dtype == 'interval':
+ try:
+ return IntervalArray(data, copy=copy)
+ except ValueError:
+ # We may have a mixture of `closed` here.
+ # We choose to return an ndarray, rather than raising.
+ pass
+
+ elif inferred_dtype.startswith('datetime'):
+ # datetime, datetime64
+ try:
+ return DatetimeArray._from_sequence(data, copy=copy)
+ except ValueError:
+ # Mixture of timezones, fall back to PandasArray
+ pass
+
+ elif inferred_dtype.startswith('timedelta'):
+ # timedelta, timedelta64
+ return TimedeltaArray._from_sequence(data, copy=copy)
+
+ # TODO(BooleanArray): handle this type
+
+ # Pandas overrides NumPy for
+ # 1. datetime64[ns]
+ # 2. timedelta64[ns]
+ # so that a DatetimeArray is returned.
+ if is_datetime64_ns_dtype(dtype):
+ return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy)
+ elif is_timedelta64_ns_dtype(dtype):
+ return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy)
+
+ result = PandasArray._from_sequence(data, dtype=dtype, copy=copy)
+ return result
diff --git a/contrib/python/pandas/py2/pandas/core/arrays/base.py b/contrib/python/pandas/py2/pandas/core/arrays/base.py
new file mode 100644
index 00000000000..7aaefef3d03
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/arrays/base.py
@@ -0,0 +1,1120 @@
+"""An interface for extending pandas with custom arrays.
+
+.. warning::
+
+ This is an experimental API and subject to breaking changes
+ without warning.
+"""
+import operator
+
+import numpy as np
+
+from pandas.compat import PY3, set_function_name
+from pandas.compat.numpy import function as nv
+from pandas.errors import AbstractMethodError
+from pandas.util._decorators import Appender, Substitution
+
+from pandas.core.dtypes.common import is_list_like
+from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
+from pandas.core.dtypes.missing import isna
+
+from pandas.core import ops
+
+_not_implemented_message = "{} does not implement {}."
+
+_extension_array_shared_docs = dict()
+
+
+class ExtensionArray(object):
+ """
+ Abstract base class for custom 1-D array types.
+
+ pandas will recognize instances of this class as proper arrays
+ with a custom type and will not attempt to coerce them to objects. They
+ may be stored directly inside a :class:`DataFrame` or :class:`Series`.
+
+ .. versionadded:: 0.23.0
+
+ Notes
+ -----
+ The interface includes the following abstract methods that must be
+ implemented by subclasses:
+
+ * _from_sequence
+ * _from_factorized
+ * __getitem__
+ * __len__
+ * dtype
+ * nbytes
+ * isna
+ * take
+ * copy
+ * _concat_same_type
+
+ A default repr displaying the type, (truncated) data, length,
+ and dtype is provided. It can be customized or replaced by
+ by overriding:
+
+ * __repr__ : A default repr for the ExtensionArray.
+ * _formatter : Print scalars inside a Series or DataFrame.
+
+ Some methods require casting the ExtensionArray to an ndarray of Python
+ objects with ``self.astype(object)``, which may be expensive. When
+ performance is a concern, we highly recommend overriding the following
+ methods:
+
+ * fillna
+ * dropna
+ * unique
+ * factorize / _values_for_factorize
+ * argsort / _values_for_argsort
+ * searchsorted
+
+ The remaining methods implemented on this class should be performant,
+ as they only compose abstract methods. Still, a more efficient
+ implementation may be available, and these methods can be overridden.
+
+ One can implement methods to handle array reductions.
+
+ * _reduce
+
+ One can implement methods to handle parsing from strings that will be used
+ in methods such as ``pandas.io.parsers.read_csv``.
+
+ * _from_sequence_of_strings
+
+ This class does not inherit from 'abc.ABCMeta' for performance reasons.
+ Methods and properties required by the interface raise
+ ``pandas.errors.AbstractMethodError`` and no ``register`` method is
+ provided for registering virtual subclasses.
+
+ ExtensionArrays are limited to 1 dimension.
+
+ They may be backed by none, one, or many NumPy arrays. For example,
+ ``pandas.Categorical`` is an extension array backed by two arrays,
+ one for codes and one for categories. An array of IPv6 address may
+ be backed by a NumPy structured array with two fields, one for the
+ lower 64 bits and one for the upper 64 bits. Or they may be backed
+ by some other storage type, like Python lists. Pandas makes no
+ assumptions on how the data are stored, just that it can be converted
+ to a NumPy array.
+ The ExtensionArray interface does not impose any rules on how this data
+ is stored. However, currently, the backing data cannot be stored in
+ attributes called ``.values`` or ``._values`` to ensure full compatibility
+ with pandas internals. But other names as ``.data``, ``._data``,
+ ``._items``, ... can be freely used.
+ """
+ # '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray.
+ # Don't override this.
+ _typ = 'extension'
+
+ # ------------------------------------------------------------------------
+ # Constructors
+ # ------------------------------------------------------------------------
+ @classmethod
+ def _from_sequence(cls, scalars, dtype=None, copy=False):
+ """
+ Construct a new ExtensionArray from a sequence of scalars.
+
+ Parameters
+ ----------
+ scalars : Sequence
+ Each element will be an instance of the scalar type for this
+ array, ``cls.dtype.type``.
+ dtype : dtype, optional
+ Construct for this particular dtype. This should be a Dtype
+ compatible with the ExtensionArray.
+ copy : boolean, default False
+ If True, copy the underlying data.
+
+ Returns
+ -------
+ ExtensionArray
+ """
+ raise AbstractMethodError(cls)
+
+ @classmethod
+ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
+ """Construct a new ExtensionArray from a sequence of strings.
+
+ .. versionadded:: 0.24.0
+
+ Parameters
+ ----------
+ strings : Sequence
+ Each element will be an instance of the scalar type for this
+ array, ``cls.dtype.type``.
+ dtype : dtype, optional
+ Construct for this particular dtype. This should be a Dtype
+ compatible with the ExtensionArray.
+ copy : boolean, default False
+ If True, copy the underlying data.
+
+ Returns
+ -------
+ ExtensionArray
+
+ """
+ raise AbstractMethodError(cls)
+
+ @classmethod
+ def _from_factorized(cls, values, original):
+ """
+ Reconstruct an ExtensionArray after factorization.
+
+ Parameters
+ ----------
+ values : ndarray
+ An integer ndarray with the factorized values.
+ original : ExtensionArray
+ The original ExtensionArray that factorize was called on.
+
+ See Also
+ --------
+ pandas.factorize
+ ExtensionArray.factorize
+ """
+ raise AbstractMethodError(cls)
+
+ # ------------------------------------------------------------------------
+ # Must be a Sequence
+ # ------------------------------------------------------------------------
+
+ def __getitem__(self, item):
+ # type (Any) -> Any
+ """
+ Select a subset of self.
+
+ Parameters
+ ----------
+ item : int, slice, or ndarray
+ * int: The position in 'self' to get.
+
+ * slice: A slice object, where 'start', 'stop', and 'step' are
+ integers or None
+
+ * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
+
+ Returns
+ -------
+ item : scalar or ExtensionArray
+
+ Notes
+ -----
+ For scalar ``item``, return a scalar value suitable for the array's
+ type. This should be an instance of ``self.dtype.type``.
+
+ For slice ``key``, return an instance of ``ExtensionArray``, even
+ if the slice is length 0 or 1.
+
+ For a boolean mask, return an instance of ``ExtensionArray``, filtered
+ to the values where ``item`` is True.
+ """
+ raise AbstractMethodError(self)
+
+ def __setitem__(self, key, value):
+ # type: (Union[int, np.ndarray], Any) -> None
+ """
+ Set one or more values inplace.
+
+ This method is not required to satisfy the pandas extension array
+ interface.
+
+ Parameters
+ ----------
+ key : int, ndarray, or slice
+ When called from, e.g. ``Series.__setitem__``, ``key`` will be
+ one of
+
+ * scalar int
+ * ndarray of integers.
+ * boolean ndarray
+ * slice object
+
+ value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
+ value or values to be set of ``key``.
+
+ Returns
+ -------
+ None
+ """
+ # Some notes to the ExtensionArray implementor who may have ended up
+ # here. While this method is not required for the interface, if you
+ # *do* choose to implement __setitem__, then some semantics should be
+ # observed:
+ #
+ # * Setting multiple values : ExtensionArrays should support setting
+ # multiple values at once, 'key' will be a sequence of integers and
+ # 'value' will be a same-length sequence.
+ #
+ # * Broadcasting : For a sequence 'key' and a scalar 'value',
+ # each position in 'key' should be set to 'value'.
+ #
+ # * Coercion : Most users will expect basic coercion to work. For
+ # example, a string like '2018-01-01' is coerced to a datetime
+ # when setting on a datetime64ns array. In general, if the
+ # __init__ method coerces that value, then so should __setitem__
+ # Note, also, that Series/DataFrame.where internally use __setitem__
+ # on a copy of the data.
+ raise NotImplementedError(_not_implemented_message.format(
+ type(self), '__setitem__')
+ )
+
+ def __len__(self):
+ # type: () -> int
+ """
+ Length of this array
+
+ Returns
+ -------
+ length : int
+ """
+ raise AbstractMethodError(self)
+
+ def __iter__(self):
+ """
+ Iterate over elements of the array.
+ """
+ # This needs to be implemented so that pandas recognizes extension
+ # arrays as list-like. The default implementation makes successive
+ # calls to ``__getitem__``, which may be slower than necessary.
+ for i in range(len(self)):
+ yield self[i]
+
+ # ------------------------------------------------------------------------
+ # Required attributes
+ # ------------------------------------------------------------------------
+ @property
+ def dtype(self):
+ # type: () -> ExtensionDtype
+ """
+ An instance of 'ExtensionDtype'.
+ """
+ raise AbstractMethodError(self)
+
+ @property
+ def shape(self):
+ # type: () -> Tuple[int, ...]
+ """
+ Return a tuple of the array dimensions.
+ """
+ return (len(self),)
+
+ @property
+ def ndim(self):
+ # type: () -> int
+ """
+ Extension Arrays are only allowed to be 1-dimensional.
+ """
+ return 1
+
+ @property
+ def nbytes(self):
+ # type: () -> int
+ """
+ The number of bytes needed to store this object in memory.
+ """
+ # If this is expensive to compute, return an approximate lower bound
+ # on the number of bytes needed.
+ raise AbstractMethodError(self)
+
+ # ------------------------------------------------------------------------
+ # Additional Methods
+ # ------------------------------------------------------------------------
+ def astype(self, dtype, copy=True):
+ """
+ Cast to a NumPy array with 'dtype'.
+
+ Parameters
+ ----------
+ dtype : str or dtype
+ Typecode or data-type to which the array is cast.
+ copy : bool, default True
+ Whether to copy the data, even if not necessary. If False,
+ a copy is made only if the old dtype does not match the
+ new dtype.
+
+ Returns
+ -------
+ array : ndarray
+ NumPy ndarray with 'dtype' for its dtype.
+ """
+ return np.array(self, dtype=dtype, copy=copy)
+
+ def isna(self):
+ # type: () -> Union[ExtensionArray, np.ndarray]
+ """
+ A 1-D array indicating if each value is missing.
+
+ Returns
+ -------
+ na_values : Union[np.ndarray, ExtensionArray]
+ In most cases, this should return a NumPy ndarray. For
+ exceptional cases like ``SparseArray``, where returning
+ an ndarray would be expensive, an ExtensionArray may be
+ returned.
+
+ Notes
+ -----
+ If returning an ExtensionArray, then
+
+ * ``na_values._is_boolean`` should be True
+ * `na_values` should implement :func:`ExtensionArray._reduce`
+ * ``na_values.any`` and ``na_values.all`` should be implemented
+ """
+ raise AbstractMethodError(self)
+
+ def _values_for_argsort(self):
+ # type: () -> ndarray
+ """
+ Return values for sorting.
+
+ Returns
+ -------
+ ndarray
+ The transformed values should maintain the ordering between values
+ within the array.
+
+ See Also
+ --------
+ ExtensionArray.argsort
+ """
+ # Note: this is used in `ExtensionArray.argsort`.
+ return np.array(self)
+
+ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
+ """
+ Return the indices that would sort this array.
+
+ Parameters
+ ----------
+ ascending : bool, default True
+ Whether the indices should result in an ascending
+ or descending sort.
+ kind : {'quicksort', 'mergesort', 'heapsort'}, optional
+ Sorting algorithm.
+ *args, **kwargs:
+ passed through to :func:`numpy.argsort`.
+
+ Returns
+ -------
+ index_array : ndarray
+ Array of indices that sort ``self``.
+
+ See Also
+ --------
+ numpy.argsort : Sorting implementation used internally.
+ """
+ # Implementor note: You have two places to override the behavior of
+ # argsort.
+ # 1. _values_for_argsort : construct the values passed to np.argsort
+ # 2. argsort : total control over sorting.
+ ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs)
+ values = self._values_for_argsort()
+ result = np.argsort(values, kind=kind, **kwargs)
+ if not ascending:
+ result = result[::-1]
+ return result
+
+ def fillna(self, value=None, method=None, limit=None):
+ """
+ Fill NA/NaN values using the specified method.
+
+ Parameters
+ ----------
+ value : scalar, array-like
+ If a scalar value is passed it is used to fill all missing values.
+ Alternatively, an array-like 'value' can be given. It's expected
+ that the array-like have the same length as 'self'.
+ method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
+ Method to use for filling holes in reindexed Series
+ pad / ffill: propagate last valid observation forward to next valid
+ backfill / bfill: use NEXT valid observation to fill gap
+ limit : int, default None
+ If method is specified, this is the maximum number of consecutive
+ NaN values to forward/backward fill. In other words, if there is
+ a gap with more than this number of consecutive NaNs, it will only
+ be partially filled. If method is not specified, this is the
+ maximum number of entries along the entire axis where NaNs will be
+ filled.
+
+ Returns
+ -------
+ filled : ExtensionArray with NA/NaN filled
+ """
+ from pandas.api.types import is_array_like
+ from pandas.util._validators import validate_fillna_kwargs
+ from pandas.core.missing import pad_1d, backfill_1d
+
+ value, method = validate_fillna_kwargs(value, method)
+
+ mask = self.isna()
+
+ if is_array_like(value):
+ if len(value) != len(self):
+ raise ValueError("Length of 'value' does not match. Got ({}) "
+ " expected {}".format(len(value), len(self)))
+ value = value[mask]
+
+ if mask.any():
+ if method is not None:
+ func = pad_1d if method == 'pad' else backfill_1d
+ new_values = func(self.astype(object), limit=limit,
+ mask=mask)
+ new_values = self._from_sequence(new_values, dtype=self.dtype)
+ else:
+ # fill with value
+ new_values = self.copy()
+ new_values[mask] = value
+ else:
+ new_values = self.copy()
+ return new_values
+
+ def dropna(self):
+ """
+ Return ExtensionArray without NA values
+
+ Returns
+ -------
+ valid : ExtensionArray
+ """
+ return self[~self.isna()]
+
+ def shift(self, periods=1, fill_value=None):
+ # type: (int, object) -> ExtensionArray
+ """
+ Shift values by desired number.
+
+ Newly introduced missing values are filled with
+ ``self.dtype.na_value``.
+
+ .. versionadded:: 0.24.0
+
+ Parameters
+ ----------
+ periods : int, default 1
+ The number of periods to shift. Negative values are allowed
+ for shifting backwards.
+
+ fill_value : object, optional
+ The scalar value to use for newly introduced missing values.
+ The default is ``self.dtype.na_value``
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ shifted : ExtensionArray
+
+ Notes
+ -----
+ If ``self`` is empty or ``periods`` is 0, a copy of ``self`` is
+ returned.
+
+ If ``periods > len(self)``, then an array of size
+ len(self) is returned, with all values filled with
+ ``self.dtype.na_value``.
+ """
+ # Note: this implementation assumes that `self.dtype.na_value` can be
+ # stored in an instance of your ExtensionArray with `self.dtype`.
+ if not len(self) or periods == 0:
+ return self.copy()
+
+ if isna(fill_value):
+ fill_value = self.dtype.na_value
+
+ empty = self._from_sequence(
+ [fill_value] * min(abs(periods), len(self)),
+ dtype=self.dtype
+ )
+ if periods > 0:
+ a = empty
+ b = self[:-periods]
+ else:
+ a = self[abs(periods):]
+ b = empty
+ return self._concat_same_type([a, b])
+
+ def unique(self):
+ """
+ Compute the ExtensionArray of unique values.
+
+ Returns
+ -------
+ uniques : ExtensionArray
+ """
+ from pandas import unique
+
+ uniques = unique(self.astype(object))
+ return self._from_sequence(uniques, dtype=self.dtype)
+
+ def searchsorted(self, value, side="left", sorter=None):
+ """
+ Find indices where elements should be inserted to maintain order.
+
+ .. versionadded:: 0.24.0
+
+ Find the indices into a sorted array `self` (a) such that, if the
+ corresponding elements in `v` were inserted before the indices, the
+ order of `self` would be preserved.
+
+ Assuming that `a` is sorted:
+
+ ====== ============================
+ `side` returned index `i` satisfies
+ ====== ============================
+ left ``self[i-1] < v <= self[i]``
+ right ``self[i-1] <= v < self[i]``
+ ====== ============================
+
+ Parameters
+ ----------
+ value : array_like
+ Values to insert into `self`.
+ side : {'left', 'right'}, optional
+ If 'left', the index of the first suitable location found is given.
+ If 'right', return the last such index. If there is no suitable
+ index, return either 0 or N (where N is the length of `self`).
+ sorter : 1-D array_like, optional
+ Optional array of integer indices that sort array a into ascending
+ order. They are typically the result of argsort.
+
+ Returns
+ -------
+ indices : array of ints
+ Array of insertion points with the same shape as `value`.
+
+ See Also
+ --------
+ numpy.searchsorted : Similar method from NumPy.
+ """
+ # Note: the base tests provided by pandas only test the basics.
+ # We do not test
+ # 1. Values outside the range of the `data_for_sorting` fixture
+ # 2. Values between the values in the `data_for_sorting` fixture
+ # 3. Missing values.
+ arr = self.astype(object)
+ return arr.searchsorted(value, side=side, sorter=sorter)
+
+ def _values_for_factorize(self):
+ # type: () -> Tuple[ndarray, Any]
+ """
+ Return an array and missing value suitable for factorization.
+
+ Returns
+ -------
+ values : ndarray
+
+ An array suitable for factorization. This should maintain order
+ and be a supported dtype (Float64, Int64, UInt64, String, Object).
+ By default, the extension array is cast to object dtype.
+ na_value : object
+ The value in `values` to consider missing. This will be treated
+ as NA in the factorization routines, so it will be coded as
+ `na_sentinal` and not included in `uniques`. By default,
+ ``np.nan`` is used.
+
+ Notes
+ -----
+ The values returned by this method are also used in
+ :func:`pandas.util.hash_pandas_object`.
+ """
+ return self.astype(object), np.nan
+
+ def factorize(self, na_sentinel=-1):
+ # type: (int) -> Tuple[ndarray, ExtensionArray]
+ """
+ Encode the extension array as an enumerated type.
+
+ Parameters
+ ----------
+ na_sentinel : int, default -1
+ Value to use in the `labels` array to indicate missing values.
+
+ Returns
+ -------
+ labels : ndarray
+ An integer NumPy array that's an indexer into the original
+ ExtensionArray.
+ uniques : ExtensionArray
+ An ExtensionArray containing the unique values of `self`.
+
+ .. note::
+
+ uniques will *not* contain an entry for the NA value of
+ the ExtensionArray if there are any missing values present
+ in `self`.
+
+ See Also
+ --------
+ pandas.factorize : Top-level factorize method that dispatches here.
+
+ Notes
+ -----
+ :meth:`pandas.factorize` offers a `sort` keyword as well.
+ """
+ # Impelmentor note: There are two ways to override the behavior of
+ # pandas.factorize
+ # 1. _values_for_factorize and _from_factorize.
+ # Specify the values passed to pandas' internal factorization
+ # routines, and how to convert from those values back to the
+ # original ExtensionArray.
+ # 2. ExtensionArray.factorize.
+ # Complete control over factorization.
+ from pandas.core.algorithms import _factorize_array
+
+ arr, na_value = self._values_for_factorize()
+
+ labels, uniques = _factorize_array(arr, na_sentinel=na_sentinel,
+ na_value=na_value)
+
+ uniques = self._from_factorized(uniques, self)
+ return labels, uniques
+
+ _extension_array_shared_docs['repeat'] = """
+ Repeat elements of a %(klass)s.
+
+ Returns a new %(klass)s where each element of the current %(klass)s
+ is repeated consecutively a given number of times.
+
+ Parameters
+ ----------
+ repeats : int or array of ints
+ The number of repetitions for each element. This should be a
+ non-negative integer. Repeating 0 times will return an empty
+ %(klass)s.
+ axis : None
+ Must be ``None``. Has no effect but is accepted for compatibility
+ with numpy.
+
+ Returns
+ -------
+ repeated_array : %(klass)s
+ Newly created %(klass)s with repeated elements.
+
+ See Also
+ --------
+ Series.repeat : Equivalent function for Series.
+ Index.repeat : Equivalent function for Index.
+ numpy.repeat : Similar method for :class:`numpy.ndarray`.
+ ExtensionArray.take : Take arbitrary positions.
+
+ Examples
+ --------
+ >>> cat = pd.Categorical(['a', 'b', 'c'])
+ >>> cat
+ [a, b, c]
+ Categories (3, object): [a, b, c]
+ >>> cat.repeat(2)
+ [a, a, b, b, c, c]
+ Categories (3, object): [a, b, c]
+ >>> cat.repeat([1, 2, 3])
+ [a, b, b, c, c, c]
+ Categories (3, object): [a, b, c]
+ """
+
+ @Substitution(klass='ExtensionArray')
+ @Appender(_extension_array_shared_docs['repeat'])
+ def repeat(self, repeats, axis=None):
+ nv.validate_repeat(tuple(), dict(axis=axis))
+ ind = np.arange(len(self)).repeat(repeats)
+ return self.take(ind)
+
+ # ------------------------------------------------------------------------
+ # Indexing methods
+ # ------------------------------------------------------------------------
+
+ def take(self, indices, allow_fill=False, fill_value=None):
+ # type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray
+ """
+ Take elements from an array.
+
+ Parameters
+ ----------
+ indices : sequence of integers
+ Indices to be taken.
+ allow_fill : bool, default False
+ How to handle negative values in `indices`.
+
+ * False: negative values in `indices` indicate positional indices
+ from the right (the default). This is similar to
+ :func:`numpy.take`.
+
+ * True: negative values in `indices` indicate
+ missing values. These values are set to `fill_value`. Any other
+ other negative values raise a ``ValueError``.
+
+ fill_value : any, optional
+ Fill value to use for NA-indices when `allow_fill` is True.
+ This may be ``None``, in which case the default NA value for
+ the type, ``self.dtype.na_value``, is used.
+
+ For many ExtensionArrays, there will be two representations of
+ `fill_value`: a user-facing "boxed" scalar, and a low-level
+ physical NA value. `fill_value` should be the user-facing version,
+ and the implementation should handle translating that to the
+ physical version for processing the take if necessary.
+
+ Returns
+ -------
+ ExtensionArray
+
+ Raises
+ ------
+ IndexError
+ When the indices are out of bounds for the array.
+ ValueError
+ When `indices` contains negative values other than ``-1``
+ and `allow_fill` is True.
+
+ Notes
+ -----
+ ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
+ ``iloc``, when `indices` is a sequence of values. Additionally,
+ it's called by :meth:`Series.reindex`, or any other method
+ that causes realignment, with a `fill_value`.
+
+ See Also
+ --------
+ numpy.take
+ pandas.api.extensions.take
+
+ Examples
+ --------
+ Here's an example implementation, which relies on casting the
+ extension array to object dtype. This uses the helper method
+ :func:`pandas.api.extensions.take`.
+
+ .. code-block:: python
+
+ def take(self, indices, allow_fill=False, fill_value=None):
+ from pandas.core.algorithms import take
+
+ # If the ExtensionArray is backed by an ndarray, then
+ # just pass that here instead of coercing to object.
+ data = self.astype(object)
+
+ if allow_fill and fill_value is None:
+ fill_value = self.dtype.na_value
+
+ # fill value should always be translated from the scalar
+ # type for the array, to the physical storage type for
+ # the data, before passing to take.
+
+ result = take(data, indices, fill_value=fill_value,
+ allow_fill=allow_fill)
+ return self._from_sequence(result, dtype=self.dtype)
+ """
+ # Implementer note: The `fill_value` parameter should be a user-facing
+ # value, an instance of self.dtype.type. When passed `fill_value=None`,
+ # the default of `self.dtype.na_value` should be used.
+ # This may differ from the physical storage type your ExtensionArray
+ # uses. In this case, your implementation is responsible for casting
+ # the user-facing type to the storage type, before using
+ # pandas.api.extensions.take
+ raise AbstractMethodError(self)
+
+ def copy(self, deep=False):
+ # type: (bool) -> ExtensionArray
+ """
+ Return a copy of the array.
+
+ Parameters
+ ----------
+ deep : bool, default False
+ Also copy the underlying data backing this array.
+
+ Returns
+ -------
+ ExtensionArray
+ """
+ raise AbstractMethodError(self)
+
+ # ------------------------------------------------------------------------
+ # Printing
+ # ------------------------------------------------------------------------
+ def __repr__(self):
+ from pandas.io.formats.printing import format_object_summary
+
+ template = (
+ u'{class_name}'
+ u'{data}\n'
+ u'Length: {length}, dtype: {dtype}'
+ )
+ # the short repr has no trailing newline, while the truncated
+ # repr does. So we include a newline in our template, and strip
+ # any trailing newlines from format_object_summary
+ data = format_object_summary(self, self._formatter(),
+ indent_for_name=False).rstrip(', \n')
+ class_name = u'<{}>\n'.format(self.__class__.__name__)
+ return template.format(class_name=class_name, data=data,
+ length=len(self),
+ dtype=self.dtype)
+
+ def _formatter(self, boxed=False):
+ # type: (bool) -> Callable[[Any], Optional[str]]
+ """Formatting function for scalar values.
+
+ This is used in the default '__repr__'. The returned formatting
+ function receives instances of your scalar type.
+
+ Parameters
+ ----------
+ boxed: bool, default False
+ An indicated for whether or not your array is being printed
+ within a Series, DataFrame, or Index (True), or just by
+ itself (False). This may be useful if you want scalar values
+ to appear differently within a Series versus on its own (e.g.
+ quoted or not).
+
+ Returns
+ -------
+ Callable[[Any], str]
+ A callable that gets instances of the scalar type and
+ returns a string. By default, :func:`repr` is used
+ when ``boxed=False`` and :func:`str` is used when
+ ``boxed=True``.
+ """
+ if boxed:
+ return str
+ return repr
+
+ def _formatting_values(self):
+ # type: () -> np.ndarray
+ # At the moment, this has to be an array since we use result.dtype
+ """
+ An array of values to be printed in, e.g. the Series repr
+
+ .. deprecated:: 0.24.0
+
+ Use :meth:`ExtensionArray._formatter` instead.
+ """
+ return np.array(self)
+
+ # ------------------------------------------------------------------------
+ # Reshaping
+ # ------------------------------------------------------------------------
+
+ @classmethod
+ def _concat_same_type(cls, to_concat):
+ # type: (Sequence[ExtensionArray]) -> ExtensionArray
+ """
+ Concatenate multiple array
+
+ Parameters
+ ----------
+ to_concat : sequence of this type
+
+ Returns
+ -------
+ ExtensionArray
+ """
+ raise AbstractMethodError(cls)
+
+ # The _can_hold_na attribute is set to True so that pandas internals
+ # will use the ExtensionDtype.na_value as the NA value in operations
+ # such as take(), reindex(), shift(), etc. In addition, those results
+ # will then be of the ExtensionArray subclass rather than an array
+ # of objects
+ _can_hold_na = True
+
+ @property
+ def _ndarray_values(self):
+ # type: () -> np.ndarray
+ """
+ Internal pandas method for lossy conversion to a NumPy ndarray.
+
+ This method is not part of the pandas interface.
+
+ The expectation is that this is cheap to compute, and is primarily
+ used for interacting with our indexers.
+ """
+ return np.array(self)
+
+ def _reduce(self, name, skipna=True, **kwargs):
+ """
+ Return a scalar result of performing the reduction operation.
+
+ Parameters
+ ----------
+ name : str
+ Name of the function, supported values are:
+ { any, all, min, max, sum, mean, median, prod,
+ std, var, sem, kurt, skew }.
+ skipna : bool, default True
+ If True, skip NaN values.
+ **kwargs
+ Additional keyword arguments passed to the reduction function.
+ Currently, `ddof` is the only supported kwarg.
+
+ Returns
+ -------
+ scalar
+
+ Raises
+ ------
+ TypeError : subclass does not define reductions
+ """
+ raise TypeError("cannot perform {name} with type {dtype}".format(
+ name=name, dtype=self.dtype))
+
+
+class ExtensionOpsMixin(object):
+ """
+ A base class for linking the operators to their dunder names.
+
+ .. note::
+
+ You may want to set ``__array_priority__`` if you want your
+ implementation to be called when involved in binary operations
+ with NumPy arrays.
+ """
+
+ @classmethod
+ def _add_arithmetic_ops(cls):
+ cls.__add__ = cls._create_arithmetic_method(operator.add)
+ cls.__radd__ = cls._create_arithmetic_method(ops.radd)
+ cls.__sub__ = cls._create_arithmetic_method(operator.sub)
+ cls.__rsub__ = cls._create_arithmetic_method(ops.rsub)
+ cls.__mul__ = cls._create_arithmetic_method(operator.mul)
+ cls.__rmul__ = cls._create_arithmetic_method(ops.rmul)
+ cls.__pow__ = cls._create_arithmetic_method(operator.pow)
+ cls.__rpow__ = cls._create_arithmetic_method(ops.rpow)
+ cls.__mod__ = cls._create_arithmetic_method(operator.mod)
+ cls.__rmod__ = cls._create_arithmetic_method(ops.rmod)
+ cls.__floordiv__ = cls._create_arithmetic_method(operator.floordiv)
+ cls.__rfloordiv__ = cls._create_arithmetic_method(ops.rfloordiv)
+ cls.__truediv__ = cls._create_arithmetic_method(operator.truediv)
+ cls.__rtruediv__ = cls._create_arithmetic_method(ops.rtruediv)
+ if not PY3:
+ cls.__div__ = cls._create_arithmetic_method(operator.div)
+ cls.__rdiv__ = cls._create_arithmetic_method(ops.rdiv)
+
+ cls.__divmod__ = cls._create_arithmetic_method(divmod)
+ cls.__rdivmod__ = cls._create_arithmetic_method(ops.rdivmod)
+
+ @classmethod
+ def _add_comparison_ops(cls):
+ cls.__eq__ = cls._create_comparison_method(operator.eq)
+ cls.__ne__ = cls._create_comparison_method(operator.ne)
+ cls.__lt__ = cls._create_comparison_method(operator.lt)
+ cls.__gt__ = cls._create_comparison_method(operator.gt)
+ cls.__le__ = cls._create_comparison_method(operator.le)
+ cls.__ge__ = cls._create_comparison_method(operator.ge)
+
+
+class ExtensionScalarOpsMixin(ExtensionOpsMixin):
+ """
+ A mixin for defining ops on an ExtensionArray.
+
+ It is assumed that the underlying scalar objects have the operators
+ already defined.
+
+ Notes
+ -----
+ If you have defined a subclass MyExtensionArray(ExtensionArray), then
+ use MyExtensionArray(ExtensionArray, ExtensionScalarOpsMixin) to
+ get the arithmetic operators. After the definition of MyExtensionArray,
+ insert the lines
+
+ MyExtensionArray._add_arithmetic_ops()
+ MyExtensionArray._add_comparison_ops()
+
+ to link the operators to your class.
+
+ .. note::
+
+ You may want to set ``__array_priority__`` if you want your
+ implementation to be called when involved in binary operations
+ with NumPy arrays.
+ """
+
+ @classmethod
+ def _create_method(cls, op, coerce_to_dtype=True):
+ """
+ A class method that returns a method that will correspond to an
+ operator for an ExtensionArray subclass, by dispatching to the
+ relevant operator defined on the individual elements of the
+ ExtensionArray.
+
+ Parameters
+ ----------
+ op : function
+ An operator that takes arguments op(a, b)
+ coerce_to_dtype : bool, default True
+ boolean indicating whether to attempt to convert
+ the result to the underlying ExtensionArray dtype.
+ If it's not possible to create a new ExtensionArray with the
+ values, an ndarray is returned instead.
+
+ Returns
+ -------
+ Callable[[Any, Any], Union[ndarray, ExtensionArray]]
+ A method that can be bound to a class. When used, the method
+ receives the two arguments, one of which is the instance of
+ this class, and should return an ExtensionArray or an ndarray.
+
+ Returning an ndarray may be necessary when the result of the
+ `op` cannot be stored in the ExtensionArray. The dtype of the
+ ndarray uses NumPy's normal inference rules.
+
+ Example
+ -------
+ Given an ExtensionArray subclass called MyExtensionArray, use
+
+ >>> __add__ = cls._create_method(operator.add)
+
+ in the class definition of MyExtensionArray to create the operator
+ for addition, that will be based on the operator implementation
+ of the underlying elements of the ExtensionArray
+ """
+
+ def _binop(self, other):
+ def convert_values(param):
+ if isinstance(param, ExtensionArray) or is_list_like(param):
+ ovalues = param
+ else: # Assume its an object
+ ovalues = [param] * len(self)
+ return ovalues
+
+ if isinstance(other, (ABCSeries, ABCIndexClass)):
+ # rely on pandas to unbox and dispatch to us
+ return NotImplemented
+
+ lvalues = self
+ rvalues = convert_values(other)
+
+ # If the operator is not defined for the underlying objects,
+ # a TypeError should be raised
+ res = [op(a, b) for (a, b) in zip(lvalues, rvalues)]
+
+ def _maybe_convert(arr):
+ if coerce_to_dtype:
+ # https://github.com/pandas-dev/pandas/issues/22850
+ # We catch all regular exceptions here, and fall back
+ # to an ndarray.
+ try:
+ res = self._from_sequence(arr)
+ except Exception:
+ res = np.asarray(arr)
+ else:
+ res = np.asarray(arr)
+ return res
+
+ if op.__name__ in {'divmod', 'rdivmod'}:
+ a, b = zip(*res)
+ res = _maybe_convert(a), _maybe_convert(b)
+ else:
+ res = _maybe_convert(res)
+ return res
+
+ op_name = ops._get_op_name(op, True)
+ return set_function_name(_binop, op_name, cls)
+
+ @classmethod
+ def _create_arithmetic_method(cls, op):
+ return cls._create_method(op)
+
+ @classmethod
+ def _create_comparison_method(cls, op):
+ return cls._create_method(op, coerce_to_dtype=False)
diff --git a/contrib/python/pandas/py2/pandas/core/arrays/categorical.py b/contrib/python/pandas/py2/pandas/core/arrays/categorical.py
new file mode 100644
index 00000000000..73a03b4f71b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/arrays/categorical.py
@@ -0,0 +1,2708 @@
+# pylint: disable=E1101,W0232
+
+import textwrap
+from warnings import warn
+
+import numpy as np
+
+from pandas._libs import algos as libalgos, lib
+import pandas.compat as compat
+from pandas.compat import lzip, u
+from pandas.compat.numpy import function as nv
+from pandas.util._decorators import (
+ Appender, Substitution, cache_readonly, deprecate_kwarg)
+from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
+
+from pandas.core.dtypes.cast import (
+ coerce_indexer_dtype, maybe_infer_to_datetimelike)
+from pandas.core.dtypes.common import (
+ ensure_int64, ensure_object, ensure_platform_int, is_categorical,
+ is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like,
+ is_dtype_equal, is_extension_array_dtype, is_float_dtype, is_integer_dtype,
+ is_iterator, is_list_like, is_object_dtype, is_scalar, is_sequence,
+ is_timedelta64_dtype)
+from pandas.core.dtypes.dtypes import CategoricalDtype
+from pandas.core.dtypes.generic import (
+ ABCCategoricalIndex, ABCDataFrame, ABCIndexClass, ABCSeries)
+from pandas.core.dtypes.inference import is_hashable
+from pandas.core.dtypes.missing import isna, notna
+
+from pandas.core.accessor import PandasDelegate, delegate_names
+import pandas.core.algorithms as algorithms
+from pandas.core.algorithms import factorize, take, take_1d, unique1d
+from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs
+import pandas.core.common as com
+from pandas.core.config import get_option
+from pandas.core.missing import interpolate_2d
+from pandas.core.sorting import nargsort
+
+from pandas.io.formats import console
+from pandas.io.formats.terminal import get_terminal_size
+
+from .base import ExtensionArray, _extension_array_shared_docs
+
+_take_msg = textwrap.dedent("""\
+ Interpreting negative values in 'indexer' as missing values.
+ In the future, this will change to meaning positional indices
+ from the right.
+
+ Use 'allow_fill=True' to retain the previous behavior and silence this
+ warning.
+
+ Use 'allow_fill=False' to accept the new behavior.""")
+
+
+def _cat_compare_op(op):
+ def f(self, other):
+ # On python2, you can usually compare any type to any type, and
+ # Categoricals can be seen as a custom type, but having different
+ # results depending whether categories are the same or not is kind of
+ # insane, so be a bit stricter here and use the python3 idea of
+ # comparing only things of equal type.
+ if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
+ return NotImplemented
+
+ other = lib.item_from_zerodim(other)
+
+ if not self.ordered:
+ if op in ['__lt__', '__gt__', '__le__', '__ge__']:
+ raise TypeError("Unordered Categoricals can only compare "
+ "equality or not")
+ if isinstance(other, Categorical):
+ # Two Categoricals can only be be compared if the categories are
+ # the same (maybe up to ordering, depending on ordered)
+
+ msg = ("Categoricals can only be compared if "
+ "'categories' are the same.")
+ if len(self.categories) != len(other.categories):
+ raise TypeError(msg + " Categories are different lengths")
+ elif (self.ordered and not (self.categories ==
+ other.categories).all()):
+ raise TypeError(msg)
+ elif not set(self.categories) == set(other.categories):
+ raise TypeError(msg)
+
+ if not (self.ordered == other.ordered):
+ raise TypeError("Categoricals can only be compared if "
+ "'ordered' is the same")
+ if not self.ordered and not self.categories.equals(
+ other.categories):
+ # both unordered and different order
+ other_codes = _get_codes_for_values(other, self.categories)
+ else:
+ other_codes = other._codes
+
+ na_mask = (self._codes == -1) | (other_codes == -1)
+ f = getattr(self._codes, op)
+ ret = f(other_codes)
+ if na_mask.any():
+ # In other series, the leads to False, so do that here too
+ ret[na_mask] = False
+ return ret
+
+ # Numpy < 1.13 may convert a scalar to a zerodim array during
+ # comparison operation when second arg has higher priority, e.g.
+ #
+ # cat[0] < cat
+ #
+ # With cat[0], for example, being ``np.int64(1)`` by the time it gets
+ # into this function would become ``np.array(1)``.
+ if is_scalar(other):
+ if other in self.categories:
+ i = self.categories.get_loc(other)
+ return getattr(self._codes, op)(i)
+ else:
+ if op == '__eq__':
+ return np.repeat(False, len(self))
+ elif op == '__ne__':
+ return np.repeat(True, len(self))
+ else:
+ msg = ("Cannot compare a Categorical for op {op} with a "
+ "scalar, which is not a category.")
+ raise TypeError(msg.format(op=op))
+ else:
+
+ # allow categorical vs object dtype array comparisons for equality
+ # these are only positional comparisons
+ if op in ['__eq__', '__ne__']:
+ return getattr(np.array(self), op)(np.array(other))
+
+ msg = ("Cannot compare a Categorical for op {op} with type {typ}."
+ "\nIf you want to compare values, use 'np.asarray(cat) "
+ "<op> other'.")
+ raise TypeError(msg.format(op=op, typ=type(other)))
+
+ f.__name__ = op
+
+ return f
+
+
+def _maybe_to_categorical(array):
+ """
+ Coerce to a categorical if a series is given.
+
+ Internal use ONLY.
+ """
+ if isinstance(array, (ABCSeries, ABCCategoricalIndex)):
+ return array._values
+ elif isinstance(array, np.ndarray):
+ return Categorical(array)
+ return array
+
+
+def contains(cat, key, container):
+ """
+ Helper for membership check for ``key`` in ``cat``.
+
+ This is a helper method for :method:`__contains__`
+ and :class:`CategoricalIndex.__contains__`.
+
+ Returns True if ``key`` is in ``cat.categories`` and the
+ location of ``key`` in ``categories`` is in ``container``.
+
+ Parameters
+ ----------
+ cat : :class:`Categorical`or :class:`categoricalIndex`
+ key : a hashable object
+ The key to check membership for.
+ container : Container (e.g. list-like or mapping)
+ The container to check for membership in.
+
+ Returns
+ -------
+ is_in : bool
+ True if ``key`` is in ``self.categories`` and location of
+ ``key`` in ``categories`` is in ``container``, else False.
+
+ Notes
+ -----
+ This method does not check for NaN values. Do that separately
+ before calling this method.
+ """
+ hash(key)
+
+ # get location of key in categories.
+ # If a KeyError, the key isn't in categories, so logically
+ # can't be in container either.
+ try:
+ loc = cat.categories.get_loc(key)
+ except KeyError:
+ return False
+
+ # loc is the location of key in categories, but also the *value*
+ # for key in container. So, `key` may be in categories,
+ # but still not in `container`. Example ('b' in categories,
+ # but not in values):
+ # 'b' in Categorical(['a'], categories=['a', 'b']) # False
+ if is_scalar(loc):
+ return loc in container
+ else:
+ # if categories is an IntervalIndex, loc is an array.
+ return any(loc_ in container for loc_ in loc)
+
+
+_codes_doc = """\
+The category codes of this categorical.
+
+Level codes are an array if integer which are the positions of the real
+values in the categories array.
+
+There is not setter, use the other categorical methods and the normal item
+setter to change values in the categorical.
+"""
+
+
+class Categorical(ExtensionArray, PandasObject):
+ """
+ Represents a categorical variable in classic R / S-plus fashion
+
+ `Categoricals` can only take on only a limited, and usually fixed, number
+ of possible values (`categories`). In contrast to statistical categorical
+ variables, a `Categorical` might have an order, but numerical operations
+ (additions, divisions, ...) are not possible.
+
+ All values of the `Categorical` are either in `categories` or `np.nan`.
+ Assigning values outside of `categories` will raise a `ValueError`. Order
+ is defined by the order of the `categories`, not lexical order of the
+ values.
+
+ Parameters
+ ----------
+ values : list-like
+ The values of the categorical. If categories are given, values not in
+ categories will be replaced with NaN.
+ categories : Index-like (unique), optional
+ The unique categories for this categorical. If not given, the
+ categories are assumed to be the unique values of `values` (sorted, if
+ possible, otherwise in the order in which they appear).
+ ordered : boolean, (default False)
+ Whether or not this categorical is treated as a ordered categorical.
+ If True, the resulting categorical will be ordered.
+ An ordered categorical respects, when sorted, the order of its
+ `categories` attribute (which in turn is the `categories` argument, if
+ provided).
+ dtype : CategoricalDtype
+ An instance of ``CategoricalDtype`` to use for this categorical
+
+ .. versionadded:: 0.21.0
+
+ Attributes
+ ----------
+ categories : Index
+ The categories of this categorical
+ codes : ndarray
+ The codes (integer positions, which point to the categories) of this
+ categorical, read only.
+ ordered : boolean
+ Whether or not this Categorical is ordered.
+ dtype : CategoricalDtype
+ The instance of ``CategoricalDtype`` storing the ``categories``
+ and ``ordered``.
+
+ .. versionadded:: 0.21.0
+
+ Methods
+ -------
+ from_codes
+ __array__
+
+ Raises
+ ------
+ ValueError
+ If the categories do not validate.
+ TypeError
+ If an explicit ``ordered=True`` is given but no `categories` and the
+ `values` are not sortable.
+
+ See Also
+ --------
+ pandas.api.types.CategoricalDtype : Type for categorical data.
+ CategoricalIndex : An Index with an underlying ``Categorical``.
+
+ Notes
+ -----
+ See the `user guide
+ <http://pandas.pydata.org/pandas-docs/stable/categorical.html>`_ for more.
+
+ Examples
+ --------
+ >>> pd.Categorical([1, 2, 3, 1, 2, 3])
+ [1, 2, 3, 1, 2, 3]
+ Categories (3, int64): [1, 2, 3]
+
+ >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
+ [a, b, c, a, b, c]
+ Categories (3, object): [a, b, c]
+
+ Ordered `Categoricals` can be sorted according to the custom order
+ of the categories and can have a min and max value.
+
+ >>> c = pd.Categorical(['a','b','c','a','b','c'], ordered=True,
+ ... categories=['c', 'b', 'a'])
+ >>> c
+ [a, b, c, a, b, c]
+ Categories (3, object): [c < b < a]
+ >>> c.min()
+ 'c'
+ """
+
+ # For comparisons, so that numpy uses our implementation if the compare
+ # ops, which raise
+ __array_priority__ = 1000
+ _dtype = CategoricalDtype(ordered=False)
+ # tolist is not actually deprecated, just suppressed in the __dir__
+ _deprecations = frozenset(['labels', 'tolist'])
+ _typ = 'categorical'
+
+ def __init__(self, values, categories=None, ordered=None, dtype=None,
+ fastpath=False):
+
+ dtype = CategoricalDtype._from_values_or_dtype(values, categories,
+ ordered, dtype)
+ # At this point, dtype is always a CategoricalDtype, but
+ # we may have dtype.categories be None, and we need to
+ # infer categories in a factorization step futher below
+
+ if fastpath:
+ self._codes = coerce_indexer_dtype(values, dtype.categories)
+ self._dtype = self._dtype.update_dtype(dtype)
+ return
+
+ # null_mask indicates missing values we want to exclude from inference.
+ # This means: only missing values in list-likes (not arrays/ndframes).
+ null_mask = np.array(False)
+
+ # sanitize input
+ if is_categorical_dtype(values):
+ if dtype.categories is None:
+ dtype = CategoricalDtype(values.categories, dtype.ordered)
+ elif not isinstance(values, (ABCIndexClass, ABCSeries)):
+ # sanitize_array coerces np.nan to a string under certain versions
+ # of numpy
+ values = maybe_infer_to_datetimelike(values, convert_dates=True)
+ if not isinstance(values, np.ndarray):
+ values = _convert_to_list_like(values)
+ from pandas.core.internals.construction import sanitize_array
+ # By convention, empty lists result in object dtype:
+ if len(values) == 0:
+ sanitize_dtype = 'object'
+ else:
+ sanitize_dtype = None
+ null_mask = isna(values)
+ if null_mask.any():
+ values = [values[idx] for idx in np.where(~null_mask)[0]]
+ values = sanitize_array(values, None, dtype=sanitize_dtype)
+
+ if dtype.categories is None:
+ try:
+ codes, categories = factorize(values, sort=True)
+ except TypeError:
+ codes, categories = factorize(values, sort=False)
+ if dtype.ordered:
+ # raise, as we don't have a sortable data structure and so
+ # the user should give us one by specifying categories
+ raise TypeError("'values' is not ordered, please "
+ "explicitly specify the categories order "
+ "by passing in a categories argument.")
+ except ValueError:
+
+ # FIXME
+ raise NotImplementedError("> 1 ndim Categorical are not "
+ "supported at this time")
+
+ # we're inferring from values
+ dtype = CategoricalDtype(categories, dtype.ordered)
+
+ elif is_categorical_dtype(values):
+ old_codes = (values._values.codes if isinstance(values, ABCSeries)
+ else values.codes)
+ codes = _recode_for_categories(old_codes, values.dtype.categories,
+ dtype.categories)
+
+ else:
+ codes = _get_codes_for_values(values, dtype.categories)
+
+ if null_mask.any():
+ # Reinsert -1 placeholders for previously removed missing values
+ full_codes = - np.ones(null_mask.shape, dtype=codes.dtype)
+ full_codes[~null_mask] = codes
+ codes = full_codes
+
+ self._dtype = self._dtype.update_dtype(dtype)
+ self._codes = coerce_indexer_dtype(codes, dtype.categories)
+
+ @property
+ def categories(self):
+ """
+ The categories of this categorical.
+
+ Setting assigns new values to each category (effectively a rename of
+ each individual category).
+
+ The assigned value has to be a list-like object. All items must be
+ unique and the number of items in the new categories must be the same
+ as the number of items in the old categories.
+
+ Assigning to `categories` is a inplace operation!
+
+ Raises
+ ------
+ ValueError
+ If the new categories do not validate as categories or if the
+ number of new categories is unequal the number of old categories
+
+ See Also
+ --------
+ rename_categories
+ reorder_categories
+ add_categories
+ remove_categories
+ remove_unused_categories
+ set_categories
+ """
+ return self.dtype.categories
+
+ @categories.setter
+ def categories(self, categories):
+ new_dtype = CategoricalDtype(categories, ordered=self.ordered)
+ if (self.dtype.categories is not None and
+ len(self.dtype.categories) != len(new_dtype.categories)):
+ raise ValueError("new categories need to have the same number of "
+ "items as the old categories!")
+ self._dtype = new_dtype
+
+ @property
+ def ordered(self):
+ """
+ Whether the categories have an ordered relationship.
+ """
+ return self.dtype.ordered
+
+ @property
+ def dtype(self):
+ """
+ The :class:`~pandas.api.types.CategoricalDtype` for this instance
+ """
+ return self._dtype
+
+ @property
+ def _ndarray_values(self):
+ return self.codes
+
+ @property
+ def _constructor(self):
+ return Categorical
+
+ @classmethod
+ def _from_sequence(cls, scalars, dtype=None, copy=False):
+ return Categorical(scalars, dtype=dtype)
+
+ def _formatter(self, boxed=False):
+ # Defer to CategoricalFormatter's formatter.
+ return None
+
+ def copy(self):
+ """
+ Copy constructor.
+ """
+ return self._constructor(values=self._codes.copy(),
+ dtype=self.dtype,
+ fastpath=True)
+
+ def astype(self, dtype, copy=True):
+ """
+ Coerce this type to another dtype
+
+ Parameters
+ ----------
+ dtype : numpy dtype or pandas type
+ copy : bool, default True
+ By default, astype always returns a newly allocated object.
+ If copy is set to False and dtype is categorical, the original
+ object is returned.
+
+ .. versionadded:: 0.19.0
+
+ """
+ if is_categorical_dtype(dtype):
+ # GH 10696/18593
+ dtype = self.dtype.update_dtype(dtype)
+ self = self.copy() if copy else self
+ if dtype == self.dtype:
+ return self
+ return self._set_dtype(dtype)
+ return np.array(self, dtype=dtype, copy=copy)
+
+ @cache_readonly
+ def ndim(self):
+ """
+ Number of dimensions of the Categorical
+ """
+ return self._codes.ndim
+
+ @cache_readonly
+ def size(self):
+ """
+ return the len of myself
+ """
+ return len(self)
+
+ @cache_readonly
+ def itemsize(self):
+ """
+ return the size of a single category
+ """
+ return self.categories.itemsize
+
+ def tolist(self):
+ """
+ Return a list of the values.
+
+ These are each a scalar type, which is a Python scalar
+ (for str, int, float) or a pandas scalar
+ (for Timestamp/Timedelta/Interval/Period)
+ """
+ return list(self)
+
+ to_list = tolist
+
+ @property
+ def base(self):
+ """
+ compat, we are always our own object
+ """
+ return None
+
+ @classmethod
+ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
+ dtype, true_values=None):
+ """
+ Construct a Categorical from inferred values.
+
+ For inferred categories (`dtype` is None) the categories are sorted.
+ For explicit `dtype`, the `inferred_categories` are cast to the
+ appropriate type.
+
+ Parameters
+ ----------
+ inferred_categories : Index
+ inferred_codes : Index
+ dtype : CategoricalDtype or 'category'
+ true_values : list, optional
+ If none are provided, the default ones are
+ "True", "TRUE", and "true."
+
+ Returns
+ -------
+ Categorical
+ """
+ from pandas import Index, to_numeric, to_datetime, to_timedelta
+
+ cats = Index(inferred_categories)
+ known_categories = (isinstance(dtype, CategoricalDtype) and
+ dtype.categories is not None)
+
+ if known_categories:
+ # Convert to a specialized type with `dtype` if specified.
+ if dtype.categories.is_numeric():
+ cats = to_numeric(inferred_categories, errors="coerce")
+ elif is_datetime64_dtype(dtype.categories):
+ cats = to_datetime(inferred_categories, errors="coerce")
+ elif is_timedelta64_dtype(dtype.categories):
+ cats = to_timedelta(inferred_categories, errors="coerce")
+ elif dtype.categories.is_boolean():
+ if true_values is None:
+ true_values = ["True", "TRUE", "true"]
+
+ cats = cats.isin(true_values)
+
+ if known_categories:
+ # Recode from observation order to dtype.categories order.
+ categories = dtype.categories
+ codes = _recode_for_categories(inferred_codes, cats, categories)
+ elif not cats.is_monotonic_increasing:
+ # Sort categories and recode for unknown categories.
+ unsorted = cats.copy()
+ categories = cats.sort_values()
+
+ codes = _recode_for_categories(inferred_codes, unsorted,
+ categories)
+ dtype = CategoricalDtype(categories, ordered=False)
+ else:
+ dtype = CategoricalDtype(cats, ordered=False)
+ codes = inferred_codes
+
+ return cls(codes, dtype=dtype, fastpath=True)
+
+ @classmethod
+ def from_codes(cls, codes, categories=None, ordered=None, dtype=None):
+ """
+ Make a Categorical type from codes and categories or dtype.
+
+ This constructor is useful if you already have codes and
+ categories/dtype and so do not need the (computation intensive)
+ factorization step, which is usually done on the constructor.
+
+ If your data does not follow this convention, please use the normal
+ constructor.
+
+ Parameters
+ ----------
+ codes : array-like, integers
+ An integer array, where each integer points to a category in
+ categories or dtype.categories, or else is -1 for NaN
+ categories : index-like, optional
+ The categories for the categorical. Items need to be unique.
+ If the categories are not given here, then they must be provided
+ in `dtype`.
+ ordered : bool, optional
+ Whether or not this categorical is treated as an ordered
+ categorical. If not given here or in `dtype`, the resulting
+ categorical will be unordered.
+ dtype : CategoricalDtype or the string "category", optional
+ If :class:`CategoricalDtype`, cannot be used together with
+ `categories` or `ordered`.
+
+ .. versionadded:: 0.24.0
+
+ When `dtype` is provided, neither `categories` nor `ordered`
+ should be provided.
+
+ Examples
+ --------
+ >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True)
+ >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype)
+ [a, b, a, b]
+ Categories (2, object): [a < b]
+ """
+ dtype = CategoricalDtype._from_values_or_dtype(categories=categories,
+ ordered=ordered,
+ dtype=dtype)
+ if dtype.categories is None:
+ msg = ("The categories must be provided in 'categories' or "
+ "'dtype'. Both were None.")
+ raise ValueError(msg)
+
+ codes = np.asarray(codes) # #21767
+ if not is_integer_dtype(codes):
+ msg = "codes need to be array-like integers"
+ if is_float_dtype(codes):
+ icodes = codes.astype('i8')
+ if (icodes == codes).all():
+ msg = None
+ codes = icodes
+ warn(("float codes will be disallowed in the future and "
+ "raise a ValueError"), FutureWarning, stacklevel=2)
+ if msg:
+ raise ValueError(msg)
+
+ if len(codes) and (
+ codes.max() >= len(dtype.categories) or codes.min() < -1):
+ raise ValueError("codes need to be between -1 and "
+ "len(categories)-1")
+
+ return cls(codes, dtype=dtype, fastpath=True)
+
+ _codes = None
+
+ def _get_codes(self):
+ """
+ Get the codes.
+
+ Returns
+ -------
+ codes : integer array view
+ A non writable view of the `codes` array.
+ """
+ v = self._codes.view()
+ v.flags.writeable = False
+ return v
+
+ def _set_codes(self, codes):
+ """
+ Not settable by the user directly
+ """
+ raise ValueError("cannot set Categorical codes directly")
+
+ codes = property(fget=_get_codes, fset=_set_codes, doc=_codes_doc)
+
+ def _set_categories(self, categories, fastpath=False):
+ """
+ Sets new categories inplace
+
+ Parameters
+ ----------
+ fastpath : boolean (default: False)
+ Don't perform validation of the categories for uniqueness or nulls
+
+ Examples
+ --------
+ >>> c = pd.Categorical(['a', 'b'])
+ >>> c
+ [a, b]
+ Categories (2, object): [a, b]
+
+ >>> c._set_categories(pd.Index(['a', 'c']))
+ >>> c
+ [a, c]
+ Categories (2, object): [a, c]
+ """
+
+ if fastpath:
+ new_dtype = CategoricalDtype._from_fastpath(categories,
+ self.ordered)
+ else:
+ new_dtype = CategoricalDtype(categories, ordered=self.ordered)
+ if (not fastpath and self.dtype.categories is not None and
+ len(new_dtype.categories) != len(self.dtype.categories)):
+ raise ValueError("new categories need to have the same number of "
+ "items than the old categories!")
+
+ self._dtype = new_dtype
+
+ def _set_dtype(self, dtype):
+ """
+ Internal method for directly updating the CategoricalDtype
+
+ Parameters
+ ----------
+ dtype : CategoricalDtype
+
+ Notes
+ -----
+ We don't do any validation here. It's assumed that the dtype is
+ a (valid) instance of `CategoricalDtype`.
+ """
+ codes = _recode_for_categories(self.codes, self.categories,
+ dtype.categories)
+ return type(self)(codes, dtype=dtype, fastpath=True)
+
+ def set_ordered(self, value, inplace=False):
+ """
+ Sets the ordered attribute to the boolean value
+
+ Parameters
+ ----------
+ value : boolean to set whether this categorical is ordered (True) or
+ not (False)
+ inplace : boolean (default: False)
+ Whether or not to set the ordered attribute inplace or return a copy
+ of this categorical with ordered set to the value
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ new_dtype = CategoricalDtype(self.categories, ordered=value)
+ cat = self if inplace else self.copy()
+ cat._dtype = new_dtype
+ if not inplace:
+ return cat
+
+ def as_ordered(self, inplace=False):
+ """
+ Set the Categorical to be ordered.
+
+ Parameters
+ ----------
+ inplace : boolean (default: False)
+ Whether or not to set the ordered attribute inplace or return a copy
+ of this categorical with ordered set to True
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ return self.set_ordered(True, inplace=inplace)
+
+ def as_unordered(self, inplace=False):
+ """
+ Set the Categorical to be unordered.
+
+ Parameters
+ ----------
+ inplace : boolean (default: False)
+ Whether or not to set the ordered attribute inplace or return a copy
+ of this categorical with ordered set to False
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ return self.set_ordered(False, inplace=inplace)
+
+ def set_categories(self, new_categories, ordered=None, rename=False,
+ inplace=False):
+ """
+ Sets the categories to the specified new_categories.
+
+ `new_categories` can include new categories (which will result in
+ unused categories) or remove old categories (which results in values
+ set to NaN). If `rename==True`, the categories will simple be renamed
+ (less or more items than in old categories will result in values set to
+ NaN or in unused categories respectively).
+
+ This method can be used to perform more than one action of adding,
+ removing, and reordering simultaneously and is therefore faster than
+ performing the individual steps via the more specialised methods.
+
+ On the other hand this methods does not do checks (e.g., whether the
+ old categories are included in the new categories on a reorder), which
+ can result in surprising changes, for example when using special string
+ dtypes on python3, which does not considers a S1 string equal to a
+ single char python string.
+
+ Parameters
+ ----------
+ new_categories : Index-like
+ The categories in new order.
+ ordered : boolean, (default: False)
+ Whether or not the categorical is treated as a ordered categorical.
+ If not given, do not change the ordered information.
+ rename : boolean (default: False)
+ Whether or not the new_categories should be considered as a rename
+ of the old categories or as reordered categories.
+ inplace : boolean (default: False)
+ Whether or not to reorder the categories inplace or return a copy of
+ this categorical with reordered categories.
+
+ Returns
+ -------
+ cat : Categorical with reordered categories or None if inplace.
+
+ Raises
+ ------
+ ValueError
+ If new_categories does not validate as categories
+
+ See Also
+ --------
+ rename_categories
+ reorder_categories
+ add_categories
+ remove_categories
+ remove_unused_categories
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ if ordered is None:
+ ordered = self.dtype.ordered
+ new_dtype = CategoricalDtype(new_categories, ordered=ordered)
+
+ cat = self if inplace else self.copy()
+ if rename:
+ if (cat.dtype.categories is not None and
+ len(new_dtype.categories) < len(cat.dtype.categories)):
+ # remove all _codes which are larger and set to -1/NaN
+ cat._codes[cat._codes >= len(new_dtype.categories)] = -1
+ else:
+ codes = _recode_for_categories(cat.codes, cat.categories,
+ new_dtype.categories)
+ cat._codes = codes
+ cat._dtype = new_dtype
+
+ if not inplace:
+ return cat
+
+ def rename_categories(self, new_categories, inplace=False):
+ """
+ Renames categories.
+
+ Parameters
+ ----------
+ new_categories : list-like, dict-like or callable
+
+ * list-like: all items must be unique and the number of items in
+ the new categories must match the existing number of categories.
+
+ * dict-like: specifies a mapping from
+ old categories to new. Categories not contained in the mapping
+ are passed through and extra categories in the mapping are
+ ignored.
+
+ .. versionadded:: 0.21.0
+
+ * callable : a callable that is called on all items in the old
+ categories and whose return values comprise the new categories.
+
+ .. versionadded:: 0.23.0
+
+ .. warning::
+
+ Currently, Series are considered list like. In a future version
+ of pandas they'll be considered dict-like.
+
+ inplace : boolean (default: False)
+ Whether or not to rename the categories inplace or return a copy of
+ this categorical with renamed categories.
+
+ Returns
+ -------
+ cat : Categorical or None
+ With ``inplace=False``, the new categorical is returned.
+ With ``inplace=True``, there is no return value.
+
+ Raises
+ ------
+ ValueError
+ If new categories are list-like and do not have the same number of
+ items than the current categories or do not validate as categories
+
+ See Also
+ --------
+ reorder_categories
+ add_categories
+ remove_categories
+ remove_unused_categories
+ set_categories
+
+ Examples
+ --------
+ >>> c = pd.Categorical(['a', 'a', 'b'])
+ >>> c.rename_categories([0, 1])
+ [0, 0, 1]
+ Categories (2, int64): [0, 1]
+
+ For dict-like ``new_categories``, extra keys are ignored and
+ categories not in the dictionary are passed through
+
+ >>> c.rename_categories({'a': 'A', 'c': 'C'})
+ [A, A, b]
+ Categories (2, object): [A, b]
+
+ You may also provide a callable to create the new categories
+
+ >>> c.rename_categories(lambda x: x.upper())
+ [A, A, B]
+ Categories (2, object): [A, B]
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ cat = self if inplace else self.copy()
+
+ if isinstance(new_categories, ABCSeries):
+ msg = ("Treating Series 'new_categories' as a list-like and using "
+ "the values. In a future version, 'rename_categories' will "
+ "treat Series like a dictionary.\n"
+ "For dict-like, use 'new_categories.to_dict()'\n"
+ "For list-like, use 'new_categories.values'.")
+ warn(msg, FutureWarning, stacklevel=2)
+ new_categories = list(new_categories)
+
+ if is_dict_like(new_categories):
+ cat.categories = [new_categories.get(item, item)
+ for item in cat.categories]
+ elif callable(new_categories):
+ cat.categories = [new_categories(item) for item in cat.categories]
+ else:
+ cat.categories = new_categories
+ if not inplace:
+ return cat
+
+ def reorder_categories(self, new_categories, ordered=None, inplace=False):
+ """
+ Reorders categories as specified in new_categories.
+
+ `new_categories` need to include all old categories and no new category
+ items.
+
+ Parameters
+ ----------
+ new_categories : Index-like
+ The categories in new order.
+ ordered : boolean, optional
+ Whether or not the categorical is treated as a ordered categorical.
+ If not given, do not change the ordered information.
+ inplace : boolean (default: False)
+ Whether or not to reorder the categories inplace or return a copy of
+ this categorical with reordered categories.
+
+ Returns
+ -------
+ cat : Categorical with reordered categories or None if inplace.
+
+ Raises
+ ------
+ ValueError
+ If the new categories do not contain all old category items or any
+ new ones
+
+ See Also
+ --------
+ rename_categories
+ add_categories
+ remove_categories
+ remove_unused_categories
+ set_categories
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ if set(self.dtype.categories) != set(new_categories):
+ raise ValueError("items in new_categories are not the same as in "
+ "old categories")
+ return self.set_categories(new_categories, ordered=ordered,
+ inplace=inplace)
+
+ def add_categories(self, new_categories, inplace=False):
+ """
+ Add new categories.
+
+ `new_categories` will be included at the last/highest place in the
+ categories and will be unused directly after this call.
+
+ Parameters
+ ----------
+ new_categories : category or list-like of category
+ The new categories to be included.
+ inplace : boolean (default: False)
+ Whether or not to add the categories inplace or return a copy of
+ this categorical with added categories.
+
+ Returns
+ -------
+ cat : Categorical with new categories added or None if inplace.
+
+ Raises
+ ------
+ ValueError
+ If the new categories include old categories or do not validate as
+ categories
+
+ See Also
+ --------
+ rename_categories
+ reorder_categories
+ remove_categories
+ remove_unused_categories
+ set_categories
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ if not is_list_like(new_categories):
+ new_categories = [new_categories]
+ already_included = set(new_categories) & set(self.dtype.categories)
+ if len(already_included) != 0:
+ msg = ("new categories must not include old categories: "
+ "{already_included!s}")
+ raise ValueError(msg.format(already_included=already_included))
+ new_categories = list(self.dtype.categories) + list(new_categories)
+ new_dtype = CategoricalDtype(new_categories, self.ordered)
+
+ cat = self if inplace else self.copy()
+ cat._dtype = new_dtype
+ cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories)
+ if not inplace:
+ return cat
+
+ def remove_categories(self, removals, inplace=False):
+ """
+ Removes the specified categories.
+
+ `removals` must be included in the old categories. Values which were in
+ the removed categories will be set to NaN
+
+ Parameters
+ ----------
+ removals : category or list of categories
+ The categories which should be removed.
+ inplace : boolean (default: False)
+ Whether or not to remove the categories inplace or return a copy of
+ this categorical with removed categories.
+
+ Returns
+ -------
+ cat : Categorical with removed categories or None if inplace.
+
+ Raises
+ ------
+ ValueError
+ If the removals are not contained in the categories
+
+ See Also
+ --------
+ rename_categories
+ reorder_categories
+ add_categories
+ remove_unused_categories
+ set_categories
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ if not is_list_like(removals):
+ removals = [removals]
+
+ removal_set = set(list(removals))
+ not_included = removal_set - set(self.dtype.categories)
+ new_categories = [c for c in self.dtype.categories
+ if c not in removal_set]
+
+ # GH 10156
+ if any(isna(removals)):
+ not_included = [x for x in not_included if notna(x)]
+ new_categories = [x for x in new_categories if notna(x)]
+
+ if len(not_included) != 0:
+ msg = "removals must all be in old categories: {not_included!s}"
+ raise ValueError(msg.format(not_included=not_included))
+
+ return self.set_categories(new_categories, ordered=self.ordered,
+ rename=False, inplace=inplace)
+
+ def remove_unused_categories(self, inplace=False):
+ """
+ Removes categories which are not used.
+
+ Parameters
+ ----------
+ inplace : boolean (default: False)
+ Whether or not to drop unused categories inplace or return a copy of
+ this categorical with unused categories dropped.
+
+ Returns
+ -------
+ cat : Categorical with unused categories dropped or None if inplace.
+
+ See Also
+ --------
+ rename_categories
+ reorder_categories
+ add_categories
+ remove_categories
+ set_categories
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ cat = self if inplace else self.copy()
+ idx, inv = np.unique(cat._codes, return_inverse=True)
+
+ if idx.size != 0 and idx[0] == -1: # na sentinel
+ idx, inv = idx[1:], inv - 1
+
+ new_categories = cat.dtype.categories.take(idx)
+ new_dtype = CategoricalDtype._from_fastpath(new_categories,
+ ordered=self.ordered)
+ cat._dtype = new_dtype
+ cat._codes = coerce_indexer_dtype(inv, new_dtype.categories)
+
+ if not inplace:
+ return cat
+
+ def map(self, mapper):
+ """
+ Map categories using input correspondence (dict, Series, or function).
+
+ Maps the categories to new categories. If the mapping correspondence is
+ one-to-one the result is a :class:`~pandas.Categorical` which has the
+ same order property as the original, otherwise a :class:`~pandas.Index`
+ is returned. NaN values are unaffected.
+
+ If a `dict` or :class:`~pandas.Series` is used any unmapped category is
+ mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`
+ will be returned.
+
+ Parameters
+ ----------
+ mapper : function, dict, or Series
+ Mapping correspondence.
+
+ Returns
+ -------
+ pandas.Categorical or pandas.Index
+ Mapped categorical.
+
+ See Also
+ --------
+ CategoricalIndex.map : Apply a mapping correspondence on a
+ :class:`~pandas.CategoricalIndex`.
+ Index.map : Apply a mapping correspondence on an
+ :class:`~pandas.Index`.
+ Series.map : Apply a mapping correspondence on a
+ :class:`~pandas.Series`.
+ Series.apply : Apply more complex functions on a
+ :class:`~pandas.Series`.
+
+ Examples
+ --------
+ >>> cat = pd.Categorical(['a', 'b', 'c'])
+ >>> cat
+ [a, b, c]
+ Categories (3, object): [a, b, c]
+ >>> cat.map(lambda x: x.upper())
+ [A, B, C]
+ Categories (3, object): [A, B, C]
+ >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'})
+ [first, second, third]
+ Categories (3, object): [first, second, third]
+
+ If the mapping is one-to-one the ordering of the categories is
+ preserved:
+
+ >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True)
+ >>> cat
+ [a, b, c]
+ Categories (3, object): [a < b < c]
+ >>> cat.map({'a': 3, 'b': 2, 'c': 1})
+ [3, 2, 1]
+ Categories (3, int64): [3 < 2 < 1]
+
+ If the mapping is not one-to-one an :class:`~pandas.Index` is returned:
+
+ >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'})
+ Index(['first', 'second', 'first'], dtype='object')
+
+ If a `dict` is used, all unmapped categories are mapped to `NaN` and
+ the result is an :class:`~pandas.Index`:
+
+ >>> cat.map({'a': 'first', 'b': 'second'})
+ Index(['first', 'second', nan], dtype='object')
+ """
+ new_categories = self.categories.map(mapper)
+ try:
+ return self.from_codes(self._codes.copy(),
+ categories=new_categories,
+ ordered=self.ordered)
+ except ValueError:
+ # NA values are represented in self._codes with -1
+ # np.take causes NA values to take final element in new_categories
+ if np.any(self._codes == -1):
+ new_categories = new_categories.insert(len(new_categories),
+ np.nan)
+ return np.take(new_categories, self._codes)
+
+ __eq__ = _cat_compare_op('__eq__')
+ __ne__ = _cat_compare_op('__ne__')
+ __lt__ = _cat_compare_op('__lt__')
+ __gt__ = _cat_compare_op('__gt__')
+ __le__ = _cat_compare_op('__le__')
+ __ge__ = _cat_compare_op('__ge__')
+
+ # for Series/ndarray like compat
+ @property
+ def shape(self):
+ """
+ Shape of the Categorical.
+
+ For internal compatibility with numpy arrays.
+
+ Returns
+ -------
+ shape : tuple
+ """
+
+ return tuple([len(self._codes)])
+
+ def shift(self, periods, fill_value=None):
+ """
+ Shift Categorical by desired number of periods.
+
+ Parameters
+ ----------
+ periods : int
+ Number of periods to move, can be positive or negative
+ fill_value : object, optional
+ The scalar value to use for newly introduced missing values.
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ shifted : Categorical
+ """
+ # since categoricals always have ndim == 1, an axis parameter
+ # doesn't make any sense here.
+ codes = self.codes
+ if codes.ndim > 1:
+ raise NotImplementedError("Categorical with ndim > 1.")
+ if np.prod(codes.shape) and (periods != 0):
+ codes = np.roll(codes, ensure_platform_int(periods), axis=0)
+ if isna(fill_value):
+ fill_value = -1
+ elif fill_value in self.categories:
+ fill_value = self.categories.get_loc(fill_value)
+ else:
+ raise ValueError("'fill_value={}' is not present "
+ "in this Categorical's "
+ "categories".format(fill_value))
+ if periods > 0:
+ codes[:periods] = fill_value
+ else:
+ codes[periods:] = fill_value
+
+ return self.from_codes(codes, dtype=self.dtype)
+
+ def __array__(self, dtype=None):
+ """
+ The numpy array interface.
+
+ Returns
+ -------
+ values : numpy array
+ A numpy array of either the specified dtype or,
+ if dtype==None (default), the same dtype as
+ categorical.categories.dtype
+ """
+ ret = take_1d(self.categories.values, self._codes)
+ if dtype and not is_dtype_equal(dtype, self.categories.dtype):
+ return np.asarray(ret, dtype)
+ if is_extension_array_dtype(ret):
+ # When we're a Categorical[ExtensionArray], like Interval,
+ # we need to ensure __array__ get's all the way to an
+ # ndarray.
+ ret = np.asarray(ret)
+ return ret
+
+ def __setstate__(self, state):
+ """Necessary for making this object picklable"""
+ if not isinstance(state, dict):
+ raise Exception('invalid pickle state')
+
+ # Provide compatibility with pre-0.15.0 Categoricals.
+ if '_categories' not in state and '_levels' in state:
+ state['_categories'] = self.dtype.validate_categories(state.pop(
+ '_levels'))
+ if '_codes' not in state and 'labels' in state:
+ state['_codes'] = coerce_indexer_dtype(
+ state.pop('labels'), state['_categories'])
+
+ # 0.16.0 ordered change
+ if '_ordered' not in state:
+
+ # >=15.0 < 0.16.0
+ if 'ordered' in state:
+ state['_ordered'] = state.pop('ordered')
+ else:
+ state['_ordered'] = False
+
+ # 0.21.0 CategoricalDtype change
+ if '_dtype' not in state:
+ state['_dtype'] = CategoricalDtype(state['_categories'],
+ state['_ordered'])
+
+ for k, v in compat.iteritems(state):
+ setattr(self, k, v)
+
+ @property
+ def T(self):
+ """
+ Return transposed numpy array.
+ """
+ return self
+
+ @property
+ def nbytes(self):
+ return self._codes.nbytes + self.dtype.categories.values.nbytes
+
+ def memory_usage(self, deep=False):
+ """
+ Memory usage of my values
+
+ Parameters
+ ----------
+ deep : bool
+ Introspect the data deeply, interrogate
+ `object` dtypes for system-level memory consumption
+
+ Returns
+ -------
+ bytes used
+
+ Notes
+ -----
+ Memory usage does not include memory consumed by elements that
+ are not components of the array if deep=False
+
+ See Also
+ --------
+ numpy.ndarray.nbytes
+ """
+ return self._codes.nbytes + self.dtype.categories.memory_usage(
+ deep=deep)
+
+ @Substitution(klass='Categorical')
+ @Appender(_shared_docs['searchsorted'])
+ def searchsorted(self, value, side='left', sorter=None):
+ if not self.ordered:
+ raise ValueError("Categorical not ordered\nyou can use "
+ ".as_ordered() to change the Categorical to an "
+ "ordered one")
+
+ from pandas.core.series import Series
+ codes = _get_codes_for_values(Series(value).values, self.categories)
+ if -1 in codes:
+ raise KeyError("Value(s) to be inserted must be in categories.")
+
+ codes = codes[0] if is_scalar(value) else codes
+
+ return self.codes.searchsorted(codes, side=side, sorter=sorter)
+
+ def isna(self):
+ """
+ Detect missing values
+
+ Missing values (-1 in .codes) are detected.
+
+ Returns
+ -------
+ a boolean array of whether my values are null
+
+ See Also
+ --------
+ isna : Top-level isna.
+ isnull : Alias of isna.
+ Categorical.notna : Boolean inverse of Categorical.isna.
+
+ """
+
+ ret = self._codes == -1
+ return ret
+ isnull = isna
+
+ def notna(self):
+ """
+ Inverse of isna
+
+ Both missing values (-1 in .codes) and NA as a category are detected as
+ null.
+
+ Returns
+ -------
+ a boolean array of whether my values are not null
+
+ See Also
+ --------
+ notna : Top-level notna.
+ notnull : Alias of notna.
+ Categorical.isna : Boolean inverse of Categorical.notna.
+
+ """
+ return ~self.isna()
+ notnull = notna
+
+ def put(self, *args, **kwargs):
+ """
+ Replace specific elements in the Categorical with given values.
+ """
+ raise NotImplementedError(("'put' is not yet implemented "
+ "for Categorical"))
+
+ def dropna(self):
+ """
+ Return the Categorical without null values.
+
+ Missing values (-1 in .codes) are detected.
+
+ Returns
+ -------
+ valid : Categorical
+ """
+ result = self[self.notna()]
+
+ return result
+
+ def value_counts(self, dropna=True):
+ """
+ Returns a Series containing counts of each category.
+
+ Every category will have an entry, even those with a count of 0.
+
+ Parameters
+ ----------
+ dropna : boolean, default True
+ Don't include counts of NaN.
+
+ Returns
+ -------
+ counts : Series
+
+ See Also
+ --------
+ Series.value_counts
+
+ """
+ from numpy import bincount
+ from pandas import Series, CategoricalIndex
+
+ code, cat = self._codes, self.categories
+ ncat, mask = len(cat), 0 <= code
+ ix, clean = np.arange(ncat), mask.all()
+
+ if dropna or clean:
+ obs = code if clean else code[mask]
+ count = bincount(obs, minlength=ncat or None)
+ else:
+ count = bincount(np.where(mask, code, ncat))
+ ix = np.append(ix, -1)
+
+ ix = self._constructor(ix, dtype=self.dtype,
+ fastpath=True)
+
+ return Series(count, index=CategoricalIndex(ix), dtype='int64')
+
+ def get_values(self):
+ """
+ Return the values.
+
+ For internal compatibility with pandas formatting.
+
+ Returns
+ -------
+ values : numpy array
+ A numpy array of the same dtype as categorical.categories.dtype or
+ Index if datetime / periods
+ """
+ # if we are a datetime and period index, return Index to keep metadata
+ if is_datetimelike(self.categories):
+ return self.categories.take(self._codes, fill_value=np.nan)
+ elif is_integer_dtype(self.categories) and -1 in self._codes:
+ return self.categories.astype("object").take(self._codes,
+ fill_value=np.nan)
+ return np.array(self)
+
+ def check_for_ordered(self, op):
+ """ assert that we are ordered """
+ if not self.ordered:
+ raise TypeError("Categorical is not ordered for operation {op}\n"
+ "you can use .as_ordered() to change the "
+ "Categorical to an ordered one\n".format(op=op))
+
+ def _values_for_argsort(self):
+ return self._codes.copy()
+
+ def argsort(self, *args, **kwargs):
+ # TODO(PY2): use correct signature
+ # We have to do *args, **kwargs to avoid a a py2-only signature
+ # issue since np.argsort differs from argsort.
+ """
+ Return the indices that would sort the Categorical.
+
+ Parameters
+ ----------
+ ascending : bool, default True
+ Whether the indices should result in an ascending
+ or descending sort.
+ kind : {'quicksort', 'mergesort', 'heapsort'}, optional
+ Sorting algorithm.
+ *args, **kwargs:
+ passed through to :func:`numpy.argsort`.
+
+ Returns
+ -------
+ argsorted : numpy array
+
+ See Also
+ --------
+ numpy.ndarray.argsort
+
+ Notes
+ -----
+ While an ordering is applied to the category values, arg-sorting
+ in this context refers more to organizing and grouping together
+ based on matching category values. Thus, this function can be
+ called on an unordered Categorical instance unlike the functions
+ 'Categorical.min' and 'Categorical.max'.
+
+ Examples
+ --------
+ >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort()
+ array([2, 0, 1, 3])
+
+ >>> cat = pd.Categorical(['b', 'b', 'a', 'c'],
+ ... categories=['c', 'b', 'a'],
+ ... ordered=True)
+ >>> cat.argsort()
+ array([3, 0, 1, 2])
+ """
+ # Keep the implementation here just for the docstring.
+ return super(Categorical, self).argsort(*args, **kwargs)
+
+ def sort_values(self, inplace=False, ascending=True, na_position='last'):
+ """
+ Sorts the Categorical by category value returning a new
+ Categorical by default.
+
+ While an ordering is applied to the category values, sorting in this
+ context refers more to organizing and grouping together based on
+ matching category values. Thus, this function can be called on an
+ unordered Categorical instance unlike the functions 'Categorical.min'
+ and 'Categorical.max'.
+
+ Parameters
+ ----------
+ inplace : boolean, default False
+ Do operation in place.
+ ascending : boolean, default True
+ Order ascending. Passing False orders descending. The
+ ordering parameter provides the method by which the
+ category values are organized.
+ na_position : {'first', 'last'} (optional, default='last')
+ 'first' puts NaNs at the beginning
+ 'last' puts NaNs at the end
+
+ Returns
+ -------
+ y : Categorical or None
+
+ See Also
+ --------
+ Categorical.sort
+ Series.sort_values
+
+ Examples
+ --------
+ >>> c = pd.Categorical([1, 2, 2, 1, 5])
+ >>> c
+ [1, 2, 2, 1, 5]
+ Categories (3, int64): [1, 2, 5]
+ >>> c.sort_values()
+ [1, 1, 2, 2, 5]
+ Categories (3, int64): [1, 2, 5]
+ >>> c.sort_values(ascending=False)
+ [5, 2, 2, 1, 1]
+ Categories (3, int64): [1, 2, 5]
+
+ Inplace sorting can be done as well:
+
+ >>> c.sort_values(inplace=True)
+ >>> c
+ [1, 1, 2, 2, 5]
+ Categories (3, int64): [1, 2, 5]
+ >>>
+ >>> c = pd.Categorical([1, 2, 2, 1, 5])
+
+ 'sort_values' behaviour with NaNs. Note that 'na_position'
+ is independent of the 'ascending' parameter:
+
+ >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5])
+ >>> c
+ [NaN, 2.0, 2.0, NaN, 5.0]
+ Categories (2, int64): [2, 5]
+ >>> c.sort_values()
+ [2.0, 2.0, 5.0, NaN, NaN]
+ Categories (2, int64): [2, 5]
+ >>> c.sort_values(ascending=False)
+ [5.0, 2.0, 2.0, NaN, NaN]
+ Categories (2, int64): [2, 5]
+ >>> c.sort_values(na_position='first')
+ [NaN, NaN, 2.0, 2.0, 5.0]
+ Categories (2, int64): [2, 5]
+ >>> c.sort_values(ascending=False, na_position='first')
+ [NaN, NaN, 5.0, 2.0, 2.0]
+ Categories (2, int64): [2, 5]
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ if na_position not in ['last', 'first']:
+ msg = 'invalid na_position: {na_position!r}'
+ raise ValueError(msg.format(na_position=na_position))
+
+ sorted_idx = nargsort(self,
+ ascending=ascending,
+ na_position=na_position)
+
+ if inplace:
+ self._codes = self._codes[sorted_idx]
+ else:
+ return self._constructor(values=self._codes[sorted_idx],
+ dtype=self.dtype,
+ fastpath=True)
+
+ def _values_for_rank(self):
+ """
+ For correctly ranking ordered categorical data. See GH#15420
+
+ Ordered categorical data should be ranked on the basis of
+ codes with -1 translated to NaN.
+
+ Returns
+ -------
+ numpy array
+
+ """
+ from pandas import Series
+ if self.ordered:
+ values = self.codes
+ mask = values == -1
+ if mask.any():
+ values = values.astype('float64')
+ values[mask] = np.nan
+ elif self.categories.is_numeric():
+ values = np.array(self)
+ else:
+ # reorder the categories (so rank can use the float codes)
+ # instead of passing an object array to rank
+ values = np.array(
+ self.rename_categories(Series(self.categories).rank().values)
+ )
+ return values
+
+ def ravel(self, order='C'):
+ """
+ Return a flattened (numpy) array.
+
+ For internal compatibility with numpy arrays.
+
+ Returns
+ -------
+ raveled : numpy array
+ """
+ return np.array(self)
+
+ def view(self):
+ """
+ Return a view of myself.
+
+ For internal compatibility with numpy arrays.
+
+ Returns
+ -------
+ view : Categorical
+ Returns `self`!
+ """
+ return self
+
+ def to_dense(self):
+ """
+ Return my 'dense' representation
+
+ For internal compatibility with numpy arrays.
+
+ Returns
+ -------
+ dense : array
+ """
+ return np.asarray(self)
+
+ @deprecate_kwarg(old_arg_name='fill_value', new_arg_name='value')
+ def fillna(self, value=None, method=None, limit=None):
+ """
+ Fill NA/NaN values using the specified method.
+
+ Parameters
+ ----------
+ value : scalar, dict, Series
+ If a scalar value is passed it is used to fill all missing values.
+ Alternatively, a Series or dict can be used to fill in different
+ values for each index. The value should not be a list. The
+ value(s) passed should either be in the categories or should be
+ NaN.
+ method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
+ Method to use for filling holes in reindexed Series
+ pad / ffill: propagate last valid observation forward to next valid
+ backfill / bfill: use NEXT valid observation to fill gap
+ limit : int, default None
+ (Not implemented yet for Categorical!)
+ If method is specified, this is the maximum number of consecutive
+ NaN values to forward/backward fill. In other words, if there is
+ a gap with more than this number of consecutive NaNs, it will only
+ be partially filled. If method is not specified, this is the
+ maximum number of entries along the entire axis where NaNs will be
+ filled.
+
+ Returns
+ -------
+ filled : Categorical with NA/NaN filled
+ """
+ value, method = validate_fillna_kwargs(
+ value, method, validate_scalar_dict_value=False
+ )
+
+ if value is None:
+ value = np.nan
+ if limit is not None:
+ raise NotImplementedError("specifying a limit for fillna has not "
+ "been implemented yet")
+
+ codes = self._codes
+
+ # pad / bfill
+ if method is not None:
+
+ values = self.to_dense().reshape(-1, len(self))
+ values = interpolate_2d(values, method, 0, None,
+ value).astype(self.categories.dtype)[0]
+ codes = _get_codes_for_values(values, self.categories)
+
+ else:
+
+ # If value is a dict or a Series (a dict value has already
+ # been converted to a Series)
+ if isinstance(value, ABCSeries):
+ if not value[~value.isin(self.categories)].isna().all():
+ raise ValueError("fill value must be in categories")
+
+ values_codes = _get_codes_for_values(value, self.categories)
+ indexer = np.where(values_codes != -1)
+ codes[indexer] = values_codes[values_codes != -1]
+
+ # If value is not a dict or Series it should be a scalar
+ elif is_hashable(value):
+ if not isna(value) and value not in self.categories:
+ raise ValueError("fill value must be in categories")
+
+ mask = codes == -1
+ if mask.any():
+ codes = codes.copy()
+ if isna(value):
+ codes[mask] = -1
+ else:
+ codes[mask] = self.categories.get_loc(value)
+
+ else:
+ raise TypeError('"value" parameter must be a scalar, dict '
+ 'or Series, but you passed a '
+ '"{0}"'.format(type(value).__name__))
+
+ return self._constructor(codes, dtype=self.dtype, fastpath=True)
+
+ def take_nd(self, indexer, allow_fill=None, fill_value=None):
+ """
+ Take elements from the Categorical.
+
+ Parameters
+ ----------
+ indexer : sequence of int
+ The indices in `self` to take. The meaning of negative values in
+ `indexer` depends on the value of `allow_fill`.
+ allow_fill : bool, default None
+ How to handle negative values in `indexer`.
+
+ * False: negative values in `indices` indicate positional indices
+ from the right. This is similar to
+ :func:`numpy.take`.
+
+ * True: negative values in `indices` indicate missing values
+ (the default). These values are set to `fill_value`. Any other
+ other negative values raise a ``ValueError``.
+
+ .. versionchanged:: 0.23.0
+
+ Deprecated the default value of `allow_fill`. The deprecated
+ default is ``True``. In the future, this will change to
+ ``False``.
+
+ fill_value : object
+ The value to use for `indices` that are missing (-1), when
+ ``allow_fill=True``. This should be the category, i.e. a value
+ in ``self.categories``, not a code.
+
+ Returns
+ -------
+ Categorical
+ This Categorical will have the same categories and ordered as
+ `self`.
+
+ See Also
+ --------
+ Series.take : Similar method for Series.
+ numpy.ndarray.take : Similar method for NumPy arrays.
+
+ Examples
+ --------
+ >>> cat = pd.Categorical(['a', 'a', 'b'])
+ >>> cat
+ [a, a, b]
+ Categories (2, object): [a, b]
+
+ Specify ``allow_fill==False`` to have negative indices mean indexing
+ from the right.
+
+ >>> cat.take([0, -1, -2], allow_fill=False)
+ [a, b, a]
+ Categories (2, object): [a, b]
+
+ With ``allow_fill=True``, indices equal to ``-1`` mean "missing"
+ values that should be filled with the `fill_value`, which is
+ ``np.nan`` by default.
+
+ >>> cat.take([0, -1, -1], allow_fill=True)
+ [a, NaN, NaN]
+ Categories (2, object): [a, b]
+
+ The fill value can be specified.
+
+ >>> cat.take([0, -1, -1], allow_fill=True, fill_value='a')
+ [a, a, a]
+ Categories (3, object): [a, b]
+
+ Specifying a fill value that's not in ``self.categories``
+ will raise a ``TypeError``.
+ """
+ indexer = np.asarray(indexer, dtype=np.intp)
+ if allow_fill is None:
+ if (indexer < 0).any():
+ warn(_take_msg, FutureWarning, stacklevel=2)
+ allow_fill = True
+
+ dtype = self.dtype
+
+ if isna(fill_value):
+ fill_value = -1
+ elif allow_fill:
+ # convert user-provided `fill_value` to codes
+ if fill_value in self.categories:
+ fill_value = self.categories.get_loc(fill_value)
+ else:
+ msg = (
+ "'fill_value' ('{}') is not in this Categorical's "
+ "categories."
+ )
+ raise TypeError(msg.format(fill_value))
+
+ codes = take(self._codes, indexer, allow_fill=allow_fill,
+ fill_value=fill_value)
+ result = type(self).from_codes(codes, dtype=dtype)
+ return result
+
+ take = take_nd
+
+ def _slice(self, slicer):
+ """
+ Return a slice of myself.
+
+ For internal compatibility with numpy arrays.
+ """
+
+ # only allow 1 dimensional slicing, but can
+ # in a 2-d case be passd (slice(None),....)
+ if isinstance(slicer, tuple) and len(slicer) == 2:
+ if not com.is_null_slice(slicer[0]):
+ raise AssertionError("invalid slicing for a 1-ndim "
+ "categorical")
+ slicer = slicer[1]
+
+ codes = self._codes[slicer]
+ return self._constructor(values=codes, dtype=self.dtype, fastpath=True)
+
+ def __len__(self):
+ """
+ The length of this Categorical.
+ """
+ return len(self._codes)
+
+ def __iter__(self):
+ """
+ Returns an Iterator over the values of this Categorical.
+ """
+ return iter(self.get_values().tolist())
+
+ def __contains__(self, key):
+ """
+ Returns True if `key` is in this Categorical.
+ """
+ # if key is a NaN, check if any NaN is in self.
+ if isna(key):
+ return self.isna().any()
+
+ return contains(self, key, container=self._codes)
+
+ def _tidy_repr(self, max_vals=10, footer=True):
+ """ a short repr displaying only max_vals and an optional (but default
+ footer)
+ """
+ num = max_vals // 2
+ head = self[:num]._get_repr(length=False, footer=False)
+ tail = self[-(max_vals - num):]._get_repr(length=False, footer=False)
+
+ result = u('{head}, ..., {tail}').format(head=head[:-1], tail=tail[1:])
+ if footer:
+ result = u('{result}\n{footer}').format(result=result,
+ footer=self._repr_footer())
+
+ return compat.text_type(result)
+
+ def _repr_categories(self):
+ """
+ return the base repr for the categories
+ """
+ max_categories = (10 if get_option("display.max_categories") == 0 else
+ get_option("display.max_categories"))
+ from pandas.io.formats import format as fmt
+ if len(self.categories) > max_categories:
+ num = max_categories // 2
+ head = fmt.format_array(self.categories[:num], None)
+ tail = fmt.format_array(self.categories[-num:], None)
+ category_strs = head + ["..."] + tail
+ else:
+ category_strs = fmt.format_array(self.categories, None)
+
+ # Strip all leading spaces, which format_array adds for columns...
+ category_strs = [x.strip() for x in category_strs]
+ return category_strs
+
+ def _repr_categories_info(self):
+ """
+ Returns a string representation of the footer.
+ """
+
+ category_strs = self._repr_categories()
+ dtype = getattr(self.categories, 'dtype_str',
+ str(self.categories.dtype))
+
+ levheader = "Categories ({length}, {dtype}): ".format(
+ length=len(self.categories), dtype=dtype)
+ width, height = get_terminal_size()
+ max_width = get_option("display.width") or width
+ if console.in_ipython_frontend():
+ # 0 = no breaks
+ max_width = 0
+ levstring = ""
+ start = True
+ cur_col_len = len(levheader) # header
+ sep_len, sep = (3, " < ") if self.ordered else (2, ", ")
+ linesep = sep.rstrip() + "\n" # remove whitespace
+ for val in category_strs:
+ if max_width != 0 and cur_col_len + sep_len + len(val) > max_width:
+ levstring += linesep + (" " * (len(levheader) + 1))
+ cur_col_len = len(levheader) + 1 # header + a whitespace
+ elif not start:
+ levstring += sep
+ cur_col_len += len(val)
+ levstring += val
+ start = False
+ # replace to simple save space by
+ return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]"
+
+ def _repr_footer(self):
+
+ return u('Length: {length}\n{info}').format(
+ length=len(self), info=self._repr_categories_info())
+
+ def _get_repr(self, length=True, na_rep='NaN', footer=True):
+ from pandas.io.formats import format as fmt
+ formatter = fmt.CategoricalFormatter(self, length=length,
+ na_rep=na_rep, footer=footer)
+ result = formatter.to_string()
+ return compat.text_type(result)
+
+ def __unicode__(self):
+ """
+ Unicode representation.
+ """
+ _maxlen = 10
+ if len(self._codes) > _maxlen:
+ result = self._tidy_repr(_maxlen)
+ elif len(self._codes) > 0:
+ result = self._get_repr(length=len(self) > _maxlen)
+ else:
+ msg = self._get_repr(length=False, footer=True).replace("\n", ", ")
+ result = ('[], {repr_msg}'.format(repr_msg=msg))
+
+ return result
+
+ def __repr__(self):
+ # We want PandasObject.__repr__, which dispatches to __unicode__
+ return super(ExtensionArray, self).__repr__()
+
+ def _maybe_coerce_indexer(self, indexer):
+ """
+ return an indexer coerced to the codes dtype
+ """
+ if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i':
+ indexer = indexer.astype(self._codes.dtype)
+ return indexer
+
+ def __getitem__(self, key):
+ """
+ Return an item.
+ """
+ if isinstance(key, (int, np.integer)):
+ i = self._codes[key]
+ if i == -1:
+ return np.nan
+ else:
+ return self.categories[i]
+ else:
+ return self._constructor(values=self._codes[key],
+ dtype=self.dtype, fastpath=True)
+
+ def __setitem__(self, key, value):
+ """
+ Item assignment.
+
+
+ Raises
+ ------
+ ValueError
+ If (one or more) Value is not in categories or if a assigned
+ `Categorical` does not have the same categories
+ """
+ from pandas.core.internals.arrays import extract_array
+
+ value = extract_array(value, extract_numpy=True)
+
+ # require identical categories set
+ if isinstance(value, Categorical):
+ if not is_dtype_equal(self, value):
+ raise ValueError("Cannot set a Categorical with another, "
+ "without identical categories")
+ if not self.categories.equals(value.categories):
+ new_codes = _recode_for_categories(
+ value.codes, value.categories, self.categories
+ )
+ value = Categorical.from_codes(new_codes, dtype=self.dtype)
+
+ rvalue = value if is_list_like(value) else [value]
+
+ from pandas import Index
+ to_add = Index(rvalue).difference(self.categories)
+
+ # no assignments of values not in categories, but it's always ok to set
+ # something to np.nan
+ if len(to_add) and not isna(to_add).all():
+ raise ValueError("Cannot setitem on a Categorical with a new "
+ "category, set the categories first")
+
+ # set by position
+ if isinstance(key, (int, np.integer)):
+ pass
+
+ # tuple of indexers (dataframe)
+ elif isinstance(key, tuple):
+ # only allow 1 dimensional slicing, but can
+ # in a 2-d case be passd (slice(None),....)
+ if len(key) == 2:
+ if not com.is_null_slice(key[0]):
+ raise AssertionError("invalid slicing for a 1-ndim "
+ "categorical")
+ key = key[1]
+ elif len(key) == 1:
+ key = key[0]
+ else:
+ raise AssertionError("invalid slicing for a 1-ndim "
+ "categorical")
+
+ # slicing in Series or Categorical
+ elif isinstance(key, slice):
+ pass
+
+ # else: array of True/False in Series or Categorical
+
+ lindexer = self.categories.get_indexer(rvalue)
+ lindexer = self._maybe_coerce_indexer(lindexer)
+ self._codes[key] = lindexer
+
+ def _reverse_indexer(self):
+ """
+ Compute the inverse of a categorical, returning
+ a dict of categories -> indexers.
+
+ *This is an internal function*
+
+ Returns
+ -------
+ dict of categories -> indexers
+
+ Example
+ -------
+ In [1]: c = pd.Categorical(list('aabca'))
+
+ In [2]: c
+ Out[2]:
+ [a, a, b, c, a]
+ Categories (3, object): [a, b, c]
+
+ In [3]: c.categories
+ Out[3]: Index([u'a', u'b', u'c'], dtype='object')
+
+ In [4]: c.codes
+ Out[4]: array([0, 0, 1, 2, 0], dtype=int8)
+
+ In [5]: c._reverse_indexer()
+ Out[5]: {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])}
+
+ """
+ categories = self.categories
+ r, counts = libalgos.groupsort_indexer(self.codes.astype('int64'),
+ categories.size)
+ counts = counts.cumsum()
+ result = [r[counts[indexer]:counts[indexer + 1]]
+ for indexer in range(len(counts) - 1)]
+ result = dict(zip(categories, result))
+ return result
+
+ # reduction ops #
+ def _reduce(self, name, axis=0, **kwargs):
+ func = getattr(self, name, None)
+ if func is None:
+ msg = 'Categorical cannot perform the operation {op}'
+ raise TypeError(msg.format(op=name))
+ return func(**kwargs)
+
+ def min(self, numeric_only=None, **kwargs):
+ """
+ The minimum value of the object.
+
+ Only ordered `Categoricals` have a minimum!
+
+ Raises
+ ------
+ TypeError
+ If the `Categorical` is not `ordered`.
+
+ Returns
+ -------
+ min : the minimum of this `Categorical`
+ """
+ self.check_for_ordered('min')
+ if numeric_only:
+ good = self._codes != -1
+ pointer = self._codes[good].min(**kwargs)
+ else:
+ pointer = self._codes.min(**kwargs)
+ if pointer == -1:
+ return np.nan
+ else:
+ return self.categories[pointer]
+
+ def max(self, numeric_only=None, **kwargs):
+ """
+ The maximum value of the object.
+
+ Only ordered `Categoricals` have a maximum!
+
+ Raises
+ ------
+ TypeError
+ If the `Categorical` is not `ordered`.
+
+ Returns
+ -------
+ max : the maximum of this `Categorical`
+ """
+ self.check_for_ordered('max')
+ if numeric_only:
+ good = self._codes != -1
+ pointer = self._codes[good].max(**kwargs)
+ else:
+ pointer = self._codes.max(**kwargs)
+ if pointer == -1:
+ return np.nan
+ else:
+ return self.categories[pointer]
+
+ def mode(self, dropna=True):
+ """
+ Returns the mode(s) of the Categorical.
+
+ Always returns `Categorical` even if only one value.
+
+ Parameters
+ ----------
+ dropna : boolean, default True
+ Don't consider counts of NaN/NaT.
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ modes : `Categorical` (sorted)
+ """
+
+ import pandas._libs.hashtable as htable
+ codes = self._codes
+ if dropna:
+ good = self._codes != -1
+ codes = self._codes[good]
+ codes = sorted(htable.mode_int64(ensure_int64(codes), dropna))
+ return self._constructor(values=codes, dtype=self.dtype, fastpath=True)
+
+ def unique(self):
+ """
+ Return the ``Categorical`` which ``categories`` and ``codes`` are
+ unique. Unused categories are NOT returned.
+
+ - unordered category: values and categories are sorted by appearance
+ order.
+ - ordered category: values are sorted by appearance order, categories
+ keeps existing order.
+
+ Returns
+ -------
+ unique values : ``Categorical``
+
+ Examples
+ --------
+ An unordered Categorical will return categories in the
+ order of appearance.
+
+ >>> pd.Categorical(list('baabc'))
+ [b, a, c]
+ Categories (3, object): [b, a, c]
+
+ >>> pd.Categorical(list('baabc'), categories=list('abc'))
+ [b, a, c]
+ Categories (3, object): [b, a, c]
+
+ An ordered Categorical preserves the category ordering.
+
+ >>> pd.Categorical(list('baabc'),
+ ... categories=list('abc'),
+ ... ordered=True)
+ [b, a, c]
+ Categories (3, object): [a < b < c]
+
+ See Also
+ --------
+ unique
+ CategoricalIndex.unique
+ Series.unique
+
+ """
+
+ # unlike np.unique, unique1d does not sort
+ unique_codes = unique1d(self.codes)
+ cat = self.copy()
+
+ # keep nan in codes
+ cat._codes = unique_codes
+
+ # exclude nan from indexer for categories
+ take_codes = unique_codes[unique_codes != -1]
+ if self.ordered:
+ take_codes = np.sort(take_codes)
+ return cat.set_categories(cat.categories.take(take_codes))
+
+ def _values_for_factorize(self):
+ codes = self.codes.astype('int64')
+ return codes, -1
+
+ @classmethod
+ def _from_factorized(cls, uniques, original):
+ return original._constructor(original.categories.take(uniques),
+ categories=original.categories,
+ ordered=original.ordered)
+
+ def equals(self, other):
+ """
+ Returns True if categorical arrays are equal.
+
+ Parameters
+ ----------
+ other : `Categorical`
+
+ Returns
+ -------
+ are_equal : boolean
+ """
+ if self.is_dtype_equal(other):
+ if self.categories.equals(other.categories):
+ # fastpath to avoid re-coding
+ other_codes = other._codes
+ else:
+ other_codes = _recode_for_categories(other.codes,
+ other.categories,
+ self.categories)
+ return np.array_equal(self._codes, other_codes)
+ return False
+
+ def is_dtype_equal(self, other):
+ """
+ Returns True if categoricals are the same dtype
+ same categories, and same ordered
+
+ Parameters
+ ----------
+ other : Categorical
+
+ Returns
+ -------
+ are_equal : boolean
+ """
+
+ try:
+ return hash(self.dtype) == hash(other.dtype)
+ except (AttributeError, TypeError):
+ return False
+
+ def describe(self):
+ """
+ Describes this Categorical
+
+ Returns
+ -------
+ description: `DataFrame`
+ A dataframe with frequency and counts by category.
+ """
+ counts = self.value_counts(dropna=False)
+ freqs = counts / float(counts.sum())
+
+ from pandas.core.reshape.concat import concat
+ result = concat([counts, freqs], axis=1)
+ result.columns = ['counts', 'freqs']
+ result.index.name = 'categories'
+
+ return result
+
+ @Substitution(klass='Categorical')
+ @Appender(_extension_array_shared_docs['repeat'])
+ def repeat(self, repeats, axis=None):
+ nv.validate_repeat(tuple(), dict(axis=axis))
+ codes = self._codes.repeat(repeats)
+ return self._constructor(values=codes, dtype=self.dtype, fastpath=True)
+
+ # Implement the ExtensionArray interface
+ @property
+ def _can_hold_na(self):
+ return True
+
+ @classmethod
+ def _concat_same_type(self, to_concat):
+ from pandas.core.dtypes.concat import _concat_categorical
+
+ return _concat_categorical(to_concat)
+
+ def isin(self, values):
+ """
+ Check whether `values` are contained in Categorical.
+
+ Return a boolean NumPy Array showing whether each element in
+ the Categorical matches an element in the passed sequence of
+ `values` exactly.
+
+ Parameters
+ ----------
+ values : set or list-like
+ The sequence of values to test. Passing in a single string will
+ raise a ``TypeError``. Instead, turn a single string into a
+ list of one element.
+
+ Returns
+ -------
+ isin : numpy.ndarray (bool dtype)
+
+ Raises
+ ------
+ TypeError
+ * If `values` is not a set or list-like
+
+ See Also
+ --------
+ pandas.Series.isin : Equivalent method on Series.
+
+ Examples
+ --------
+
+ >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama',
+ ... 'hippo'])
+ >>> s.isin(['cow', 'lama'])
+ array([ True, True, True, False, True, False])
+
+ Passing a single string as ``s.isin('lama')`` will raise an error. Use
+ a list of one element instead:
+
+ >>> s.isin(['lama'])
+ array([ True, False, True, False, True, False])
+ """
+ from pandas.core.internals.construction import sanitize_array
+ if not is_list_like(values):
+ raise TypeError("only list-like objects are allowed to be passed"
+ " to isin(), you passed a [{values_type}]"
+ .format(values_type=type(values).__name__))
+ values = sanitize_array(values, None, None)
+ null_mask = np.asarray(isna(values))
+ code_values = self.categories.get_indexer(values)
+ code_values = code_values[null_mask | (code_values >= 0)]
+ return algorithms.isin(self.codes, code_values)
+
+
+# The Series.cat accessor
+
+
+@delegate_names(delegate=Categorical,
+ accessors=["categories", "ordered"],
+ typ="property")
+@delegate_names(delegate=Categorical,
+ accessors=["rename_categories", "reorder_categories",
+ "add_categories", "remove_categories",
+ "remove_unused_categories", "set_categories",
+ "as_ordered", "as_unordered"],
+ typ="method")
+class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
+ """
+ Accessor object for categorical properties of the Series values.
+
+ Be aware that assigning to `categories` is a inplace operation, while all
+ methods return new categorical data per default (but can be called with
+ `inplace=True`).
+
+ Parameters
+ ----------
+ data : Series or CategoricalIndex
+
+ Examples
+ --------
+ >>> s.cat.categories
+ >>> s.cat.categories = list('abc')
+ >>> s.cat.rename_categories(list('cab'))
+ >>> s.cat.reorder_categories(list('cab'))
+ >>> s.cat.add_categories(['d','e'])
+ >>> s.cat.remove_categories(['d'])
+ >>> s.cat.remove_unused_categories()
+ >>> s.cat.set_categories(list('abcde'))
+ >>> s.cat.as_ordered()
+ >>> s.cat.as_unordered()
+ """
+
+ def __init__(self, data):
+ self._validate(data)
+ self._parent = data.values
+ self._index = data.index
+ self._name = data.name
+ self._freeze()
+
+ @staticmethod
+ def _validate(data):
+ if not is_categorical_dtype(data.dtype):
+ raise AttributeError("Can only use .cat accessor with a "
+ "'category' dtype")
+
+ def _delegate_property_get(self, name):
+ return getattr(self._parent, name)
+
+ def _delegate_property_set(self, name, new_values):
+ return setattr(self._parent, name, new_values)
+
+ @property
+ def codes(self):
+ """
+ Return Series of codes as well as the index.
+ """
+ from pandas import Series
+ return Series(self._parent.codes, index=self._index)
+
+ def _delegate_method(self, name, *args, **kwargs):
+ from pandas import Series
+ method = getattr(self._parent, name)
+ res = method(*args, **kwargs)
+ if res is not None:
+ return Series(res, index=self._index, name=self._name)
+
+ @property
+ def categorical(self):
+ # Note: Upon deprecation, `test_tab_completion_with_categorical` will
+ # need to be updated. `categorical` will need to be removed from
+ # `ok_for_cat`.
+ warn("`Series.cat.categorical` has been deprecated. Use the "
+ "attributes on 'Series.cat' directly instead.",
+ FutureWarning,
+ stacklevel=2)
+ return self._parent
+
+ @property
+ def name(self):
+ # Note: Upon deprecation, `test_tab_completion_with_categorical` will
+ # need to be updated. `name` will need to be removed from
+ # `ok_for_cat`.
+ warn("`Series.cat.name` has been deprecated. Use `Series.name` "
+ "instead.",
+ FutureWarning,
+ stacklevel=2)
+ return self._name
+
+ @property
+ def index(self):
+ # Note: Upon deprecation, `test_tab_completion_with_categorical` will
+ # need to be updated. `index` will need to be removed from
+ # ok_for_cat`.
+ warn("`Series.cat.index` has been deprecated. Use `Series.index` "
+ "instead.",
+ FutureWarning,
+ stacklevel=2)
+ return self._index
+
+# utility routines
+
+
+def _get_codes_for_values(values, categories):
+ """
+ utility routine to turn values into codes given the specified categories
+ """
+ from pandas.core.algorithms import _get_data_algo, _hashtables
+ dtype_equal = is_dtype_equal(values.dtype, categories.dtype)
+
+ if dtype_equal:
+ # To prevent erroneous dtype coercion in _get_data_algo, retrieve
+ # the underlying numpy array. gh-22702
+ values = getattr(values, '_ndarray_values', values)
+ categories = getattr(categories, '_ndarray_values', categories)
+ elif (is_extension_array_dtype(categories.dtype) and
+ is_object_dtype(values)):
+ # Support inferring the correct extension dtype from an array of
+ # scalar objects. e.g.
+ # Categorical(array[Period, Period], categories=PeriodIndex(...))
+ try:
+ values = (
+ categories.dtype.construct_array_type()._from_sequence(values)
+ )
+ except Exception:
+ # but that may fail for any reason, so fall back to object
+ values = ensure_object(values)
+ categories = ensure_object(categories)
+ else:
+ values = ensure_object(values)
+ categories = ensure_object(categories)
+
+ (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
+ (_, _), cats = _get_data_algo(categories, _hashtables)
+ t = hash_klass(len(cats))
+ t.map_locations(cats)
+ return coerce_indexer_dtype(t.lookup(vals), cats)
+
+
+def _recode_for_categories(codes, old_categories, new_categories):
+ """
+ Convert a set of codes for to a new set of categories
+
+ Parameters
+ ----------
+ codes : array
+ old_categories, new_categories : Index
+
+ Returns
+ -------
+ new_codes : array
+
+ Examples
+ --------
+ >>> old_cat = pd.Index(['b', 'a', 'c'])
+ >>> new_cat = pd.Index(['a', 'b'])
+ >>> codes = np.array([0, 1, 1, 2])
+ >>> _recode_for_categories(codes, old_cat, new_cat)
+ array([ 1, 0, 0, -1])
+ """
+ from pandas.core.algorithms import take_1d
+
+ if len(old_categories) == 0:
+ # All null anyway, so just retain the nulls
+ return codes.copy()
+ elif new_categories.equals(old_categories):
+ # Same categories, so no need to actually recode
+ return codes.copy()
+ indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories),
+ new_categories)
+ new_codes = take_1d(indexer, codes.copy(), fill_value=-1)
+ return new_codes
+
+
+def _convert_to_list_like(list_like):
+ if hasattr(list_like, "dtype"):
+ return list_like
+ if isinstance(list_like, list):
+ return list_like
+ if (is_sequence(list_like) or isinstance(list_like, tuple) or
+ is_iterator(list_like)):
+ return list(list_like)
+ elif is_scalar(list_like):
+ return [list_like]
+ else:
+ # is this reached?
+ return [list_like]
+
+
+def _factorize_from_iterable(values):
+ """
+ Factorize an input `values` into `categories` and `codes`. Preserves
+ categorical dtype in `categories`.
+
+ *This is an internal function*
+
+ Parameters
+ ----------
+ values : list-like
+
+ Returns
+ -------
+ codes : ndarray
+ categories : Index
+ If `values` has a categorical dtype, then `categories` is
+ a CategoricalIndex keeping the categories and order of `values`.
+ """
+ from pandas.core.indexes.category import CategoricalIndex
+
+ if not is_list_like(values):
+ raise TypeError("Input must be list-like")
+
+ if is_categorical(values):
+ if isinstance(values, (ABCCategoricalIndex, ABCSeries)):
+ values = values._values
+ categories = CategoricalIndex(values.categories,
+ categories=values.categories,
+ ordered=values.ordered)
+ codes = values.codes
+ else:
+ # The value of ordered is irrelevant since we don't use cat as such,
+ # but only the resulting categories, the order of which is independent
+ # from ordered. Set ordered to False as default. See GH #15457
+ cat = Categorical(values, ordered=False)
+ categories = cat.categories
+ codes = cat.codes
+ return codes, categories
+
+
+def _factorize_from_iterables(iterables):
+ """
+ A higher-level wrapper over `_factorize_from_iterable`.
+
+ *This is an internal function*
+
+ Parameters
+ ----------
+ iterables : list-like of list-likes
+
+ Returns
+ -------
+ codes_list : list of ndarrays
+ categories_list : list of Indexes
+
+ Notes
+ -----
+ See `_factorize_from_iterable` for more info.
+ """
+ if len(iterables) == 0:
+ # For consistency, it should return a list of 2 lists.
+ return [[], []]
+ return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables]))
diff --git a/contrib/python/pandas/py2/pandas/core/arrays/datetimelike.py b/contrib/python/pandas/py2/pandas/core/arrays/datetimelike.py
new file mode 100644
index 00000000000..73e799f9e0a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/arrays/datetimelike.py
@@ -0,0 +1,1598 @@
+# -*- coding: utf-8 -*-
+from datetime import datetime, timedelta
+import operator
+import warnings
+
+import numpy as np
+
+from pandas._libs import NaT, algos, iNaT, lib
+from pandas._libs.tslibs.period import (
+ DIFFERENT_FREQ, IncompatibleFrequency, Period)
+from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds
+from pandas._libs.tslibs.timestamps import (
+ RoundTo, maybe_integer_op_deprecated, round_nsint64)
+import pandas.compat as compat
+from pandas.compat.numpy import function as nv
+from pandas.errors import (
+ AbstractMethodError, NullFrequencyError, PerformanceWarning)
+from pandas.util._decorators import Appender, Substitution
+from pandas.util._validators import validate_fillna_kwargs
+
+from pandas.core.dtypes.common import (
+ is_categorical_dtype, is_datetime64_any_dtype, is_datetime64_dtype,
+ is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal,
+ is_extension_array_dtype, is_float_dtype, is_integer_dtype, is_list_like,
+ is_object_dtype, is_offsetlike, is_period_dtype, is_string_dtype,
+ is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype)
+from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
+from pandas.core.dtypes.inference import is_array_like
+from pandas.core.dtypes.missing import isna
+
+from pandas.core import missing, nanops
+from pandas.core.algorithms import (
+ checked_add_with_arr, take, unique1d, value_counts)
+import pandas.core.common as com
+
+from pandas.tseries import frequencies
+from pandas.tseries.offsets import DateOffset, Tick
+
+from .base import ExtensionArray, ExtensionOpsMixin
+
+
+class AttributesMixin(object):
+
+ @property
+ def _attributes(self):
+ # Inheriting subclass should implement _attributes as a list of strings
+ raise AbstractMethodError(self)
+
+ @classmethod
+ def _simple_new(cls, values, **kwargs):
+ raise AbstractMethodError(cls)
+
+ def _get_attributes_dict(self):
+ """
+ return an attributes dict for my class
+ """
+ return {k: getattr(self, k, None) for k in self._attributes}
+
+ @property
+ def _scalar_type(self):
+ # type: () -> Union[type, Tuple[type]]
+ """The scalar associated with this datelike
+
+ * PeriodArray : Period
+ * DatetimeArray : Timestamp
+ * TimedeltaArray : Timedelta
+ """
+ raise AbstractMethodError(self)
+
+ def _scalar_from_string(self, value):
+ # type: (str) -> Union[Period, Timestamp, Timedelta, NaTType]
+ """
+ Construct a scalar type from a string.
+
+ Parameters
+ ----------
+ value : str
+
+ Returns
+ -------
+ Period, Timestamp, or Timedelta, or NaT
+ Whatever the type of ``self._scalar_type`` is.
+
+ Notes
+ -----
+ This should call ``self._check_compatible_with`` before
+ unboxing the result.
+ """
+ raise AbstractMethodError(self)
+
+ def _unbox_scalar(self, value):
+ # type: (Union[Period, Timestamp, Timedelta, NaTType]) -> int
+ """
+ Unbox the integer value of a scalar `value`.
+
+ Parameters
+ ----------
+ value : Union[Period, Timestamp, Timedelta]
+
+ Returns
+ -------
+ int
+
+ Examples
+ --------
+ >>> self._unbox_scalar(Timedelta('10s')) # DOCTEST: +SKIP
+ 10000000000
+ """
+ raise AbstractMethodError(self)
+
+ def _check_compatible_with(self, other):
+ # type: (Union[Period, Timestamp, Timedelta, NaTType]) -> None
+ """
+ Verify that `self` and `other` are compatible.
+
+ * DatetimeArray verifies that the timezones (if any) match
+ * PeriodArray verifies that the freq matches
+ * Timedelta has no verification
+
+ In each case, NaT is considered compatible.
+
+ Parameters
+ ----------
+ other
+
+ Raises
+ ------
+ Exception
+ """
+ raise AbstractMethodError(self)
+
+
+class DatelikeOps(object):
+ """
+ Common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex.
+ """
+
+ @Substitution(URL="https://docs.python.org/3/library/datetime.html"
+ "#strftime-and-strptime-behavior")
+ def strftime(self, date_format):
+ """
+ Convert to Index using specified date_format.
+
+ Return an Index of formatted strings specified by date_format, which
+ supports the same string format as the python standard library. Details
+ of the string format can be found in `python string format
+ doc <%(URL)s>`__
+
+ Parameters
+ ----------
+ date_format : str
+ Date format string (e.g. "%%Y-%%m-%%d").
+
+ Returns
+ -------
+ Index
+ Index of formatted strings
+
+ See Also
+ --------
+ to_datetime : Convert the given argument to datetime.
+ DatetimeIndex.normalize : Return DatetimeIndex with times to midnight.
+ DatetimeIndex.round : Round the DatetimeIndex to the specified freq.
+ DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq.
+
+ Examples
+ --------
+ >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"),
+ ... periods=3, freq='s')
+ >>> rng.strftime('%%B %%d, %%Y, %%r')
+ Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM',
+ 'March 10, 2018, 09:00:02 AM'],
+ dtype='object')
+ """
+ from pandas import Index
+ return Index(self._format_native_types(date_format=date_format))
+
+
+class TimelikeOps(object):
+ """
+ Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex.
+ """
+
+ _round_doc = (
+ """
+ Perform {op} operation on the data to the specified `freq`.
+
+ Parameters
+ ----------
+ freq : str or Offset
+ The frequency level to {op} the index to. Must be a fixed
+ frequency like 'S' (second) not 'ME' (month end). See
+ :ref:`frequency aliases <timeseries.offset_aliases>` for
+ a list of possible `freq` values.
+ ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'
+ Only relevant for DatetimeIndex:
+
+ - 'infer' will attempt to infer fall dst-transition hours based on
+ order
+ - bool-ndarray where True signifies a DST time, False designates
+ a non-DST time (note that this flag is only applicable for
+ ambiguous times)
+ - 'NaT' will return NaT where there are ambiguous times
+ - 'raise' will raise an AmbiguousTimeError if there are ambiguous
+ times
+
+ .. versionadded:: 0.24.0
+
+ nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta,
+ default 'raise'
+ A nonexistent time does not exist in a particular timezone
+ where clocks moved forward due to DST.
+
+ - 'shift_forward' will shift the nonexistent time forward to the
+ closest existing time
+ - 'shift_backward' will shift the nonexistent time backward to the
+ closest existing time
+ - 'NaT' will return NaT where there are nonexistent times
+ - timedelta objects will shift nonexistent times by the timedelta
+ - 'raise' will raise an NonExistentTimeError if there are
+ nonexistent times
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ DatetimeIndex, TimedeltaIndex, or Series
+ Index of the same type for a DatetimeIndex or TimedeltaIndex,
+ or a Series with the same index for a Series.
+
+ Raises
+ ------
+ ValueError if the `freq` cannot be converted.
+
+ Examples
+ --------
+ **DatetimeIndex**
+
+ >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min')
+ >>> rng
+ DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00',
+ '2018-01-01 12:01:00'],
+ dtype='datetime64[ns]', freq='T')
+ """)
+
+ _round_example = (
+ """>>> rng.round('H')
+ DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00',
+ '2018-01-01 12:00:00'],
+ dtype='datetime64[ns]', freq=None)
+
+ **Series**
+
+ >>> pd.Series(rng).dt.round("H")
+ 0 2018-01-01 12:00:00
+ 1 2018-01-01 12:00:00
+ 2 2018-01-01 12:00:00
+ dtype: datetime64[ns]
+ """)
+
+ _floor_example = (
+ """>>> rng.floor('H')
+ DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00',
+ '2018-01-01 12:00:00'],
+ dtype='datetime64[ns]', freq=None)
+
+ **Series**
+
+ >>> pd.Series(rng).dt.floor("H")
+ 0 2018-01-01 11:00:00
+ 1 2018-01-01 12:00:00
+ 2 2018-01-01 12:00:00
+ dtype: datetime64[ns]
+ """
+ )
+
+ _ceil_example = (
+ """>>> rng.ceil('H')
+ DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00',
+ '2018-01-01 13:00:00'],
+ dtype='datetime64[ns]', freq=None)
+
+ **Series**
+
+ >>> pd.Series(rng).dt.ceil("H")
+ 0 2018-01-01 12:00:00
+ 1 2018-01-01 12:00:00
+ 2 2018-01-01 13:00:00
+ dtype: datetime64[ns]
+ """
+ )
+
+ def _round(self, freq, mode, ambiguous, nonexistent):
+ # round the local times
+ values = _ensure_datetimelike_to_i8(self)
+ result = round_nsint64(values, mode, freq)
+ result = self._maybe_mask_results(result, fill_value=NaT)
+
+ dtype = self.dtype
+ if is_datetime64tz_dtype(self):
+ dtype = None
+ return self._ensure_localized(
+ self._simple_new(result, dtype=dtype), ambiguous, nonexistent
+ )
+
+ @Appender((_round_doc + _round_example).format(op="round"))
+ def round(self, freq, ambiguous='raise', nonexistent='raise'):
+ return self._round(
+ freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent
+ )
+
+ @Appender((_round_doc + _floor_example).format(op="floor"))
+ def floor(self, freq, ambiguous='raise', nonexistent='raise'):
+ return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent)
+
+ @Appender((_round_doc + _ceil_example).format(op="ceil"))
+ def ceil(self, freq, ambiguous='raise', nonexistent='raise'):
+ return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent)
+
+
+class DatetimeLikeArrayMixin(ExtensionOpsMixin,
+ AttributesMixin,
+ ExtensionArray):
+ """
+ Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray
+
+ Assumes that __new__/__init__ defines:
+ _data
+ _freq
+
+ and that the inheriting class has methods:
+ _generate_range
+ """
+
+ @property
+ def _box_func(self):
+ """
+ box function to get object from internal representation
+ """
+ raise AbstractMethodError(self)
+
+ def _box_values(self, values):
+ """
+ apply box func to passed values
+ """
+ return lib.map_infer(values, self._box_func)
+
+ def __iter__(self):
+ return (self._box_func(v) for v in self.asi8)
+
+ @property
+ def asi8(self):
+ # type: () -> ndarray
+ """
+ Integer representation of the values.
+
+ Returns
+ -------
+ ndarray
+ An ndarray with int64 dtype.
+ """
+ # do not cache or you'll create a memory leak
+ return self._data.view('i8')
+
+ @property
+ def _ndarray_values(self):
+ return self._data
+
+ # ----------------------------------------------------------------
+ # Rendering Methods
+
+ def _format_native_types(self, na_rep='NaT', date_format=None):
+ """
+ Helper method for astype when converting to strings.
+
+ Returns
+ -------
+ ndarray[str]
+ """
+ raise AbstractMethodError(self)
+
+ def _formatter(self, boxed=False):
+ # TODO: Remove Datetime & DatetimeTZ formatters.
+ return "'{}'".format
+
+ # ----------------------------------------------------------------
+ # Array-Like / EA-Interface Methods
+
+ @property
+ def nbytes(self):
+ return self._data.nbytes
+
+ def __array__(self, dtype=None):
+ # used for Timedelta/DatetimeArray, overwritten by PeriodArray
+ if is_object_dtype(dtype):
+ return np.array(list(self), dtype=object)
+ return self._data
+
+ @property
+ def shape(self):
+ return (len(self),)
+
+ @property
+ def size(self):
+ # type: () -> int
+ """The number of elements in this array."""
+ return np.prod(self.shape)
+
+ def __len__(self):
+ return len(self._data)
+
+ def __getitem__(self, key):
+ """
+ This getitem defers to the underlying array, which by-definition can
+ only handle list-likes, slices, and integer scalars
+ """
+
+ is_int = lib.is_integer(key)
+ if lib.is_scalar(key) and not is_int:
+ raise IndexError("only integers, slices (`:`), ellipsis (`...`), "
+ "numpy.newaxis (`None`) and integer or boolean "
+ "arrays are valid indices")
+
+ getitem = self._data.__getitem__
+ if is_int:
+ val = getitem(key)
+ return self._box_func(val)
+
+ if com.is_bool_indexer(key):
+ key = np.asarray(key, dtype=bool)
+ if key.all():
+ key = slice(0, None, None)
+ else:
+ key = lib.maybe_booleans_to_slice(key.view(np.uint8))
+
+ is_period = is_period_dtype(self)
+ if is_period:
+ freq = self.freq
+ else:
+ freq = None
+ if isinstance(key, slice):
+ if self.freq is not None and key.step is not None:
+ freq = key.step * self.freq
+ else:
+ freq = self.freq
+ elif key is Ellipsis:
+ # GH#21282 indexing with Ellipsis is similar to a full slice,
+ # should preserve `freq` attribute
+ freq = self.freq
+
+ result = getitem(key)
+ if result.ndim > 1:
+ # To support MPL which performs slicing with 2 dim
+ # even though it only has 1 dim by definition
+ if is_period:
+ return self._simple_new(result, dtype=self.dtype, freq=freq)
+ return result
+
+ return self._simple_new(result, dtype=self.dtype, freq=freq)
+
+ def __setitem__(
+ self,
+ key, # type: Union[int, Sequence[int], Sequence[bool], slice]
+ value, # type: Union[NaTType, Scalar, Sequence[Scalar]]
+ ):
+ # type: (...) -> None
+ # I'm fudging the types a bit here. The "Scalar" above really depends
+ # on type(self). For PeriodArray, it's Period (or stuff coercible
+ # to a period in from_sequence). For DatetimeArray, it's Timestamp...
+ # I don't know if mypy can do that, possibly with Generics.
+ # https://mypy.readthedocs.io/en/latest/generics.html
+
+ if is_list_like(value):
+ is_slice = isinstance(key, slice)
+
+ if lib.is_scalar(key):
+ raise ValueError("setting an array element with a sequence.")
+
+ if (not is_slice
+ and len(key) != len(value)
+ and not com.is_bool_indexer(key)):
+ msg = ("shape mismatch: value array of length '{}' does not "
+ "match indexing result of length '{}'.")
+ raise ValueError(msg.format(len(key), len(value)))
+ if not is_slice and len(key) == 0:
+ return
+
+ value = type(self)._from_sequence(value, dtype=self.dtype)
+ self._check_compatible_with(value)
+ value = value.asi8
+ elif isinstance(value, self._scalar_type):
+ self._check_compatible_with(value)
+ value = self._unbox_scalar(value)
+ elif isna(value) or value == iNaT:
+ value = iNaT
+ else:
+ msg = (
+ "'value' should be a '{scalar}', 'NaT', or array of those. "
+ "Got '{typ}' instead."
+ )
+ raise TypeError(msg.format(scalar=self._scalar_type.__name__,
+ typ=type(value).__name__))
+ self._data[key] = value
+ self._maybe_clear_freq()
+
+ def _maybe_clear_freq(self):
+ # inplace operations like __setitem__ may invalidate the freq of
+ # DatetimeArray and TimedeltaArray
+ pass
+
+ def astype(self, dtype, copy=True):
+ # Some notes on cases we don't have to handle here in the base class:
+ # 1. PeriodArray.astype handles period -> period
+ # 2. DatetimeArray.astype handles conversion between tz.
+ # 3. DatetimeArray.astype handles datetime -> period
+ from pandas import Categorical
+ dtype = pandas_dtype(dtype)
+
+ if is_object_dtype(dtype):
+ return self._box_values(self.asi8)
+ elif is_string_dtype(dtype) and not is_categorical_dtype(dtype):
+ return self._format_native_types()
+ elif is_integer_dtype(dtype):
+ # we deliberately ignore int32 vs. int64 here.
+ # See https://github.com/pandas-dev/pandas/issues/24381 for more.
+ values = self.asi8
+
+ if is_unsigned_integer_dtype(dtype):
+ # Again, we ignore int32 vs. int64
+ values = values.view("uint64")
+
+ if copy:
+ values = values.copy()
+ return values
+ elif (is_datetime_or_timedelta_dtype(dtype) and
+ not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype):
+ # disallow conversion between datetime/timedelta,
+ # and conversions for any datetimelike to float
+ msg = 'Cannot cast {name} to dtype {dtype}'
+ raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
+ elif is_categorical_dtype(dtype):
+ return Categorical(self, dtype=dtype)
+ else:
+ return np.asarray(self, dtype=dtype)
+
+ def view(self, dtype=None):
+ """
+ New view on this array with the same data.
+
+ Parameters
+ ----------
+ dtype : numpy dtype, optional
+
+ Returns
+ -------
+ ndarray
+ With the specified `dtype`.
+ """
+ return self._data.view(dtype=dtype)
+
+ # ------------------------------------------------------------------
+ # ExtensionArray Interface
+
+ def unique(self):
+ result = unique1d(self.asi8)
+ return type(self)(result, dtype=self.dtype)
+
+ def _validate_fill_value(self, fill_value):
+ """
+ If a fill_value is passed to `take` convert it to an i8 representation,
+ raising ValueError if this is not possible.
+
+ Parameters
+ ----------
+ fill_value : object
+
+ Returns
+ -------
+ fill_value : np.int64
+
+ Raises
+ ------
+ ValueError
+ """
+ raise AbstractMethodError(self)
+
+ def take(self, indices, allow_fill=False, fill_value=None):
+ if allow_fill:
+ fill_value = self._validate_fill_value(fill_value)
+
+ new_values = take(self.asi8,
+ indices,
+ allow_fill=allow_fill,
+ fill_value=fill_value)
+
+ return type(self)(new_values, dtype=self.dtype)
+
+ @classmethod
+ def _concat_same_type(cls, to_concat):
+ dtypes = {x.dtype for x in to_concat}
+ assert len(dtypes) == 1
+ dtype = list(dtypes)[0]
+
+ values = np.concatenate([x.asi8 for x in to_concat])
+ return cls(values, dtype=dtype)
+
+ def copy(self, deep=False):
+ values = self.asi8.copy()
+ return type(self)._simple_new(values, dtype=self.dtype, freq=self.freq)
+
+ def _values_for_factorize(self):
+ return self.asi8, iNaT
+
+ @classmethod
+ def _from_factorized(cls, values, original):
+ return cls(values, dtype=original.dtype)
+
+ def _values_for_argsort(self):
+ return self._data
+
+ # ------------------------------------------------------------------
+ # Additional array methods
+ # These are not part of the EA API, but we implement them because
+ # pandas assumes they're there.
+
+ def searchsorted(self, value, side='left', sorter=None):
+ """
+ Find indices where elements should be inserted to maintain order.
+
+ Find the indices into a sorted array `self` such that, if the
+ corresponding elements in `value` were inserted before the indices,
+ the order of `self` would be preserved.
+
+ Parameters
+ ----------
+ value : array_like
+ Values to insert into `self`.
+ side : {'left', 'right'}, optional
+ If 'left', the index of the first suitable location found is given.
+ If 'right', return the last such index. If there is no suitable
+ index, return either 0 or N (where N is the length of `self`).
+ sorter : 1-D array_like, optional
+ Optional array of integer indices that sort `self` into ascending
+ order. They are typically the result of ``np.argsort``.
+
+ Returns
+ -------
+ indices : array of ints
+ Array of insertion points with the same shape as `value`.
+ """
+ if isinstance(value, compat.string_types):
+ value = self._scalar_from_string(value)
+
+ if not (isinstance(value, (self._scalar_type, type(self)))
+ or isna(value)):
+ raise ValueError("Unexpected type for 'value': {valtype}"
+ .format(valtype=type(value)))
+
+ self._check_compatible_with(value)
+ if isinstance(value, type(self)):
+ value = value.asi8
+ else:
+ value = self._unbox_scalar(value)
+
+ return self.asi8.searchsorted(value, side=side, sorter=sorter)
+
+ def repeat(self, repeats, *args, **kwargs):
+ """
+ Repeat elements of an array.
+
+ See Also
+ --------
+ numpy.ndarray.repeat
+ """
+ nv.validate_repeat(args, kwargs)
+ values = self._data.repeat(repeats)
+ return type(self)(values.view('i8'), dtype=self.dtype)
+
+ def value_counts(self, dropna=False):
+ """
+ Return a Series containing counts of unique values.
+
+ Parameters
+ ----------
+ dropna : boolean, default True
+ Don't include counts of NaT values.
+
+ Returns
+ -------
+ Series
+ """
+ from pandas import Series, Index
+
+ if dropna:
+ values = self[~self.isna()]._data
+ else:
+ values = self._data
+
+ cls = type(self)
+
+ result = value_counts(values, sort=False, dropna=dropna)
+ index = Index(cls(result.index.view('i8'), dtype=self.dtype),
+ name=result.index.name)
+ return Series(result.values, index=index, name=result.name)
+
+ def map(self, mapper):
+ # TODO(GH-23179): Add ExtensionArray.map
+ # Need to figure out if we want ExtensionArray.map first.
+ # If so, then we can refactor IndexOpsMixin._map_values to
+ # a standalone function and call from here..
+ # Else, just rewrite _map_infer_values to do the right thing.
+ from pandas import Index
+
+ return Index(self).map(mapper).array
+
+ # ------------------------------------------------------------------
+ # Null Handling
+
+ def isna(self):
+ return self._isnan
+
+ @property # NB: override with cache_readonly in immutable subclasses
+ def _isnan(self):
+ """
+ return if each value is nan
+ """
+ return (self.asi8 == iNaT)
+
+ @property # NB: override with cache_readonly in immutable subclasses
+ def _hasnans(self):
+ """
+ return if I have any nans; enables various perf speedups
+ """
+ return bool(self._isnan.any())
+
+ def _maybe_mask_results(self, result, fill_value=iNaT, convert=None):
+ """
+ Parameters
+ ----------
+ result : a ndarray
+ fill_value : object, default iNaT
+ convert : string/dtype or None
+
+ Returns
+ -------
+ result : ndarray with values replace by the fill_value
+
+ mask the result if needed, convert to the provided dtype if its not
+ None
+
+ This is an internal routine
+ """
+
+ if self._hasnans:
+ if convert:
+ result = result.astype(convert)
+ if fill_value is None:
+ fill_value = np.nan
+ result[self._isnan] = fill_value
+ return result
+
+ def fillna(self, value=None, method=None, limit=None):
+ # TODO(GH-20300): remove this
+ # Just overriding to ensure that we avoid an astype(object).
+ # Either 20300 or a `_values_for_fillna` would avoid this duplication.
+ if isinstance(value, ABCSeries):
+ value = value.array
+
+ value, method = validate_fillna_kwargs(value, method)
+
+ mask = self.isna()
+
+ if is_array_like(value):
+ if len(value) != len(self):
+ raise ValueError("Length of 'value' does not match. Got ({}) "
+ " expected {}".format(len(value), len(self)))
+ value = value[mask]
+
+ if mask.any():
+ if method is not None:
+ if method == 'pad':
+ func = missing.pad_1d
+ else:
+ func = missing.backfill_1d
+
+ values = self._data
+ if not is_period_dtype(self):
+ # For PeriodArray self._data is i8, which gets copied
+ # by `func`. Otherwise we need to make a copy manually
+ # to avoid modifying `self` in-place.
+ values = values.copy()
+
+ new_values = func(values, limit=limit,
+ mask=mask)
+ if is_datetime64tz_dtype(self):
+ # we need to pass int64 values to the constructor to avoid
+ # re-localizing incorrectly
+ new_values = new_values.view("i8")
+ new_values = type(self)(new_values, dtype=self.dtype)
+ else:
+ # fill with value
+ new_values = self.copy()
+ new_values[mask] = value
+ else:
+ new_values = self.copy()
+ return new_values
+
+ # ------------------------------------------------------------------
+ # Frequency Properties/Methods
+
+ @property
+ def freq(self):
+ """
+ Return the frequency object if it is set, otherwise None.
+ """
+ return self._freq
+
+ @freq.setter
+ def freq(self, value):
+ if value is not None:
+ value = frequencies.to_offset(value)
+ self._validate_frequency(self, value)
+
+ self._freq = value
+
+ @property
+ def freqstr(self):
+ """
+ Return the frequency object as a string if its set, otherwise None
+ """
+ if self.freq is None:
+ return None
+ return self.freq.freqstr
+
+ @property # NB: override with cache_readonly in immutable subclasses
+ def inferred_freq(self):
+ """
+ Tryies to return a string representing a frequency guess,
+ generated by infer_freq. Returns None if it can't autodetect the
+ frequency.
+ """
+ try:
+ return frequencies.infer_freq(self)
+ except ValueError:
+ return None
+
+ @property # NB: override with cache_readonly in immutable subclasses
+ def _resolution(self):
+ return frequencies.Resolution.get_reso_from_freq(self.freqstr)
+
+ @property # NB: override with cache_readonly in immutable subclasses
+ def resolution(self):
+ """
+ Returns day, hour, minute, second, millisecond or microsecond
+ """
+ return frequencies.Resolution.get_str(self._resolution)
+
+ @classmethod
+ def _validate_frequency(cls, index, freq, **kwargs):
+ """
+ Validate that a frequency is compatible with the values of a given
+ Datetime Array/Index or Timedelta Array/Index
+
+ Parameters
+ ----------
+ index : DatetimeIndex or TimedeltaIndex
+ The index on which to determine if the given frequency is valid
+ freq : DateOffset
+ The frequency to validate
+ """
+ if is_period_dtype(cls):
+ # Frequency validation is not meaningful for Period Array/Index
+ return None
+
+ inferred = index.inferred_freq
+ if index.size == 0 or inferred == freq.freqstr:
+ return None
+
+ try:
+ on_freq = cls._generate_range(start=index[0], end=None,
+ periods=len(index), freq=freq,
+ **kwargs)
+ if not np.array_equal(index.asi8, on_freq.asi8):
+ raise ValueError
+ except ValueError as e:
+ if "non-fixed" in str(e):
+ # non-fixed frequencies are not meaningful for timedelta64;
+ # we retain that error message
+ raise e
+ # GH#11587 the main way this is reached is if the `np.array_equal`
+ # check above is False. This can also be reached if index[0]
+ # is `NaT`, in which case the call to `cls._generate_range` will
+ # raise a ValueError, which we re-raise with a more targeted
+ # message.
+ raise ValueError('Inferred frequency {infer} from passed values '
+ 'does not conform to passed frequency {passed}'
+ .format(infer=inferred, passed=freq.freqstr))
+
+ # monotonicity/uniqueness properties are called via frequencies.infer_freq,
+ # see GH#23789
+
+ @property
+ def _is_monotonic_increasing(self):
+ return algos.is_monotonic(self.asi8, timelike=True)[0]
+
+ @property
+ def _is_monotonic_decreasing(self):
+ return algos.is_monotonic(self.asi8, timelike=True)[1]
+
+ @property
+ def _is_unique(self):
+ return len(unique1d(self.asi8)) == len(self)
+
+ # ------------------------------------------------------------------
+ # Arithmetic Methods
+
+ def _add_datetimelike_scalar(self, other):
+ # Overriden by TimedeltaArray
+ raise TypeError("cannot add {cls} and {typ}"
+ .format(cls=type(self).__name__,
+ typ=type(other).__name__))
+
+ _add_datetime_arraylike = _add_datetimelike_scalar
+
+ def _sub_datetimelike_scalar(self, other):
+ # Overridden by DatetimeArray
+ assert other is not NaT
+ raise TypeError("cannot subtract a datelike from a {cls}"
+ .format(cls=type(self).__name__))
+
+ _sub_datetime_arraylike = _sub_datetimelike_scalar
+
+ def _sub_period(self, other):
+ # Overriden by PeriodArray
+ raise TypeError("cannot subtract Period from a {cls}"
+ .format(cls=type(self).__name__))
+
+ def _add_offset(self, offset):
+ raise AbstractMethodError(self)
+
+ def _add_delta(self, other):
+ """
+ Add a timedelta-like, Tick or TimedeltaIndex-like object
+ to self, yielding an int64 numpy array
+
+ Parameters
+ ----------
+ delta : {timedelta, np.timedelta64, Tick,
+ TimedeltaIndex, ndarray[timedelta64]}
+
+ Returns
+ -------
+ result : ndarray[int64]
+
+ Notes
+ -----
+ The result's name is set outside of _add_delta by the calling
+ method (__add__ or __sub__), if necessary (i.e. for Indexes).
+ """
+ if isinstance(other, (Tick, timedelta, np.timedelta64)):
+ new_values = self._add_timedeltalike_scalar(other)
+ elif is_timedelta64_dtype(other):
+ # ndarray[timedelta64] or TimedeltaArray/index
+ new_values = self._add_delta_tdi(other)
+
+ return new_values
+
+ def _add_timedeltalike_scalar(self, other):
+ """
+ Add a delta of a timedeltalike
+ return the i8 result view
+ """
+ if isna(other):
+ # i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds
+ new_values = np.empty(len(self), dtype='i8')
+ new_values[:] = iNaT
+ return new_values
+
+ inc = delta_to_nanoseconds(other)
+ new_values = checked_add_with_arr(self.asi8, inc,
+ arr_mask=self._isnan).view('i8')
+ new_values = self._maybe_mask_results(new_values)
+ return new_values.view('i8')
+
+ def _add_delta_tdi(self, other):
+ """
+ Add a delta of a TimedeltaIndex
+ return the i8 result view
+ """
+ if len(self) != len(other):
+ raise ValueError("cannot add indices of unequal length")
+
+ if isinstance(other, np.ndarray):
+ # ndarray[timedelta64]; wrap in TimedeltaIndex for op
+ from pandas import TimedeltaIndex
+ other = TimedeltaIndex(other)
+
+ self_i8 = self.asi8
+ other_i8 = other.asi8
+ new_values = checked_add_with_arr(self_i8, other_i8,
+ arr_mask=self._isnan,
+ b_mask=other._isnan)
+ if self._hasnans or other._hasnans:
+ mask = (self._isnan) | (other._isnan)
+ new_values[mask] = iNaT
+ return new_values.view('i8')
+
+ def _add_nat(self):
+ """
+ Add pd.NaT to self
+ """
+ if is_period_dtype(self):
+ raise TypeError('Cannot add {cls} and {typ}'
+ .format(cls=type(self).__name__,
+ typ=type(NaT).__name__))
+
+ # GH#19124 pd.NaT is treated like a timedelta for both timedelta
+ # and datetime dtypes
+ result = np.zeros(len(self), dtype=np.int64)
+ result.fill(iNaT)
+ return type(self)(result, dtype=self.dtype, freq=None)
+
+ def _sub_nat(self):
+ """
+ Subtract pd.NaT from self
+ """
+ # GH#19124 Timedelta - datetime is not in general well-defined.
+ # We make an exception for pd.NaT, which in this case quacks
+ # like a timedelta.
+ # For datetime64 dtypes by convention we treat NaT as a datetime, so
+ # this subtraction returns a timedelta64 dtype.
+ # For period dtype, timedelta64 is a close-enough return dtype.
+ result = np.zeros(len(self), dtype=np.int64)
+ result.fill(iNaT)
+ return result.view('timedelta64[ns]')
+
+ def _sub_period_array(self, other):
+ """
+ Subtract a Period Array/Index from self. This is only valid if self
+ is itself a Period Array/Index, raises otherwise. Both objects must
+ have the same frequency.
+
+ Parameters
+ ----------
+ other : PeriodIndex or PeriodArray
+
+ Returns
+ -------
+ result : np.ndarray[object]
+ Array of DateOffset objects; nulls represented by NaT
+ """
+ if not is_period_dtype(self):
+ raise TypeError("cannot subtract {dtype}-dtype from {cls}"
+ .format(dtype=other.dtype,
+ cls=type(self).__name__))
+
+ if len(self) != len(other):
+ raise ValueError("cannot subtract arrays/indices of "
+ "unequal length")
+ if self.freq != other.freq:
+ msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
+ own_freq=self.freqstr,
+ other_freq=other.freqstr)
+ raise IncompatibleFrequency(msg)
+
+ new_values = checked_add_with_arr(self.asi8, -other.asi8,
+ arr_mask=self._isnan,
+ b_mask=other._isnan)
+
+ new_values = np.array([self.freq.base * x for x in new_values])
+ if self._hasnans or other._hasnans:
+ mask = (self._isnan) | (other._isnan)
+ new_values[mask] = NaT
+ return new_values
+
+ def _addsub_int_array(self, other, op):
+ """
+ Add or subtract array-like of integers equivalent to applying
+ `_time_shift` pointwise.
+
+ Parameters
+ ----------
+ other : Index, ExtensionArray, np.ndarray
+ integer-dtype
+ op : {operator.add, operator.sub}
+
+ Returns
+ -------
+ result : same class as self
+ """
+ # _addsub_int_array is overriden by PeriodArray
+ assert not is_period_dtype(self)
+ assert op in [operator.add, operator.sub]
+
+ if self.freq is None:
+ # GH#19123
+ raise NullFrequencyError("Cannot shift with no freq")
+
+ elif isinstance(self.freq, Tick):
+ # easy case where we can convert to timedelta64 operation
+ td = Timedelta(self.freq)
+ return op(self, td * other)
+
+ # We should only get here with DatetimeIndex; dispatch
+ # to _addsub_offset_array
+ assert not is_timedelta64_dtype(self)
+ return op(self, np.array(other) * self.freq)
+
+ def _addsub_offset_array(self, other, op):
+ """
+ Add or subtract array-like of DateOffset objects
+
+ Parameters
+ ----------
+ other : Index, np.ndarray
+ object-dtype containing pd.DateOffset objects
+ op : {operator.add, operator.sub}
+
+ Returns
+ -------
+ result : same class as self
+ """
+ assert op in [operator.add, operator.sub]
+ if len(other) == 1:
+ return op(self, other[0])
+
+ warnings.warn("Adding/subtracting array of DateOffsets to "
+ "{cls} not vectorized"
+ .format(cls=type(self).__name__), PerformanceWarning)
+
+ # For EA self.astype('O') returns a numpy array, not an Index
+ left = lib.values_from_object(self.astype('O'))
+
+ res_values = op(left, np.array(other))
+ kwargs = {}
+ if not is_period_dtype(self):
+ kwargs['freq'] = 'infer'
+ return self._from_sequence(res_values, **kwargs)
+
+ def _time_shift(self, periods, freq=None):
+ """
+ Shift each value by `periods`.
+
+ Note this is different from ExtensionArray.shift, which
+ shifts the *position* of each element, padding the end with
+ missing values.
+
+ Parameters
+ ----------
+ periods : int
+ Number of periods to shift by.
+ freq : pandas.DateOffset, pandas.Timedelta, or string
+ Frequency increment to shift by.
+ """
+ if freq is not None and freq != self.freq:
+ if isinstance(freq, compat.string_types):
+ freq = frequencies.to_offset(freq)
+ offset = periods * freq
+ result = self + offset
+ return result
+
+ if periods == 0:
+ # immutable so OK
+ return self.copy()
+
+ if self.freq is None:
+ raise NullFrequencyError("Cannot shift with no freq")
+
+ start = self[0] + periods * self.freq
+ end = self[-1] + periods * self.freq
+
+ # Note: in the DatetimeTZ case, _generate_range will infer the
+ # appropriate timezone from `start` and `end`, so tz does not need
+ # to be passed explicitly.
+ return self._generate_range(start=start, end=end, periods=None,
+ freq=self.freq)
+
+ def __add__(self, other):
+ other = lib.item_from_zerodim(other)
+ if isinstance(other, (ABCSeries, ABCDataFrame)):
+ return NotImplemented
+
+ # scalar others
+ elif other is NaT:
+ result = self._add_nat()
+ elif isinstance(other, (Tick, timedelta, np.timedelta64)):
+ result = self._add_delta(other)
+ elif isinstance(other, DateOffset):
+ # specifically _not_ a Tick
+ result = self._add_offset(other)
+ elif isinstance(other, (datetime, np.datetime64)):
+ result = self._add_datetimelike_scalar(other)
+ elif lib.is_integer(other):
+ # This check must come after the check for np.timedelta64
+ # as is_integer returns True for these
+ if not is_period_dtype(self):
+ maybe_integer_op_deprecated(self)
+ result = self._time_shift(other)
+
+ # array-like others
+ elif is_timedelta64_dtype(other):
+ # TimedeltaIndex, ndarray[timedelta64]
+ result = self._add_delta(other)
+ elif is_offsetlike(other):
+ # Array/Index of DateOffset objects
+ result = self._addsub_offset_array(other, operator.add)
+ elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other):
+ # DatetimeIndex, ndarray[datetime64]
+ return self._add_datetime_arraylike(other)
+ elif is_integer_dtype(other):
+ if not is_period_dtype(self):
+ maybe_integer_op_deprecated(self)
+ result = self._addsub_int_array(other, operator.add)
+ elif is_float_dtype(other):
+ # Explicitly catch invalid dtypes
+ raise TypeError("cannot add {dtype}-dtype to {cls}"
+ .format(dtype=other.dtype,
+ cls=type(self).__name__))
+ elif is_period_dtype(other):
+ # if self is a TimedeltaArray and other is a PeriodArray with
+ # a timedelta-like (i.e. Tick) freq, this operation is valid.
+ # Defer to the PeriodArray implementation.
+ # In remaining cases, this will end up raising TypeError.
+ return NotImplemented
+ elif is_extension_array_dtype(other):
+ # Categorical op will raise; defer explicitly
+ return NotImplemented
+ else: # pragma: no cover
+ return NotImplemented
+
+ if is_timedelta64_dtype(result) and isinstance(result, np.ndarray):
+ from pandas.core.arrays import TimedeltaArray
+ # TODO: infer freq?
+ return TimedeltaArray(result)
+ return result
+
+ def __radd__(self, other):
+ # alias for __add__
+ return self.__add__(other)
+
+ def __sub__(self, other):
+ other = lib.item_from_zerodim(other)
+ if isinstance(other, (ABCSeries, ABCDataFrame)):
+ return NotImplemented
+
+ # scalar others
+ elif other is NaT:
+ result = self._sub_nat()
+ elif isinstance(other, (Tick, timedelta, np.timedelta64)):
+ result = self._add_delta(-other)
+ elif isinstance(other, DateOffset):
+ # specifically _not_ a Tick
+ result = self._add_offset(-other)
+ elif isinstance(other, (datetime, np.datetime64)):
+ result = self._sub_datetimelike_scalar(other)
+ elif lib.is_integer(other):
+ # This check must come after the check for np.timedelta64
+ # as is_integer returns True for these
+ if not is_period_dtype(self):
+ maybe_integer_op_deprecated(self)
+ result = self._time_shift(-other)
+
+ elif isinstance(other, Period):
+ result = self._sub_period(other)
+
+ # array-like others
+ elif is_timedelta64_dtype(other):
+ # TimedeltaIndex, ndarray[timedelta64]
+ result = self._add_delta(-other)
+ elif is_offsetlike(other):
+ # Array/Index of DateOffset objects
+ result = self._addsub_offset_array(other, operator.sub)
+ elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other):
+ # DatetimeIndex, ndarray[datetime64]
+ result = self._sub_datetime_arraylike(other)
+ elif is_period_dtype(other):
+ # PeriodIndex
+ result = self._sub_period_array(other)
+ elif is_integer_dtype(other):
+ if not is_period_dtype(self):
+ maybe_integer_op_deprecated(self)
+ result = self._addsub_int_array(other, operator.sub)
+ elif isinstance(other, ABCIndexClass):
+ raise TypeError("cannot subtract {cls} and {typ}"
+ .format(cls=type(self).__name__,
+ typ=type(other).__name__))
+ elif is_float_dtype(other):
+ # Explicitly catch invalid dtypes
+ raise TypeError("cannot subtract {dtype}-dtype from {cls}"
+ .format(dtype=other.dtype,
+ cls=type(self).__name__))
+ elif is_extension_array_dtype(other):
+ # Categorical op will raise; defer explicitly
+ return NotImplemented
+ else: # pragma: no cover
+ return NotImplemented
+
+ if is_timedelta64_dtype(result) and isinstance(result, np.ndarray):
+ from pandas.core.arrays import TimedeltaArray
+ # TODO: infer freq?
+ return TimedeltaArray(result)
+ return result
+
+ def __rsub__(self, other):
+ if is_datetime64_dtype(other) and is_timedelta64_dtype(self):
+ # ndarray[datetime64] cannot be subtracted from self, so
+ # we need to wrap in DatetimeArray/Index and flip the operation
+ if not isinstance(other, DatetimeLikeArrayMixin):
+ # Avoid down-casting DatetimeIndex
+ from pandas.core.arrays import DatetimeArray
+ other = DatetimeArray(other)
+ return other - self
+ elif (is_datetime64_any_dtype(self) and hasattr(other, 'dtype') and
+ not is_datetime64_any_dtype(other)):
+ # GH#19959 datetime - datetime is well-defined as timedelta,
+ # but any other type - datetime is not well-defined.
+ raise TypeError("cannot subtract {cls} from {typ}"
+ .format(cls=type(self).__name__,
+ typ=type(other).__name__))
+ elif is_period_dtype(self) and is_timedelta64_dtype(other):
+ # TODO: Can we simplify/generalize these cases at all?
+ raise TypeError("cannot subtract {cls} from {dtype}"
+ .format(cls=type(self).__name__,
+ dtype=other.dtype))
+ return -(self - other)
+
+ # FIXME: DTA/TDA/PA inplace methods should actually be inplace, GH#24115
+ def __iadd__(self, other):
+ # alias for __add__
+ return self.__add__(other)
+
+ def __isub__(self, other):
+ # alias for __sub__
+ return self.__sub__(other)
+
+ # --------------------------------------------------------------
+ # Comparison Methods
+
+ def _ensure_localized(self, arg, ambiguous='raise', nonexistent='raise',
+ from_utc=False):
+ """
+ Ensure that we are re-localized.
+
+ This is for compat as we can then call this on all datetimelike
+ arrays generally (ignored for Period/Timedelta)
+
+ Parameters
+ ----------
+ arg : Union[DatetimeLikeArray, DatetimeIndexOpsMixin, ndarray]
+ ambiguous : str, bool, or bool-ndarray, default 'raise'
+ nonexistent : str, default 'raise'
+ from_utc : bool, default False
+ If True, localize the i8 ndarray to UTC first before converting to
+ the appropriate tz. If False, localize directly to the tz.
+
+ Returns
+ -------
+ localized array
+ """
+
+ # reconvert to local tz
+ tz = getattr(self, 'tz', None)
+ if tz is not None:
+ if not isinstance(arg, type(self)):
+ arg = self._simple_new(arg)
+ if from_utc:
+ arg = arg.tz_localize('UTC').tz_convert(self.tz)
+ else:
+ arg = arg.tz_localize(
+ self.tz, ambiguous=ambiguous, nonexistent=nonexistent
+ )
+ return arg
+
+ # --------------------------------------------------------------
+ # Reductions
+
+ def _reduce(self, name, axis=0, skipna=True, **kwargs):
+ op = getattr(self, name, None)
+ if op:
+ return op(axis=axis, skipna=skipna, **kwargs)
+ else:
+ return super(DatetimeLikeArrayMixin, self)._reduce(
+ name, skipna, **kwargs
+ )
+
+ def min(self, axis=None, skipna=True, *args, **kwargs):
+ """
+ Return the minimum value of the Array or minimum along
+ an axis.
+
+ See Also
+ --------
+ numpy.ndarray.min
+ Index.min : Return the minimum value in an Index.
+ Series.min : Return the minimum value in a Series.
+ """
+ nv.validate_min(args, kwargs)
+ nv.validate_minmax_axis(axis)
+
+ result = nanops.nanmin(self.asi8, skipna=skipna, mask=self.isna())
+ if isna(result):
+ # Period._from_ordinal does not handle np.nan gracefully
+ return NaT
+ return self._box_func(result)
+
+ def max(self, axis=None, skipna=True, *args, **kwargs):
+ """
+ Return the maximum value of the Array or maximum along
+ an axis.
+
+ See Also
+ --------
+ numpy.ndarray.max
+ Index.max : Return the maximum value in an Index.
+ Series.max : Return the maximum value in a Series.
+ """
+ # TODO: skipna is broken with max.
+ # See https://github.com/pandas-dev/pandas/issues/24265
+ nv.validate_max(args, kwargs)
+ nv.validate_minmax_axis(axis)
+
+ mask = self.isna()
+ if skipna:
+ values = self[~mask].asi8
+ elif mask.any():
+ return NaT
+ else:
+ values = self.asi8
+
+ if not len(values):
+ # short-circut for empty max / min
+ return NaT
+
+ result = nanops.nanmax(values, skipna=skipna)
+ # Don't have to worry about NA `result`, since no NA went in.
+ return self._box_func(result)
+
+
+# -------------------------------------------------------------------
+# Shared Constructor Helpers
+
+def validate_periods(periods):
+ """
+ If a `periods` argument is passed to the Datetime/Timedelta Array/Index
+ constructor, cast it to an integer.
+
+ Parameters
+ ----------
+ periods : None, float, int
+
+ Returns
+ -------
+ periods : None or int
+
+ Raises
+ ------
+ TypeError
+ if periods is None, float, or int
+ """
+ if periods is not None:
+ if lib.is_float(periods):
+ periods = int(periods)
+ elif not lib.is_integer(periods):
+ raise TypeError('periods must be a number, got {periods}'
+ .format(periods=periods))
+ return periods
+
+
+def validate_endpoints(closed):
+ """
+ Check that the `closed` argument is among [None, "left", "right"]
+
+ Parameters
+ ----------
+ closed : {None, "left", "right"}
+
+ Returns
+ -------
+ left_closed : bool
+ right_closed : bool
+
+ Raises
+ ------
+ ValueError : if argument is not among valid values
+ """
+ left_closed = False
+ right_closed = False
+
+ if closed is None:
+ left_closed = True
+ right_closed = True
+ elif closed == "left":
+ left_closed = True
+ elif closed == "right":
+ right_closed = True
+ else:
+ raise ValueError("Closed has to be either 'left', 'right' or None")
+
+ return left_closed, right_closed
+
+
+def validate_inferred_freq(freq, inferred_freq, freq_infer):
+ """
+ If the user passes a freq and another freq is inferred from passed data,
+ require that they match.
+
+ Parameters
+ ----------
+ freq : DateOffset or None
+ inferred_freq : DateOffset or None
+ freq_infer : bool
+
+ Returns
+ -------
+ freq : DateOffset or None
+ freq_infer : bool
+
+ Notes
+ -----
+ We assume at this point that `maybe_infer_freq` has been called, so
+ `freq` is either a DateOffset object or None.
+ """
+ if inferred_freq is not None:
+ if freq is not None and freq != inferred_freq:
+ raise ValueError('Inferred frequency {inferred} from passed '
+ 'values does not conform to passed frequency '
+ '{passed}'
+ .format(inferred=inferred_freq,
+ passed=freq.freqstr))
+ elif freq is None:
+ freq = inferred_freq
+ freq_infer = False
+
+ return freq, freq_infer
+
+
+def maybe_infer_freq(freq):
+ """
+ Comparing a DateOffset to the string "infer" raises, so we need to
+ be careful about comparisons. Make a dummy variable `freq_infer` to
+ signify the case where the given freq is "infer" and set freq to None
+ to avoid comparison trouble later on.
+
+ Parameters
+ ----------
+ freq : {DateOffset, None, str}
+
+ Returns
+ -------
+ freq : {DateOffset, None}
+ freq_infer : bool
+ """
+ freq_infer = False
+ if not isinstance(freq, DateOffset):
+ # if a passed freq is None, don't infer automatically
+ if freq != 'infer':
+ freq = frequencies.to_offset(freq)
+ else:
+ freq_infer = True
+ freq = None
+ return freq, freq_infer
+
+
+def _ensure_datetimelike_to_i8(other, to_utc=False):
+ """
+ Helper for coercing an input scalar or array to i8.
+
+ Parameters
+ ----------
+ other : 1d array
+ to_utc : bool, default False
+ If True, convert the values to UTC before extracting the i8 values
+ If False, extract the i8 values directly.
+
+ Returns
+ -------
+ i8 1d array
+ """
+ from pandas import Index
+ from pandas.core.arrays import PeriodArray
+
+ if lib.is_scalar(other) and isna(other):
+ return iNaT
+ elif isinstance(other, (PeriodArray, ABCIndexClass,
+ DatetimeLikeArrayMixin)):
+ # convert tz if needed
+ if getattr(other, 'tz', None) is not None:
+ if to_utc:
+ other = other.tz_convert('UTC')
+ else:
+ other = other.tz_localize(None)
+ else:
+ try:
+ return np.array(other, copy=False).view('i8')
+ except TypeError:
+ # period array cannot be coerced to int
+ other = Index(other)
+ return other.asi8
diff --git a/contrib/python/pandas/py2/pandas/core/arrays/datetimes.py b/contrib/python/pandas/py2/pandas/core/arrays/datetimes.py
new file mode 100644
index 00000000000..69cb787e0b8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/arrays/datetimes.py
@@ -0,0 +1,2152 @@
+# -*- coding: utf-8 -*-
+from datetime import datetime, time, timedelta
+import textwrap
+import warnings
+
+import numpy as np
+from pytz import utc
+
+from pandas._libs import lib, tslib
+from pandas._libs.tslibs import (
+ NaT, Timestamp, ccalendar, conversion, fields, iNaT, normalize_date,
+ resolution as libresolution, timezones)
+import pandas.compat as compat
+from pandas.errors import PerformanceWarning
+from pandas.util._decorators import Appender
+
+from pandas.core.dtypes.common import (
+ _INT64_DTYPE, _NS_DTYPE, is_categorical_dtype, is_datetime64_dtype,
+ is_datetime64_ns_dtype, is_datetime64tz_dtype, is_dtype_equal,
+ is_extension_type, is_float_dtype, is_object_dtype, is_period_dtype,
+ is_string_dtype, is_timedelta64_dtype, pandas_dtype)
+from pandas.core.dtypes.dtypes import DatetimeTZDtype
+from pandas.core.dtypes.generic import (
+ ABCDataFrame, ABCIndexClass, ABCPandasArray, ABCSeries)
+from pandas.core.dtypes.missing import isna
+
+from pandas.core import ops
+from pandas.core.algorithms import checked_add_with_arr
+from pandas.core.arrays import datetimelike as dtl
+from pandas.core.arrays._ranges import generate_regular_range
+import pandas.core.common as com
+
+from pandas.tseries.frequencies import get_period_alias, to_offset
+from pandas.tseries.offsets import Day, Tick
+
+_midnight = time(0, 0)
+# TODO(GH-24559): Remove warning, int_as_wall_time parameter.
+_i8_message = """
+ Passing integer-dtype data and a timezone to DatetimeIndex. Integer values
+ will be interpreted differently in a future version of pandas. Previously,
+ these were viewed as datetime64[ns] values representing the wall time
+ *in the specified timezone*. In the future, these will be viewed as
+ datetime64[ns] values representing the wall time *in UTC*. This is similar
+ to a nanosecond-precision UNIX epoch. To accept the future behavior, use
+
+ pd.to_datetime(integer_data, utc=True).tz_convert(tz)
+
+ To keep the previous behavior, use
+
+ pd.to_datetime(integer_data).tz_localize(tz)
+"""
+
+
+def tz_to_dtype(tz):
+ """
+ Return a datetime64[ns] dtype appropriate for the given timezone.
+
+ Parameters
+ ----------
+ tz : tzinfo or None
+
+ Returns
+ -------
+ np.dtype or Datetime64TZDType
+ """
+ if tz is None:
+ return _NS_DTYPE
+ else:
+ return DatetimeTZDtype(tz=tz)
+
+
+def _to_M8(key, tz=None):
+ """
+ Timestamp-like => dt64
+ """
+ if not isinstance(key, Timestamp):
+ # this also converts strings
+ key = Timestamp(key)
+ if key.tzinfo is not None and tz is not None:
+ # Don't tz_localize(None) if key is already tz-aware
+ key = key.tz_convert(tz)
+ else:
+ key = key.tz_localize(tz)
+
+ return np.int64(conversion.pydt_to_i8(key)).view(_NS_DTYPE)
+
+
+def _field_accessor(name, field, docstring=None):
+ def f(self):
+ values = self.asi8
+ if self.tz is not None and not timezones.is_utc(self.tz):
+ values = self._local_timestamps()
+
+ if field in self._bool_ops:
+ if field.endswith(('start', 'end')):
+ freq = self.freq
+ month_kw = 12
+ if freq:
+ kwds = freq.kwds
+ month_kw = kwds.get('startingMonth', kwds.get('month', 12))
+
+ result = fields.get_start_end_field(values, field,
+ self.freqstr, month_kw)
+ else:
+ result = fields.get_date_field(values, field)
+
+ # these return a boolean by-definition
+ return result
+
+ if field in self._object_ops:
+ result = fields.get_date_name_field(values, field)
+ result = self._maybe_mask_results(result, fill_value=None)
+
+ else:
+ result = fields.get_date_field(values, field)
+ result = self._maybe_mask_results(result, fill_value=None,
+ convert='float64')
+
+ return result
+
+ f.__name__ = name
+ f.__doc__ = "\n{}\n".format(docstring)
+ return property(f)
+
+
+def _dt_array_cmp(cls, op):
+ """
+ Wrap comparison operations to convert datetime-like to datetime64
+ """
+ opname = '__{name}__'.format(name=op.__name__)
+ nat_result = True if opname == '__ne__' else False
+
+ def wrapper(self, other):
+ if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
+ return NotImplemented
+
+ other = lib.item_from_zerodim(other)
+
+ if isinstance(other, (datetime, np.datetime64, compat.string_types)):
+ if isinstance(other, (datetime, np.datetime64)):
+ # GH#18435 strings get a pass from tzawareness compat
+ self._assert_tzawareness_compat(other)
+
+ try:
+ other = _to_M8(other, tz=self.tz)
+ except ValueError:
+ # string that cannot be parsed to Timestamp
+ return ops.invalid_comparison(self, other, op)
+
+ result = op(self.asi8, other.view('i8'))
+ if isna(other):
+ result.fill(nat_result)
+ elif lib.is_scalar(other) or np.ndim(other) == 0:
+ return ops.invalid_comparison(self, other, op)
+ elif len(other) != len(self):
+ raise ValueError("Lengths must match")
+ else:
+ if isinstance(other, list):
+ try:
+ other = type(self)._from_sequence(other)
+ except ValueError:
+ other = np.array(other, dtype=np.object_)
+ elif not isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries,
+ DatetimeArray)):
+ # Following Timestamp convention, __eq__ is all-False
+ # and __ne__ is all True, others raise TypeError.
+ return ops.invalid_comparison(self, other, op)
+
+ if is_object_dtype(other):
+ # We have to use _comp_method_OBJECT_ARRAY instead of numpy
+ # comparison otherwise it would fail to raise when
+ # comparing tz-aware and tz-naive
+ with np.errstate(all='ignore'):
+ result = ops._comp_method_OBJECT_ARRAY(op,
+ self.astype(object),
+ other)
+ o_mask = isna(other)
+ elif not (is_datetime64_dtype(other) or
+ is_datetime64tz_dtype(other)):
+ # e.g. is_timedelta64_dtype(other)
+ return ops.invalid_comparison(self, other, op)
+ else:
+ self._assert_tzawareness_compat(other)
+ if isinstance(other, (ABCIndexClass, ABCSeries)):
+ other = other.array
+
+ if (is_datetime64_dtype(other) and
+ not is_datetime64_ns_dtype(other) or
+ not hasattr(other, 'asi8')):
+ # e.g. other.dtype == 'datetime64[s]'
+ # or an object-dtype ndarray
+ other = type(self)._from_sequence(other)
+
+ result = op(self.view('i8'), other.view('i8'))
+ o_mask = other._isnan
+
+ result = com.values_from_object(result)
+
+ # Make sure to pass an array to result[...]; indexing with
+ # Series breaks with older version of numpy
+ o_mask = np.array(o_mask)
+ if o_mask.any():
+ result[o_mask] = nat_result
+
+ if self._hasnans:
+ result[self._isnan] = nat_result
+
+ return result
+
+ return compat.set_function_name(wrapper, opname, cls)
+
+
+class DatetimeArray(dtl.DatetimeLikeArrayMixin,
+ dtl.TimelikeOps,
+ dtl.DatelikeOps):
+ """
+ Pandas ExtensionArray for tz-naive or tz-aware datetime data.
+
+ .. versionadded:: 0.24.0
+
+ .. warning::
+
+ DatetimeArray is currently experimental, and its API may change
+ without warning. In particular, :attr:`DatetimeArray.dtype` is
+ expected to change to always be an instance of an ``ExtensionDtype``
+ subclass.
+
+ Parameters
+ ----------
+ values : Series, Index, DatetimeArray, ndarray
+ The datetime data.
+
+ For DatetimeArray `values` (or a Series or Index boxing one),
+ `dtype` and `freq` will be extracted from `values`, with
+ precedence given to
+
+ dtype : numpy.dtype or DatetimeTZDtype
+ Note that the only NumPy dtype allowed is 'datetime64[ns]'.
+ freq : str or Offset, optional
+ copy : bool, default False
+ Whether to copy the underlying array of values.
+ """
+ _typ = "datetimearray"
+ _scalar_type = Timestamp
+
+ # define my properties & methods for delegation
+ _bool_ops = ['is_month_start', 'is_month_end',
+ 'is_quarter_start', 'is_quarter_end', 'is_year_start',
+ 'is_year_end', 'is_leap_year']
+ _object_ops = ['weekday_name', 'freq', 'tz']
+ _field_ops = ['year', 'month', 'day', 'hour', 'minute', 'second',
+ 'weekofyear', 'week', 'weekday', 'dayofweek',
+ 'dayofyear', 'quarter', 'days_in_month',
+ 'daysinmonth', 'microsecond',
+ 'nanosecond']
+ _other_ops = ['date', 'time', 'timetz']
+ _datetimelike_ops = _field_ops + _object_ops + _bool_ops + _other_ops
+ _datetimelike_methods = ['to_period', 'tz_localize',
+ 'tz_convert',
+ 'normalize', 'strftime', 'round', 'floor',
+ 'ceil', 'month_name', 'day_name']
+
+ # dummy attribute so that datetime.__eq__(DatetimeArray) defers
+ # by returning NotImplemented
+ timetuple = None
+
+ # Needed so that Timestamp.__richcmp__(DateTimeArray) operates pointwise
+ ndim = 1
+
+ # ensure that operations with numpy arrays defer to our implementation
+ __array_priority__ = 1000
+
+ # -----------------------------------------------------------------
+ # Constructors
+
+ _attributes = ["freq", "tz"]
+ _dtype = None # type: Union[np.dtype, DatetimeTZDtype]
+ _freq = None
+
+ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False):
+ if isinstance(values, (ABCSeries, ABCIndexClass)):
+ values = values._values
+
+ inferred_freq = getattr(values, "_freq", None)
+
+ if isinstance(values, type(self)):
+ # validation
+ dtz = getattr(dtype, 'tz', None)
+ if dtz and values.tz is None:
+ dtype = DatetimeTZDtype(tz=dtype.tz)
+ elif dtz and values.tz:
+ if not timezones.tz_compare(dtz, values.tz):
+ msg = (
+ "Timezone of the array and 'dtype' do not match. "
+ "'{}' != '{}'"
+ )
+ raise TypeError(msg.format(dtz, values.tz))
+ elif values.tz:
+ dtype = values.dtype
+ # freq = validate_values_freq(values, freq)
+ if freq is None:
+ freq = values.freq
+ values = values._data
+
+ if not isinstance(values, np.ndarray):
+ msg = (
+ "Unexpected type '{}'. 'values' must be a DatetimeArray "
+ "ndarray, or Series or Index containing one of those."
+ )
+ raise ValueError(msg.format(type(values).__name__))
+
+ if values.dtype == 'i8':
+ # for compat with datetime/timedelta/period shared methods,
+ # we can sometimes get here with int64 values. These represent
+ # nanosecond UTC (or tz-naive) unix timestamps
+ values = values.view(_NS_DTYPE)
+
+ if values.dtype != _NS_DTYPE:
+ msg = (
+ "The dtype of 'values' is incorrect. Must be 'datetime64[ns]'."
+ " Got {} instead."
+ )
+ raise ValueError(msg.format(values.dtype))
+
+ dtype = _validate_dt64_dtype(dtype)
+
+ if freq == "infer":
+ msg = (
+ "Frequency inference not allowed in DatetimeArray.__init__. "
+ "Use 'pd.array()' instead."
+ )
+ raise ValueError(msg)
+
+ if copy:
+ values = values.copy()
+ if freq:
+ freq = to_offset(freq)
+ if getattr(dtype, 'tz', None):
+ # https://github.com/pandas-dev/pandas/issues/18595
+ # Ensure that we have a standard timezone for pytz objects.
+ # Without this, things like adding an array of timedeltas and
+ # a tz-aware Timestamp (with a tz specific to its datetime) will
+ # be incorrect(ish?) for the array as a whole
+ dtype = DatetimeTZDtype(tz=timezones.tz_standardize(dtype.tz))
+
+ self._data = values
+ self._dtype = dtype
+ self._freq = freq
+
+ if inferred_freq is None and freq is not None:
+ type(self)._validate_frequency(self, freq)
+
+ @classmethod
+ def _simple_new(cls, values, freq=None, dtype=_NS_DTYPE):
+ assert isinstance(values, np.ndarray)
+ if values.dtype == 'i8':
+ values = values.view(_NS_DTYPE)
+
+ result = object.__new__(cls)
+ result._data = values
+ result._freq = freq
+ result._dtype = dtype
+ return result
+
+ @classmethod
+ def _from_sequence(cls, data, dtype=None, copy=False,
+ tz=None, freq=None,
+ dayfirst=False, yearfirst=False, ambiguous='raise',
+ int_as_wall_time=False):
+
+ freq, freq_infer = dtl.maybe_infer_freq(freq)
+
+ subarr, tz, inferred_freq = sequence_to_dt64ns(
+ data, dtype=dtype, copy=copy, tz=tz,
+ dayfirst=dayfirst, yearfirst=yearfirst,
+ ambiguous=ambiguous, int_as_wall_time=int_as_wall_time)
+
+ freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq,
+ freq_infer)
+
+ dtype = tz_to_dtype(tz)
+ result = cls._simple_new(subarr, freq=freq, dtype=dtype)
+
+ if inferred_freq is None and freq is not None:
+ # this condition precludes `freq_infer`
+ cls._validate_frequency(result, freq, ambiguous=ambiguous)
+
+ elif freq_infer:
+ # Set _freq directly to bypass duplicative _validate_frequency
+ # check.
+ result._freq = to_offset(result.inferred_freq)
+
+ return result
+
+ @classmethod
+ def _generate_range(cls, start, end, periods, freq, tz=None,
+ normalize=False, ambiguous='raise',
+ nonexistent='raise', closed=None):
+
+ periods = dtl.validate_periods(periods)
+ if freq is None and any(x is None for x in [periods, start, end]):
+ raise ValueError('Must provide freq argument if no data is '
+ 'supplied')
+
+ if com.count_not_none(start, end, periods, freq) != 3:
+ raise ValueError('Of the four parameters: start, end, periods, '
+ 'and freq, exactly three must be specified')
+ freq = to_offset(freq)
+
+ if start is not None:
+ start = Timestamp(start)
+
+ if end is not None:
+ end = Timestamp(end)
+
+ if start is None and end is None:
+ if closed is not None:
+ raise ValueError("Closed has to be None if not both of start"
+ "and end are defined")
+ if start is NaT or end is NaT:
+ raise ValueError("Neither `start` nor `end` can be NaT")
+
+ left_closed, right_closed = dtl.validate_endpoints(closed)
+
+ start, end, _normalized = _maybe_normalize_endpoints(start, end,
+ normalize)
+
+ tz = _infer_tz_from_endpoints(start, end, tz)
+
+ if tz is not None:
+ # Localize the start and end arguments
+ start = _maybe_localize_point(
+ start, getattr(start, 'tz', None), start, freq, tz
+ )
+ end = _maybe_localize_point(
+ end, getattr(end, 'tz', None), end, freq, tz
+ )
+ if freq is not None:
+ # We break Day arithmetic (fixed 24 hour) here and opt for
+ # Day to mean calendar day (23/24/25 hour). Therefore, strip
+ # tz info from start and day to avoid DST arithmetic
+ if isinstance(freq, Day):
+ if start is not None:
+ start = start.tz_localize(None)
+ if end is not None:
+ end = end.tz_localize(None)
+ # TODO: consider re-implementing _cached_range; GH#17914
+ values, _tz = generate_regular_range(start, end, periods, freq)
+ index = cls._simple_new(values, freq=freq, dtype=tz_to_dtype(_tz))
+
+ if tz is not None and index.tz is None:
+ arr = conversion.tz_localize_to_utc(
+ index.asi8,
+ tz, ambiguous=ambiguous, nonexistent=nonexistent)
+
+ index = cls(arr)
+
+ # index is localized datetime64 array -> have to convert
+ # start/end as well to compare
+ if start is not None:
+ start = start.tz_localize(tz).asm8
+ if end is not None:
+ end = end.tz_localize(tz).asm8
+ else:
+ # Create a linearly spaced date_range in local time
+ # Nanosecond-granularity timestamps aren't always correctly
+ # representable with doubles, so we limit the range that we
+ # pass to np.linspace as much as possible
+ arr = np.linspace(
+ 0, end.value - start.value,
+ periods, dtype='int64') + start.value
+ dtype = tz_to_dtype(tz)
+ index = cls._simple_new(
+ arr.astype('M8[ns]', copy=False), freq=None, dtype=dtype
+ )
+
+ if not left_closed and len(index) and index[0] == start:
+ index = index[1:]
+ if not right_closed and len(index) and index[-1] == end:
+ index = index[:-1]
+
+ dtype = tz_to_dtype(tz)
+ return cls._simple_new(index.asi8, freq=freq, dtype=dtype)
+
+ # -----------------------------------------------------------------
+ # DatetimeLike Interface
+
+ def _unbox_scalar(self, value):
+ if not isinstance(value, self._scalar_type) and value is not NaT:
+ raise ValueError("'value' should be a Timestamp.")
+ if not isna(value):
+ self._check_compatible_with(value)
+ return value.value
+
+ def _scalar_from_string(self, value):
+ return Timestamp(value, tz=self.tz)
+
+ def _check_compatible_with(self, other):
+ if other is NaT:
+ return
+ if not timezones.tz_compare(self.tz, other.tz):
+ raise ValueError("Timezones don't match. '{own} != {other}'"
+ .format(own=self.tz, other=other.tz))
+
+ def _maybe_clear_freq(self):
+ self._freq = None
+
+ # -----------------------------------------------------------------
+ # Descriptive Properties
+
+ @property
+ def _box_func(self):
+ return lambda x: Timestamp(x, freq=self.freq, tz=self.tz)
+
+ @property
+ def dtype(self):
+ # type: () -> Union[np.dtype, DatetimeTZDtype]
+ """
+ The dtype for the DatetimeArray.
+
+ .. warning::
+
+ A future version of pandas will change dtype to never be a
+ ``numpy.dtype``. Instead, :attr:`DatetimeArray.dtype` will
+ always be an instance of an ``ExtensionDtype`` subclass.
+
+ Returns
+ -------
+ numpy.dtype or DatetimeTZDtype
+ If the values are tz-naive, then ``np.dtype('datetime64[ns]')``
+ is returned.
+
+ If the values are tz-aware, then the ``DatetimeTZDtype``
+ is returned.
+ """
+ return self._dtype
+
+ @property
+ def tz(self):
+ """
+ Return timezone, if any.
+
+ Returns
+ -------
+ datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None
+ Returns None when the array is tz-naive.
+ """
+ # GH 18595
+ return getattr(self.dtype, "tz", None)
+
+ @tz.setter
+ def tz(self, value):
+ # GH 3746: Prevent localizing or converting the index by setting tz
+ raise AttributeError("Cannot directly set timezone. Use tz_localize() "
+ "or tz_convert() as appropriate")
+
+ @property
+ def tzinfo(self):
+ """
+ Alias for tz attribute
+ """
+ return self.tz
+
+ @property # NB: override with cache_readonly in immutable subclasses
+ def _timezone(self):
+ """
+ Comparable timezone both for pytz / dateutil
+ """
+ return timezones.get_timezone(self.tzinfo)
+
+ @property # NB: override with cache_readonly in immutable subclasses
+ def is_normalized(self):
+ """
+ Returns True if all of the dates are at midnight ("no time")
+ """
+ return conversion.is_date_array_normalized(self.asi8, self.tz)
+
+ @property # NB: override with cache_readonly in immutable subclasses
+ def _resolution(self):
+ return libresolution.resolution(self.asi8, self.tz)
+
+ # ----------------------------------------------------------------
+ # Array-Like / EA-Interface Methods
+
+ def __array__(self, dtype=None):
+ if dtype is None and self.tz:
+ # The default for tz-aware is object, to preserve tz info
+ dtype = object
+
+ return super(DatetimeArray, self).__array__(dtype=dtype)
+
+ def __iter__(self):
+ """
+ Return an iterator over the boxed values
+
+ Yields
+ -------
+ tstamp : Timestamp
+ """
+
+ # convert in chunks of 10k for efficiency
+ data = self.asi8
+ length = len(self)
+ chunksize = 10000
+ chunks = int(length / chunksize) + 1
+ for i in range(chunks):
+ start_i = i * chunksize
+ end_i = min((i + 1) * chunksize, length)
+ converted = tslib.ints_to_pydatetime(data[start_i:end_i],
+ tz=self.tz, freq=self.freq,
+ box="timestamp")
+ for v in converted:
+ yield v
+
+ def astype(self, dtype, copy=True):
+ # We handle
+ # --> datetime
+ # --> period
+ # DatetimeLikeArrayMixin Super handles the rest.
+ dtype = pandas_dtype(dtype)
+
+ if (is_datetime64_ns_dtype(dtype) and
+ not is_dtype_equal(dtype, self.dtype)):
+ # GH#18951: datetime64_ns dtype but not equal means different tz
+ new_tz = getattr(dtype, 'tz', None)
+ if getattr(self.dtype, 'tz', None) is None:
+ return self.tz_localize(new_tz)
+ result = self.tz_convert(new_tz)
+ if new_tz is None:
+ # Do we want .astype('datetime64[ns]') to be an ndarray.
+ # The astype in Block._astype expects this to return an
+ # ndarray, but we could maybe work around it there.
+ result = result._data
+ return result
+ elif is_datetime64tz_dtype(self.dtype) and is_dtype_equal(self.dtype,
+ dtype):
+ if copy:
+ return self.copy()
+ return self
+ elif is_period_dtype(dtype):
+ return self.to_period(freq=dtype.freq)
+ return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy)
+
+ # ----------------------------------------------------------------
+ # ExtensionArray Interface
+
+ @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__)
+ def _validate_fill_value(self, fill_value):
+ if isna(fill_value):
+ fill_value = iNaT
+ elif isinstance(fill_value, (datetime, np.datetime64)):
+ self._assert_tzawareness_compat(fill_value)
+ fill_value = Timestamp(fill_value).value
+ else:
+ raise ValueError("'fill_value' should be a Timestamp. "
+ "Got '{got}'.".format(got=fill_value))
+ return fill_value
+
+ # -----------------------------------------------------------------
+ # Rendering Methods
+
+ def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs):
+ from pandas.io.formats.format import _get_format_datetime64_from_values
+ fmt = _get_format_datetime64_from_values(self, date_format)
+
+ return tslib.format_array_from_datetime(self.asi8,
+ tz=self.tz,
+ format=fmt,
+ na_rep=na_rep)
+
+ # -----------------------------------------------------------------
+ # Comparison Methods
+
+ _create_comparison_method = classmethod(_dt_array_cmp)
+
+ def _has_same_tz(self, other):
+ zzone = self._timezone
+
+ # vzone sholdn't be None if value is non-datetime like
+ if isinstance(other, np.datetime64):
+ # convert to Timestamp as np.datetime64 doesn't have tz attr
+ other = Timestamp(other)
+ vzone = timezones.get_timezone(getattr(other, 'tzinfo', '__no_tz__'))
+ return zzone == vzone
+
+ def _assert_tzawareness_compat(self, other):
+ # adapted from _Timestamp._assert_tzawareness_compat
+ other_tz = getattr(other, 'tzinfo', None)
+ if is_datetime64tz_dtype(other):
+ # Get tzinfo from Series dtype
+ other_tz = other.dtype.tz
+ if other is NaT:
+ # pd.NaT quacks both aware and naive
+ pass
+ elif self.tz is None:
+ if other_tz is not None:
+ raise TypeError('Cannot compare tz-naive and tz-aware '
+ 'datetime-like objects.')
+ elif other_tz is None:
+ raise TypeError('Cannot compare tz-naive and tz-aware '
+ 'datetime-like objects')
+
+ # -----------------------------------------------------------------
+ # Arithmetic Methods
+
+ def _sub_datetime_arraylike(self, other):
+ """subtract DatetimeArray/Index or ndarray[datetime64]"""
+ if len(self) != len(other):
+ raise ValueError("cannot add indices of unequal length")
+
+ if isinstance(other, np.ndarray):
+ assert is_datetime64_dtype(other)
+ other = type(self)(other)
+
+ if not self._has_same_tz(other):
+ # require tz compat
+ raise TypeError("{cls} subtraction must have the same "
+ "timezones or no timezones"
+ .format(cls=type(self).__name__))
+
+ self_i8 = self.asi8
+ other_i8 = other.asi8
+ arr_mask = self._isnan | other._isnan
+ new_values = checked_add_with_arr(self_i8, -other_i8,
+ arr_mask=arr_mask)
+ if self._hasnans or other._hasnans:
+ new_values[arr_mask] = iNaT
+ return new_values.view('timedelta64[ns]')
+
+ def _add_offset(self, offset):
+ assert not isinstance(offset, Tick)
+ try:
+ if self.tz is not None:
+ values = self.tz_localize(None)
+ else:
+ values = self
+ result = offset.apply_index(values)
+ if self.tz is not None:
+ result = result.tz_localize(self.tz)
+
+ except NotImplementedError:
+ warnings.warn("Non-vectorized DateOffset being applied to Series "
+ "or DatetimeIndex", PerformanceWarning)
+ result = self.astype('O') + offset
+
+ return type(self)._from_sequence(result, freq='infer')
+
+ def _sub_datetimelike_scalar(self, other):
+ # subtract a datetime from myself, yielding a ndarray[timedelta64[ns]]
+ assert isinstance(other, (datetime, np.datetime64))
+ assert other is not NaT
+ other = Timestamp(other)
+ if other is NaT:
+ return self - NaT
+
+ if not self._has_same_tz(other):
+ # require tz compat
+ raise TypeError("Timestamp subtraction must have the same "
+ "timezones or no timezones")
+
+ i8 = self.asi8
+ result = checked_add_with_arr(i8, -other.value,
+ arr_mask=self._isnan)
+ result = self._maybe_mask_results(result)
+ return result.view('timedelta64[ns]')
+
+ def _add_delta(self, delta):
+ """
+ Add a timedelta-like, Tick, or TimedeltaIndex-like object
+ to self, yielding a new DatetimeArray
+
+ Parameters
+ ----------
+ other : {timedelta, np.timedelta64, Tick,
+ TimedeltaIndex, ndarray[timedelta64]}
+
+ Returns
+ -------
+ result : DatetimeArray
+ """
+ new_values = super(DatetimeArray, self)._add_delta(delta)
+ return type(self)._from_sequence(new_values, tz=self.tz, freq='infer')
+
+ # -----------------------------------------------------------------
+ # Timezone Conversion and Localization Methods
+
+ def _local_timestamps(self):
+ """
+ Convert to an i8 (unix-like nanosecond timestamp) representation
+ while keeping the local timezone and not using UTC.
+ This is used to calculate time-of-day information as if the timestamps
+ were timezone-naive.
+ """
+ return conversion.tz_convert(self.asi8, utc, self.tz)
+
+ def tz_convert(self, tz):
+ """
+ Convert tz-aware Datetime Array/Index from one time zone to another.
+
+ Parameters
+ ----------
+ tz : string, pytz.timezone, dateutil.tz.tzfile or None
+ Time zone for time. Corresponding timestamps would be converted
+ to this time zone of the Datetime Array/Index. A `tz` of None will
+ convert to UTC and remove the timezone information.
+
+ Returns
+ -------
+ normalized : same type as self
+
+ Raises
+ ------
+ TypeError
+ If Datetime Array/Index is tz-naive.
+
+ See Also
+ --------
+ DatetimeIndex.tz : A timezone that has a variable offset from UTC.
+ DatetimeIndex.tz_localize : Localize tz-naive DatetimeIndex to a
+ given time zone, or remove timezone from a tz-aware DatetimeIndex.
+
+ Examples
+ --------
+ With the `tz` parameter, we can change the DatetimeIndex
+ to other time zones:
+
+ >>> dti = pd.date_range(start='2014-08-01 09:00',
+ ... freq='H', periods=3, tz='Europe/Berlin')
+
+ >>> dti
+ DatetimeIndex(['2014-08-01 09:00:00+02:00',
+ '2014-08-01 10:00:00+02:00',
+ '2014-08-01 11:00:00+02:00'],
+ dtype='datetime64[ns, Europe/Berlin]', freq='H')
+
+ >>> dti.tz_convert('US/Central')
+ DatetimeIndex(['2014-08-01 02:00:00-05:00',
+ '2014-08-01 03:00:00-05:00',
+ '2014-08-01 04:00:00-05:00'],
+ dtype='datetime64[ns, US/Central]', freq='H')
+
+ With the ``tz=None``, we can remove the timezone (after converting
+ to UTC if necessary):
+
+ >>> dti = pd.date_range(start='2014-08-01 09:00',freq='H',
+ ... periods=3, tz='Europe/Berlin')
+
+ >>> dti
+ DatetimeIndex(['2014-08-01 09:00:00+02:00',
+ '2014-08-01 10:00:00+02:00',
+ '2014-08-01 11:00:00+02:00'],
+ dtype='datetime64[ns, Europe/Berlin]', freq='H')
+
+ >>> dti.tz_convert(None)
+ DatetimeIndex(['2014-08-01 07:00:00',
+ '2014-08-01 08:00:00',
+ '2014-08-01 09:00:00'],
+ dtype='datetime64[ns]', freq='H')
+ """
+ tz = timezones.maybe_get_tz(tz)
+
+ if self.tz is None:
+ # tz naive, use tz_localize
+ raise TypeError('Cannot convert tz-naive timestamps, use '
+ 'tz_localize to localize')
+
+ # No conversion since timestamps are all UTC to begin with
+ dtype = tz_to_dtype(tz)
+ return self._simple_new(self.asi8, dtype=dtype, freq=self.freq)
+
+ def tz_localize(self, tz, ambiguous='raise', nonexistent='raise',
+ errors=None):
+ """
+ Localize tz-naive Datetime Array/Index to tz-aware
+ Datetime Array/Index.
+
+ This method takes a time zone (tz) naive Datetime Array/Index object
+ and makes this time zone aware. It does not move the time to another
+ time zone.
+ Time zone localization helps to switch from time zone aware to time
+ zone unaware objects.
+
+ Parameters
+ ----------
+ tz : string, pytz.timezone, dateutil.tz.tzfile or None
+ Time zone to convert timestamps to. Passing ``None`` will
+ remove the time zone information preserving local time.
+ ambiguous : 'infer', 'NaT', bool array, default 'raise'
+ When clocks moved backward due to DST, ambiguous times may arise.
+ For example in Central European Time (UTC+01), when going from
+ 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at
+ 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the
+ `ambiguous` parameter dictates how ambiguous times should be
+ handled.
+
+ - 'infer' will attempt to infer fall dst-transition hours based on
+ order
+ - bool-ndarray where True signifies a DST time, False signifies a
+ non-DST time (note that this flag is only applicable for
+ ambiguous times)
+ - 'NaT' will return NaT where there are ambiguous times
+ - 'raise' will raise an AmbiguousTimeError if there are ambiguous
+ times
+
+ nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta,
+ default 'raise'
+ A nonexistent time does not exist in a particular timezone
+ where clocks moved forward due to DST.
+
+ - 'shift_forward' will shift the nonexistent time forward to the
+ closest existing time
+ - 'shift_backward' will shift the nonexistent time backward to the
+ closest existing time
+ - 'NaT' will return NaT where there are nonexistent times
+ - timedelta objects will shift nonexistent times by the timedelta
+ - 'raise' will raise an NonExistentTimeError if there are
+ nonexistent times
+
+ .. versionadded:: 0.24.0
+
+ errors : {'raise', 'coerce'}, default None
+
+ - 'raise' will raise a NonExistentTimeError if a timestamp is not
+ valid in the specified time zone (e.g. due to a transition from
+ or to DST time). Use ``nonexistent='raise'`` instead.
+ - 'coerce' will return NaT if the timestamp can not be converted
+ to the specified time zone. Use ``nonexistent='NaT'`` instead.
+
+ .. deprecated:: 0.24.0
+
+ Returns
+ -------
+ result : same type as self
+ Array/Index converted to the specified time zone.
+
+ Raises
+ ------
+ TypeError
+ If the Datetime Array/Index is tz-aware and tz is not None.
+
+ See Also
+ --------
+ DatetimeIndex.tz_convert : Convert tz-aware DatetimeIndex from
+ one time zone to another.
+
+ Examples
+ --------
+ >>> tz_naive = pd.date_range('2018-03-01 09:00', periods=3)
+ >>> tz_naive
+ DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00',
+ '2018-03-03 09:00:00'],
+ dtype='datetime64[ns]', freq='D')
+
+ Localize DatetimeIndex in US/Eastern time zone:
+
+ >>> tz_aware = tz_naive.tz_localize(tz='US/Eastern')
+ >>> tz_aware
+ DatetimeIndex(['2018-03-01 09:00:00-05:00',
+ '2018-03-02 09:00:00-05:00',
+ '2018-03-03 09:00:00-05:00'],
+ dtype='datetime64[ns, US/Eastern]', freq='D')
+
+ With the ``tz=None``, we can remove the time zone information
+ while keeping the local time (not converted to UTC):
+
+ >>> tz_aware.tz_localize(None)
+ DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00',
+ '2018-03-03 09:00:00'],
+ dtype='datetime64[ns]', freq='D')
+
+ Be careful with DST changes. When there is sequential data, pandas can
+ infer the DST time:
+ >>> s = pd.to_datetime(pd.Series([
+ ... '2018-10-28 01:30:00',
+ ... '2018-10-28 02:00:00',
+ ... '2018-10-28 02:30:00',
+ ... '2018-10-28 02:00:00',
+ ... '2018-10-28 02:30:00',
+ ... '2018-10-28 03:00:00',
+ ... '2018-10-28 03:30:00']))
+ >>> s.dt.tz_localize('CET', ambiguous='infer')
+ 2018-10-28 01:30:00+02:00 0
+ 2018-10-28 02:00:00+02:00 1
+ 2018-10-28 02:30:00+02:00 2
+ 2018-10-28 02:00:00+01:00 3
+ 2018-10-28 02:30:00+01:00 4
+ 2018-10-28 03:00:00+01:00 5
+ 2018-10-28 03:30:00+01:00 6
+ dtype: int64
+
+ In some cases, inferring the DST is impossible. In such cases, you can
+ pass an ndarray to the ambiguous parameter to set the DST explicitly
+
+ >>> s = pd.to_datetime(pd.Series([
+ ... '2018-10-28 01:20:00',
+ ... '2018-10-28 02:36:00',
+ ... '2018-10-28 03:46:00']))
+ >>> s.dt.tz_localize('CET', ambiguous=np.array([True, True, False]))
+ 0 2018-10-28 01:20:00+02:00
+ 1 2018-10-28 02:36:00+02:00
+ 2 2018-10-28 03:46:00+01:00
+ dtype: datetime64[ns, CET]
+
+ If the DST transition causes nonexistent times, you can shift these
+ dates forward or backwards with a timedelta object or `'shift_forward'`
+ or `'shift_backwards'`.
+ >>> s = pd.to_datetime(pd.Series([
+ ... '2015-03-29 02:30:00',
+ ... '2015-03-29 03:30:00']))
+ >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_forward')
+ 0 2015-03-29 03:00:00+02:00
+ 1 2015-03-29 03:30:00+02:00
+ dtype: datetime64[ns, 'Europe/Warsaw']
+ >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_backward')
+ 0 2015-03-29 01:59:59.999999999+01:00
+ 1 2015-03-29 03:30:00+02:00
+ dtype: datetime64[ns, 'Europe/Warsaw']
+ >>> s.dt.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H'))
+ 0 2015-03-29 03:30:00+02:00
+ 1 2015-03-29 03:30:00+02:00
+ dtype: datetime64[ns, 'Europe/Warsaw']
+ """
+ if errors is not None:
+ warnings.warn("The errors argument is deprecated and will be "
+ "removed in a future release. Use "
+ "nonexistent='NaT' or nonexistent='raise' "
+ "instead.", FutureWarning)
+ if errors == 'coerce':
+ nonexistent = 'NaT'
+ elif errors == 'raise':
+ nonexistent = 'raise'
+ else:
+ raise ValueError("The errors argument must be either 'coerce' "
+ "or 'raise'.")
+
+ nonexistent_options = ('raise', 'NaT', 'shift_forward',
+ 'shift_backward')
+ if nonexistent not in nonexistent_options and not isinstance(
+ nonexistent, timedelta):
+ raise ValueError("The nonexistent argument must be one of 'raise',"
+ " 'NaT', 'shift_forward', 'shift_backward' or"
+ " a timedelta object")
+
+ if self.tz is not None:
+ if tz is None:
+ new_dates = conversion.tz_convert(self.asi8, timezones.UTC,
+ self.tz)
+ else:
+ raise TypeError("Already tz-aware, use tz_convert to convert.")
+ else:
+ tz = timezones.maybe_get_tz(tz)
+ # Convert to UTC
+
+ new_dates = conversion.tz_localize_to_utc(
+ self.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent,
+ )
+ new_dates = new_dates.view(_NS_DTYPE)
+ dtype = tz_to_dtype(tz)
+ return self._simple_new(new_dates, dtype=dtype, freq=self.freq)
+
+ # ----------------------------------------------------------------
+ # Conversion Methods - Vectorized analogues of Timestamp methods
+
+ def to_pydatetime(self):
+ """
+ Return Datetime Array/Index as object ndarray of datetime.datetime
+ objects
+
+ Returns
+ -------
+ datetimes : ndarray
+ """
+ return tslib.ints_to_pydatetime(self.asi8, tz=self.tz)
+
+ def normalize(self):
+ """
+ Convert times to midnight.
+
+ The time component of the date-time is converted to midnight i.e.
+ 00:00:00. This is useful in cases, when the time does not matter.
+ Length is unaltered. The timezones are unaffected.
+
+ This method is available on Series with datetime values under
+ the ``.dt`` accessor, and directly on Datetime Array/Index.
+
+ Returns
+ -------
+ DatetimeArray, DatetimeIndex or Series
+ The same type as the original data. Series will have the same
+ name and index. DatetimeIndex will have the same name.
+
+ See Also
+ --------
+ floor : Floor the datetimes to the specified freq.
+ ceil : Ceil the datetimes to the specified freq.
+ round : Round the datetimes to the specified freq.
+
+ Examples
+ --------
+ >>> idx = pd.date_range(start='2014-08-01 10:00', freq='H',
+ ... periods=3, tz='Asia/Calcutta')
+ >>> idx
+ DatetimeIndex(['2014-08-01 10:00:00+05:30',
+ '2014-08-01 11:00:00+05:30',
+ '2014-08-01 12:00:00+05:30'],
+ dtype='datetime64[ns, Asia/Calcutta]', freq='H')
+ >>> idx.normalize()
+ DatetimeIndex(['2014-08-01 00:00:00+05:30',
+ '2014-08-01 00:00:00+05:30',
+ '2014-08-01 00:00:00+05:30'],
+ dtype='datetime64[ns, Asia/Calcutta]', freq=None)
+ """
+ if self.tz is None or timezones.is_utc(self.tz):
+ not_null = ~self.isna()
+ DAY_NS = ccalendar.DAY_SECONDS * 1000000000
+ new_values = self.asi8.copy()
+ adjustment = (new_values[not_null] % DAY_NS)
+ new_values[not_null] = new_values[not_null] - adjustment
+ else:
+ new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz)
+ return type(self)._from_sequence(new_values,
+ freq='infer').tz_localize(self.tz)
+
+ def to_period(self, freq=None):
+ """
+ Cast to PeriodArray/Index at a particular frequency.
+
+ Converts DatetimeArray/Index to PeriodArray/Index.
+
+ Parameters
+ ----------
+ freq : string or Offset, optional
+ One of pandas' :ref:`offset strings <timeseries.offset_aliases>`
+ or an Offset object. Will be inferred by default.
+
+ Returns
+ -------
+ PeriodArray/Index
+
+ Raises
+ ------
+ ValueError
+ When converting a DatetimeArray/Index with non-regular values,
+ so that a frequency cannot be inferred.
+
+ See Also
+ --------
+ PeriodIndex: Immutable ndarray holding ordinal values.
+ DatetimeIndex.to_pydatetime: Return DatetimeIndex as object.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({"y": [1,2,3]},
+ ... index=pd.to_datetime(["2000-03-31 00:00:00",
+ ... "2000-05-31 00:00:00",
+ ... "2000-08-31 00:00:00"]))
+ >>> df.index.to_period("M")
+ PeriodIndex(['2000-03', '2000-05', '2000-08'],
+ dtype='period[M]', freq='M')
+
+ Infer the daily frequency
+
+ >>> idx = pd.date_range("2017-01-01", periods=2)
+ >>> idx.to_period()
+ PeriodIndex(['2017-01-01', '2017-01-02'],
+ dtype='period[D]', freq='D')
+ """
+ from pandas.core.arrays import PeriodArray
+
+ if self.tz is not None:
+ warnings.warn("Converting to PeriodArray/Index representation "
+ "will drop timezone information.", UserWarning)
+
+ if freq is None:
+ freq = self.freqstr or self.inferred_freq
+
+ if freq is None:
+ raise ValueError("You must pass a freq argument as "
+ "current index has none.")
+
+ freq = get_period_alias(freq)
+
+ return PeriodArray._from_datetime64(self._data, freq, tz=self.tz)
+
+ def to_perioddelta(self, freq):
+ """
+ Calculate TimedeltaArray of difference between index
+ values and index converted to PeriodArray at specified
+ freq. Used for vectorized offsets
+
+ Parameters
+ ----------
+ freq : Period frequency
+
+ Returns
+ -------
+ TimedeltaArray/Index
+ """
+ # TODO: consider privatizing (discussion in GH#23113)
+ from pandas.core.arrays.timedeltas import TimedeltaArray
+ i8delta = self.asi8 - self.to_period(freq).to_timestamp().asi8
+ m8delta = i8delta.view('m8[ns]')
+ return TimedeltaArray(m8delta)
+
+ # -----------------------------------------------------------------
+ # Properties - Vectorized Timestamp Properties/Methods
+
+ def month_name(self, locale=None):
+ """
+ Return the month names of the DateTimeIndex with specified locale.
+
+ .. versionadded:: 0.23.0
+
+ Parameters
+ ----------
+ locale : str, optional
+ Locale determining the language in which to return the month name.
+ Default is English locale.
+
+ Returns
+ -------
+ Index
+ Index of month names.
+
+ Examples
+ --------
+ >>> idx = pd.date_range(start='2018-01', freq='M', periods=3)
+ >>> idx
+ DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'],
+ dtype='datetime64[ns]', freq='M')
+ >>> idx.month_name()
+ Index(['January', 'February', 'March'], dtype='object')
+ """
+ if self.tz is not None and not timezones.is_utc(self.tz):
+ values = self._local_timestamps()
+ else:
+ values = self.asi8
+
+ result = fields.get_date_name_field(values, 'month_name',
+ locale=locale)
+ result = self._maybe_mask_results(result, fill_value=None)
+ return result
+
+ def day_name(self, locale=None):
+ """
+ Return the day names of the DateTimeIndex with specified locale.
+
+ .. versionadded:: 0.23.0
+
+ Parameters
+ ----------
+ locale : str, optional
+ Locale determining the language in which to return the day name.
+ Default is English locale.
+
+ Returns
+ -------
+ Index
+ Index of day names.
+
+ Examples
+ --------
+ >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3)
+ >>> idx
+ DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'],
+ dtype='datetime64[ns]', freq='D')
+ >>> idx.day_name()
+ Index(['Monday', 'Tuesday', 'Wednesday'], dtype='object')
+ """
+ if self.tz is not None and not timezones.is_utc(self.tz):
+ values = self._local_timestamps()
+ else:
+ values = self.asi8
+
+ result = fields.get_date_name_field(values, 'day_name',
+ locale=locale)
+ result = self._maybe_mask_results(result, fill_value=None)
+ return result
+
+ @property
+ def time(self):
+ """
+ Returns numpy array of datetime.time. The time part of the Timestamps.
+ """
+ # If the Timestamps have a timezone that is not UTC,
+ # convert them into their i8 representation while
+ # keeping their timezone and not using UTC
+ if self.tz is not None and not timezones.is_utc(self.tz):
+ timestamps = self._local_timestamps()
+ else:
+ timestamps = self.asi8
+
+ return tslib.ints_to_pydatetime(timestamps, box="time")
+
+ @property
+ def timetz(self):
+ """
+ Returns numpy array of datetime.time also containing timezone
+ information. The time part of the Timestamps.
+ """
+ return tslib.ints_to_pydatetime(self.asi8, self.tz, box="time")
+
+ @property
+ def date(self):
+ """
+ Returns numpy array of python datetime.date objects (namely, the date
+ part of Timestamps without timezone information).
+ """
+ # If the Timestamps have a timezone that is not UTC,
+ # convert them into their i8 representation while
+ # keeping their timezone and not using UTC
+ if self.tz is not None and not timezones.is_utc(self.tz):
+ timestamps = self._local_timestamps()
+ else:
+ timestamps = self.asi8
+
+ return tslib.ints_to_pydatetime(timestamps, box="date")
+
+ year = _field_accessor('year', 'Y', "The year of the datetime.")
+ month = _field_accessor('month', 'M',
+ "The month as January=1, December=12. ")
+ day = _field_accessor('day', 'D', "The days of the datetime.")
+ hour = _field_accessor('hour', 'h', "The hours of the datetime.")
+ minute = _field_accessor('minute', 'm', "The minutes of the datetime.")
+ second = _field_accessor('second', 's', "The seconds of the datetime.")
+ microsecond = _field_accessor('microsecond', 'us',
+ "The microseconds of the datetime.")
+ nanosecond = _field_accessor('nanosecond', 'ns',
+ "The nanoseconds of the datetime.")
+ weekofyear = _field_accessor('weekofyear', 'woy',
+ "The week ordinal of the year.")
+ week = weekofyear
+ _dayofweek_doc = """
+ The day of the week with Monday=0, Sunday=6.
+
+ Return the day of the week. It is assumed the week starts on
+ Monday, which is denoted by 0 and ends on Sunday which is denoted
+ by 6. This method is available on both Series with datetime
+ values (using the `dt` accessor) or DatetimeIndex.
+
+ Returns
+ -------
+ Series or Index
+ Containing integers indicating the day number.
+
+ See Also
+ --------
+ Series.dt.dayofweek : Alias.
+ Series.dt.weekday : Alias.
+ Series.dt.day_name : Returns the name of the day of the week.
+
+ Examples
+ --------
+ >>> s = pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series()
+ >>> s.dt.dayofweek
+ 2016-12-31 5
+ 2017-01-01 6
+ 2017-01-02 0
+ 2017-01-03 1
+ 2017-01-04 2
+ 2017-01-05 3
+ 2017-01-06 4
+ 2017-01-07 5
+ 2017-01-08 6
+ Freq: D, dtype: int64
+ """
+ dayofweek = _field_accessor('dayofweek', 'dow', _dayofweek_doc)
+ weekday = dayofweek
+
+ weekday_name = _field_accessor(
+ 'weekday_name',
+ 'weekday_name',
+ "The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0")
+
+ dayofyear = _field_accessor('dayofyear', 'doy',
+ "The ordinal day of the year.")
+ quarter = _field_accessor('quarter', 'q', "The quarter of the date.")
+ days_in_month = _field_accessor(
+ 'days_in_month',
+ 'dim',
+ "The number of days in the month.")
+ daysinmonth = days_in_month
+ _is_month_doc = """
+ Indicates whether the date is the {first_or_last} day of the month.
+
+ Returns
+ -------
+ Series or array
+ For Series, returns a Series with boolean values.
+ For DatetimeIndex, returns a boolean array.
+
+ See Also
+ --------
+ is_month_start : Return a boolean indicating whether the date
+ is the first day of the month.
+ is_month_end : Return a boolean indicating whether the date
+ is the last day of the month.
+
+ Examples
+ --------
+ This method is available on Series with datetime values under
+ the ``.dt`` accessor, and directly on DatetimeIndex.
+
+ >>> s = pd.Series(pd.date_range("2018-02-27", periods=3))
+ >>> s
+ 0 2018-02-27
+ 1 2018-02-28
+ 2 2018-03-01
+ dtype: datetime64[ns]
+ >>> s.dt.is_month_start
+ 0 False
+ 1 False
+ 2 True
+ dtype: bool
+ >>> s.dt.is_month_end
+ 0 False
+ 1 True
+ 2 False
+ dtype: bool
+
+ >>> idx = pd.date_range("2018-02-27", periods=3)
+ >>> idx.is_month_start
+ array([False, False, True])
+ >>> idx.is_month_end
+ array([False, True, False])
+ """
+ is_month_start = _field_accessor(
+ 'is_month_start',
+ 'is_month_start',
+ _is_month_doc.format(first_or_last='first'))
+
+ is_month_end = _field_accessor(
+ 'is_month_end',
+ 'is_month_end',
+ _is_month_doc.format(first_or_last='last'))
+
+ is_quarter_start = _field_accessor(
+ 'is_quarter_start',
+ 'is_quarter_start',
+ """
+ Indicator for whether the date is the first day of a quarter.
+
+ Returns
+ -------
+ is_quarter_start : Series or DatetimeIndex
+ The same type as the original data with boolean values. Series will
+ have the same name and index. DatetimeIndex will have the same
+ name.
+
+ See Also
+ --------
+ quarter : Return the quarter of the date.
+ is_quarter_end : Similar property for indicating the quarter start.
+
+ Examples
+ --------
+ This method is available on Series with datetime values under
+ the ``.dt`` accessor, and directly on DatetimeIndex.
+
+ >>> df = pd.DataFrame({'dates': pd.date_range("2017-03-30",
+ ... periods=4)})
+ >>> df.assign(quarter=df.dates.dt.quarter,
+ ... is_quarter_start=df.dates.dt.is_quarter_start)
+ dates quarter is_quarter_start
+ 0 2017-03-30 1 False
+ 1 2017-03-31 1 False
+ 2 2017-04-01 2 True
+ 3 2017-04-02 2 False
+
+ >>> idx = pd.date_range('2017-03-30', periods=4)
+ >>> idx
+ DatetimeIndex(['2017-03-30', '2017-03-31', '2017-04-01', '2017-04-02'],
+ dtype='datetime64[ns]', freq='D')
+
+ >>> idx.is_quarter_start
+ array([False, False, True, False])
+ """)
+ is_quarter_end = _field_accessor(
+ 'is_quarter_end',
+ 'is_quarter_end',
+ """
+ Indicator for whether the date is the last day of a quarter.
+
+ Returns
+ -------
+ is_quarter_end : Series or DatetimeIndex
+ The same type as the original data with boolean values. Series will
+ have the same name and index. DatetimeIndex will have the same
+ name.
+
+ See Also
+ --------
+ quarter : Return the quarter of the date.
+ is_quarter_start : Similar property indicating the quarter start.
+
+ Examples
+ --------
+ This method is available on Series with datetime values under
+ the ``.dt`` accessor, and directly on DatetimeIndex.
+
+ >>> df = pd.DataFrame({'dates': pd.date_range("2017-03-30",
+ ... periods=4)})
+ >>> df.assign(quarter=df.dates.dt.quarter,
+ ... is_quarter_end=df.dates.dt.is_quarter_end)
+ dates quarter is_quarter_end
+ 0 2017-03-30 1 False
+ 1 2017-03-31 1 True
+ 2 2017-04-01 2 False
+ 3 2017-04-02 2 False
+
+ >>> idx = pd.date_range('2017-03-30', periods=4)
+ >>> idx
+ DatetimeIndex(['2017-03-30', '2017-03-31', '2017-04-01', '2017-04-02'],
+ dtype='datetime64[ns]', freq='D')
+
+ >>> idx.is_quarter_end
+ array([False, True, False, False])
+ """)
+ is_year_start = _field_accessor(
+ 'is_year_start',
+ 'is_year_start',
+ """
+ Indicate whether the date is the first day of a year.
+
+ Returns
+ -------
+ Series or DatetimeIndex
+ The same type as the original data with boolean values. Series will
+ have the same name and index. DatetimeIndex will have the same
+ name.
+
+ See Also
+ --------
+ is_year_end : Similar property indicating the last day of the year.
+
+ Examples
+ --------
+ This method is available on Series with datetime values under
+ the ``.dt`` accessor, and directly on DatetimeIndex.
+
+ >>> dates = pd.Series(pd.date_range("2017-12-30", periods=3))
+ >>> dates
+ 0 2017-12-30
+ 1 2017-12-31
+ 2 2018-01-01
+ dtype: datetime64[ns]
+
+ >>> dates.dt.is_year_start
+ 0 False
+ 1 False
+ 2 True
+ dtype: bool
+
+ >>> idx = pd.date_range("2017-12-30", periods=3)
+ >>> idx
+ DatetimeIndex(['2017-12-30', '2017-12-31', '2018-01-01'],
+ dtype='datetime64[ns]', freq='D')
+
+ >>> idx.is_year_start
+ array([False, False, True])
+ """)
+ is_year_end = _field_accessor(
+ 'is_year_end',
+ 'is_year_end',
+ """
+ Indicate whether the date is the last day of the year.
+
+ Returns
+ -------
+ Series or DatetimeIndex
+ The same type as the original data with boolean values. Series will
+ have the same name and index. DatetimeIndex will have the same
+ name.
+
+ See Also
+ --------
+ is_year_start : Similar property indicating the start of the year.
+
+ Examples
+ --------
+ This method is available on Series with datetime values under
+ the ``.dt`` accessor, and directly on DatetimeIndex.
+
+ >>> dates = pd.Series(pd.date_range("2017-12-30", periods=3))
+ >>> dates
+ 0 2017-12-30
+ 1 2017-12-31
+ 2 2018-01-01
+ dtype: datetime64[ns]
+
+ >>> dates.dt.is_year_end
+ 0 False
+ 1 True
+ 2 False
+ dtype: bool
+
+ >>> idx = pd.date_range("2017-12-30", periods=3)
+ >>> idx
+ DatetimeIndex(['2017-12-30', '2017-12-31', '2018-01-01'],
+ dtype='datetime64[ns]', freq='D')
+
+ >>> idx.is_year_end
+ array([False, True, False])
+ """)
+ is_leap_year = _field_accessor(
+ 'is_leap_year',
+ 'is_leap_year',
+ """
+ Boolean indicator if the date belongs to a leap year.
+
+ A leap year is a year, which has 366 days (instead of 365) including
+ 29th of February as an intercalary day.
+ Leap years are years which are multiples of four with the exception
+ of years divisible by 100 but not by 400.
+
+ Returns
+ -------
+ Series or ndarray
+ Booleans indicating if dates belong to a leap year.
+
+ Examples
+ --------
+ This method is available on Series with datetime values under
+ the ``.dt`` accessor, and directly on DatetimeIndex.
+
+ >>> idx = pd.date_range("2012-01-01", "2015-01-01", freq="Y")
+ >>> idx
+ DatetimeIndex(['2012-12-31', '2013-12-31', '2014-12-31'],
+ dtype='datetime64[ns]', freq='A-DEC')
+ >>> idx.is_leap_year
+ array([ True, False, False], dtype=bool)
+
+ >>> dates = pd.Series(idx)
+ >>> dates_series
+ 0 2012-12-31
+ 1 2013-12-31
+ 2 2014-12-31
+ dtype: datetime64[ns]
+ >>> dates_series.dt.is_leap_year
+ 0 True
+ 1 False
+ 2 False
+ dtype: bool
+ """)
+
+ def to_julian_date(self):
+ """
+ Convert Datetime Array to float64 ndarray of Julian Dates.
+ 0 Julian date is noon January 1, 4713 BC.
+ http://en.wikipedia.org/wiki/Julian_day
+ """
+
+ # http://mysite.verizon.net/aesir_research/date/jdalg2.htm
+ year = np.asarray(self.year)
+ month = np.asarray(self.month)
+ day = np.asarray(self.day)
+ testarr = month < 3
+ year[testarr] -= 1
+ month[testarr] += 12
+ return (day +
+ np.fix((153 * month - 457) / 5) +
+ 365 * year +
+ np.floor(year / 4) -
+ np.floor(year / 100) +
+ np.floor(year / 400) +
+ 1721118.5 +
+ (self.hour +
+ self.minute / 60.0 +
+ self.second / 3600.0 +
+ self.microsecond / 3600.0 / 1e+6 +
+ self.nanosecond / 3600.0 / 1e+9
+ ) / 24.0)
+
+
+DatetimeArray._add_comparison_ops()
+
+
+# -------------------------------------------------------------------
+# Constructor Helpers
+
+def sequence_to_dt64ns(data, dtype=None, copy=False,
+ tz=None,
+ dayfirst=False, yearfirst=False, ambiguous='raise',
+ int_as_wall_time=False):
+ """
+ Parameters
+ ----------
+ data : list-like
+ dtype : dtype, str, or None, default None
+ copy : bool, default False
+ tz : tzinfo, str, or None, default None
+ dayfirst : bool, default False
+ yearfirst : bool, default False
+ ambiguous : str, bool, or arraylike, default 'raise'
+ See pandas._libs.tslibs.conversion.tz_localize_to_utc
+ int_as_wall_time : bool, default False
+ Whether to treat ints as wall time in specified timezone, or as
+ nanosecond-precision UNIX epoch (wall time in UTC).
+ This is used in DatetimeIndex.__init__ to deprecate the wall-time
+ behaviour.
+
+ ..versionadded:: 0.24.0
+
+ Returns
+ -------
+ result : numpy.ndarray
+ The sequence converted to a numpy array with dtype ``datetime64[ns]``.
+ tz : tzinfo or None
+ Either the user-provided tzinfo or one inferred from the data.
+ inferred_freq : Tick or None
+ The inferred frequency of the sequence.
+
+ Raises
+ ------
+ TypeError : PeriodDType data is passed
+ """
+
+ inferred_freq = None
+
+ dtype = _validate_dt64_dtype(dtype)
+
+ if not hasattr(data, "dtype"):
+ # e.g. list, tuple
+ if np.ndim(data) == 0:
+ # i.e. generator
+ data = list(data)
+ data = np.asarray(data)
+ copy = False
+ elif isinstance(data, ABCSeries):
+ data = data._values
+ if isinstance(data, ABCPandasArray):
+ data = data.to_numpy()
+
+ if hasattr(data, "freq"):
+ # i.e. DatetimeArray/Index
+ inferred_freq = data.freq
+
+ # if dtype has an embedded tz, capture it
+ tz = validate_tz_from_dtype(dtype, tz)
+
+ if isinstance(data, ABCIndexClass):
+ data = data._data
+
+ # By this point we are assured to have either a numpy array or Index
+ data, copy = maybe_convert_dtype(data, copy)
+
+ if is_object_dtype(data) or is_string_dtype(data):
+ # TODO: We do not have tests specific to string-dtypes,
+ # also complex or categorical or other extension
+ copy = False
+ if lib.infer_dtype(data, skipna=False) == 'integer':
+ data = data.astype(np.int64)
+ else:
+ # data comes back here as either i8 to denote UTC timestamps
+ # or M8[ns] to denote wall times
+ data, inferred_tz = objects_to_datetime64ns(
+ data, dayfirst=dayfirst, yearfirst=yearfirst)
+ tz = maybe_infer_tz(tz, inferred_tz)
+ # When a sequence of timestamp objects is passed, we always
+ # want to treat the (now i8-valued) data as UTC timestamps,
+ # not wall times.
+ int_as_wall_time = False
+
+ # `data` may have originally been a Categorical[datetime64[ns, tz]],
+ # so we need to handle these types.
+ if is_datetime64tz_dtype(data):
+ # DatetimeArray -> ndarray
+ tz = maybe_infer_tz(tz, data.tz)
+ result = data._data
+
+ elif is_datetime64_dtype(data):
+ # tz-naive DatetimeArray or ndarray[datetime64]
+ data = getattr(data, "_data", data)
+ if data.dtype != _NS_DTYPE:
+ data = conversion.ensure_datetime64ns(data)
+
+ if tz is not None:
+ # Convert tz-naive to UTC
+ tz = timezones.maybe_get_tz(tz)
+ data = conversion.tz_localize_to_utc(data.view('i8'), tz,
+ ambiguous=ambiguous)
+ data = data.view(_NS_DTYPE)
+
+ assert data.dtype == _NS_DTYPE, data.dtype
+ result = data
+
+ else:
+ # must be integer dtype otherwise
+ # assume this data are epoch timestamps
+ if tz:
+ tz = timezones.maybe_get_tz(tz)
+
+ if data.dtype != _INT64_DTYPE:
+ data = data.astype(np.int64, copy=False)
+ if int_as_wall_time and tz is not None and not timezones.is_utc(tz):
+ warnings.warn(_i8_message, FutureWarning, stacklevel=4)
+ data = conversion.tz_localize_to_utc(data.view('i8'), tz,
+ ambiguous=ambiguous)
+ data = data.view(_NS_DTYPE)
+ result = data.view(_NS_DTYPE)
+
+ if copy:
+ # TODO: should this be deepcopy?
+ result = result.copy()
+
+ assert isinstance(result, np.ndarray), type(result)
+ assert result.dtype == 'M8[ns]', result.dtype
+
+ # We have to call this again after possibly inferring a tz above
+ validate_tz_from_dtype(dtype, tz)
+
+ return result, tz, inferred_freq
+
+
+def objects_to_datetime64ns(data, dayfirst, yearfirst,
+ utc=False, errors="raise",
+ require_iso8601=False, allow_object=False):
+ """
+ Convert data to array of timestamps.
+
+ Parameters
+ ----------
+ data : np.ndarray[object]
+ dayfirst : bool
+ yearfirst : bool
+ utc : bool, default False
+ Whether to convert timezone-aware timestamps to UTC
+ errors : {'raise', 'ignore', 'coerce'}
+ allow_object : bool
+ Whether to return an object-dtype ndarray instead of raising if the
+ data contains more than one timezone.
+
+ Returns
+ -------
+ result : ndarray
+ np.int64 dtype if returned values represent UTC timestamps
+ np.datetime64[ns] if returned values represent wall times
+ object if mixed timezones
+ inferred_tz : tzinfo or None
+
+ Raises
+ ------
+ ValueError : if data cannot be converted to datetimes
+ """
+ assert errors in ["raise", "ignore", "coerce"]
+
+ # if str-dtype, convert
+ data = np.array(data, copy=False, dtype=np.object_)
+
+ try:
+ result, tz_parsed = tslib.array_to_datetime(
+ data,
+ errors=errors,
+ utc=utc,
+ dayfirst=dayfirst,
+ yearfirst=yearfirst,
+ require_iso8601=require_iso8601
+ )
+ except ValueError as e:
+ try:
+ values, tz_parsed = conversion.datetime_to_datetime64(data)
+ # If tzaware, these values represent unix timestamps, so we
+ # return them as i8 to distinguish from wall times
+ return values.view('i8'), tz_parsed
+ except (ValueError, TypeError):
+ raise e
+
+ if tz_parsed is not None:
+ # We can take a shortcut since the datetime64 numpy array
+ # is in UTC
+ # Return i8 values to denote unix timestamps
+ return result.view('i8'), tz_parsed
+ elif is_datetime64_dtype(result):
+ # returning M8[ns] denotes wall-times; since tz is None
+ # the distinction is a thin one
+ return result, tz_parsed
+ elif is_object_dtype(result):
+ # GH#23675 when called via `pd.to_datetime`, returning an object-dtype
+ # array is allowed. When called via `pd.DatetimeIndex`, we can
+ # only accept datetime64 dtype, so raise TypeError if object-dtype
+ # is returned, as that indicates the values can be recognized as
+ # datetimes but they have conflicting timezones/awareness
+ if allow_object:
+ return result, tz_parsed
+ raise TypeError(result)
+ else: # pragma: no cover
+ # GH#23675 this TypeError should never be hit, whereas the TypeError
+ # in the object-dtype branch above is reachable.
+ raise TypeError(result)
+
+
+def maybe_convert_dtype(data, copy):
+ """
+ Convert data based on dtype conventions, issuing deprecation warnings
+ or errors where appropriate.
+
+ Parameters
+ ----------
+ data : np.ndarray or pd.Index
+ copy : bool
+
+ Returns
+ -------
+ data : np.ndarray or pd.Index
+ copy : bool
+
+ Raises
+ ------
+ TypeError : PeriodDType data is passed
+ """
+ if is_float_dtype(data):
+ # Note: we must cast to datetime64[ns] here in order to treat these
+ # as wall-times instead of UTC timestamps.
+ data = data.astype(_NS_DTYPE)
+ copy = False
+ # TODO: deprecate this behavior to instead treat symmetrically
+ # with integer dtypes. See discussion in GH#23675
+
+ elif is_timedelta64_dtype(data):
+ warnings.warn("Passing timedelta64-dtype data is deprecated, will "
+ "raise a TypeError in a future version",
+ FutureWarning, stacklevel=5)
+ data = data.view(_NS_DTYPE)
+
+ elif is_period_dtype(data):
+ # Note: without explicitly raising here, PeriodIndex
+ # test_setops.test_join_does_not_recur fails
+ raise TypeError("Passing PeriodDtype data is invalid. "
+ "Use `data.to_timestamp()` instead")
+
+ elif is_categorical_dtype(data):
+ # GH#18664 preserve tz in going DTI->Categorical->DTI
+ # TODO: cases where we need to do another pass through this func,
+ # e.g. the categories are timedelta64s
+ data = data.categories.take(data.codes, fill_value=NaT)._values
+ copy = False
+
+ elif is_extension_type(data) and not is_datetime64tz_dtype(data):
+ # Includes categorical
+ # TODO: We have no tests for these
+ data = np.array(data, dtype=np.object_)
+ copy = False
+
+ return data, copy
+
+
+# -------------------------------------------------------------------
+# Validation and Inference
+
+def maybe_infer_tz(tz, inferred_tz):
+ """
+ If a timezone is inferred from data, check that it is compatible with
+ the user-provided timezone, if any.
+
+ Parameters
+ ----------
+ tz : tzinfo or None
+ inferred_tz : tzinfo or None
+
+ Returns
+ -------
+ tz : tzinfo or None
+
+ Raises
+ ------
+ TypeError : if both timezones are present but do not match
+ """
+ if tz is None:
+ tz = inferred_tz
+ elif inferred_tz is None:
+ pass
+ elif not timezones.tz_compare(tz, inferred_tz):
+ raise TypeError('data is already tz-aware {inferred_tz}, unable to '
+ 'set specified tz: {tz}'
+ .format(inferred_tz=inferred_tz, tz=tz))
+ return tz
+
+
+def _validate_dt64_dtype(dtype):
+ """
+ Check that a dtype, if passed, represents either a numpy datetime64[ns]
+ dtype or a pandas DatetimeTZDtype.
+
+ Parameters
+ ----------
+ dtype : object
+
+ Returns
+ -------
+ dtype : None, numpy.dtype, or DatetimeTZDtype
+
+ Raises
+ ------
+ ValueError : invalid dtype
+
+ Notes
+ -----
+ Unlike validate_tz_from_dtype, this does _not_ allow non-existent
+ tz errors to go through
+ """
+ if dtype is not None:
+ dtype = pandas_dtype(dtype)
+ if is_dtype_equal(dtype, np.dtype("M8")):
+ # no precision, warn
+ dtype = _NS_DTYPE
+ msg = textwrap.dedent("""\
+ Passing in 'datetime64' dtype with no precision is deprecated
+ and will raise in a future version. Please pass in
+ 'datetime64[ns]' instead.""")
+ warnings.warn(msg, FutureWarning, stacklevel=5)
+
+ if ((isinstance(dtype, np.dtype) and dtype != _NS_DTYPE)
+ or not isinstance(dtype, (np.dtype, DatetimeTZDtype))):
+ raise ValueError("Unexpected value for 'dtype': '{dtype}'. "
+ "Must be 'datetime64[ns]' or DatetimeTZDtype'."
+ .format(dtype=dtype))
+ return dtype
+
+
+def validate_tz_from_dtype(dtype, tz):
+ """
+ If the given dtype is a DatetimeTZDtype, extract the implied
+ tzinfo object from it and check that it does not conflict with the given
+ tz.
+
+ Parameters
+ ----------
+ dtype : dtype, str
+ tz : None, tzinfo
+
+ Returns
+ -------
+ tz : consensus tzinfo
+
+ Raises
+ ------
+ ValueError : on tzinfo mismatch
+ """
+ if dtype is not None:
+ if isinstance(dtype, compat.string_types):
+ try:
+ dtype = DatetimeTZDtype.construct_from_string(dtype)
+ except TypeError:
+ # Things like `datetime64[ns]`, which is OK for the
+ # constructors, but also nonsense, which should be validated
+ # but not by us. We *do* allow non-existent tz errors to
+ # go through
+ pass
+ dtz = getattr(dtype, 'tz', None)
+ if dtz is not None:
+ if tz is not None and not timezones.tz_compare(tz, dtz):
+ raise ValueError("cannot supply both a tz and a dtype"
+ " with a tz")
+ tz = dtz
+
+ if tz is not None and is_datetime64_dtype(dtype):
+ # We also need to check for the case where the user passed a
+ # tz-naive dtype (i.e. datetime64[ns])
+ if tz is not None and not timezones.tz_compare(tz, dtz):
+ raise ValueError("cannot supply both a tz and a "
+ "timezone-naive dtype (i.e. datetime64[ns]")
+
+ return tz
+
+
+def _infer_tz_from_endpoints(start, end, tz):
+ """
+ If a timezone is not explicitly given via `tz`, see if one can
+ be inferred from the `start` and `end` endpoints. If more than one
+ of these inputs provides a timezone, require that they all agree.
+
+ Parameters
+ ----------
+ start : Timestamp
+ end : Timestamp
+ tz : tzinfo or None
+
+ Returns
+ -------
+ tz : tzinfo or None
+
+ Raises
+ ------
+ TypeError : if start and end timezones do not agree
+ """
+ try:
+ inferred_tz = timezones.infer_tzinfo(start, end)
+ except Exception:
+ raise TypeError('Start and end cannot both be tz-aware with '
+ 'different timezones')
+
+ inferred_tz = timezones.maybe_get_tz(inferred_tz)
+ tz = timezones.maybe_get_tz(tz)
+
+ if tz is not None and inferred_tz is not None:
+ if not timezones.tz_compare(inferred_tz, tz):
+ raise AssertionError("Inferred time zone not equal to passed "
+ "time zone")
+
+ elif inferred_tz is not None:
+ tz = inferred_tz
+
+ return tz
+
+
+def _maybe_normalize_endpoints(start, end, normalize):
+ _normalized = True
+
+ if start is not None:
+ if normalize:
+ start = normalize_date(start)
+ _normalized = True
+ else:
+ _normalized = _normalized and start.time() == _midnight
+
+ if end is not None:
+ if normalize:
+ end = normalize_date(end)
+ _normalized = True
+ else:
+ _normalized = _normalized and end.time() == _midnight
+
+ return start, end, _normalized
+
+
+def _maybe_localize_point(ts, is_none, is_not_none, freq, tz):
+ """
+ Localize a start or end Timestamp to the timezone of the corresponding
+ start or end Timestamp
+
+ Parameters
+ ----------
+ ts : start or end Timestamp to potentially localize
+ is_none : argument that should be None
+ is_not_none : argument that should not be None
+ freq : Tick, DateOffset, or None
+ tz : str, timezone object or None
+
+ Returns
+ -------
+ ts : Timestamp
+ """
+ # Make sure start and end are timezone localized if:
+ # 1) freq = a Timedelta-like frequency (Tick)
+ # 2) freq = None i.e. generating a linspaced range
+ if isinstance(freq, Tick) or freq is None:
+ localize_args = {'tz': tz, 'ambiguous': False}
+ else:
+ localize_args = {'tz': None}
+ if is_none is None and is_not_none is not None:
+ ts = ts.tz_localize(**localize_args)
+ return ts
diff --git a/contrib/python/pandas/py2/pandas/core/arrays/integer.py b/contrib/python/pandas/py2/pandas/core/arrays/integer.py
new file mode 100644
index 00000000000..a6a4a49d3a9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/arrays/integer.py
@@ -0,0 +1,706 @@
+import copy
+import sys
+import warnings
+
+import numpy as np
+
+from pandas._libs import lib
+from pandas.compat import range, set_function_name, string_types
+from pandas.util._decorators import cache_readonly
+
+from pandas.core.dtypes.base import ExtensionDtype
+from pandas.core.dtypes.cast import astype_nansafe
+from pandas.core.dtypes.common import (
+ is_bool_dtype, is_float, is_float_dtype, is_integer, is_integer_dtype,
+ is_list_like, is_object_dtype, is_scalar)
+from pandas.core.dtypes.dtypes import register_extension_dtype
+from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
+from pandas.core.dtypes.missing import isna, notna
+
+from pandas.core import nanops
+from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
+from pandas.core.tools.numeric import to_numeric
+
+
+class _IntegerDtype(ExtensionDtype):
+ """
+ An ExtensionDtype to hold a single size & kind of integer dtype.
+
+ These specific implementations are subclasses of the non-public
+ _IntegerDtype. For example we have Int8Dtype to represnt signed int 8s.
+
+ The attributes name & type are set when these subclasses are created.
+ """
+ name = None
+ base = None
+ type = None
+ na_value = np.nan
+
+ def __repr__(self):
+ sign = 'U' if self.is_unsigned_integer else ''
+ return "{sign}Int{size}Dtype()".format(sign=sign,
+ size=8 * self.itemsize)
+
+ @cache_readonly
+ def is_signed_integer(self):
+ return self.kind == 'i'
+
+ @cache_readonly
+ def is_unsigned_integer(self):
+ return self.kind == 'u'
+
+ @property
+ def _is_numeric(self):
+ return True
+
+ @cache_readonly
+ def numpy_dtype(self):
+ """ Return an instance of our numpy dtype """
+ return np.dtype(self.type)
+
+ @cache_readonly
+ def kind(self):
+ return self.numpy_dtype.kind
+
+ @cache_readonly
+ def itemsize(self):
+ """ Return the number of bytes in this dtype """
+ return self.numpy_dtype.itemsize
+
+ @classmethod
+ def construct_array_type(cls):
+ """Return the array type associated with this dtype
+
+ Returns
+ -------
+ type
+ """
+ return IntegerArray
+
+ @classmethod
+ def construct_from_string(cls, string):
+ """
+ Construction from a string, raise a TypeError if not
+ possible
+ """
+ if string == cls.name:
+ return cls()
+ raise TypeError("Cannot construct a '{}' from "
+ "'{}'".format(cls, string))
+
+
+def integer_array(values, dtype=None, copy=False):
+ """
+ Infer and return an integer array of the values.
+
+ Parameters
+ ----------
+ values : 1D list-like
+ dtype : dtype, optional
+ dtype to coerce
+ copy : boolean, default False
+
+ Returns
+ -------
+ IntegerArray
+
+ Raises
+ ------
+ TypeError if incompatible types
+ """
+ values, mask = coerce_to_array(values, dtype=dtype, copy=copy)
+ return IntegerArray(values, mask)
+
+
+def safe_cast(values, dtype, copy):
+ """
+ Safely cast the values to the dtype if they
+ are equivalent, meaning floats must be equivalent to the
+ ints.
+
+ """
+
+ try:
+ return values.astype(dtype, casting='safe', copy=copy)
+ except TypeError:
+
+ casted = values.astype(dtype, copy=copy)
+ if (casted == values).all():
+ return casted
+
+ raise TypeError("cannot safely cast non-equivalent {} to {}".format(
+ values.dtype, np.dtype(dtype)))
+
+
+def coerce_to_array(values, dtype, mask=None, copy=False):
+ """
+ Coerce the input values array to numpy arrays with a mask
+
+ Parameters
+ ----------
+ values : 1D list-like
+ dtype : integer dtype
+ mask : boolean 1D array, optional
+ copy : boolean, default False
+ if True, copy the input
+
+ Returns
+ -------
+ tuple of (values, mask)
+ """
+ # if values is integer numpy array, preserve it's dtype
+ if dtype is None and hasattr(values, 'dtype'):
+ if is_integer_dtype(values.dtype):
+ dtype = values.dtype
+
+ if dtype is not None:
+ if (isinstance(dtype, string_types) and
+ (dtype.startswith("Int") or dtype.startswith("UInt"))):
+ # Avoid DeprecationWarning from NumPy about np.dtype("Int64")
+ # https://github.com/numpy/numpy/pull/7476
+ dtype = dtype.lower()
+
+ if not issubclass(type(dtype), _IntegerDtype):
+ try:
+ dtype = _dtypes[str(np.dtype(dtype))]
+ except KeyError:
+ raise ValueError("invalid dtype specified {}".format(dtype))
+
+ if isinstance(values, IntegerArray):
+ values, mask = values._data, values._mask
+ if dtype is not None:
+ values = values.astype(dtype.numpy_dtype, copy=False)
+
+ if copy:
+ values = values.copy()
+ mask = mask.copy()
+ return values, mask
+
+ values = np.array(values, copy=copy)
+ if is_object_dtype(values):
+ inferred_type = lib.infer_dtype(values, skipna=True)
+ if inferred_type == 'empty':
+ values = np.empty(len(values))
+ values.fill(np.nan)
+ elif inferred_type not in ['floating', 'integer',
+ 'mixed-integer', 'mixed-integer-float']:
+ raise TypeError("{} cannot be converted to an IntegerDtype".format(
+ values.dtype))
+
+ elif not (is_integer_dtype(values) or is_float_dtype(values)):
+ raise TypeError("{} cannot be converted to an IntegerDtype".format(
+ values.dtype))
+
+ if mask is None:
+ mask = isna(values)
+ else:
+ assert len(mask) == len(values)
+
+ if not values.ndim == 1:
+ raise TypeError("values must be a 1D list-like")
+ if not mask.ndim == 1:
+ raise TypeError("mask must be a 1D list-like")
+
+ # infer dtype if needed
+ if dtype is None:
+ dtype = np.dtype('int64')
+ else:
+ dtype = dtype.type
+
+ # if we are float, let's make sure that we can
+ # safely cast
+
+ # we copy as need to coerce here
+ if mask.any():
+ values = values.copy()
+ values[mask] = 1
+ values = safe_cast(values, dtype, copy=False)
+ else:
+ values = safe_cast(values, dtype, copy=False)
+
+ return values, mask
+
+
+class IntegerArray(ExtensionArray, ExtensionOpsMixin):
+ """
+ Array of integer (optional missing) values.
+
+ .. versionadded:: 0.24.0
+
+ .. warning::
+
+ IntegerArray is currently experimental, and its API or internal
+ implementation may change without warning.
+
+ We represent an IntegerArray with 2 numpy arrays:
+
+ - data: contains a numpy integer array of the appropriate dtype
+ - mask: a boolean array holding a mask on the data, True is missing
+
+ To construct an IntegerArray from generic array-like input, use
+ :func:`pandas.array` with one of the integer dtypes (see examples).
+
+ See :ref:`integer_na` for more.
+
+ Parameters
+ ----------
+ values : numpy.ndarray
+ A 1-d integer-dtype array.
+ mask : numpy.ndarray
+ A 1-d boolean-dtype array indicating missing values.
+ copy : bool, default False
+ Whether to copy the `values` and `mask`.
+
+ Returns
+ -------
+ IntegerArray
+
+ Examples
+ --------
+ Create an IntegerArray with :func:`pandas.array`.
+
+ >>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype())
+ >>> int_array
+ <IntegerArray>
+ [1, NaN, 3]
+ Length: 3, dtype: Int32
+
+ String aliases for the dtypes are also available. They are capitalized.
+
+ >>> pd.array([1, None, 3], dtype='Int32')
+ <IntegerArray>
+ [1, NaN, 3]
+ Length: 3, dtype: Int32
+
+ >>> pd.array([1, None, 3], dtype='UInt16')
+ <IntegerArray>
+ [1, NaN, 3]
+ Length: 3, dtype: UInt16
+ """
+
+ @cache_readonly
+ def dtype(self):
+ return _dtypes[str(self._data.dtype)]
+
+ def __init__(self, values, mask, copy=False):
+ if not (isinstance(values, np.ndarray)
+ and is_integer_dtype(values.dtype)):
+ raise TypeError("values should be integer numpy array. Use "
+ "the 'integer_array' function instead")
+ if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)):
+ raise TypeError("mask should be boolean numpy array. Use "
+ "the 'integer_array' function instead")
+
+ if copy:
+ values = values.copy()
+ mask = mask.copy()
+
+ self._data = values
+ self._mask = mask
+
+ @classmethod
+ def _from_sequence(cls, scalars, dtype=None, copy=False):
+ return integer_array(scalars, dtype=dtype, copy=copy)
+
+ @classmethod
+ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
+ scalars = to_numeric(strings, errors="raise")
+ return cls._from_sequence(scalars, dtype, copy)
+
+ @classmethod
+ def _from_factorized(cls, values, original):
+ return integer_array(values, dtype=original.dtype)
+
+ def _formatter(self, boxed=False):
+ def fmt(x):
+ if isna(x):
+ return 'NaN'
+ return str(x)
+ return fmt
+
+ def __getitem__(self, item):
+ if is_integer(item):
+ if self._mask[item]:
+ return self.dtype.na_value
+ return self._data[item]
+ return type(self)(self._data[item], self._mask[item])
+
+ def _coerce_to_ndarray(self):
+ """
+ coerce to an ndarary of object dtype
+ """
+
+ # TODO(jreback) make this better
+ data = self._data.astype(object)
+ data[self._mask] = self._na_value
+ return data
+
+ __array_priority__ = 1000 # higher than ndarray so ops dispatch to us
+
+ def __array__(self, dtype=None):
+ """
+ the array interface, return my values
+ We return an object array here to preserve our scalar values
+ """
+ return self._coerce_to_ndarray()
+
+ def __iter__(self):
+ for i in range(len(self)):
+ if self._mask[i]:
+ yield self.dtype.na_value
+ else:
+ yield self._data[i]
+
+ def take(self, indexer, allow_fill=False, fill_value=None):
+ from pandas.api.extensions import take
+
+ # we always fill with 1 internally
+ # to avoid upcasting
+ data_fill_value = 1 if isna(fill_value) else fill_value
+ result = take(self._data, indexer, fill_value=data_fill_value,
+ allow_fill=allow_fill)
+
+ mask = take(self._mask, indexer, fill_value=True,
+ allow_fill=allow_fill)
+
+ # if we are filling
+ # we only fill where the indexer is null
+ # not existing missing values
+ # TODO(jreback) what if we have a non-na float as a fill value?
+ if allow_fill and notna(fill_value):
+ fill_mask = np.asarray(indexer) == -1
+ result[fill_mask] = fill_value
+ mask = mask ^ fill_mask
+
+ return type(self)(result, mask, copy=False)
+
+ def copy(self, deep=False):
+ data, mask = self._data, self._mask
+ if deep:
+ data = copy.deepcopy(data)
+ mask = copy.deepcopy(mask)
+ else:
+ data = data.copy()
+ mask = mask.copy()
+ return type(self)(data, mask, copy=False)
+
+ def __setitem__(self, key, value):
+ _is_scalar = is_scalar(value)
+ if _is_scalar:
+ value = [value]
+ value, mask = coerce_to_array(value, dtype=self.dtype)
+
+ if _is_scalar:
+ value = value[0]
+ mask = mask[0]
+
+ self._data[key] = value
+ self._mask[key] = mask
+
+ def __len__(self):
+ return len(self._data)
+
+ @property
+ def nbytes(self):
+ return self._data.nbytes + self._mask.nbytes
+
+ def isna(self):
+ return self._mask
+
+ @property
+ def _na_value(self):
+ return np.nan
+
+ @classmethod
+ def _concat_same_type(cls, to_concat):
+ data = np.concatenate([x._data for x in to_concat])
+ mask = np.concatenate([x._mask for x in to_concat])
+ return cls(data, mask)
+
+ def astype(self, dtype, copy=True):
+ """
+ Cast to a NumPy array or IntegerArray with 'dtype'.
+
+ Parameters
+ ----------
+ dtype : str or dtype
+ Typecode or data-type to which the array is cast.
+ copy : bool, default True
+ Whether to copy the data, even if not necessary. If False,
+ a copy is made only if the old dtype does not match the
+ new dtype.
+
+ Returns
+ -------
+ array : ndarray or IntegerArray
+ NumPy ndarray or IntergerArray with 'dtype' for its dtype.
+
+ Raises
+ ------
+ TypeError
+ if incompatible type with an IntegerDtype, equivalent of same_kind
+ casting
+ """
+
+ # if we are astyping to an existing IntegerDtype we can fastpath
+ if isinstance(dtype, _IntegerDtype):
+ result = self._data.astype(dtype.numpy_dtype, copy=False)
+ return type(self)(result, mask=self._mask, copy=False)
+
+ # coerce
+ data = self._coerce_to_ndarray()
+ return astype_nansafe(data, dtype, copy=None)
+
+ @property
+ def _ndarray_values(self):
+ # type: () -> np.ndarray
+ """Internal pandas method for lossy conversion to a NumPy ndarray.
+
+ This method is not part of the pandas interface.
+
+ The expectation is that this is cheap to compute, and is primarily
+ used for interacting with our indexers.
+ """
+ return self._data
+
+ def value_counts(self, dropna=True):
+ """
+ Returns a Series containing counts of each category.
+
+ Every category will have an entry, even those with a count of 0.
+
+ Parameters
+ ----------
+ dropna : boolean, default True
+ Don't include counts of NaN.
+
+ Returns
+ -------
+ counts : Series
+
+ See Also
+ --------
+ Series.value_counts
+
+ """
+
+ from pandas import Index, Series
+
+ # compute counts on the data with no nans
+ data = self._data[~self._mask]
+ value_counts = Index(data).value_counts()
+ array = value_counts.values
+
+ # TODO(extension)
+ # if we have allow Index to hold an ExtensionArray
+ # this is easier
+ index = value_counts.index.astype(object)
+
+ # if we want nans, count the mask
+ if not dropna:
+
+ # TODO(extension)
+ # appending to an Index *always* infers
+ # w/o passing the dtype
+ array = np.append(array, [self._mask.sum()])
+ index = Index(np.concatenate(
+ [index.values,
+ np.array([np.nan], dtype=object)]), dtype=object)
+
+ return Series(array, index=index)
+
+ def _values_for_argsort(self):
+ # type: () -> ndarray
+ """Return values for sorting.
+
+ Returns
+ -------
+ ndarray
+ The transformed values should maintain the ordering between values
+ within the array.
+
+ See Also
+ --------
+ ExtensionArray.argsort
+ """
+ data = self._data.copy()
+ data[self._mask] = data.min() - 1
+ return data
+
+ @classmethod
+ def _create_comparison_method(cls, op):
+ def cmp_method(self, other):
+
+ op_name = op.__name__
+ mask = None
+
+ if isinstance(other, (ABCSeries, ABCIndexClass)):
+ # Rely on pandas to unbox and dispatch to us.
+ return NotImplemented
+
+ if isinstance(other, IntegerArray):
+ other, mask = other._data, other._mask
+
+ elif is_list_like(other):
+ other = np.asarray(other)
+ if other.ndim > 0 and len(self) != len(other):
+ raise ValueError('Lengths must match to compare')
+
+ other = lib.item_from_zerodim(other)
+
+ # numpy will show a DeprecationWarning on invalid elementwise
+ # comparisons, this will raise in the future
+ with warnings.catch_warnings():
+ warnings.filterwarnings("ignore", "elementwise", FutureWarning)
+ with np.errstate(all='ignore'):
+ result = op(self._data, other)
+
+ # nans propagate
+ if mask is None:
+ mask = self._mask
+ else:
+ mask = self._mask | mask
+
+ result[mask] = True if op_name == 'ne' else False
+ return result
+
+ name = '__{name}__'.format(name=op.__name__)
+ return set_function_name(cmp_method, name, cls)
+
+ def _reduce(self, name, skipna=True, **kwargs):
+ data = self._data
+ mask = self._mask
+
+ # coerce to a nan-aware float if needed
+ if mask.any():
+ data = self._data.astype('float64')
+ data[mask] = self._na_value
+
+ op = getattr(nanops, 'nan' + name)
+ result = op(data, axis=0, skipna=skipna, mask=mask)
+
+ # if we have a boolean op, don't coerce
+ if name in ['any', 'all']:
+ pass
+
+ # if we have a preservable numeric op,
+ # provide coercion back to an integer type if possible
+ elif name in ['sum', 'min', 'max', 'prod'] and notna(result):
+ int_result = int(result)
+ if int_result == result:
+ result = int_result
+
+ return result
+
+ def _maybe_mask_result(self, result, mask, other, op_name):
+ """
+ Parameters
+ ----------
+ result : array-like
+ mask : array-like bool
+ other : scalar or array-like
+ op_name : str
+ """
+
+ # may need to fill infs
+ # and mask wraparound
+ if is_float_dtype(result):
+ mask |= (result == np.inf) | (result == -np.inf)
+
+ # if we have a float operand we are by-definition
+ # a float result
+ # or our op is a divide
+ if ((is_float_dtype(other) or is_float(other)) or
+ (op_name in ['rtruediv', 'truediv', 'rdiv', 'div'])):
+ result[mask] = np.nan
+ return result
+
+ return type(self)(result, mask, copy=False)
+
+ @classmethod
+ def _create_arithmetic_method(cls, op):
+ def integer_arithmetic_method(self, other):
+
+ op_name = op.__name__
+ mask = None
+
+ if isinstance(other, (ABCSeries, ABCIndexClass)):
+ # Rely on pandas to unbox and dispatch to us.
+ return NotImplemented
+
+ if getattr(other, 'ndim', 0) > 1:
+ raise NotImplementedError(
+ "can only perform ops with 1-d structures")
+
+ if isinstance(other, IntegerArray):
+ other, mask = other._data, other._mask
+
+ elif getattr(other, 'ndim', None) == 0:
+ other = other.item()
+
+ elif is_list_like(other):
+ other = np.asarray(other)
+ if not other.ndim:
+ other = other.item()
+ elif other.ndim == 1:
+ if not (is_float_dtype(other) or is_integer_dtype(other)):
+ raise TypeError(
+ "can only perform ops with numeric values")
+ else:
+ if not (is_float(other) or is_integer(other)):
+ raise TypeError("can only perform ops with numeric values")
+
+ # nans propagate
+ if mask is None:
+ mask = self._mask
+ else:
+ mask = self._mask | mask
+
+ # 1 ** np.nan is 1. So we have to unmask those.
+ if op_name == 'pow':
+ mask = np.where(self == 1, False, mask)
+
+ elif op_name == 'rpow':
+ mask = np.where(other == 1, False, mask)
+
+ with np.errstate(all='ignore'):
+ result = op(self._data, other)
+
+ # divmod returns a tuple
+ if op_name == 'divmod':
+ div, mod = result
+ return (self._maybe_mask_result(div, mask, other, 'floordiv'),
+ self._maybe_mask_result(mod, mask, other, 'mod'))
+
+ return self._maybe_mask_result(result, mask, other, op_name)
+
+ name = '__{name}__'.format(name=op.__name__)
+ return set_function_name(integer_arithmetic_method, name, cls)
+
+
+IntegerArray._add_arithmetic_ops()
+IntegerArray._add_comparison_ops()
+
+
+module = sys.modules[__name__]
+
+
+# create the Dtype
+_dtypes = {}
+for dtype in ['int8', 'int16', 'int32', 'int64',
+ 'uint8', 'uint16', 'uint32', 'uint64']:
+
+ if dtype.startswith('u'):
+ name = "U{}".format(dtype[1:].capitalize())
+ else:
+ name = dtype.capitalize()
+ classname = "{}Dtype".format(name)
+ numpy_dtype = getattr(np, dtype)
+ attributes_dict = {'type': numpy_dtype,
+ 'name': name}
+ dtype_type = register_extension_dtype(
+ type(classname, (_IntegerDtype, ), attributes_dict)
+ )
+ setattr(module, classname, dtype_type)
+
+ _dtypes[dtype] = dtype_type()
diff --git a/contrib/python/pandas/py2/pandas/core/arrays/interval.py b/contrib/python/pandas/py2/pandas/core/arrays/interval.py
new file mode 100644
index 00000000000..1e671c7bd95
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/arrays/interval.py
@@ -0,0 +1,1104 @@
+from operator import le, lt
+import textwrap
+
+import numpy as np
+
+from pandas._libs.interval import (
+ Interval, IntervalMixin, intervals_to_interval_bounds)
+from pandas.compat import add_metaclass
+from pandas.compat.numpy import function as nv
+from pandas.util._decorators import Appender
+from pandas.util._doctools import _WritableDoc
+
+from pandas.core.dtypes.cast import maybe_convert_platform
+from pandas.core.dtypes.common import (
+ is_categorical_dtype, is_datetime64_any_dtype, is_float_dtype,
+ is_integer_dtype, is_interval, is_interval_dtype, is_scalar,
+ is_string_dtype, is_timedelta64_dtype, pandas_dtype)
+from pandas.core.dtypes.dtypes import IntervalDtype
+from pandas.core.dtypes.generic import (
+ ABCDatetimeIndex, ABCInterval, ABCIntervalIndex, ABCPeriodIndex, ABCSeries)
+from pandas.core.dtypes.missing import isna, notna
+
+from pandas.core.arrays.base import (
+ ExtensionArray, _extension_array_shared_docs)
+from pandas.core.arrays.categorical import Categorical
+import pandas.core.common as com
+from pandas.core.config import get_option
+from pandas.core.indexes.base import Index, ensure_index
+
+_VALID_CLOSED = {'left', 'right', 'both', 'neither'}
+_interval_shared_docs = {}
+
+_shared_docs_kwargs = dict(
+ klass='IntervalArray',
+ qualname='arrays.IntervalArray',
+ name=''
+)
+
+
+_interval_shared_docs['class'] = """
+%(summary)s
+
+.. versionadded:: %(versionadded)s
+
+.. warning::
+
+ The indexing behaviors are provisional and may change in
+ a future version of pandas.
+
+Parameters
+----------
+data : array-like (1-dimensional)
+ Array-like containing Interval objects from which to build the
+ %(klass)s.
+closed : {'left', 'right', 'both', 'neither'}, default 'right'
+ Whether the intervals are closed on the left-side, right-side, both or
+ neither.
+dtype : dtype or None, default None
+ If None, dtype will be inferred.
+
+ .. versionadded:: 0.23.0
+copy : bool, default False
+ Copy the input data.
+%(name)s\
+verify_integrity : bool, default True
+ Verify that the %(klass)s is valid.
+
+Attributes
+----------
+left
+right
+closed
+mid
+length
+is_non_overlapping_monotonic
+%(extra_attributes)s\
+
+Methods
+-------
+from_arrays
+from_tuples
+from_breaks
+overlaps
+set_closed
+to_tuples
+%(extra_methods)s\
+
+See Also
+--------
+Index : The base pandas Index type.
+Interval : A bounded slice-like interval; the elements of an %(klass)s.
+interval_range : Function to create a fixed frequency IntervalIndex.
+cut : Bin values into discrete Intervals.
+qcut : Bin values into equal-sized Intervals based on rank or sample quantiles.
+
+Notes
+------
+See the `user guide
+<http://pandas.pydata.org/pandas-docs/stable/advanced.html#intervalindex>`_
+for more.
+
+%(examples)s\
+"""
+
+
+@Appender(_interval_shared_docs['class'] % dict(
+ klass="IntervalArray",
+ summary="Pandas array for interval data that are closed on the same side.",
+ versionadded="0.24.0",
+ name='',
+ extra_attributes='',
+ extra_methods='',
+ examples=textwrap.dedent("""\
+ Examples
+ --------
+ A new ``IntervalArray`` can be constructed directly from an array-like of
+ ``Interval`` objects:
+
+ >>> pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)])
+ IntervalArray([(0, 1], (1, 5]],
+ closed='right',
+ dtype='interval[int64]')
+
+ It may also be constructed using one of the constructor
+ methods: :meth:`IntervalArray.from_arrays`,
+ :meth:`IntervalArray.from_breaks`, and :meth:`IntervalArray.from_tuples`.
+ """),
+))
+@add_metaclass(_WritableDoc)
+class IntervalArray(IntervalMixin, ExtensionArray):
+ dtype = IntervalDtype()
+ ndim = 1
+ can_hold_na = True
+ _na_value = _fill_value = np.nan
+
+ def __new__(cls, data, closed=None, dtype=None, copy=False,
+ verify_integrity=True):
+
+ if isinstance(data, ABCSeries) and is_interval_dtype(data):
+ data = data.values
+
+ if isinstance(data, (cls, ABCIntervalIndex)):
+ left = data.left
+ right = data.right
+ closed = closed or data.closed
+ else:
+
+ # don't allow scalars
+ if is_scalar(data):
+ msg = ("{}(...) must be called with a collection of some kind,"
+ " {} was passed")
+ raise TypeError(msg.format(cls.__name__, data))
+
+ # might need to convert empty or purely na data
+ data = maybe_convert_platform_interval(data)
+ left, right, infer_closed = intervals_to_interval_bounds(
+ data, validate_closed=closed is None)
+ closed = closed or infer_closed
+
+ return cls._simple_new(left, right, closed, copy=copy, dtype=dtype,
+ verify_integrity=verify_integrity)
+
+ @classmethod
+ def _simple_new(cls, left, right, closed=None,
+ copy=False, dtype=None, verify_integrity=True):
+ result = IntervalMixin.__new__(cls)
+
+ closed = closed or 'right'
+ left = ensure_index(left, copy=copy)
+ right = ensure_index(right, copy=copy)
+
+ if dtype is not None:
+ # GH 19262: dtype must be an IntervalDtype to override inferred
+ dtype = pandas_dtype(dtype)
+ if not is_interval_dtype(dtype):
+ msg = 'dtype must be an IntervalDtype, got {dtype}'
+ raise TypeError(msg.format(dtype=dtype))
+ elif dtype.subtype is not None:
+ left = left.astype(dtype.subtype)
+ right = right.astype(dtype.subtype)
+
+ # coerce dtypes to match if needed
+ if is_float_dtype(left) and is_integer_dtype(right):
+ right = right.astype(left.dtype)
+ elif is_float_dtype(right) and is_integer_dtype(left):
+ left = left.astype(right.dtype)
+
+ if type(left) != type(right):
+ msg = ('must not have differing left [{ltype}] and right '
+ '[{rtype}] types')
+ raise ValueError(msg.format(ltype=type(left).__name__,
+ rtype=type(right).__name__))
+ elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype):
+ # GH 19016
+ msg = ('category, object, and string subtypes are not supported '
+ 'for IntervalArray')
+ raise TypeError(msg)
+ elif isinstance(left, ABCPeriodIndex):
+ msg = 'Period dtypes are not supported, use a PeriodIndex instead'
+ raise ValueError(msg)
+ elif (isinstance(left, ABCDatetimeIndex) and
+ str(left.tz) != str(right.tz)):
+ msg = ("left and right must have the same time zone, got "
+ "'{left_tz}' and '{right_tz}'")
+ raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz))
+
+ result._left = left
+ result._right = right
+ result._closed = closed
+ if verify_integrity:
+ result._validate()
+ return result
+
+ @classmethod
+ def _from_sequence(cls, scalars, dtype=None, copy=False):
+ return cls(scalars, dtype=dtype, copy=copy)
+
+ @classmethod
+ def _from_factorized(cls, values, original):
+ if len(values) == 0:
+ # An empty array returns object-dtype here. We can't create
+ # a new IA from an (empty) object-dtype array, so turn it into the
+ # correct dtype.
+ values = values.astype(original.dtype.subtype)
+ return cls(values, closed=original.closed)
+
+ _interval_shared_docs['from_breaks'] = """
+ Construct an %(klass)s from an array of splits.
+
+ Parameters
+ ----------
+ breaks : array-like (1-dimensional)
+ Left and right bounds for each interval.
+ closed : {'left', 'right', 'both', 'neither'}, default 'right'
+ Whether the intervals are closed on the left-side, right-side, both
+ or neither.
+ copy : boolean, default False
+ copy the data
+ dtype : dtype or None, default None
+ If None, dtype will be inferred
+
+ .. versionadded:: 0.23.0
+
+ See Also
+ --------
+ interval_range : Function to create a fixed frequency IntervalIndex.
+ %(klass)s.from_arrays : Construct from a left and right array.
+ %(klass)s.from_tuples : Construct from a sequence of tuples.
+
+ Examples
+ --------
+ >>> pd.%(qualname)s.from_breaks([0, 1, 2, 3])
+ %(klass)s([(0, 1], (1, 2], (2, 3]],
+ closed='right',
+ dtype='interval[int64]')
+ """
+
+ @classmethod
+ @Appender(_interval_shared_docs['from_breaks'] % _shared_docs_kwargs)
+ def from_breaks(cls, breaks, closed='right', copy=False, dtype=None):
+ breaks = maybe_convert_platform_interval(breaks)
+
+ return cls.from_arrays(breaks[:-1], breaks[1:], closed, copy=copy,
+ dtype=dtype)
+
+ _interval_shared_docs['from_arrays'] = """
+ Construct from two arrays defining the left and right bounds.
+
+ Parameters
+ ----------
+ left : array-like (1-dimensional)
+ Left bounds for each interval.
+ right : array-like (1-dimensional)
+ Right bounds for each interval.
+ closed : {'left', 'right', 'both', 'neither'}, default 'right'
+ Whether the intervals are closed on the left-side, right-side, both
+ or neither.
+ copy : boolean, default False
+ Copy the data.
+ dtype : dtype, optional
+ If None, dtype will be inferred.
+
+ .. versionadded:: 0.23.0
+
+ Returns
+ -------
+ %(klass)s
+
+ Raises
+ ------
+ ValueError
+ When a value is missing in only one of `left` or `right`.
+ When a value in `left` is greater than the corresponding value
+ in `right`.
+
+ See Also
+ --------
+ interval_range : Function to create a fixed frequency IntervalIndex.
+ %(klass)s.from_breaks : Construct an %(klass)s from an array of
+ splits.
+ %(klass)s.from_tuples : Construct an %(klass)s from an
+ array-like of tuples.
+
+ Notes
+ -----
+ Each element of `left` must be less than or equal to the `right`
+ element at the same position. If an element is missing, it must be
+ missing in both `left` and `right`. A TypeError is raised when
+ using an unsupported type for `left` or `right`. At the moment,
+ 'category', 'object', and 'string' subtypes are not supported.
+
+ Examples
+ --------
+ >>> %(klass)s.from_arrays([0, 1, 2], [1, 2, 3])
+ %(klass)s([(0, 1], (1, 2], (2, 3]],
+ closed='right',
+ dtype='interval[int64]')
+ """
+
+ @classmethod
+ @Appender(_interval_shared_docs['from_arrays'] % _shared_docs_kwargs)
+ def from_arrays(cls, left, right, closed='right', copy=False, dtype=None):
+ left = maybe_convert_platform_interval(left)
+ right = maybe_convert_platform_interval(right)
+
+ return cls._simple_new(left, right, closed, copy=copy,
+ dtype=dtype, verify_integrity=True)
+
+ _interval_shared_docs['from_intervals'] = """
+ Construct an %(klass)s from a 1d array of Interval objects
+
+ .. deprecated:: 0.23.0
+
+ Parameters
+ ----------
+ data : array-like (1-dimensional)
+ Array of Interval objects. All intervals must be closed on the same
+ sides.
+ copy : boolean, default False
+ by-default copy the data, this is compat only and ignored
+ dtype : dtype or None, default None
+ If None, dtype will be inferred
+
+ ..versionadded:: 0.23.0
+
+ See Also
+ --------
+ interval_range : Function to create a fixed frequency IntervalIndex.
+ %(klass)s.from_arrays : Construct an %(klass)s from a left and
+ right array.
+ %(klass)s.from_breaks : Construct an %(klass)s from an array of
+ splits.
+ %(klass)s.from_tuples : Construct an %(klass)s from an
+ array-like of tuples.
+
+ Examples
+ --------
+ >>> pd.%(qualname)s.from_intervals([pd.Interval(0, 1),
+ ... pd.Interval(1, 2)])
+ %(klass)s([(0, 1], (1, 2]],
+ closed='right', dtype='interval[int64]')
+
+ The generic Index constructor work identically when it infers an array
+ of all intervals:
+
+ >>> pd.Index([pd.Interval(0, 1), pd.Interval(1, 2)])
+ %(klass)s([(0, 1], (1, 2]],
+ closed='right', dtype='interval[int64]')
+ """
+
+ _interval_shared_docs['from_tuples'] = """
+ Construct an %(klass)s from an array-like of tuples
+
+ Parameters
+ ----------
+ data : array-like (1-dimensional)
+ Array of tuples
+ closed : {'left', 'right', 'both', 'neither'}, default 'right'
+ Whether the intervals are closed on the left-side, right-side, both
+ or neither.
+ copy : boolean, default False
+ by-default copy the data, this is compat only and ignored
+ dtype : dtype or None, default None
+ If None, dtype will be inferred
+
+ ..versionadded:: 0.23.0
+
+ See Also
+ --------
+ interval_range : Function to create a fixed frequency IntervalIndex.
+ %(klass)s.from_arrays : Construct an %(klass)s from a left and
+ right array.
+ %(klass)s.from_breaks : Construct an %(klass)s from an array of
+ splits.
+
+ Examples
+ --------
+ >>> pd.%(qualname)s.from_tuples([(0, 1), (1, 2)])
+ %(klass)s([(0, 1], (1, 2]],
+ closed='right', dtype='interval[int64]')
+ """
+
+ @classmethod
+ @Appender(_interval_shared_docs['from_tuples'] % _shared_docs_kwargs)
+ def from_tuples(cls, data, closed='right', copy=False, dtype=None):
+ if len(data):
+ left, right = [], []
+ else:
+ # ensure that empty data keeps input dtype
+ left = right = data
+
+ for d in data:
+ if isna(d):
+ lhs = rhs = np.nan
+ else:
+ name = cls.__name__
+ try:
+ # need list of length 2 tuples, e.g. [(0, 1), (1, 2), ...]
+ lhs, rhs = d
+ except ValueError:
+ msg = ('{name}.from_tuples requires tuples of '
+ 'length 2, got {tpl}').format(name=name, tpl=d)
+ raise ValueError(msg)
+ except TypeError:
+ msg = ('{name}.from_tuples received an invalid '
+ 'item, {tpl}').format(name=name, tpl=d)
+ raise TypeError(msg)
+ left.append(lhs)
+ right.append(rhs)
+
+ return cls.from_arrays(left, right, closed, copy=False,
+ dtype=dtype)
+
+ def _validate(self):
+ """Verify that the IntervalArray is valid.
+
+ Checks that
+
+ * closed is valid
+ * left and right match lengths
+ * left and right have the same missing values
+ * left is always below right
+ """
+ if self.closed not in _VALID_CLOSED:
+ raise ValueError("invalid option for 'closed': {closed}"
+ .format(closed=self.closed))
+ if len(self.left) != len(self.right):
+ raise ValueError('left and right must have the same length')
+ left_mask = notna(self.left)
+ right_mask = notna(self.right)
+ if not (left_mask == right_mask).all():
+ raise ValueError('missing values must be missing in the same '
+ 'location both left and right sides')
+ if not (self.left[left_mask] <= self.right[left_mask]).all():
+ raise ValueError('left side of interval must be <= right side')
+
+ # ---------
+ # Interface
+ # ---------
+ def __iter__(self):
+ return iter(np.asarray(self))
+
+ def __len__(self):
+ return len(self.left)
+
+ def __getitem__(self, value):
+ left = self.left[value]
+ right = self.right[value]
+
+ # scalar
+ if not isinstance(left, Index):
+ if isna(left):
+ return self._fill_value
+ return Interval(left, right, self.closed)
+
+ return self._shallow_copy(left, right)
+
+ def __setitem__(self, key, value):
+ # na value: need special casing to set directly on numpy arrays
+ needs_float_conversion = False
+ if is_scalar(value) and isna(value):
+ if is_integer_dtype(self.dtype.subtype):
+ # can't set NaN on a numpy integer array
+ needs_float_conversion = True
+ elif is_datetime64_any_dtype(self.dtype.subtype):
+ # need proper NaT to set directly on the numpy array
+ value = np.datetime64('NaT')
+ elif is_timedelta64_dtype(self.dtype.subtype):
+ # need proper NaT to set directly on the numpy array
+ value = np.timedelta64('NaT')
+ value_left, value_right = value, value
+
+ # scalar interval
+ elif is_interval_dtype(value) or isinstance(value, ABCInterval):
+ self._check_closed_matches(value, name="value")
+ value_left, value_right = value.left, value.right
+
+ else:
+ # list-like of intervals
+ try:
+ array = IntervalArray(value)
+ value_left, value_right = array.left, array.right
+ except TypeError:
+ # wrong type: not interval or NA
+ msg = "'value' should be an interval type, got {} instead."
+ raise TypeError(msg.format(type(value)))
+
+ # Need to ensure that left and right are updated atomically, so we're
+ # forced to copy, update the copy, and swap in the new values.
+ left = self.left.copy(deep=True)
+ if needs_float_conversion:
+ left = left.astype('float')
+ left.values[key] = value_left
+ self._left = left
+
+ right = self.right.copy(deep=True)
+ if needs_float_conversion:
+ right = right.astype('float')
+ right.values[key] = value_right
+ self._right = right
+
+ def fillna(self, value=None, method=None, limit=None):
+ """
+ Fill NA/NaN values using the specified method.
+
+ Parameters
+ ----------
+ value : scalar, dict, Series
+ If a scalar value is passed it is used to fill all missing values.
+ Alternatively, a Series or dict can be used to fill in different
+ values for each index. The value should not be a list. The
+ value(s) passed should be either Interval objects or NA/NaN.
+ method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
+ (Not implemented yet for IntervalArray)
+ Method to use for filling holes in reindexed Series
+ limit : int, default None
+ (Not implemented yet for IntervalArray)
+ If method is specified, this is the maximum number of consecutive
+ NaN values to forward/backward fill. In other words, if there is
+ a gap with more than this number of consecutive NaNs, it will only
+ be partially filled. If method is not specified, this is the
+ maximum number of entries along the entire axis where NaNs will be
+ filled.
+
+ Returns
+ -------
+ filled : IntervalArray with NA/NaN filled
+ """
+ if method is not None:
+ raise TypeError('Filling by method is not supported for '
+ 'IntervalArray.')
+ if limit is not None:
+ raise TypeError('limit is not supported for IntervalArray.')
+
+ if not isinstance(value, ABCInterval):
+ msg = ("'IntervalArray.fillna' only supports filling with a "
+ "scalar 'pandas.Interval'. Got a '{}' instead."
+ .format(type(value).__name__))
+ raise TypeError(msg)
+
+ value = getattr(value, '_values', value)
+ self._check_closed_matches(value, name="value")
+
+ left = self.left.fillna(value=value.left)
+ right = self.right.fillna(value=value.right)
+ return self._shallow_copy(left, right)
+
+ @property
+ def dtype(self):
+ return IntervalDtype(self.left.dtype)
+
+ def astype(self, dtype, copy=True):
+ """
+ Cast to an ExtensionArray or NumPy array with dtype 'dtype'.
+
+ Parameters
+ ----------
+ dtype : str or dtype
+ Typecode or data-type to which the array is cast.
+
+ copy : bool, default True
+ Whether to copy the data, even if not necessary. If False,
+ a copy is made only if the old dtype does not match the
+ new dtype.
+
+ Returns
+ -------
+ array : ExtensionArray or ndarray
+ ExtensionArray or NumPy ndarray with 'dtype' for its dtype.
+ """
+ dtype = pandas_dtype(dtype)
+ if is_interval_dtype(dtype):
+ if dtype == self.dtype:
+ return self.copy() if copy else self
+
+ # need to cast to different subtype
+ try:
+ new_left = self.left.astype(dtype.subtype)
+ new_right = self.right.astype(dtype.subtype)
+ except TypeError:
+ msg = ('Cannot convert {dtype} to {new_dtype}; subtypes are '
+ 'incompatible')
+ raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype))
+ return self._shallow_copy(new_left, new_right)
+ elif is_categorical_dtype(dtype):
+ return Categorical(np.asarray(self))
+ # TODO: This try/except will be repeated.
+ try:
+ return np.asarray(self).astype(dtype, copy=copy)
+ except (TypeError, ValueError):
+ msg = 'Cannot cast {name} to dtype {dtype}'
+ raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
+
+ @classmethod
+ def _concat_same_type(cls, to_concat):
+ """
+ Concatenate multiple IntervalArray
+
+ Parameters
+ ----------
+ to_concat : sequence of IntervalArray
+
+ Returns
+ -------
+ IntervalArray
+ """
+ closed = {interval.closed for interval in to_concat}
+ if len(closed) != 1:
+ raise ValueError("Intervals must all be closed on the same side.")
+ closed = closed.pop()
+
+ left = np.concatenate([interval.left for interval in to_concat])
+ right = np.concatenate([interval.right for interval in to_concat])
+ return cls._simple_new(left, right, closed=closed, copy=False)
+
+ def _shallow_copy(self, left=None, right=None, closed=None):
+ """
+ Return a new IntervalArray with the replacement attributes
+
+ Parameters
+ ----------
+ left : array-like
+ Values to be used for the left-side of the the intervals.
+ If None, the existing left and right values will be used.
+
+ right : array-like
+ Values to be used for the right-side of the the intervals.
+ If None and left is IntervalArray-like, the left and right
+ of the IntervalArray-like will be used.
+
+ closed : {'left', 'right', 'both', 'neither'}, optional
+ Whether the intervals are closed on the left-side, right-side, both
+ or neither. If None, the existing closed will be used.
+ """
+ if left is None:
+
+ # no values passed
+ left, right = self.left, self.right
+
+ elif right is None:
+
+ # only single value passed, could be an IntervalArray
+ # or array of Intervals
+ if not isinstance(left, (type(self), ABCIntervalIndex)):
+ left = type(self)(left)
+
+ left, right = left.left, left.right
+ else:
+
+ # both left and right are values
+ pass
+
+ closed = closed or self.closed
+ return self._simple_new(
+ left, right, closed=closed, verify_integrity=False)
+
+ def copy(self, deep=False):
+ """
+ Return a copy of the array.
+
+ Parameters
+ ----------
+ deep : bool, default False
+ Also copy the underlying data backing this array.
+
+ Returns
+ -------
+ IntervalArray
+ """
+ left = self.left.copy(deep=True) if deep else self.left
+ right = self.right.copy(deep=True) if deep else self.right
+ closed = self.closed
+ # TODO: Could skip verify_integrity here.
+ return type(self).from_arrays(left, right, closed=closed)
+
+ def isna(self):
+ return isna(self.left)
+
+ @property
+ def nbytes(self):
+ return self.left.nbytes + self.right.nbytes
+
+ @property
+ def size(self):
+ # Avoid materializing self.values
+ return self.left.size
+
+ @property
+ def shape(self):
+ return self.left.shape
+
+ def take(self, indices, allow_fill=False, fill_value=None, axis=None,
+ **kwargs):
+ """
+ Take elements from the IntervalArray.
+
+ Parameters
+ ----------
+ indices : sequence of integers
+ Indices to be taken.
+
+ allow_fill : bool, default False
+ How to handle negative values in `indices`.
+
+ * False: negative values in `indices` indicate positional indices
+ from the right (the default). This is similar to
+ :func:`numpy.take`.
+
+ * True: negative values in `indices` indicate
+ missing values. These values are set to `fill_value`. Any other
+ other negative values raise a ``ValueError``.
+
+ fill_value : Interval or NA, optional
+ Fill value to use for NA-indices when `allow_fill` is True.
+ This may be ``None``, in which case the default NA value for
+ the type, ``self.dtype.na_value``, is used.
+
+ For many ExtensionArrays, there will be two representations of
+ `fill_value`: a user-facing "boxed" scalar, and a low-level
+ physical NA value. `fill_value` should be the user-facing version,
+ and the implementation should handle translating that to the
+ physical version for processing the take if necessary.
+
+ axis : any, default None
+ Present for compat with IntervalIndex; does nothing.
+
+ Returns
+ -------
+ IntervalArray
+
+ Raises
+ ------
+ IndexError
+ When the indices are out of bounds for the array.
+ ValueError
+ When `indices` contains negative values other than ``-1``
+ and `allow_fill` is True.
+ """
+ from pandas.core.algorithms import take
+
+ nv.validate_take(tuple(), kwargs)
+
+ fill_left = fill_right = fill_value
+ if allow_fill:
+ if fill_value is None:
+ fill_left = fill_right = self.left._na_value
+ elif is_interval(fill_value):
+ self._check_closed_matches(fill_value, name='fill_value')
+ fill_left, fill_right = fill_value.left, fill_value.right
+ elif not is_scalar(fill_value) and notna(fill_value):
+ msg = ("'IntervalArray.fillna' only supports filling with a "
+ "'scalar pandas.Interval or NA'. Got a '{}' instead."
+ .format(type(fill_value).__name__))
+ raise ValueError(msg)
+
+ left_take = take(self.left, indices,
+ allow_fill=allow_fill, fill_value=fill_left)
+ right_take = take(self.right, indices,
+ allow_fill=allow_fill, fill_value=fill_right)
+
+ return self._shallow_copy(left_take, right_take)
+
+ def value_counts(self, dropna=True):
+ """
+ Returns a Series containing counts of each interval.
+
+ Parameters
+ ----------
+ dropna : boolean, default True
+ Don't include counts of NaN.
+
+ Returns
+ -------
+ counts : Series
+
+ See Also
+ --------
+ Series.value_counts
+ """
+ # TODO: implement this is a non-naive way!
+ from pandas.core.algorithms import value_counts
+ return value_counts(np.asarray(self), dropna=dropna)
+
+ # Formatting
+
+ def _format_data(self):
+
+ # TODO: integrate with categorical and make generic
+ # name argument is unused here; just for compat with base / categorical
+ n = len(self)
+ max_seq_items = min((get_option(
+ 'display.max_seq_items') or n) // 10, 10)
+
+ formatter = str
+
+ if n == 0:
+ summary = '[]'
+ elif n == 1:
+ first = formatter(self[0])
+ summary = '[{first}]'.format(first=first)
+ elif n == 2:
+ first = formatter(self[0])
+ last = formatter(self[-1])
+ summary = '[{first}, {last}]'.format(first=first, last=last)
+ else:
+
+ if n > max_seq_items:
+ n = min(max_seq_items // 2, 10)
+ head = [formatter(x) for x in self[:n]]
+ tail = [formatter(x) for x in self[-n:]]
+ summary = '[{head} ... {tail}]'.format(
+ head=', '.join(head), tail=', '.join(tail))
+ else:
+ tail = [formatter(x) for x in self]
+ summary = '[{tail}]'.format(tail=', '.join(tail))
+
+ return summary
+
+ def __repr__(self):
+ tpl = textwrap.dedent("""\
+ {cls}({data},
+ {lead}closed='{closed}',
+ {lead}dtype='{dtype}')""")
+ return tpl.format(cls=self.__class__.__name__,
+ data=self._format_data(),
+ lead=' ' * len(self.__class__.__name__) + ' ',
+ closed=self.closed, dtype=self.dtype)
+
+ def _format_space(self):
+ space = ' ' * (len(self.__class__.__name__) + 1)
+ return "\n{space}".format(space=space)
+
+ @property
+ def left(self):
+ """
+ Return the left endpoints of each Interval in the IntervalArray as
+ an Index
+ """
+ return self._left
+
+ @property
+ def right(self):
+ """
+ Return the right endpoints of each Interval in the IntervalArray as
+ an Index
+ """
+ return self._right
+
+ @property
+ def closed(self):
+ """
+ Whether the intervals are closed on the left-side, right-side, both or
+ neither
+ """
+ return self._closed
+
+ _interval_shared_docs['set_closed'] = """
+ Return an %(klass)s identical to the current one, but closed on the
+ specified side
+
+ .. versionadded:: 0.24.0
+
+ Parameters
+ ----------
+ closed : {'left', 'right', 'both', 'neither'}
+ Whether the intervals are closed on the left-side, right-side, both
+ or neither.
+
+ Returns
+ -------
+ new_index : %(klass)s
+
+ Examples
+ --------
+ >>> index = pd.interval_range(0, 3)
+ >>> index
+ IntervalIndex([(0, 1], (1, 2], (2, 3]],
+ closed='right',
+ dtype='interval[int64]')
+ >>> index.set_closed('both')
+ IntervalIndex([[0, 1], [1, 2], [2, 3]],
+ closed='both',
+ dtype='interval[int64]')
+ """
+
+ @Appender(_interval_shared_docs['set_closed'] % _shared_docs_kwargs)
+ def set_closed(self, closed):
+ if closed not in _VALID_CLOSED:
+ msg = "invalid option for 'closed': {closed}"
+ raise ValueError(msg.format(closed=closed))
+
+ return self._shallow_copy(closed=closed)
+
+ @property
+ def length(self):
+ """
+ Return an Index with entries denoting the length of each Interval in
+ the IntervalArray
+ """
+ try:
+ return self.right - self.left
+ except TypeError:
+ # length not defined for some types, e.g. string
+ msg = ('IntervalArray contains Intervals without defined length, '
+ 'e.g. Intervals with string endpoints')
+ raise TypeError(msg)
+
+ @property
+ def mid(self):
+ """
+ Return the midpoint of each Interval in the IntervalArray as an Index
+ """
+ try:
+ return 0.5 * (self.left + self.right)
+ except TypeError:
+ # datetime safe version
+ return self.left + 0.5 * self.length
+
+ _interval_shared_docs['is_non_overlapping_monotonic'] = """
+ Return True if the %(klass)s is non-overlapping (no Intervals share
+ points) and is either monotonic increasing or monotonic decreasing,
+ else False
+ """
+
+ @property
+ @Appender(_interval_shared_docs['is_non_overlapping_monotonic']
+ % _shared_docs_kwargs)
+ def is_non_overlapping_monotonic(self):
+ # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... )
+ # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...)
+ # we already require left <= right
+
+ # strict inequality for closed == 'both'; equality implies overlapping
+ # at a point when both sides of intervals are included
+ if self.closed == 'both':
+ return bool((self.right[:-1] < self.left[1:]).all() or
+ (self.left[:-1] > self.right[1:]).all())
+
+ # non-strict inequality when closed != 'both'; at least one side is
+ # not included in the intervals, so equality does not imply overlapping
+ return bool((self.right[:-1] <= self.left[1:]).all() or
+ (self.left[:-1] >= self.right[1:]).all())
+
+ # Conversion
+ def __array__(self, dtype=None):
+ """
+ Return the IntervalArray's data as a numpy array of Interval
+ objects (with dtype='object')
+ """
+ left = self.left
+ right = self.right
+ mask = self.isna()
+ closed = self._closed
+
+ result = np.empty(len(left), dtype=object)
+ for i in range(len(left)):
+ if mask[i]:
+ result[i] = np.nan
+ else:
+ result[i] = Interval(left[i], right[i], closed)
+ return result
+
+ _interval_shared_docs['to_tuples'] = """\
+ Return an %(return_type)s of tuples of the form (left, right)
+
+ Parameters
+ ----------
+ na_tuple : boolean, default True
+ Returns NA as a tuple if True, ``(nan, nan)``, or just as the NA
+ value itself if False, ``nan``.
+
+ .. versionadded:: 0.23.0
+
+ Returns
+ -------
+ tuples: %(return_type)s
+ %(examples)s\
+ """
+
+ @Appender(_interval_shared_docs['to_tuples'] % dict(
+ return_type='ndarray',
+ examples='',
+ ))
+ def to_tuples(self, na_tuple=True):
+ tuples = com.asarray_tuplesafe(zip(self.left, self.right))
+ if not na_tuple:
+ # GH 18756
+ tuples = np.where(~self.isna(), tuples, np.nan)
+ return tuples
+
+ @Appender(_extension_array_shared_docs['repeat'] % _shared_docs_kwargs)
+ def repeat(self, repeats, axis=None):
+ nv.validate_repeat(tuple(), dict(axis=axis))
+ left_repeat = self.left.repeat(repeats)
+ right_repeat = self.right.repeat(repeats)
+ return self._shallow_copy(left=left_repeat, right=right_repeat)
+
+ _interval_shared_docs['overlaps'] = """
+ Check elementwise if an Interval overlaps the values in the %(klass)s.
+
+ Two intervals overlap if they share a common point, including closed
+ endpoints. Intervals that only have an open endpoint in common do not
+ overlap.
+
+ .. versionadded:: 0.24.0
+
+ Parameters
+ ----------
+ other : Interval
+ Interval to check against for an overlap.
+
+ Returns
+ -------
+ ndarray
+ Boolean array positionally indicating where an overlap occurs.
+
+ See Also
+ --------
+ Interval.overlaps : Check whether two Interval objects overlap.
+
+ Examples
+ --------
+ >>> intervals = pd.%(qualname)s.from_tuples([(0, 1), (1, 3), (2, 4)])
+ >>> intervals
+ %(klass)s([(0, 1], (1, 3], (2, 4]],
+ closed='right',
+ dtype='interval[int64]')
+ >>> intervals.overlaps(pd.Interval(0.5, 1.5))
+ array([ True, True, False])
+
+ Intervals that share closed endpoints overlap:
+
+ >>> intervals.overlaps(pd.Interval(1, 3, closed='left'))
+ array([ True, True, True])
+
+ Intervals that only have an open endpoint in common do not overlap:
+
+ >>> intervals.overlaps(pd.Interval(1, 2, closed='right'))
+ array([False, True, False])
+ """
+
+ @Appender(_interval_shared_docs['overlaps'] % _shared_docs_kwargs)
+ def overlaps(self, other):
+ if isinstance(other, (IntervalArray, ABCIntervalIndex)):
+ raise NotImplementedError
+ elif not isinstance(other, Interval):
+ msg = '`other` must be Interval-like, got {other}'
+ raise TypeError(msg.format(other=type(other).__name__))
+
+ # equality is okay if both endpoints are closed (overlap at a point)
+ op1 = le if (self.closed_left and other.closed_right) else lt
+ op2 = le if (other.closed_left and self.closed_right) else lt
+
+ # overlaps is equivalent negation of two interval being disjoint:
+ # disjoint = (A.left > B.right) or (B.left > A.right)
+ # (simplifying the negation allows this to be done in less operations)
+ return op1(self.left, other.right) & op2(other.left, self.right)
+
+
+def maybe_convert_platform_interval(values):
+ """
+ Try to do platform conversion, with special casing for IntervalArray.
+ Wrapper around maybe_convert_platform that alters the default return
+ dtype in certain cases to be compatible with IntervalArray. For example,
+ empty lists return with integer dtype instead of object dtype, which is
+ prohibited for IntervalArray.
+
+ Parameters
+ ----------
+ values : array-like
+
+ Returns
+ -------
+ array
+ """
+ if isinstance(values, (list, tuple)) and len(values) == 0:
+ # GH 19016
+ # empty lists/tuples get object dtype by default, but this is not
+ # prohibited for IntervalArray, so coerce to integer instead
+ return np.array([], dtype=np.int64)
+ elif is_categorical_dtype(values):
+ values = np.asarray(values)
+
+ return maybe_convert_platform(values)
diff --git a/contrib/python/pandas/py2/pandas/core/arrays/numpy_.py b/contrib/python/pandas/py2/pandas/core/arrays/numpy_.py
new file mode 100644
index 00000000000..791ff44303e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/arrays/numpy_.py
@@ -0,0 +1,458 @@
+import numbers
+
+import numpy as np
+
+from pandas._libs import lib
+from pandas.compat.numpy import function as nv
+from pandas.util._validators import validate_fillna_kwargs
+
+from pandas.core.dtypes.dtypes import ExtensionDtype
+from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
+from pandas.core.dtypes.inference import is_array_like, is_list_like
+
+from pandas import compat
+from pandas.core import nanops
+from pandas.core.missing import backfill_1d, pad_1d
+
+from .base import ExtensionArray, ExtensionOpsMixin
+
+
+class PandasDtype(ExtensionDtype):
+ """
+ A Pandas ExtensionDtype for NumPy dtypes.
+
+ .. versionadded:: 0.24.0
+
+ This is mostly for internal compatibility, and is not especially
+ useful on its own.
+
+ Parameters
+ ----------
+ dtype : numpy.dtype
+ """
+ _metadata = ('_dtype',)
+
+ def __init__(self, dtype):
+ dtype = np.dtype(dtype)
+ self._dtype = dtype
+ self._name = dtype.name
+ self._type = dtype.type
+
+ def __repr__(self):
+ return "PandasDtype({!r})".format(self.name)
+
+ @property
+ def numpy_dtype(self):
+ """The NumPy dtype this PandasDtype wraps."""
+ return self._dtype
+
+ @property
+ def name(self):
+ return self._name
+
+ @property
+ def type(self):
+ return self._type
+
+ @property
+ def _is_numeric(self):
+ # exclude object, str, unicode, void.
+ return self.kind in set('biufc')
+
+ @property
+ def _is_boolean(self):
+ return self.kind == 'b'
+
+ @classmethod
+ def construct_from_string(cls, string):
+ return cls(np.dtype(string))
+
+ def construct_array_type(cls):
+ return PandasArray
+
+ @property
+ def kind(self):
+ return self._dtype.kind
+
+ @property
+ def itemsize(self):
+ """The element size of this data-type object."""
+ return self._dtype.itemsize
+
+
+# TODO(NumPy1.13): remove this
+# Compat for NumPy 1.12, which doesn't provide NDArrayOperatorsMixin
+# or __array_ufunc__, so those operations won't be available to people
+# on older NumPys.
+#
+# We would normally write this as bases=(...), then "class Foo(*bases):
+# but Python2 doesn't allow unpacking tuples in the class statement.
+# So, we fall back to "object", to avoid writing a metaclass.
+try:
+ from numpy.lib.mixins import NDArrayOperatorsMixin
+except ImportError:
+ NDArrayOperatorsMixin = object
+
+
+class PandasArray(ExtensionArray, ExtensionOpsMixin, NDArrayOperatorsMixin):
+ """
+ A pandas ExtensionArray for NumPy data.
+
+ .. versionadded :: 0.24.0
+
+ This is mostly for internal compatibility, and is not especially
+ useful on its own.
+
+ Parameters
+ ----------
+ values : ndarray
+ The NumPy ndarray to wrap. Must be 1-dimensional.
+ copy : bool, default False
+ Whether to copy `values`.
+
+ Notes
+ -----
+ Operations like ``+`` and applying ufuncs requires NumPy>=1.13.
+ """
+ # If you're wondering why pd.Series(cls) doesn't put the array in an
+ # ExtensionBlock, search for `ABCPandasArray`. We check for
+ # that _typ to ensure that that users don't unnecessarily use EAs inside
+ # pandas internals, which turns off things like block consolidation.
+ _typ = "npy_extension"
+ __array_priority__ = 1000
+
+ # ------------------------------------------------------------------------
+ # Constructors
+
+ def __init__(self, values, copy=False):
+ if isinstance(values, type(self)):
+ values = values._ndarray
+ if not isinstance(values, np.ndarray):
+ raise ValueError("'values' must be a NumPy array.")
+
+ if values.ndim != 1:
+ raise ValueError("PandasArray must be 1-dimensional.")
+
+ if copy:
+ values = values.copy()
+
+ self._ndarray = values
+ self._dtype = PandasDtype(values.dtype)
+
+ @classmethod
+ def _from_sequence(cls, scalars, dtype=None, copy=False):
+ if isinstance(dtype, PandasDtype):
+ dtype = dtype._dtype
+
+ result = np.asarray(scalars, dtype=dtype)
+ if copy and result is scalars:
+ result = result.copy()
+ return cls(result)
+
+ @classmethod
+ def _from_factorized(cls, values, original):
+ return cls(values)
+
+ @classmethod
+ def _concat_same_type(cls, to_concat):
+ return cls(np.concatenate(to_concat))
+
+ # ------------------------------------------------------------------------
+ # Data
+
+ @property
+ def dtype(self):
+ return self._dtype
+
+ # ------------------------------------------------------------------------
+ # NumPy Array Interface
+
+ def __array__(self, dtype=None):
+ return np.asarray(self._ndarray, dtype=dtype)
+
+ _HANDLED_TYPES = (np.ndarray, numbers.Number)
+
+ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+ # Lightly modified version of
+ # https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/\
+ # numpy.lib.mixins.NDArrayOperatorsMixin.html
+ # The primary modification is not boxing scalar return values
+ # in PandasArray, since pandas' ExtensionArrays are 1-d.
+ out = kwargs.get('out', ())
+ for x in inputs + out:
+ # Only support operations with instances of _HANDLED_TYPES.
+ # Use PandasArray instead of type(self) for isinstance to
+ # allow subclasses that don't override __array_ufunc__ to
+ # handle PandasArray objects.
+ if not isinstance(x, self._HANDLED_TYPES + (PandasArray,)):
+ return NotImplemented
+
+ # Defer to the implementation of the ufunc on unwrapped values.
+ inputs = tuple(x._ndarray if isinstance(x, PandasArray) else x
+ for x in inputs)
+ if out:
+ kwargs['out'] = tuple(
+ x._ndarray if isinstance(x, PandasArray) else x
+ for x in out)
+ result = getattr(ufunc, method)(*inputs, **kwargs)
+
+ if type(result) is tuple and len(result):
+ # multiple return values
+ if not lib.is_scalar(result[0]):
+ # re-box array-like results
+ return tuple(type(self)(x) for x in result)
+ else:
+ # but not scalar reductions
+ return result
+ elif method == 'at':
+ # no return value
+ return None
+ else:
+ # one return value
+ if not lib.is_scalar(result):
+ # re-box array-like results, but not scalar reductions
+ result = type(self)(result)
+ return result
+
+ # ------------------------------------------------------------------------
+ # Pandas ExtensionArray Interface
+
+ def __getitem__(self, item):
+ if isinstance(item, type(self)):
+ item = item._ndarray
+
+ result = self._ndarray[item]
+ if not lib.is_scalar(item):
+ result = type(self)(result)
+ return result
+
+ def __setitem__(self, key, value):
+ from pandas.core.internals.arrays import extract_array
+
+ value = extract_array(value, extract_numpy=True)
+
+ if not lib.is_scalar(key) and is_list_like(key):
+ key = np.asarray(key)
+
+ if not lib.is_scalar(value):
+ value = np.asarray(value)
+
+ values = self._ndarray
+ t = np.result_type(value, values)
+ if t != self._ndarray.dtype:
+ values = values.astype(t, casting='safe')
+ values[key] = value
+ self._dtype = PandasDtype(t)
+ self._ndarray = values
+ else:
+ self._ndarray[key] = value
+
+ def __len__(self):
+ return len(self._ndarray)
+
+ @property
+ def nbytes(self):
+ return self._ndarray.nbytes
+
+ def isna(self):
+ from pandas import isna
+
+ return isna(self._ndarray)
+
+ def fillna(self, value=None, method=None, limit=None):
+ # TODO(_values_for_fillna): remove this
+ value, method = validate_fillna_kwargs(value, method)
+
+ mask = self.isna()
+
+ if is_array_like(value):
+ if len(value) != len(self):
+ raise ValueError("Length of 'value' does not match. Got ({}) "
+ " expected {}".format(len(value), len(self)))
+ value = value[mask]
+
+ if mask.any():
+ if method is not None:
+ func = pad_1d if method == 'pad' else backfill_1d
+ new_values = func(self._ndarray, limit=limit,
+ mask=mask)
+ new_values = self._from_sequence(new_values, dtype=self.dtype)
+ else:
+ # fill with value
+ new_values = self.copy()
+ new_values[mask] = value
+ else:
+ new_values = self.copy()
+ return new_values
+
+ def take(self, indices, allow_fill=False, fill_value=None):
+ from pandas.core.algorithms import take
+
+ result = take(self._ndarray, indices, allow_fill=allow_fill,
+ fill_value=fill_value)
+ return type(self)(result)
+
+ def copy(self, deep=False):
+ return type(self)(self._ndarray.copy())
+
+ def _values_for_argsort(self):
+ return self._ndarray
+
+ def _values_for_factorize(self):
+ return self._ndarray, -1
+
+ def unique(self):
+ from pandas import unique
+
+ return type(self)(unique(self._ndarray))
+
+ # ------------------------------------------------------------------------
+ # Reductions
+
+ def _reduce(self, name, skipna=True, **kwargs):
+ meth = getattr(self, name, None)
+ if meth:
+ return meth(skipna=skipna, **kwargs)
+ else:
+ msg = (
+ "'{}' does not implement reduction '{}'"
+ )
+ raise TypeError(msg.format(type(self).__name__, name))
+
+ def any(self, axis=None, out=None, keepdims=False, skipna=True):
+ nv.validate_any((), dict(out=out, keepdims=keepdims))
+ return nanops.nanany(self._ndarray, axis=axis, skipna=skipna)
+
+ def all(self, axis=None, out=None, keepdims=False, skipna=True):
+ nv.validate_all((), dict(out=out, keepdims=keepdims))
+ return nanops.nanall(self._ndarray, axis=axis, skipna=skipna)
+
+ def min(self, axis=None, out=None, keepdims=False, skipna=True):
+ nv.validate_min((), dict(out=out, keepdims=keepdims))
+ return nanops.nanmin(self._ndarray, axis=axis, skipna=skipna)
+
+ def max(self, axis=None, out=None, keepdims=False, skipna=True):
+ nv.validate_max((), dict(out=out, keepdims=keepdims))
+ return nanops.nanmax(self._ndarray, axis=axis, skipna=skipna)
+
+ def sum(self, axis=None, dtype=None, out=None, keepdims=False,
+ initial=None, skipna=True, min_count=0):
+ nv.validate_sum((), dict(dtype=dtype, out=out, keepdims=keepdims,
+ initial=initial))
+ return nanops.nansum(self._ndarray, axis=axis, skipna=skipna,
+ min_count=min_count)
+
+ def prod(self, axis=None, dtype=None, out=None, keepdims=False,
+ initial=None, skipna=True, min_count=0):
+ nv.validate_prod((), dict(dtype=dtype, out=out, keepdims=keepdims,
+ initial=initial))
+ return nanops.nanprod(self._ndarray, axis=axis, skipna=skipna,
+ min_count=min_count)
+
+ def mean(self, axis=None, dtype=None, out=None, keepdims=False,
+ skipna=True):
+ nv.validate_mean((), dict(dtype=dtype, out=out, keepdims=keepdims))
+ return nanops.nanmean(self._ndarray, axis=axis, skipna=skipna)
+
+ def median(self, axis=None, out=None, overwrite_input=False,
+ keepdims=False, skipna=True):
+ nv.validate_median((), dict(out=out, overwrite_input=overwrite_input,
+ keepdims=keepdims))
+ return nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna)
+
+ def std(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False,
+ skipna=True):
+ nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out,
+ keepdims=keepdims),
+ fname='std')
+ return nanops.nanstd(self._ndarray, axis=axis, skipna=skipna,
+ ddof=ddof)
+
+ def var(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False,
+ skipna=True):
+ nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out,
+ keepdims=keepdims),
+ fname='var')
+ return nanops.nanvar(self._ndarray, axis=axis, skipna=skipna,
+ ddof=ddof)
+
+ def sem(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False,
+ skipna=True):
+ nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out,
+ keepdims=keepdims),
+ fname='sem')
+ return nanops.nansem(self._ndarray, axis=axis, skipna=skipna,
+ ddof=ddof)
+
+ def kurt(self, axis=None, dtype=None, out=None, keepdims=False,
+ skipna=True):
+ nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out,
+ keepdims=keepdims),
+ fname='kurt')
+ return nanops.nankurt(self._ndarray, axis=axis, skipna=skipna)
+
+ def skew(self, axis=None, dtype=None, out=None, keepdims=False,
+ skipna=True):
+ nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out,
+ keepdims=keepdims),
+ fname='skew')
+ return nanops.nanskew(self._ndarray, axis=axis, skipna=skipna)
+
+ # ------------------------------------------------------------------------
+ # Additional Methods
+ def to_numpy(self, dtype=None, copy=False):
+ """
+ Convert the PandasArray to a :class:`numpy.ndarray`.
+
+ By default, this requires no coercion or copying of data.
+
+ Parameters
+ ----------
+ dtype : numpy.dtype
+ The NumPy dtype to pass to :func:`numpy.asarray`.
+ copy : bool, default False
+ Whether to copy the underlying data.
+
+ Returns
+ -------
+ ndarray
+ """
+ result = np.asarray(self._ndarray, dtype=dtype)
+ if copy and result is self._ndarray:
+ result = result.copy()
+
+ return result
+
+ # ------------------------------------------------------------------------
+ # Ops
+
+ def __invert__(self):
+ return type(self)(~self._ndarray)
+
+ @classmethod
+ def _create_arithmetic_method(cls, op):
+ def arithmetic_method(self, other):
+ if isinstance(other, (ABCIndexClass, ABCSeries)):
+ return NotImplemented
+
+ elif isinstance(other, cls):
+ other = other._ndarray
+
+ with np.errstate(all="ignore"):
+ result = op(self._ndarray, other)
+
+ if op is divmod:
+ a, b = result
+ return cls(a), cls(b)
+
+ return cls(result)
+
+ return compat.set_function_name(arithmetic_method,
+ "__{}__".format(op.__name__),
+ cls)
+
+ _create_comparison_method = _create_arithmetic_method
+
+
+PandasArray._add_arithmetic_ops()
+PandasArray._add_comparison_ops()
diff --git a/contrib/python/pandas/py2/pandas/core/arrays/period.py b/contrib/python/pandas/py2/pandas/core/arrays/period.py
new file mode 100644
index 00000000000..47135932a69
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/arrays/period.py
@@ -0,0 +1,956 @@
+# -*- coding: utf-8 -*-
+from datetime import timedelta
+import operator
+
+import numpy as np
+
+from pandas._libs.tslibs import (
+ NaT, frequencies as libfrequencies, iNaT, period as libperiod)
+from pandas._libs.tslibs.fields import isleapyear_arr
+from pandas._libs.tslibs.period import (
+ DIFFERENT_FREQ, IncompatibleFrequency, Period, get_period_field_arr,
+ period_asfreq_arr)
+from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds
+import pandas.compat as compat
+from pandas.util._decorators import Appender, cache_readonly
+
+from pandas.core.dtypes.common import (
+ _TD_DTYPE, ensure_object, is_datetime64_dtype, is_float_dtype,
+ is_list_like, is_period_dtype, pandas_dtype)
+from pandas.core.dtypes.dtypes import PeriodDtype
+from pandas.core.dtypes.generic import (
+ ABCDataFrame, ABCIndexClass, ABCPeriodIndex, ABCSeries)
+from pandas.core.dtypes.missing import isna, notna
+
+import pandas.core.algorithms as algos
+from pandas.core.arrays import datetimelike as dtl
+import pandas.core.common as com
+
+from pandas.tseries import frequencies
+from pandas.tseries.offsets import DateOffset, Tick, _delta_to_tick
+
+
+def _field_accessor(name, alias, docstring=None):
+ def f(self):
+ base, mult = libfrequencies.get_freq_code(self.freq)
+ result = get_period_field_arr(alias, self.asi8, base)
+ return result
+
+ f.__name__ = name
+ f.__doc__ = docstring
+ return property(f)
+
+
+def _period_array_cmp(cls, op):
+ """
+ Wrap comparison operations to convert Period-like to PeriodDtype
+ """
+ opname = '__{name}__'.format(name=op.__name__)
+ nat_result = True if opname == '__ne__' else False
+
+ def wrapper(self, other):
+ op = getattr(self.asi8, opname)
+
+ if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
+ return NotImplemented
+
+ if is_list_like(other) and len(other) != len(self):
+ raise ValueError("Lengths must match")
+
+ if isinstance(other, Period):
+ self._check_compatible_with(other)
+
+ result = op(other.ordinal)
+ elif isinstance(other, cls):
+ self._check_compatible_with(other)
+
+ result = op(other.asi8)
+
+ mask = self._isnan | other._isnan
+ if mask.any():
+ result[mask] = nat_result
+
+ return result
+ elif other is NaT:
+ result = np.empty(len(self.asi8), dtype=bool)
+ result.fill(nat_result)
+ else:
+ other = Period(other, freq=self.freq)
+ result = op(other.ordinal)
+
+ if self._hasnans:
+ result[self._isnan] = nat_result
+
+ return result
+
+ return compat.set_function_name(wrapper, opname, cls)
+
+
+class PeriodArray(dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps):
+ """
+ Pandas ExtensionArray for storing Period data.
+
+ Users should use :func:`period_array` to create new instances.
+
+ Parameters
+ ----------
+ values : Union[PeriodArray, Series[period], ndarary[int], PeriodIndex]
+ The data to store. These should be arrays that can be directly
+ converted to ordinals without inference or copy (PeriodArray,
+ ndarray[int64]), or a box around such an array (Series[period],
+ PeriodIndex).
+ freq : str or DateOffset
+ The `freq` to use for the array. Mostly applicable when `values`
+ is an ndarray of integers, when `freq` is required. When `values`
+ is a PeriodArray (or box around), it's checked that ``values.freq``
+ matches `freq`.
+ copy : bool, default False
+ Whether to copy the ordinals before storing.
+
+ See Also
+ --------
+ period_array : Create a new PeriodArray.
+ pandas.PeriodIndex : Immutable Index for period data.
+
+ Notes
+ -----
+ There are two components to a PeriodArray
+
+ - ordinals : integer ndarray
+ - freq : pd.tseries.offsets.Offset
+
+ The values are physically stored as a 1-D ndarray of integers. These are
+ called "ordinals" and represent some kind of offset from a base.
+
+ The `freq` indicates the span covered by each element of the array.
+ All elements in the PeriodArray have the same `freq`.
+ """
+ # array priority higher than numpy scalars
+ __array_priority__ = 1000
+ _attributes = ["freq"]
+ _typ = "periodarray" # ABCPeriodArray
+ _scalar_type = Period
+
+ # Names others delegate to us
+ _other_ops = []
+ _bool_ops = ['is_leap_year']
+ _object_ops = ['start_time', 'end_time', 'freq']
+ _field_ops = ['year', 'month', 'day', 'hour', 'minute', 'second',
+ 'weekofyear', 'weekday', 'week', 'dayofweek',
+ 'dayofyear', 'quarter', 'qyear',
+ 'days_in_month', 'daysinmonth']
+ _datetimelike_ops = _field_ops + _object_ops + _bool_ops
+ _datetimelike_methods = ['strftime', 'to_timestamp', 'asfreq']
+
+ # --------------------------------------------------------------------
+ # Constructors
+
+ def __init__(self, values, freq=None, dtype=None, copy=False):
+ freq = validate_dtype_freq(dtype, freq)
+
+ if freq is not None:
+ freq = Period._maybe_convert_freq(freq)
+
+ if isinstance(values, ABCSeries):
+ values = values._values
+ if not isinstance(values, type(self)):
+ raise TypeError("Incorrect dtype")
+
+ elif isinstance(values, ABCPeriodIndex):
+ values = values._values
+
+ if isinstance(values, type(self)):
+ if freq is not None and freq != values.freq:
+ msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
+ own_freq=values.freq.freqstr,
+ other_freq=freq.freqstr)
+ raise IncompatibleFrequency(msg)
+ values, freq = values._data, values.freq
+
+ values = np.array(values, dtype='int64', copy=copy)
+ self._data = values
+ if freq is None:
+ raise ValueError('freq is not specified and cannot be inferred')
+ self._dtype = PeriodDtype(freq)
+
+ @classmethod
+ def _simple_new(cls, values, freq=None, **kwargs):
+ # alias for PeriodArray.__init__
+ return cls(values, freq=freq, **kwargs)
+
+ @classmethod
+ def _from_sequence(cls, scalars, dtype=None, copy=False):
+ # type: (Sequence[Optional[Period]], PeriodDtype, bool) -> PeriodArray
+ if dtype:
+ freq = dtype.freq
+ else:
+ freq = None
+
+ if isinstance(scalars, cls):
+ validate_dtype_freq(scalars.dtype, freq)
+ if copy:
+ scalars = scalars.copy()
+ return scalars
+
+ periods = np.asarray(scalars, dtype=object)
+ if copy:
+ periods = periods.copy()
+
+ freq = freq or libperiod.extract_freq(periods)
+ ordinals = libperiod.extract_ordinals(periods, freq)
+ return cls(ordinals, freq=freq)
+
+ @classmethod
+ def _from_datetime64(cls, data, freq, tz=None):
+ """
+ Construct a PeriodArray from a datetime64 array
+
+ Parameters
+ ----------
+ data : ndarray[datetime64[ns], datetime64[ns, tz]]
+ freq : str or Tick
+ tz : tzinfo, optional
+
+ Returns
+ -------
+ PeriodArray[freq]
+ """
+ data, freq = dt64arr_to_periodarr(data, freq, tz)
+ return cls(data, freq=freq)
+
+ @classmethod
+ def _generate_range(cls, start, end, periods, freq, fields):
+ periods = dtl.validate_periods(periods)
+
+ if freq is not None:
+ freq = Period._maybe_convert_freq(freq)
+
+ field_count = len(fields)
+ if start is not None or end is not None:
+ if field_count > 0:
+ raise ValueError('Can either instantiate from fields '
+ 'or endpoints, but not both')
+ subarr, freq = _get_ordinal_range(start, end, periods, freq)
+ elif field_count > 0:
+ subarr, freq = _range_from_fields(freq=freq, **fields)
+ else:
+ raise ValueError('Not enough parameters to construct '
+ 'Period range')
+
+ return subarr, freq
+
+ # -----------------------------------------------------------------
+ # DatetimeLike Interface
+
+ def _unbox_scalar(self, value):
+ # type: (Union[Period, NaTType]) -> int
+ if value is NaT:
+ return value.value
+ elif isinstance(value, self._scalar_type):
+ if not isna(value):
+ self._check_compatible_with(value)
+ return value.ordinal
+ else:
+ raise ValueError("'value' should be a Period. Got '{val}' instead."
+ .format(val=value))
+
+ def _scalar_from_string(self, value):
+ # type: (str) -> Period
+ return Period(value, freq=self.freq)
+
+ def _check_compatible_with(self, other):
+ if other is NaT:
+ return
+ if self.freqstr != other.freqstr:
+ _raise_on_incompatible(self, other)
+
+ # --------------------------------------------------------------------
+ # Data / Attributes
+
+ @cache_readonly
+ def dtype(self):
+ return self._dtype
+
+ @property
+ def freq(self):
+ """
+ Return the frequency object for this PeriodArray.
+ """
+ return self.dtype.freq
+
+ def __array__(self, dtype=None):
+ # overriding DatetimelikeArray
+ return np.array(list(self), dtype=object)
+
+ # --------------------------------------------------------------------
+ # Vectorized analogues of Period properties
+
+ year = _field_accessor('year', 0, "The year of the period")
+ month = _field_accessor('month', 3, "The month as January=1, December=12")
+ day = _field_accessor('day', 4, "The days of the period")
+ hour = _field_accessor('hour', 5, "The hour of the period")
+ minute = _field_accessor('minute', 6, "The minute of the period")
+ second = _field_accessor('second', 7, "The second of the period")
+ weekofyear = _field_accessor('week', 8, "The week ordinal of the year")
+ week = weekofyear
+ dayofweek = _field_accessor('dayofweek', 10,
+ "The day of the week with Monday=0, Sunday=6")
+ weekday = dayofweek
+ dayofyear = day_of_year = _field_accessor('dayofyear', 9,
+ "The ordinal day of the year")
+ quarter = _field_accessor('quarter', 2, "The quarter of the date")
+ qyear = _field_accessor('qyear', 1)
+ days_in_month = _field_accessor('days_in_month', 11,
+ "The number of days in the month")
+ daysinmonth = days_in_month
+
+ @property
+ def is_leap_year(self):
+ """
+ Logical indicating if the date belongs to a leap year
+ """
+ return isleapyear_arr(np.asarray(self.year))
+
+ @property
+ def start_time(self):
+ return self.to_timestamp(how='start')
+
+ @property
+ def end_time(self):
+ return self.to_timestamp(how='end')
+
+ def to_timestamp(self, freq=None, how='start'):
+ """
+ Cast to DatetimeArray/Index.
+
+ Parameters
+ ----------
+ freq : string or DateOffset, optional
+ Target frequency. The default is 'D' for week or longer,
+ 'S' otherwise
+ how : {'s', 'e', 'start', 'end'}
+
+ Returns
+ -------
+ DatetimeArray/Index
+ """
+ from pandas.core.arrays import DatetimeArray
+
+ how = libperiod._validate_end_alias(how)
+
+ end = how == 'E'
+ if end:
+ if freq == 'B':
+ # roll forward to ensure we land on B date
+ adjust = Timedelta(1, 'D') - Timedelta(1, 'ns')
+ return self.to_timestamp(how='start') + adjust
+ else:
+ adjust = Timedelta(1, 'ns')
+ return (self + self.freq).to_timestamp(how='start') - adjust
+
+ if freq is None:
+ base, mult = libfrequencies.get_freq_code(self.freq)
+ freq = libfrequencies.get_to_timestamp_base(base)
+ else:
+ freq = Period._maybe_convert_freq(freq)
+
+ base, mult = libfrequencies.get_freq_code(freq)
+ new_data = self.asfreq(freq, how=how)
+
+ new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base)
+ return DatetimeArray._from_sequence(new_data, freq='infer')
+
+ # --------------------------------------------------------------------
+ # Array-like / EA-Interface Methods
+
+ def _formatter(self, boxed=False):
+ if boxed:
+ return str
+ return "'{}'".format
+
+ @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__)
+ def _validate_fill_value(self, fill_value):
+ if isna(fill_value):
+ fill_value = iNaT
+ elif isinstance(fill_value, Period):
+ self._check_compatible_with(fill_value)
+ fill_value = fill_value.ordinal
+ else:
+ raise ValueError("'fill_value' should be a Period. "
+ "Got '{got}'.".format(got=fill_value))
+ return fill_value
+
+ # --------------------------------------------------------------------
+
+ def _time_shift(self, periods, freq=None):
+ """
+ Shift each value by `periods`.
+
+ Note this is different from ExtensionArray.shift, which
+ shifts the *position* of each element, padding the end with
+ missing values.
+
+ Parameters
+ ----------
+ periods : int
+ Number of periods to shift by.
+ freq : pandas.DateOffset, pandas.Timedelta, or string
+ Frequency increment to shift by.
+ """
+ if freq is not None:
+ raise TypeError("`freq` argument is not supported for "
+ "{cls}._time_shift"
+ .format(cls=type(self).__name__))
+ values = self.asi8 + periods * self.freq.n
+ if self._hasnans:
+ values[self._isnan] = iNaT
+ return type(self)(values, freq=self.freq)
+
+ @property
+ def _box_func(self):
+ return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq)
+
+ def asfreq(self, freq=None, how='E'):
+ """
+ Convert the Period Array/Index to the specified frequency `freq`.
+
+ Parameters
+ ----------
+ freq : str
+ a frequency
+ how : str {'E', 'S'}
+ 'E', 'END', or 'FINISH' for end,
+ 'S', 'START', or 'BEGIN' for start.
+ Whether the elements should be aligned to the end
+ or start within pa period. January 31st ('END') vs.
+ January 1st ('START') for example.
+
+ Returns
+ -------
+ new : Period Array/Index with the new frequency
+
+ Examples
+ --------
+ >>> pidx = pd.period_range('2010-01-01', '2015-01-01', freq='A')
+ >>> pidx
+ <class 'pandas.core.indexes.period.PeriodIndex'>
+ [2010, ..., 2015]
+ Length: 6, Freq: A-DEC
+
+ >>> pidx.asfreq('M')
+ <class 'pandas.core.indexes.period.PeriodIndex'>
+ [2010-12, ..., 2015-12]
+ Length: 6, Freq: M
+
+ >>> pidx.asfreq('M', how='S')
+ <class 'pandas.core.indexes.period.PeriodIndex'>
+ [2010-01, ..., 2015-01]
+ Length: 6, Freq: M
+ """
+ how = libperiod._validate_end_alias(how)
+
+ freq = Period._maybe_convert_freq(freq)
+
+ base1, mult1 = libfrequencies.get_freq_code(self.freq)
+ base2, mult2 = libfrequencies.get_freq_code(freq)
+
+ asi8 = self.asi8
+ # mult1 can't be negative or 0
+ end = how == 'E'
+ if end:
+ ordinal = asi8 + mult1 - 1
+ else:
+ ordinal = asi8
+
+ new_data = period_asfreq_arr(ordinal, base1, base2, end)
+
+ if self._hasnans:
+ new_data[self._isnan] = iNaT
+
+ return type(self)(new_data, freq=freq)
+
+ # ------------------------------------------------------------------
+ # Rendering Methods
+
+ def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs):
+ """
+ actually format my specific types
+ """
+ values = self.astype(object)
+
+ if date_format:
+ formatter = lambda dt: dt.strftime(date_format)
+ else:
+ formatter = lambda dt: u'%s' % dt
+
+ if self._hasnans:
+ mask = self._isnan
+ values[mask] = na_rep
+ imask = ~mask
+ values[imask] = np.array([formatter(dt) for dt
+ in values[imask]])
+ else:
+ values = np.array([formatter(dt) for dt in values])
+ return values
+
+ # ------------------------------------------------------------------
+
+ def astype(self, dtype, copy=True):
+ # We handle Period[T] -> Period[U]
+ # Our parent handles everything else.
+ dtype = pandas_dtype(dtype)
+
+ if is_period_dtype(dtype):
+ return self.asfreq(dtype.freq)
+ return super(PeriodArray, self).astype(dtype, copy=copy)
+
+ @property
+ def flags(self):
+ # TODO: remove
+ # We need this since reduction.SeriesBinGrouper uses values.flags
+ # Ideally, we wouldn't be passing objects down there in the first
+ # place.
+ return self._data.flags
+
+ # ------------------------------------------------------------------
+ # Arithmetic Methods
+ _create_comparison_method = classmethod(_period_array_cmp)
+
+ def _sub_datelike(self, other):
+ assert other is not NaT
+ return NotImplemented
+
+ def _sub_period(self, other):
+ # If the operation is well-defined, we return an object-Index
+ # of DateOffsets. Null entries are filled with pd.NaT
+ self._check_compatible_with(other)
+ asi8 = self.asi8
+ new_data = asi8 - other.ordinal
+ new_data = np.array([self.freq * x for x in new_data])
+
+ if self._hasnans:
+ new_data[self._isnan] = NaT
+
+ return new_data
+
+ @Appender(dtl.DatetimeLikeArrayMixin._addsub_int_array.__doc__)
+ def _addsub_int_array(
+ self,
+ other, # type: Union[Index, ExtensionArray, np.ndarray[int]]
+ op # type: Callable[Any, Any]
+ ):
+ # type: (...) -> PeriodArray
+
+ assert op in [operator.add, operator.sub]
+ if op is operator.sub:
+ other = -other
+ res_values = algos.checked_add_with_arr(self.asi8, other,
+ arr_mask=self._isnan)
+ res_values = res_values.view('i8')
+ res_values[self._isnan] = iNaT
+ return type(self)(res_values, freq=self.freq)
+
+ def _add_offset(self, other):
+ assert not isinstance(other, Tick)
+ base = libfrequencies.get_base_alias(other.rule_code)
+ if base != self.freq.rule_code:
+ _raise_on_incompatible(self, other)
+
+ # Note: when calling parent class's _add_timedeltalike_scalar,
+ # it will call delta_to_nanoseconds(delta). Because delta here
+ # is an integer, delta_to_nanoseconds will return it unchanged.
+ result = super(PeriodArray, self)._add_timedeltalike_scalar(other.n)
+ return type(self)(result, freq=self.freq)
+
+ def _add_timedeltalike_scalar(self, other):
+ """
+ Parameters
+ ----------
+ other : timedelta, Tick, np.timedelta64
+
+ Returns
+ -------
+ result : ndarray[int64]
+ """
+ assert isinstance(self.freq, Tick) # checked by calling function
+ assert isinstance(other, (timedelta, np.timedelta64, Tick))
+
+ if notna(other):
+ # special handling for np.timedelta64("NaT"), avoid calling
+ # _check_timedeltalike_freq_compat as that would raise TypeError
+ other = self._check_timedeltalike_freq_compat(other)
+
+ # Note: when calling parent class's _add_timedeltalike_scalar,
+ # it will call delta_to_nanoseconds(delta). Because delta here
+ # is an integer, delta_to_nanoseconds will return it unchanged.
+ ordinals = super(PeriodArray, self)._add_timedeltalike_scalar(other)
+ return ordinals
+
+ def _add_delta_tdi(self, other):
+ """
+ Parameters
+ ----------
+ other : TimedeltaArray or ndarray[timedelta64]
+
+ Returns
+ -------
+ result : ndarray[int64]
+ """
+ assert isinstance(self.freq, Tick) # checked by calling function
+
+ delta = self._check_timedeltalike_freq_compat(other)
+ return self._addsub_int_array(delta, operator.add).asi8
+
+ def _add_delta(self, other):
+ """
+ Add a timedelta-like, Tick, or TimedeltaIndex-like object
+ to self, yielding a new PeriodArray
+
+ Parameters
+ ----------
+ other : {timedelta, np.timedelta64, Tick,
+ TimedeltaIndex, ndarray[timedelta64]}
+
+ Returns
+ -------
+ result : PeriodArray
+ """
+ if not isinstance(self.freq, Tick):
+ # We cannot add timedelta-like to non-tick PeriodArray
+ _raise_on_incompatible(self, other)
+
+ new_ordinals = super(PeriodArray, self)._add_delta(other)
+ return type(self)(new_ordinals, freq=self.freq)
+
+ def _check_timedeltalike_freq_compat(self, other):
+ """
+ Arithmetic operations with timedelta-like scalars or array `other`
+ are only valid if `other` is an integer multiple of `self.freq`.
+ If the operation is valid, find that integer multiple. Otherwise,
+ raise because the operation is invalid.
+
+ Parameters
+ ----------
+ other : timedelta, np.timedelta64, Tick,
+ ndarray[timedelta64], TimedeltaArray, TimedeltaIndex
+
+ Returns
+ -------
+ multiple : int or ndarray[int64]
+
+ Raises
+ ------
+ IncompatibleFrequency
+ """
+ assert isinstance(self.freq, Tick) # checked by calling function
+ own_offset = frequencies.to_offset(self.freq.rule_code)
+ base_nanos = delta_to_nanoseconds(own_offset)
+
+ if isinstance(other, (timedelta, np.timedelta64, Tick)):
+ nanos = delta_to_nanoseconds(other)
+
+ elif isinstance(other, np.ndarray):
+ # numpy timedelta64 array; all entries must be compatible
+ assert other.dtype.kind == 'm'
+ if other.dtype != _TD_DTYPE:
+ # i.e. non-nano unit
+ # TODO: disallow unit-less timedelta64
+ other = other.astype(_TD_DTYPE)
+ nanos = other.view('i8')
+ else:
+ # TimedeltaArray/Index
+ nanos = other.asi8
+
+ if np.all(nanos % base_nanos == 0):
+ # nanos being added is an integer multiple of the
+ # base-frequency to self.freq
+ delta = nanos // base_nanos
+ # delta is the integer (or integer-array) number of periods
+ # by which will be added to self.
+ return delta
+
+ _raise_on_incompatible(self, other)
+
+ def _values_for_argsort(self):
+ return self._data
+
+
+PeriodArray._add_comparison_ops()
+
+
+def _raise_on_incompatible(left, right):
+ """
+ Helper function to render a consistent error message when raising
+ IncompatibleFrequency.
+
+ Parameters
+ ----------
+ left : PeriodArray
+ right : DateOffset, Period, ndarray, or timedelta-like
+
+ Raises
+ ------
+ IncompatibleFrequency
+ """
+ # GH#24283 error message format depends on whether right is scalar
+ if isinstance(right, np.ndarray):
+ other_freq = None
+ elif isinstance(right, (ABCPeriodIndex, PeriodArray, Period, DateOffset)):
+ other_freq = right.freqstr
+ else:
+ other_freq = _delta_to_tick(Timedelta(right)).freqstr
+
+ msg = DIFFERENT_FREQ.format(cls=type(left).__name__,
+ own_freq=left.freqstr,
+ other_freq=other_freq)
+ raise IncompatibleFrequency(msg)
+
+
+# -------------------------------------------------------------------
+# Constructor Helpers
+
+def period_array(data, freq=None, copy=False):
+ # type: (Sequence[Optional[Period]], Optional[Tick], bool) -> PeriodArray
+ """
+ Construct a new PeriodArray from a sequence of Period scalars.
+
+ Parameters
+ ----------
+ data : Sequence of Period objects
+ A sequence of Period objects. These are required to all have
+ the same ``freq.`` Missing values can be indicated by ``None``
+ or ``pandas.NaT``.
+ freq : str, Tick, or Offset
+ The frequency of every element of the array. This can be specified
+ to avoid inferring the `freq` from `data`.
+ copy : bool, default False
+ Whether to ensure a copy of the data is made.
+
+ Returns
+ -------
+ PeriodArray
+
+ See Also
+ --------
+ PeriodArray
+ pandas.PeriodIndex
+
+ Examples
+ --------
+ >>> period_array([pd.Period('2017', freq='A'),
+ ... pd.Period('2018', freq='A')])
+ <PeriodArray>
+ ['2017', '2018']
+ Length: 2, dtype: period[A-DEC]
+
+ >>> period_array([pd.Period('2017', freq='A'),
+ ... pd.Period('2018', freq='A'),
+ ... pd.NaT])
+ <PeriodArray>
+ ['2017', '2018', 'NaT']
+ Length: 3, dtype: period[A-DEC]
+
+ Integers that look like years are handled
+
+ >>> period_array([2000, 2001, 2002], freq='D')
+ ['2000-01-01', '2001-01-01', '2002-01-01']
+ Length: 3, dtype: period[D]
+
+ Datetime-like strings may also be passed
+
+ >>> period_array(['2000-Q1', '2000-Q2', '2000-Q3', '2000-Q4'], freq='Q')
+ <PeriodArray>
+ ['2000Q1', '2000Q2', '2000Q3', '2000Q4']
+ Length: 4, dtype: period[Q-DEC]
+ """
+ if is_datetime64_dtype(data):
+ return PeriodArray._from_datetime64(data, freq)
+ if isinstance(data, (ABCPeriodIndex, ABCSeries, PeriodArray)):
+ return PeriodArray(data, freq)
+
+ # other iterable of some kind
+ if not isinstance(data, (np.ndarray, list, tuple)):
+ data = list(data)
+
+ data = np.asarray(data)
+
+ if freq:
+ dtype = PeriodDtype(freq)
+ else:
+ dtype = None
+
+ if is_float_dtype(data) and len(data) > 0:
+ raise TypeError("PeriodIndex does not allow "
+ "floating point in construction")
+
+ data = ensure_object(data)
+
+ return PeriodArray._from_sequence(data, dtype=dtype)
+
+
+def validate_dtype_freq(dtype, freq):
+ """
+ If both a dtype and a freq are available, ensure they match. If only
+ dtype is available, extract the implied freq.
+
+ Parameters
+ ----------
+ dtype : dtype
+ freq : DateOffset or None
+
+ Returns
+ -------
+ freq : DateOffset
+
+ Raises
+ ------
+ ValueError : non-period dtype
+ IncompatibleFrequency : mismatch between dtype and freq
+ """
+ if freq is not None:
+ freq = frequencies.to_offset(freq)
+
+ if dtype is not None:
+ dtype = pandas_dtype(dtype)
+ if not is_period_dtype(dtype):
+ raise ValueError('dtype must be PeriodDtype')
+ if freq is None:
+ freq = dtype.freq
+ elif freq != dtype.freq:
+ raise IncompatibleFrequency('specified freq and dtype '
+ 'are different')
+ return freq
+
+
+def dt64arr_to_periodarr(data, freq, tz=None):
+ """
+ Convert an datetime-like array to values Period ordinals.
+
+ Parameters
+ ----------
+ data : Union[Series[datetime64[ns]], DatetimeIndex, ndarray[datetime64ns]]
+ freq : Optional[Union[str, Tick]]
+ Must match the `freq` on the `data` if `data` is a DatetimeIndex
+ or Series.
+ tz : Optional[tzinfo]
+
+ Returns
+ -------
+ ordinals : ndarray[int]
+ freq : Tick
+ The frequencey extracted from the Series or DatetimeIndex if that's
+ used.
+
+ """
+ if data.dtype != np.dtype('M8[ns]'):
+ raise ValueError('Wrong dtype: {dtype}'.format(dtype=data.dtype))
+
+ if freq is None:
+ if isinstance(data, ABCIndexClass):
+ data, freq = data._values, data.freq
+ elif isinstance(data, ABCSeries):
+ data, freq = data._values, data.dt.freq
+
+ freq = Period._maybe_convert_freq(freq)
+
+ if isinstance(data, (ABCIndexClass, ABCSeries)):
+ data = data._values
+
+ base, mult = libfrequencies.get_freq_code(freq)
+ return libperiod.dt64arr_to_periodarr(data.view('i8'), base, tz), freq
+
+
+def _get_ordinal_range(start, end, periods, freq, mult=1):
+ if com.count_not_none(start, end, periods) != 2:
+ raise ValueError('Of the three parameters: start, end, and periods, '
+ 'exactly two must be specified')
+
+ if freq is not None:
+ _, mult = libfrequencies.get_freq_code(freq)
+
+ if start is not None:
+ start = Period(start, freq)
+ if end is not None:
+ end = Period(end, freq)
+
+ is_start_per = isinstance(start, Period)
+ is_end_per = isinstance(end, Period)
+
+ if is_start_per and is_end_per and start.freq != end.freq:
+ raise ValueError('start and end must have same freq')
+ if (start is NaT or end is NaT):
+ raise ValueError('start and end must not be NaT')
+
+ if freq is None:
+ if is_start_per:
+ freq = start.freq
+ elif is_end_per:
+ freq = end.freq
+ else: # pragma: no cover
+ raise ValueError('Could not infer freq from start/end')
+
+ if periods is not None:
+ periods = periods * mult
+ if start is None:
+ data = np.arange(end.ordinal - periods + mult,
+ end.ordinal + 1, mult,
+ dtype=np.int64)
+ else:
+ data = np.arange(start.ordinal, start.ordinal + periods, mult,
+ dtype=np.int64)
+ else:
+ data = np.arange(start.ordinal, end.ordinal + 1, mult, dtype=np.int64)
+
+ return data, freq
+
+
+def _range_from_fields(year=None, month=None, quarter=None, day=None,
+ hour=None, minute=None, second=None, freq=None):
+ if hour is None:
+ hour = 0
+ if minute is None:
+ minute = 0
+ if second is None:
+ second = 0
+ if day is None:
+ day = 1
+
+ ordinals = []
+
+ if quarter is not None:
+ if freq is None:
+ freq = 'Q'
+ base = libfrequencies.FreqGroup.FR_QTR
+ else:
+ base, mult = libfrequencies.get_freq_code(freq)
+ if base != libfrequencies.FreqGroup.FR_QTR:
+ raise AssertionError("base must equal FR_QTR")
+
+ year, quarter = _make_field_arrays(year, quarter)
+ for y, q in compat.zip(year, quarter):
+ y, m = libperiod.quarter_to_myear(y, q, freq)
+ val = libperiod.period_ordinal(y, m, 1, 1, 1, 1, 0, 0, base)
+ ordinals.append(val)
+ else:
+ base, mult = libfrequencies.get_freq_code(freq)
+ arrays = _make_field_arrays(year, month, day, hour, minute, second)
+ for y, mth, d, h, mn, s in compat.zip(*arrays):
+ ordinals.append(libperiod.period_ordinal(
+ y, mth, d, h, mn, s, 0, 0, base))
+
+ return np.array(ordinals, dtype=np.int64), freq
+
+
+def _make_field_arrays(*fields):
+ length = None
+ for x in fields:
+ if isinstance(x, (list, np.ndarray, ABCSeries)):
+ if length is not None and len(x) != length:
+ raise ValueError('Mismatched Period array lengths')
+ elif length is None:
+ length = len(x)
+
+ arrays = [np.asarray(x) if isinstance(x, (np.ndarray, list, ABCSeries))
+ else np.repeat(x, length) for x in fields]
+
+ return arrays
diff --git a/contrib/python/pandas/py2/pandas/core/arrays/sparse.py b/contrib/python/pandas/py2/pandas/core/arrays/sparse.py
new file mode 100644
index 00000000000..9be2c9af169
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/arrays/sparse.py
@@ -0,0 +1,2028 @@
+"""
+SparseArray data structure
+"""
+from __future__ import division
+
+import numbers
+import operator
+import re
+import warnings
+
+import numpy as np
+
+from pandas._libs import index as libindex, lib
+import pandas._libs.sparse as splib
+from pandas._libs.sparse import BlockIndex, IntIndex
+from pandas._libs.tslibs import NaT
+import pandas.compat as compat
+from pandas.compat.numpy import function as nv
+from pandas.errors import PerformanceWarning
+
+from pandas.core.dtypes.base import ExtensionDtype
+from pandas.core.dtypes.cast import (
+ astype_nansafe, construct_1d_arraylike_from_scalar, find_common_type,
+ infer_dtype_from_scalar, maybe_convert_platform)
+from pandas.core.dtypes.common import (
+ is_array_like, is_bool_dtype, is_datetime64_any_dtype, is_dtype_equal,
+ is_integer, is_list_like, is_object_dtype, is_scalar, is_string_dtype,
+ pandas_dtype)
+from pandas.core.dtypes.dtypes import register_extension_dtype
+from pandas.core.dtypes.generic import (
+ ABCIndexClass, ABCSeries, ABCSparseSeries)
+from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna
+
+from pandas.core.accessor import PandasDelegate, delegate_names
+import pandas.core.algorithms as algos
+from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
+from pandas.core.base import PandasObject
+import pandas.core.common as com
+from pandas.core.missing import interpolate_2d
+
+import pandas.io.formats.printing as printing
+
+
+# ----------------------------------------------------------------------------
+# Dtype
+@register_extension_dtype
+class SparseDtype(ExtensionDtype):
+ """
+ Dtype for data stored in :class:`SparseArray`.
+
+ This dtype implements the pandas ExtensionDtype interface.
+
+ .. versionadded:: 0.24.0
+
+ Parameters
+ ----------
+ dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64
+ The dtype of the underlying array storing the non-fill value values.
+ fill_value : scalar, optional
+ The scalar value not stored in the SparseArray. By default, this
+ depends on `dtype`.
+
+ =========== ==========
+ dtype na_value
+ =========== ==========
+ float ``np.nan``
+ int ``0``
+ bool ``False``
+ datetime64 ``pd.NaT``
+ timedelta64 ``pd.NaT``
+ =========== ==========
+
+ The default value may be overridden by specifying a `fill_value`.
+ """
+ # We include `_is_na_fill_value` in the metadata to avoid hash collisions
+ # between SparseDtype(float, 0.0) and SparseDtype(float, nan).
+ # Without is_na_fill_value in the comparison, those would be equal since
+ # hash(nan) is (sometimes?) 0.
+ _metadata = ('_dtype', '_fill_value', '_is_na_fill_value')
+
+ def __init__(self, dtype=np.float64, fill_value=None):
+ # type: (Union[str, np.dtype, 'ExtensionDtype', type], Any) -> None
+ from pandas.core.dtypes.missing import na_value_for_dtype
+ from pandas.core.dtypes.common import (
+ pandas_dtype, is_string_dtype, is_scalar
+ )
+
+ if isinstance(dtype, type(self)):
+ if fill_value is None:
+ fill_value = dtype.fill_value
+ dtype = dtype.subtype
+
+ dtype = pandas_dtype(dtype)
+ if is_string_dtype(dtype):
+ dtype = np.dtype('object')
+
+ if fill_value is None:
+ fill_value = na_value_for_dtype(dtype)
+
+ if not is_scalar(fill_value):
+ raise ValueError("fill_value must be a scalar. Got {} "
+ "instead".format(fill_value))
+ self._dtype = dtype
+ self._fill_value = fill_value
+
+ def __hash__(self):
+ # Python3 doesn't inherit __hash__ when a base class overrides
+ # __eq__, so we explicitly do it here.
+ return super(SparseDtype, self).__hash__()
+
+ def __eq__(self, other):
+ # We have to override __eq__ to handle NA values in _metadata.
+ # The base class does simple == checks, which fail for NA.
+ if isinstance(other, compat.string_types):
+ try:
+ other = self.construct_from_string(other)
+ except TypeError:
+ return False
+
+ if isinstance(other, type(self)):
+ subtype = self.subtype == other.subtype
+ if self._is_na_fill_value:
+ # this case is complicated by two things:
+ # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan)
+ # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT)
+ # i.e. we want to treat any floating-point NaN as equal, but
+ # not a floating-point NaN and a datetime NaT.
+ fill_value = (
+ other._is_na_fill_value and
+ isinstance(self.fill_value, type(other.fill_value)) or
+ isinstance(other.fill_value, type(self.fill_value))
+ )
+ else:
+ fill_value = self.fill_value == other.fill_value
+
+ return subtype and fill_value
+ return False
+
+ @property
+ def fill_value(self):
+ """
+ The fill value of the array.
+
+ Converting the SparseArray to a dense ndarray will fill the
+ array with this value.
+
+ .. warning::
+
+ It's possible to end up with a SparseArray that has ``fill_value``
+ values in ``sp_values``. This can occur, for example, when setting
+ ``SparseArray.fill_value`` directly.
+ """
+ return self._fill_value
+
+ @property
+ def _is_na_fill_value(self):
+ from pandas.core.dtypes.missing import isna
+ return isna(self.fill_value)
+
+ @property
+ def _is_numeric(self):
+ from pandas.core.dtypes.common import is_object_dtype
+ return not is_object_dtype(self.subtype)
+
+ @property
+ def _is_boolean(self):
+ from pandas.core.dtypes.common import is_bool_dtype
+ return is_bool_dtype(self.subtype)
+
+ @property
+ def kind(self):
+ """
+ The sparse kind. Either 'integer', or 'block'.
+ """
+ return self.subtype.kind
+
+ @property
+ def type(self):
+ return self.subtype.type
+
+ @property
+ def subtype(self):
+ return self._dtype
+
+ @property
+ def name(self):
+ return 'Sparse[{}, {}]'.format(self.subtype.name, self.fill_value)
+
+ def __repr__(self):
+ return self.name
+
+ @classmethod
+ def construct_array_type(cls):
+ return SparseArray
+
+ @classmethod
+ def construct_from_string(cls, string):
+ """
+ Construct a SparseDtype from a string form.
+
+ Parameters
+ ----------
+ string : str
+ Can take the following forms.
+
+ string dtype
+ ================ ============================
+ 'int' SparseDtype[np.int64, 0]
+ 'Sparse' SparseDtype[np.float64, nan]
+ 'Sparse[int]' SparseDtype[np.int64, 0]
+ 'Sparse[int, 0]' SparseDtype[np.int64, 0]
+ ================ ============================
+
+ It is not possible to specify non-default fill values
+ with a string. An argument like ``'Sparse[int, 1]'``
+ will raise a ``TypeError`` because the default fill value
+ for integers is 0.
+
+ Returns
+ -------
+ SparseDtype
+ """
+ msg = "Could not construct SparseDtype from '{}'".format(string)
+ if string.startswith("Sparse"):
+ try:
+ sub_type, has_fill_value = cls._parse_subtype(string)
+ result = SparseDtype(sub_type)
+ except Exception:
+ raise TypeError(msg)
+ else:
+ msg = ("Could not construct SparseDtype from '{}'.\n\nIt "
+ "looks like the fill_value in the string is not "
+ "the default for the dtype. Non-default fill_values "
+ "are not supported. Use the 'SparseDtype()' "
+ "constructor instead.")
+ if has_fill_value and str(result) != string:
+ raise TypeError(msg.format(string))
+ return result
+ else:
+ raise TypeError(msg)
+
+ @staticmethod
+ def _parse_subtype(dtype):
+ """
+ Parse a string to get the subtype
+
+ Parameters
+ ----------
+ dtype : str
+ A string like
+
+ * Sparse[subtype]
+ * Sparse[subtype, fill_value]
+
+ Returns
+ -------
+ subtype : str
+
+ Raises
+ ------
+ ValueError
+ When the subtype cannot be extracted.
+ """
+ xpr = re.compile(
+ r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$"
+ )
+ m = xpr.match(dtype)
+ has_fill_value = False
+ if m:
+ subtype = m.groupdict()['subtype']
+ has_fill_value = m.groupdict()['fill_value'] or has_fill_value
+ elif dtype == "Sparse":
+ subtype = 'float64'
+ else:
+ raise ValueError("Cannot parse {}".format(dtype))
+ return subtype, has_fill_value
+
+ @classmethod
+ def is_dtype(cls, dtype):
+ dtype = getattr(dtype, 'dtype', dtype)
+ if (isinstance(dtype, compat.string_types) and
+ dtype.startswith("Sparse")):
+ sub_type, _ = cls._parse_subtype(dtype)
+ dtype = np.dtype(sub_type)
+ elif isinstance(dtype, cls):
+ return True
+ return isinstance(dtype, np.dtype) or dtype == 'Sparse'
+
+ def update_dtype(self, dtype):
+ """
+ Convert the SparseDtype to a new dtype.
+
+ This takes care of converting the ``fill_value``.
+
+ Parameters
+ ----------
+ dtype : Union[str, numpy.dtype, SparseDtype]
+ The new dtype to use.
+
+ * For a SparseDtype, it is simply returned
+ * For a NumPy dtype (or str), the current fill value
+ is converted to the new dtype, and a SparseDtype
+ with `dtype` and the new fill value is returned.
+
+ Returns
+ -------
+ SparseDtype
+ A new SparseDtype with the corret `dtype` and fill value
+ for that `dtype`.
+
+ Raises
+ ------
+ ValueError
+ When the current fill value cannot be converted to the
+ new `dtype` (e.g. trying to convert ``np.nan`` to an
+ integer dtype).
+
+
+ Examples
+ --------
+ >>> SparseDtype(int, 0).update_dtype(float)
+ Sparse[float64, 0.0]
+
+ >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan))
+ Sparse[float64, nan]
+ """
+ cls = type(self)
+ dtype = pandas_dtype(dtype)
+
+ if not isinstance(dtype, cls):
+ fill_value = astype_nansafe(np.array(self.fill_value),
+ dtype).item()
+ dtype = cls(dtype, fill_value=fill_value)
+
+ return dtype
+
+ @property
+ def _subtype_with_str(self):
+ """
+ Whether the SparseDtype's subtype should be considered ``str``.
+
+ Typically, pandas will store string data in an object-dtype array.
+ When converting values to a dtype, e.g. in ``.astype``, we need to
+ be more specific, we need the actual underlying type.
+
+ Returns
+ -------
+
+ >>> SparseDtype(int, 1)._subtype_with_str
+ dtype('int64')
+
+ >>> SparseDtype(object, 1)._subtype_with_str
+ dtype('O')
+
+ >>> dtype = SparseDtype(str, '')
+ >>> dtype.subtype
+ dtype('O')
+
+ >>> dtype._subtype_with_str
+ str
+ """
+ if isinstance(self.fill_value, compat.string_types):
+ return type(self.fill_value)
+ return self.subtype
+
+
+# ----------------------------------------------------------------------------
+# Array
+
+
+_sparray_doc_kwargs = dict(klass='SparseArray')
+
+
+def _get_fill(arr):
+ # type: (SparseArray) -> ndarray
+ """
+ Create a 0-dim ndarray containing the fill value
+
+ Parameters
+ ----------
+ arr : SparseArray
+
+ Returns
+ -------
+ fill_value : ndarray
+ 0-dim ndarray with just the fill value.
+
+ Notes
+ -----
+ coerce fill_value to arr dtype if possible
+ int64 SparseArray can have NaN as fill_value if there is no missing
+ """
+ try:
+ return np.asarray(arr.fill_value, dtype=arr.dtype.subtype)
+ except ValueError:
+ return np.asarray(arr.fill_value)
+
+
+def _sparse_array_op(left, right, op, name):
+ # type: (SparseArray, SparseArray, Callable, str) -> Any
+ """
+ Perform a binary operation between two arrays.
+
+ Parameters
+ ----------
+ left : Union[SparseArray, ndarray]
+ right : Union[SparseArray, ndarray]
+ op : Callable
+ The binary operation to perform
+ name str
+ Name of the callable.
+
+ Returns
+ -------
+ SparseArray
+ """
+ if name.startswith('__'):
+ # For lookups in _libs.sparse we need non-dunder op name
+ name = name[2:-2]
+
+ # dtype used to find corresponding sparse method
+ ltype = left.dtype.subtype
+ rtype = right.dtype.subtype
+
+ if not is_dtype_equal(ltype, rtype):
+ subtype = find_common_type([ltype, rtype])
+ ltype = SparseDtype(subtype, left.fill_value)
+ rtype = SparseDtype(subtype, right.fill_value)
+
+ # TODO(GH-23092): pass copy=False. Need to fix astype_nansafe
+ left = left.astype(ltype)
+ right = right.astype(rtype)
+ dtype = ltype.subtype
+ else:
+ dtype = ltype
+
+ # dtype the result must have
+ result_dtype = None
+
+ if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:
+ with np.errstate(all='ignore'):
+ result = op(left.get_values(), right.get_values())
+ fill = op(_get_fill(left), _get_fill(right))
+
+ if left.sp_index.ngaps == 0:
+ index = left.sp_index
+ else:
+ index = right.sp_index
+ elif left.sp_index.equals(right.sp_index):
+ with np.errstate(all='ignore'):
+ result = op(left.sp_values, right.sp_values)
+ fill = op(_get_fill(left), _get_fill(right))
+ index = left.sp_index
+ else:
+ if name[0] == 'r':
+ left, right = right, left
+ name = name[1:]
+
+ if name in ('and', 'or') and dtype == 'bool':
+ opname = 'sparse_{name}_uint8'.format(name=name)
+ # to make template simple, cast here
+ left_sp_values = left.sp_values.view(np.uint8)
+ right_sp_values = right.sp_values.view(np.uint8)
+ result_dtype = np.bool
+ else:
+ opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype)
+ left_sp_values = left.sp_values
+ right_sp_values = right.sp_values
+
+ sparse_op = getattr(splib, opname)
+
+ with np.errstate(all='ignore'):
+ result, index, fill = sparse_op(
+ left_sp_values, left.sp_index, left.fill_value,
+ right_sp_values, right.sp_index, right.fill_value)
+
+ if result_dtype is None:
+ result_dtype = result.dtype
+
+ return _wrap_result(name, result, index, fill, dtype=result_dtype)
+
+
+def _wrap_result(name, data, sparse_index, fill_value, dtype=None):
+ """
+ wrap op result to have correct dtype
+ """
+ if name.startswith('__'):
+ # e.g. __eq__ --> eq
+ name = name[2:-2]
+
+ if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
+ dtype = np.bool
+
+ fill_value = lib.item_from_zerodim(fill_value)
+
+ if is_bool_dtype(dtype):
+ # fill_value may be np.bool_
+ fill_value = bool(fill_value)
+ return SparseArray(data,
+ sparse_index=sparse_index,
+ fill_value=fill_value,
+ dtype=dtype)
+
+
+class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin):
+ """
+ An ExtensionArray for storing sparse data.
+
+ .. versionchanged:: 0.24.0
+
+ Implements the ExtensionArray interface.
+
+ Parameters
+ ----------
+ data : array-like
+ A dense array of values to store in the SparseArray. This may contain
+ `fill_value`.
+ sparse_index : SparseIndex, optional
+ index : Index
+ fill_value : scalar, optional
+ Elements in `data` that are `fill_value` are not stored in the
+ SparseArray. For memory savings, this should be the most common value
+ in `data`. By default, `fill_value` depends on the dtype of `data`:
+
+ =========== ==========
+ data.dtype na_value
+ =========== ==========
+ float ``np.nan``
+ int ``0``
+ bool False
+ datetime64 ``pd.NaT``
+ timedelta64 ``pd.NaT``
+ =========== ==========
+
+ The fill value is potentiall specified in three ways. In order of
+ precedence, these are
+
+ 1. The `fill_value` argument
+ 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is
+ a ``SparseDtype``
+ 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype`
+ is not a ``SparseDtype`` and `data` is a ``SparseArray``.
+
+
+ kind : {'integer', 'block'}, default 'integer'
+ The type of storage for sparse locations.
+
+ * 'block': Stores a `block` and `block_length` for each
+ contiguous *span* of sparse values. This is best when
+ sparse data tends to be clumped together, with large
+ regsions of ``fill-value`` values between sparse values.
+ * 'integer': uses an integer to store the location of
+ each sparse value.
+
+ dtype : np.dtype or SparseDtype, optional
+ The dtype to use for the SparseArray. For numpy dtypes, this
+ determines the dtype of ``self.sp_values``. For SparseDtype,
+ this determines ``self.sp_values`` and ``self.fill_value``.
+ copy : bool, default False
+ Whether to explicitly copy the incoming `data` array.
+ """
+
+ __array_priority__ = 15
+ _pandas_ftype = 'sparse'
+ _subtyp = 'sparse_array' # register ABCSparseArray
+
+ def __init__(self, data, sparse_index=None, index=None, fill_value=None,
+ kind='integer', dtype=None, copy=False):
+ from pandas.core.internals import SingleBlockManager
+
+ if isinstance(data, SingleBlockManager):
+ data = data.internal_values()
+
+ if fill_value is None and isinstance(dtype, SparseDtype):
+ fill_value = dtype.fill_value
+
+ if isinstance(data, (type(self), ABCSparseSeries)):
+ # disable normal inference on dtype, sparse_index, & fill_value
+ if sparse_index is None:
+ sparse_index = data.sp_index
+ if fill_value is None:
+ fill_value = data.fill_value
+ if dtype is None:
+ dtype = data.dtype
+ # TODO: make kind=None, and use data.kind?
+ data = data.sp_values
+
+ # Handle use-provided dtype
+ if isinstance(dtype, compat.string_types):
+ # Two options: dtype='int', regular numpy dtype
+ # or dtype='Sparse[int]', a sparse dtype
+ try:
+ dtype = SparseDtype.construct_from_string(dtype)
+ except TypeError:
+ dtype = pandas_dtype(dtype)
+
+ if isinstance(dtype, SparseDtype):
+ if fill_value is None:
+ fill_value = dtype.fill_value
+ dtype = dtype.subtype
+
+ if index is not None and not is_scalar(data):
+ raise Exception("must only pass scalars with an index ")
+
+ if is_scalar(data):
+ if index is not None:
+ if data is None:
+ data = np.nan
+
+ if index is not None:
+ npoints = len(index)
+ elif sparse_index is None:
+ npoints = 1
+ else:
+ npoints = sparse_index.length
+
+ dtype = infer_dtype_from_scalar(data)[0]
+ data = construct_1d_arraylike_from_scalar(
+ data, npoints, dtype
+ )
+
+ if dtype is not None:
+ dtype = pandas_dtype(dtype)
+
+ # TODO: disentangle the fill_value dtype inference from
+ # dtype inference
+ if data is None:
+ # XXX: What should the empty dtype be? Object or float?
+ data = np.array([], dtype=dtype)
+
+ if not is_array_like(data):
+ try:
+ # probably shared code in sanitize_series
+ from pandas.core.internals.construction import sanitize_array
+ data = sanitize_array(data, index=None)
+ except ValueError:
+ # NumPy may raise a ValueError on data like [1, []]
+ # we retry with object dtype here.
+ if dtype is None:
+ dtype = object
+ data = np.atleast_1d(np.asarray(data, dtype=dtype))
+ else:
+ raise
+
+ if copy:
+ # TODO: avoid double copy when dtype forces cast.
+ data = data.copy()
+
+ if fill_value is None:
+ fill_value_dtype = data.dtype if dtype is None else dtype
+ if fill_value_dtype is None:
+ fill_value = np.nan
+ else:
+ fill_value = na_value_for_dtype(fill_value_dtype)
+
+ if isinstance(data, type(self)) and sparse_index is None:
+ sparse_index = data._sparse_index
+ sparse_values = np.asarray(data.sp_values, dtype=dtype)
+ elif sparse_index is None:
+ sparse_values, sparse_index, fill_value = make_sparse(
+ data, kind=kind, fill_value=fill_value, dtype=dtype
+ )
+ else:
+ sparse_values = np.asarray(data, dtype=dtype)
+ if len(sparse_values) != sparse_index.npoints:
+ raise AssertionError("Non array-like type {type} must "
+ "have the same length as the index"
+ .format(type=type(sparse_values)))
+ self._sparse_index = sparse_index
+ self._sparse_values = sparse_values
+ self._dtype = SparseDtype(sparse_values.dtype, fill_value)
+
+ @classmethod
+ def _simple_new(cls, sparse_array, sparse_index, dtype):
+ # type: (np.ndarray, SparseIndex, SparseDtype) -> 'SparseArray'
+ new = cls([])
+ new._sparse_index = sparse_index
+ new._sparse_values = sparse_array
+ new._dtype = dtype
+ return new
+
+ def __array__(self, dtype=None, copy=True):
+ fill_value = self.fill_value
+
+ if self.sp_index.ngaps == 0:
+ # Compat for na dtype and int values.
+ return self.sp_values
+ if dtype is None:
+ # Can NumPy represent this type?
+ # If not, `np.result_type` will raise. We catch that
+ # and return object.
+ if is_datetime64_any_dtype(self.sp_values.dtype):
+ # However, we *do* special-case the common case of
+ # a datetime64 with pandas NaT.
+ if fill_value is NaT:
+ # Can't put pd.NaT in a datetime64[ns]
+ fill_value = np.datetime64('NaT')
+ try:
+ dtype = np.result_type(self.sp_values.dtype, type(fill_value))
+ except TypeError:
+ dtype = object
+
+ out = np.full(self.shape, fill_value, dtype=dtype)
+ out[self.sp_index.to_int_index().indices] = self.sp_values
+ return out
+
+ def __setitem__(self, key, value):
+ # I suppose we could allow setting of non-fill_value elements.
+ # TODO(SparseArray.__setitem__): remove special cases in
+ # ExtensionBlock.where
+ msg = "SparseArray does not support item assignment via setitem"
+ raise TypeError(msg)
+
+ @classmethod
+ def _from_sequence(cls, scalars, dtype=None, copy=False):
+ return cls(scalars, dtype=dtype)
+
+ @classmethod
+ def _from_factorized(cls, values, original):
+ return cls(values, dtype=original.dtype)
+
+ # ------------------------------------------------------------------------
+ # Data
+ # ------------------------------------------------------------------------
+ @property
+ def sp_index(self):
+ """
+ The SparseIndex containing the location of non- ``fill_value`` points.
+ """
+ return self._sparse_index
+
+ @property
+ def sp_values(self):
+ """
+ An ndarray containing the non- ``fill_value`` values.
+
+ Examples
+ --------
+ >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0)
+ >>> s.sp_values
+ array([1, 2])
+ """
+ return self._sparse_values
+
+ @property
+ def dtype(self):
+ return self._dtype
+
+ @property
+ def fill_value(self):
+ """
+ Elements in `data` that are `fill_value` are not stored.
+
+ For memory savings, this should be the most common value in the array.
+ """
+ return self.dtype.fill_value
+
+ @fill_value.setter
+ def fill_value(self, value):
+ self._dtype = SparseDtype(self.dtype.subtype, value)
+
+ @property
+ def kind(self):
+ """
+ The kind of sparse index for this array. One of {'integer', 'block'}.
+ """
+ if isinstance(self.sp_index, IntIndex):
+ return 'integer'
+ else:
+ return 'block'
+
+ @property
+ def _valid_sp_values(self):
+ sp_vals = self.sp_values
+ mask = notna(sp_vals)
+ return sp_vals[mask]
+
+ def __len__(self):
+ return self.sp_index.length
+
+ @property
+ def _null_fill_value(self):
+ return self._dtype._is_na_fill_value
+
+ def _fill_value_matches(self, fill_value):
+ if self._null_fill_value:
+ return isna(fill_value)
+ else:
+ return self.fill_value == fill_value
+
+ @property
+ def nbytes(self):
+ return self.sp_values.nbytes + self.sp_index.nbytes
+
+ @property
+ def density(self):
+ """
+ The percent of non- ``fill_value`` points, as decimal.
+
+ Examples
+ --------
+ >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
+ >>> s.density
+ 0.6
+ """
+ r = float(self.sp_index.npoints) / float(self.sp_index.length)
+ return r
+
+ @property
+ def npoints(self):
+ """
+ The number of non- ``fill_value`` points.
+
+ Examples
+ --------
+ >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
+ >>> s.npoints
+ 3
+ """
+ return self.sp_index.npoints
+
+ @property
+ def values(self):
+ """
+ Dense values
+ """
+ return self.to_dense()
+
+ def isna(self):
+ from pandas import isna
+ # If null fill value, we want SparseDtype[bool, true]
+ # to preserve the same memory usage.
+ dtype = SparseDtype(bool, self._null_fill_value)
+ return type(self)._simple_new(isna(self.sp_values),
+ self.sp_index, dtype)
+
+ def fillna(self, value=None, method=None, limit=None):
+ """
+ Fill missing values with `value`.
+
+ Parameters
+ ----------
+ value : scalar, optional
+ method : str, optional
+
+ .. warning::
+
+ Using 'method' will result in high memory use,
+ as all `fill_value` methods will be converted to
+ an in-memory ndarray
+
+ limit : int, optional
+
+ Returns
+ -------
+ SparseArray
+
+ Notes
+ -----
+ When `value` is specified, the result's ``fill_value`` depends on
+ ``self.fill_value``. The goal is to maintain low-memory use.
+
+ If ``self.fill_value`` is NA, the result dtype will be
+ ``SparseDtype(self.dtype, fill_value=value)``. This will preserve
+ amount of memory used before and after filling.
+
+ When ``self.fill_value`` is not NA, the result dtype will be
+ ``self.dtype``. Again, this preserves the amount of memory used.
+ """
+ if ((method is None and value is None) or
+ (method is not None and value is not None)):
+ raise ValueError("Must specify one of 'method' or 'value'.")
+
+ elif method is not None:
+ msg = "fillna with 'method' requires high memory usage."
+ warnings.warn(msg, PerformanceWarning)
+ filled = interpolate_2d(np.asarray(self), method=method,
+ limit=limit)
+ return type(self)(filled, fill_value=self.fill_value)
+
+ else:
+ new_values = np.where(isna(self.sp_values), value, self.sp_values)
+
+ if self._null_fill_value:
+ # This is essentially just updating the dtype.
+ new_dtype = SparseDtype(self.dtype.subtype, fill_value=value)
+ else:
+ new_dtype = self.dtype
+
+ return self._simple_new(new_values, self._sparse_index, new_dtype)
+
+ def shift(self, periods=1, fill_value=None):
+
+ if not len(self) or periods == 0:
+ return self.copy()
+
+ if isna(fill_value):
+ fill_value = self.dtype.na_value
+
+ subtype = np.result_type(fill_value, self.dtype.subtype)
+
+ if subtype != self.dtype.subtype:
+ # just coerce up front
+ arr = self.astype(SparseDtype(subtype, self.fill_value))
+ else:
+ arr = self
+
+ empty = self._from_sequence(
+ [fill_value] * min(abs(periods), len(self)),
+ dtype=arr.dtype
+ )
+
+ if periods > 0:
+ a = empty
+ b = arr[:-periods]
+ else:
+ a = arr[abs(periods):]
+ b = empty
+ return arr._concat_same_type([a, b])
+
+ def _first_fill_value_loc(self):
+ """
+ Get the location of the first missing value.
+
+ Returns
+ -------
+ int
+ """
+ if len(self) == 0 or self.sp_index.npoints == len(self):
+ return -1
+
+ indices = self.sp_index.to_int_index().indices
+ if not len(indices) or indices[0] > 0:
+ return 0
+
+ diff = indices[1:] - indices[:-1]
+ return np.searchsorted(diff, 2) + 1
+
+ def unique(self):
+ uniques = list(algos.unique(self.sp_values))
+ fill_loc = self._first_fill_value_loc()
+ if fill_loc >= 0:
+ uniques.insert(fill_loc, self.fill_value)
+ return type(self)._from_sequence(uniques, dtype=self.dtype)
+
+ def _values_for_factorize(self):
+ # Still override this for hash_pandas_object
+ return np.asarray(self), self.fill_value
+
+ def factorize(self, na_sentinel=-1):
+ # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
+ # The sparsity on this is backwards from what Sparse would want. Want
+ # ExtensionArray.factorize -> Tuple[EA, EA]
+ # Given that we have to return a dense array of labels, why bother
+ # implementing an efficient factorize?
+ labels, uniques = algos.factorize(np.asarray(self),
+ na_sentinel=na_sentinel)
+ uniques = SparseArray(uniques, dtype=self.dtype)
+ return labels, uniques
+
+ def value_counts(self, dropna=True):
+ """
+ Returns a Series containing counts of unique values.
+
+ Parameters
+ ----------
+ dropna : boolean, default True
+ Don't include counts of NaN, even if NaN is in sp_values.
+
+ Returns
+ -------
+ counts : Series
+ """
+ from pandas import Index, Series
+
+ keys, counts = algos._value_counts_arraylike(self.sp_values,
+ dropna=dropna)
+ fcounts = self.sp_index.ngaps
+ if fcounts > 0:
+ if self._null_fill_value and dropna:
+ pass
+ else:
+ if self._null_fill_value:
+ mask = isna(keys)
+ else:
+ mask = keys == self.fill_value
+
+ if mask.any():
+ counts[mask] += fcounts
+ else:
+ keys = np.insert(keys, 0, self.fill_value)
+ counts = np.insert(counts, 0, fcounts)
+
+ if not isinstance(keys, ABCIndexClass):
+ keys = Index(keys)
+ result = Series(counts, index=keys)
+ return result
+
+ # --------
+ # Indexing
+ # --------
+
+ def __getitem__(self, key):
+ if isinstance(key, tuple):
+ if len(key) > 1:
+ raise IndexError("too many indices for array.")
+ key = key[0]
+
+ if is_integer(key):
+ return self._get_val_at(key)
+ elif isinstance(key, tuple):
+ data_slice = self.values[key]
+ elif isinstance(key, slice):
+ # special case to preserve dtypes
+ if key == slice(None):
+ return self.copy()
+ # TODO: this logic is surely elsewhere
+ # TODO: this could be more efficient
+ indices = np.arange(len(self), dtype=np.int32)[key]
+ return self.take(indices)
+ else:
+ # TODO: I think we can avoid densifying when masking a
+ # boolean SparseArray with another. Need to look at the
+ # key's fill_value for True / False, and then do an intersection
+ # on the indicies of the sp_values.
+ if isinstance(key, SparseArray):
+ if is_bool_dtype(key):
+ key = key.to_dense()
+ else:
+ key = np.asarray(key)
+
+ if com.is_bool_indexer(key) and len(self) == len(key):
+ return self.take(np.arange(len(key), dtype=np.int32)[key])
+ elif hasattr(key, '__len__'):
+ return self.take(key)
+ else:
+ raise ValueError("Cannot slice with '{}'".format(key))
+
+ return type(self)(data_slice, kind=self.kind)
+
+ def _get_val_at(self, loc):
+ n = len(self)
+ if loc < 0:
+ loc += n
+
+ if loc >= n or loc < 0:
+ raise IndexError('Out of bounds access')
+
+ sp_loc = self.sp_index.lookup(loc)
+ if sp_loc == -1:
+ return self.fill_value
+ else:
+ return libindex.get_value_at(self.sp_values, sp_loc)
+
+ def take(self, indices, allow_fill=False, fill_value=None):
+ if is_scalar(indices):
+ raise ValueError("'indices' must be an array, not a "
+ "scalar '{}'.".format(indices))
+ indices = np.asarray(indices, dtype=np.int32)
+
+ if indices.size == 0:
+ result = []
+ kwargs = {'dtype': self.dtype}
+ elif allow_fill:
+ result = self._take_with_fill(indices, fill_value=fill_value)
+ kwargs = {}
+ else:
+ result = self._take_without_fill(indices)
+ kwargs = {'dtype': self.dtype}
+
+ return type(self)(result, fill_value=self.fill_value, kind=self.kind,
+ **kwargs)
+
+ def _take_with_fill(self, indices, fill_value=None):
+ if fill_value is None:
+ fill_value = self.dtype.na_value
+
+ if indices.min() < -1:
+ raise ValueError("Invalid value in 'indices'. Must be between -1 "
+ "and the length of the array.")
+
+ if indices.max() >= len(self):
+ raise IndexError("out of bounds value in 'indices'.")
+
+ if len(self) == 0:
+ # Empty... Allow taking only if all empty
+ if (indices == -1).all():
+ dtype = np.result_type(self.sp_values, type(fill_value))
+ taken = np.empty_like(indices, dtype=dtype)
+ taken.fill(fill_value)
+ return taken
+ else:
+ raise IndexError('cannot do a non-empty take from an empty '
+ 'axes.')
+
+ sp_indexer = self.sp_index.lookup_array(indices)
+
+ if self.sp_index.npoints == 0:
+ # Avoid taking from the empty self.sp_values
+ taken = np.full(sp_indexer.shape, fill_value=fill_value,
+ dtype=np.result_type(type(fill_value)))
+ else:
+ taken = self.sp_values.take(sp_indexer)
+
+ # sp_indexer may be -1 for two reasons
+ # 1.) we took for an index of -1 (new)
+ # 2.) we took a value that was self.fill_value (old)
+ new_fill_indices = indices == -1
+ old_fill_indices = (sp_indexer == -1) & ~new_fill_indices
+
+ # Fill in two steps.
+ # Old fill values
+ # New fill values
+ # potentially coercing to a new dtype at each stage.
+
+ m0 = sp_indexer[old_fill_indices] < 0
+ m1 = sp_indexer[new_fill_indices] < 0
+
+ result_type = taken.dtype
+
+ if m0.any():
+ result_type = np.result_type(result_type,
+ type(self.fill_value))
+ taken = taken.astype(result_type)
+ taken[old_fill_indices] = self.fill_value
+
+ if m1.any():
+ result_type = np.result_type(result_type, type(fill_value))
+ taken = taken.astype(result_type)
+ taken[new_fill_indices] = fill_value
+
+ return taken
+
+ def _take_without_fill(self, indices):
+ to_shift = indices < 0
+ indices = indices.copy()
+
+ n = len(self)
+
+ if (indices.max() >= n) or (indices.min() < -n):
+ if n == 0:
+ raise IndexError("cannot do a non-empty take from an "
+ "empty axes.")
+ else:
+ raise IndexError("out of bounds value in 'indices'.")
+
+ if to_shift.any():
+ indices[to_shift] += n
+
+ if self.sp_index.npoints == 0:
+ # edge case in take...
+ # I think just return
+ out = np.full(indices.shape, self.fill_value,
+ dtype=np.result_type(type(self.fill_value)))
+ arr, sp_index, fill_value = make_sparse(out,
+ fill_value=self.fill_value)
+ return type(self)(arr, sparse_index=sp_index,
+ fill_value=fill_value)
+
+ sp_indexer = self.sp_index.lookup_array(indices)
+ taken = self.sp_values.take(sp_indexer)
+ fillable = (sp_indexer < 0)
+
+ if fillable.any():
+ # TODO: may need to coerce array to fill value
+ result_type = np.result_type(taken, type(self.fill_value))
+ taken = taken.astype(result_type)
+ taken[fillable] = self.fill_value
+
+ return taken
+
+ def searchsorted(self, v, side="left", sorter=None):
+ msg = "searchsorted requires high memory usage."
+ warnings.warn(msg, PerformanceWarning, stacklevel=2)
+ if not is_scalar(v):
+ v = np.asarray(v)
+ v = np.asarray(v)
+ return np.asarray(self, dtype=self.dtype.subtype).searchsorted(
+ v, side, sorter
+ )
+
+ def copy(self, deep=False):
+ if deep:
+ values = self.sp_values.copy()
+ else:
+ values = self.sp_values
+
+ return self._simple_new(values, self.sp_index, self.dtype)
+
+ @classmethod
+ def _concat_same_type(cls, to_concat):
+ fill_values = [x.fill_value for x in to_concat]
+
+ fill_value = fill_values[0]
+
+ # np.nan isn't a singleton, so we may end up with multiple
+ # NaNs here, so we ignore tha all NA case too.
+ if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
+ warnings.warn("Concatenating sparse arrays with multiple fill "
+ "values: '{}'. Picking the first and "
+ "converting the rest.".format(fill_values),
+ PerformanceWarning,
+ stacklevel=6)
+ keep = to_concat[0]
+ to_concat2 = [keep]
+
+ for arr in to_concat[1:]:
+ to_concat2.append(cls(np.asarray(arr), fill_value=fill_value))
+
+ to_concat = to_concat2
+
+ values = []
+ length = 0
+
+ if to_concat:
+ sp_kind = to_concat[0].kind
+ else:
+ sp_kind = 'integer'
+
+ if sp_kind == 'integer':
+ indices = []
+
+ for arr in to_concat:
+ idx = arr.sp_index.to_int_index().indices.copy()
+ idx += length # TODO: wraparound
+ length += arr.sp_index.length
+
+ values.append(arr.sp_values)
+ indices.append(idx)
+
+ data = np.concatenate(values)
+ indices = np.concatenate(indices)
+ sp_index = IntIndex(length, indices)
+
+ else:
+ # when concatentating block indices, we don't claim that you'll
+ # get an identical index as concating the values and then
+ # creating a new index. We don't want to spend the time trying
+ # to merge blocks across arrays in `to_concat`, so the resulting
+ # BlockIndex may have more blocs.
+ blengths = []
+ blocs = []
+
+ for arr in to_concat:
+ idx = arr.sp_index.to_block_index()
+
+ values.append(arr.sp_values)
+ blocs.append(idx.blocs.copy() + length)
+ blengths.append(idx.blengths)
+ length += arr.sp_index.length
+
+ data = np.concatenate(values)
+ blocs = np.concatenate(blocs)
+ blengths = np.concatenate(blengths)
+
+ sp_index = BlockIndex(length, blocs, blengths)
+
+ return cls(data, sparse_index=sp_index, fill_value=fill_value)
+
+ def astype(self, dtype=None, copy=True):
+ """
+ Change the dtype of a SparseArray.
+
+ The output will always be a SparseArray. To convert to a dense
+ ndarray with a certain dtype, use :meth:`numpy.asarray`.
+
+ Parameters
+ ----------
+ dtype : np.dtype or ExtensionDtype
+ For SparseDtype, this changes the dtype of
+ ``self.sp_values`` and the ``self.fill_value``.
+
+ For other dtypes, this only changes the dtype of
+ ``self.sp_values``.
+
+ copy : bool, default True
+ Whether to ensure a copy is made, even if not necessary.
+
+ Returns
+ -------
+ SparseArray
+
+ Examples
+ --------
+ >>> arr = SparseArray([0, 0, 1, 2])
+ >>> arr
+ [0, 0, 1, 2]
+ Fill: 0
+ IntIndex
+ Indices: array([2, 3], dtype=int32)
+
+ >>> arr.astype(np.dtype('int32'))
+ [0, 0, 1, 2]
+ Fill: 0
+ IntIndex
+ Indices: array([2, 3], dtype=int32)
+
+ Using a NumPy dtype with a different kind (e.g. float) will coerce
+ just ``self.sp_values``.
+
+ >>> arr.astype(np.dtype('float64'))
+ ... # doctest: +NORMALIZE_WHITESPACE
+ [0, 0, 1.0, 2.0]
+ Fill: 0
+ IntIndex
+ Indices: array([2, 3], dtype=int32)
+
+ Use a SparseDtype if you wish to be change the fill value as well.
+
+ >>> arr.astype(SparseDtype("float64", fill_value=np.nan))
+ ... # doctest: +NORMALIZE_WHITESPACE
+ [nan, nan, 1.0, 2.0]
+ Fill: nan
+ IntIndex
+ Indices: array([2, 3], dtype=int32)
+ """
+ dtype = self.dtype.update_dtype(dtype)
+ subtype = dtype._subtype_with_str
+ sp_values = astype_nansafe(self.sp_values,
+ subtype,
+ copy=copy)
+ if sp_values is self.sp_values and copy:
+ sp_values = sp_values.copy()
+
+ return self._simple_new(sp_values,
+ self.sp_index,
+ dtype)
+
+ def map(self, mapper):
+ """
+ Map categories using input correspondence (dict, Series, or function).
+
+ Parameters
+ ----------
+ mapper : dict, Series, callable
+ The correspondence from old values to new.
+
+ Returns
+ -------
+ SparseArray
+ The output array will have the same density as the input.
+ The output fill value will be the result of applying the
+ mapping to ``self.fill_value``
+
+ Examples
+ --------
+ >>> arr = pd.SparseArray([0, 1, 2])
+ >>> arr.apply(lambda x: x + 10)
+ [10, 11, 12]
+ Fill: 10
+ IntIndex
+ Indices: array([1, 2], dtype=int32)
+
+ >>> arr.apply({0: 10, 1: 11, 2: 12})
+ [10, 11, 12]
+ Fill: 10
+ IntIndex
+ Indices: array([1, 2], dtype=int32)
+
+ >>> arr.apply(pd.Series([10, 11, 12], index=[0, 1, 2]))
+ [10, 11, 12]
+ Fill: 10
+ IntIndex
+ Indices: array([1, 2], dtype=int32)
+ """
+ # this is used in apply.
+ # We get hit since we're an "is_extension_type" but regular extension
+ # types are not hit. This may be worth adding to the interface.
+ if isinstance(mapper, ABCSeries):
+ mapper = mapper.to_dict()
+
+ if isinstance(mapper, compat.Mapping):
+ fill_value = mapper.get(self.fill_value, self.fill_value)
+ sp_values = [mapper.get(x, None) for x in self.sp_values]
+ else:
+ fill_value = mapper(self.fill_value)
+ sp_values = [mapper(x) for x in self.sp_values]
+
+ return type(self)(sp_values, sparse_index=self.sp_index,
+ fill_value=fill_value)
+
+ def to_dense(self):
+ """
+ Convert SparseArray to a NumPy array.
+
+ Returns
+ -------
+ arr : NumPy array
+ """
+ return np.asarray(self, dtype=self.sp_values.dtype)
+
+ # TODO: Look into deprecating this in favor of `to_dense`.
+ get_values = to_dense
+
+ # ------------------------------------------------------------------------
+ # IO
+ # ------------------------------------------------------------------------
+ def __setstate__(self, state):
+ """Necessary for making this object picklable"""
+ if isinstance(state, tuple):
+ # Compat for pandas < 0.24.0
+ nd_state, (fill_value, sp_index) = state
+ sparse_values = np.array([])
+ sparse_values.__setstate__(nd_state)
+
+ self._sparse_values = sparse_values
+ self._sparse_index = sp_index
+ self._dtype = SparseDtype(sparse_values.dtype, fill_value)
+ else:
+ self.__dict__.update(state)
+
+ def nonzero(self):
+ if self.fill_value == 0:
+ return self.sp_index.to_int_index().indices,
+ else:
+ return self.sp_index.to_int_index().indices[self.sp_values != 0],
+
+ # ------------------------------------------------------------------------
+ # Reductions
+ # ------------------------------------------------------------------------
+
+ def _reduce(self, name, skipna=True, **kwargs):
+ method = getattr(self, name, None)
+
+ if method is None:
+ raise TypeError("cannot perform {name} with type {dtype}".format(
+ name=name, dtype=self.dtype))
+
+ if skipna:
+ arr = self
+ else:
+ arr = self.dropna()
+
+ # we don't support these kwargs.
+ # They should only be present when called via pandas, so do it here.
+ # instead of in `any` / `all` (which will raise if they're present,
+ # thanks to nv.validate
+ kwargs.pop('filter_type', None)
+ kwargs.pop('numeric_only', None)
+ kwargs.pop('op', None)
+ return getattr(arr, name)(**kwargs)
+
+ def all(self, axis=None, *args, **kwargs):
+ """
+ Tests whether all elements evaluate True
+
+ Returns
+ -------
+ all : bool
+
+ See Also
+ --------
+ numpy.all
+ """
+ nv.validate_all(args, kwargs)
+
+ values = self.sp_values
+
+ if len(values) != len(self) and not np.all(self.fill_value):
+ return False
+
+ return values.all()
+
+ def any(self, axis=0, *args, **kwargs):
+ """
+ Tests whether at least one of elements evaluate True
+
+ Returns
+ -------
+ any : bool
+
+ See Also
+ --------
+ numpy.any
+ """
+ nv.validate_any(args, kwargs)
+
+ values = self.sp_values
+
+ if len(values) != len(self) and np.any(self.fill_value):
+ return True
+
+ return values.any().item()
+
+ def sum(self, axis=0, *args, **kwargs):
+ """
+ Sum of non-NA/null values
+
+ Returns
+ -------
+ sum : float
+ """
+ nv.validate_sum(args, kwargs)
+ valid_vals = self._valid_sp_values
+ sp_sum = valid_vals.sum()
+ if self._null_fill_value:
+ return sp_sum
+ else:
+ nsparse = self.sp_index.ngaps
+ return sp_sum + self.fill_value * nsparse
+
+ def cumsum(self, axis=0, *args, **kwargs):
+ """
+ Cumulative sum of non-NA/null values.
+
+ When performing the cumulative summation, any non-NA/null values will
+ be skipped. The resulting SparseArray will preserve the locations of
+ NaN values, but the fill value will be `np.nan` regardless.
+
+ Parameters
+ ----------
+ axis : int or None
+ Axis over which to perform the cumulative summation. If None,
+ perform cumulative summation over flattened array.
+
+ Returns
+ -------
+ cumsum : SparseArray
+ """
+ nv.validate_cumsum(args, kwargs)
+
+ if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour.
+ raise ValueError("axis(={axis}) out of bounds".format(axis=axis))
+
+ if not self._null_fill_value:
+ return SparseArray(self.to_dense()).cumsum()
+
+ return SparseArray(self.sp_values.cumsum(), sparse_index=self.sp_index,
+ fill_value=self.fill_value)
+
+ def mean(self, axis=0, *args, **kwargs):
+ """
+ Mean of non-NA/null values
+
+ Returns
+ -------
+ mean : float
+ """
+ nv.validate_mean(args, kwargs)
+ valid_vals = self._valid_sp_values
+ sp_sum = valid_vals.sum()
+ ct = len(valid_vals)
+
+ if self._null_fill_value:
+ return sp_sum / ct
+ else:
+ nsparse = self.sp_index.ngaps
+ return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)
+
+ def transpose(self, *axes):
+ """
+ Returns the SparseArray.
+ """
+ return self
+
+ @property
+ def T(self):
+ """
+ Returns the SparseArray.
+ """
+ return self
+
+ # ------------------------------------------------------------------------
+ # Ufuncs
+ # ------------------------------------------------------------------------
+
+ def __array_wrap__(self, array, context=None):
+ from pandas.core.dtypes.generic import ABCSparseSeries
+
+ ufunc, inputs, _ = context
+ inputs = tuple(x.values if isinstance(x, ABCSparseSeries) else x
+ for x in inputs)
+ return self.__array_ufunc__(ufunc, '__call__', *inputs)
+
+ _HANDLED_TYPES = (np.ndarray, numbers.Number)
+
+ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+ out = kwargs.get('out', ())
+
+ for x in inputs + out:
+ if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)):
+ return NotImplemented
+
+ special = {'add', 'sub', 'mul', 'pow', 'mod', 'floordiv', 'truediv',
+ 'divmod', 'eq', 'ne', 'lt', 'gt', 'le', 'ge', 'remainder'}
+ if compat.PY2:
+ special.add('div')
+ aliases = {
+ 'subtract': 'sub',
+ 'multiply': 'mul',
+ 'floor_divide': 'floordiv',
+ 'true_divide': 'truediv',
+ 'power': 'pow',
+ 'remainder': 'mod',
+ 'divide': 'div',
+ 'equal': 'eq',
+ 'not_equal': 'ne',
+ 'less': 'lt',
+ 'less_equal': 'le',
+ 'greater': 'gt',
+ 'greater_equal': 'ge',
+ }
+
+ flipped = {
+ 'lt': '__gt__',
+ 'le': '__ge__',
+ 'gt': '__lt__',
+ 'ge': '__le__',
+ 'eq': '__eq__',
+ 'ne': '__ne__',
+ }
+
+ op_name = ufunc.__name__
+ op_name = aliases.get(op_name, op_name)
+
+ if op_name in special and kwargs.get('out') is None:
+ if isinstance(inputs[0], type(self)):
+ return getattr(self, '__{}__'.format(op_name))(inputs[1])
+ else:
+ name = flipped.get(op_name, '__r{}__'.format(op_name))
+ return getattr(self, name)(inputs[0])
+
+ if len(inputs) == 1:
+ # No alignment necessary.
+ sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
+ fill_value = getattr(ufunc, method)(self.fill_value, **kwargs)
+ return self._simple_new(sp_values,
+ self.sp_index,
+ SparseDtype(sp_values.dtype, fill_value))
+
+ result = getattr(ufunc, method)(*[np.asarray(x) for x in inputs],
+ **kwargs)
+ if out:
+ if len(out) == 1:
+ out = out[0]
+ return out
+
+ if type(result) is tuple:
+ return tuple(type(self)(x) for x in result)
+ elif method == 'at':
+ # no return value
+ return None
+ else:
+ return type(self)(result)
+
+ def __abs__(self):
+ return np.abs(self)
+
+ # ------------------------------------------------------------------------
+ # Ops
+ # ------------------------------------------------------------------------
+
+ @classmethod
+ def _create_unary_method(cls, op):
+ def sparse_unary_method(self):
+ fill_value = op(np.array(self.fill_value)).item()
+ values = op(self.sp_values)
+ dtype = SparseDtype(values.dtype, fill_value)
+ return cls._simple_new(values, self.sp_index, dtype)
+
+ name = '__{name}__'.format(name=op.__name__)
+ return compat.set_function_name(sparse_unary_method, name, cls)
+
+ @classmethod
+ def _create_arithmetic_method(cls, op):
+ def sparse_arithmetic_method(self, other):
+ op_name = op.__name__
+
+ if isinstance(other, (ABCSeries, ABCIndexClass)):
+ # Rely on pandas to dispatch to us.
+ return NotImplemented
+
+ if isinstance(other, SparseArray):
+ return _sparse_array_op(self, other, op, op_name)
+
+ elif is_scalar(other):
+ with np.errstate(all='ignore'):
+ fill = op(_get_fill(self), np.asarray(other))
+ result = op(self.sp_values, other)
+
+ if op_name == 'divmod':
+ left, right = result
+ lfill, rfill = fill
+ return (_wrap_result(op_name, left, self.sp_index, lfill),
+ _wrap_result(op_name, right, self.sp_index, rfill))
+
+ return _wrap_result(op_name, result, self.sp_index, fill)
+
+ else:
+ other = np.asarray(other)
+ with np.errstate(all='ignore'):
+ # TODO: delete sparse stuff in core/ops.py
+ # TODO: look into _wrap_result
+ if len(self) != len(other):
+ raise AssertionError(
+ ("length mismatch: {self} vs. {other}".format(
+ self=len(self), other=len(other))))
+ if not isinstance(other, SparseArray):
+ dtype = getattr(other, 'dtype', None)
+ other = SparseArray(other, fill_value=self.fill_value,
+ dtype=dtype)
+ return _sparse_array_op(self, other, op, op_name)
+
+ name = '__{name}__'.format(name=op.__name__)
+ return compat.set_function_name(sparse_arithmetic_method, name, cls)
+
+ @classmethod
+ def _create_comparison_method(cls, op):
+ def cmp_method(self, other):
+ op_name = op.__name__
+
+ if op_name in {'and_', 'or_'}:
+ op_name = op_name[:-1]
+
+ if isinstance(other, (ABCSeries, ABCIndexClass)):
+ # Rely on pandas to unbox and dispatch to us.
+ return NotImplemented
+
+ if not is_scalar(other) and not isinstance(other, type(self)):
+ # convert list-like to ndarray
+ other = np.asarray(other)
+
+ if isinstance(other, np.ndarray):
+ # TODO: make this more flexible than just ndarray...
+ if len(self) != len(other):
+ raise AssertionError("length mismatch: {self} vs. {other}"
+ .format(self=len(self),
+ other=len(other)))
+ other = SparseArray(other, fill_value=self.fill_value)
+
+ if isinstance(other, SparseArray):
+ return _sparse_array_op(self, other, op, op_name)
+ else:
+ with np.errstate(all='ignore'):
+ fill_value = op(self.fill_value, other)
+ result = op(self.sp_values, other)
+
+ return type(self)(result,
+ sparse_index=self.sp_index,
+ fill_value=fill_value,
+ dtype=np.bool_)
+
+ name = '__{name}__'.format(name=op.__name__)
+ return compat.set_function_name(cmp_method, name, cls)
+
+ @classmethod
+ def _add_unary_ops(cls):
+ cls.__pos__ = cls._create_unary_method(operator.pos)
+ cls.__neg__ = cls._create_unary_method(operator.neg)
+ cls.__invert__ = cls._create_unary_method(operator.invert)
+
+ @classmethod
+ def _add_comparison_ops(cls):
+ cls.__and__ = cls._create_comparison_method(operator.and_)
+ cls.__or__ = cls._create_comparison_method(operator.or_)
+ super(SparseArray, cls)._add_comparison_ops()
+
+ # ----------
+ # Formatting
+ # -----------
+ def __unicode__(self):
+ return '{self}\nFill: {fill}\n{index}'.format(
+ self=printing.pprint_thing(self),
+ fill=printing.pprint_thing(self.fill_value),
+ index=printing.pprint_thing(self.sp_index))
+
+ def _formatter(self, boxed=False):
+ # Defer to the formatter from the GenericArrayFormatter calling us.
+ # This will infer the correct formatter from the dtype of the values.
+ return None
+
+
+SparseArray._add_arithmetic_ops()
+SparseArray._add_comparison_ops()
+SparseArray._add_unary_ops()
+
+
+def _maybe_to_dense(obj):
+ """
+ try to convert to dense
+ """
+ if hasattr(obj, 'to_dense'):
+ return obj.to_dense()
+ return obj
+
+
+def _maybe_to_sparse(array):
+ """
+ array must be SparseSeries or SparseArray
+ """
+ if isinstance(array, ABCSparseSeries):
+ array = array.values.copy()
+ return array
+
+
+def _sanitize_values(arr):
+ """
+ return an ndarray for our input,
+ in a platform independent manner
+ """
+
+ if hasattr(arr, 'values'):
+ arr = arr.values
+ else:
+
+ # scalar
+ if is_scalar(arr):
+ arr = [arr]
+
+ # ndarray
+ if isinstance(arr, np.ndarray):
+ pass
+
+ elif is_list_like(arr) and len(arr) > 0:
+ arr = maybe_convert_platform(arr)
+
+ else:
+ arr = np.asarray(arr)
+
+ return arr
+
+
+def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False):
+ """
+ Convert ndarray to sparse format
+
+ Parameters
+ ----------
+ arr : ndarray
+ kind : {'block', 'integer'}
+ fill_value : NaN or another value
+ dtype : np.dtype, optional
+ copy : bool, default False
+
+ Returns
+ -------
+ (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar)
+ """
+
+ arr = _sanitize_values(arr)
+
+ if arr.ndim > 1:
+ raise TypeError("expected dimension <= 1 data")
+
+ if fill_value is None:
+ fill_value = na_value_for_dtype(arr.dtype)
+
+ if isna(fill_value):
+ mask = notna(arr)
+ else:
+ # For str arrays in NumPy 1.12.0, operator!= below isn't
+ # element-wise but just returns False if fill_value is not str,
+ # so cast to object comparison to be safe
+ if is_string_dtype(arr):
+ arr = arr.astype(object)
+
+ if is_object_dtype(arr.dtype):
+ # element-wise equality check method in numpy doesn't treat
+ # each element type, eg. 0, 0.0, and False are treated as
+ # same. So we have to check the both of its type and value.
+ mask = splib.make_mask_object_ndarray(arr, fill_value)
+ else:
+ mask = arr != fill_value
+
+ length = len(arr)
+ if length != len(mask):
+ # the arr is a SparseArray
+ indices = mask.sp_index.indices
+ else:
+ indices = mask.nonzero()[0].astype(np.int32)
+
+ index = _make_index(length, indices, kind)
+ sparsified_values = arr[mask]
+ if dtype is not None:
+ sparsified_values = astype_nansafe(sparsified_values, dtype=dtype)
+ # TODO: copy
+ return sparsified_values, index, fill_value
+
+
+def _make_index(length, indices, kind):
+
+ if kind == 'block' or isinstance(kind, BlockIndex):
+ locs, lens = splib.get_blocks(indices)
+ index = BlockIndex(length, locs, lens)
+ elif kind == 'integer' or isinstance(kind, IntIndex):
+ index = IntIndex(length, indices)
+ else: # pragma: no cover
+ raise ValueError('must be block or integer type')
+ return index
+
+
+# ----------------------------------------------------------------------------
+# Accessor
+
+@delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
+ 'sp_values'],
+ typ='property')
+class SparseAccessor(PandasDelegate):
+ """
+ Accessor for SparseSparse from other sparse matrix data types.
+ """
+
+ def __init__(self, data=None):
+ self._validate(data)
+ # Store the Series since we need that for to_coo
+ self._parent = data
+
+ @staticmethod
+ def _validate(data):
+ if not isinstance(data.dtype, SparseDtype):
+ msg = "Can only use the '.sparse' accessor with Sparse data."
+ raise AttributeError(msg)
+
+ def _delegate_property_get(self, name, *args, **kwargs):
+ return getattr(self._parent.values, name)
+
+ def _delegate_method(self, name, *args, **kwargs):
+ if name == 'from_coo':
+ return self.from_coo(*args, **kwargs)
+ elif name == 'to_coo':
+ return self.to_coo(*args, **kwargs)
+ else:
+ raise ValueError
+
+ @classmethod
+ def from_coo(cls, A, dense_index=False):
+ """
+ Create a SparseSeries from a scipy.sparse.coo_matrix.
+
+ Parameters
+ ----------
+ A : scipy.sparse.coo_matrix
+ dense_index : bool, default False
+ If False (default), the SparseSeries index consists of only the
+ coords of the non-null entries of the original coo_matrix.
+ If True, the SparseSeries index consists of the full sorted
+ (row, col) coordinates of the coo_matrix.
+
+ Returns
+ -------
+ s : SparseSeries
+
+ Examples
+ ---------
+ >>> from scipy import sparse
+ >>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])),
+ shape=(3, 4))
+ >>> A
+ <3x4 sparse matrix of type '<class 'numpy.float64'>'
+ with 3 stored elements in COOrdinate format>
+ >>> A.todense()
+ matrix([[ 0., 0., 1., 2.],
+ [ 3., 0., 0., 0.],
+ [ 0., 0., 0., 0.]])
+ >>> ss = pd.SparseSeries.from_coo(A)
+ >>> ss
+ 0 2 1
+ 3 2
+ 1 0 3
+ dtype: float64
+ BlockIndex
+ Block locations: array([0], dtype=int32)
+ Block lengths: array([3], dtype=int32)
+ """
+ from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series
+ from pandas import Series
+
+ result = _coo_to_sparse_series(A, dense_index=dense_index)
+ # SparseSeries -> Series[sparse]
+ result = Series(result.values, index=result.index, copy=False)
+
+ return result
+
+ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
+ """
+ Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex.
+
+ Use row_levels and column_levels to determine the row and column
+ coordinates respectively. row_levels and column_levels are the names
+ (labels) or numbers of the levels. {row_levels, column_levels} must be
+ a partition of the MultiIndex level names (or numbers).
+
+ Parameters
+ ----------
+ row_levels : tuple/list
+ column_levels : tuple/list
+ sort_labels : bool, default False
+ Sort the row and column labels before forming the sparse matrix.
+
+ Returns
+ -------
+ y : scipy.sparse.coo_matrix
+ rows : list (row labels)
+ columns : list (column labels)
+
+ Examples
+ --------
+ >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
+ >>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0),
+ (1, 2, 'a', 1),
+ (1, 1, 'b', 0),
+ (1, 1, 'b', 1),
+ (2, 1, 'b', 0),
+ (2, 1, 'b', 1)],
+ names=['A', 'B', 'C', 'D'])
+ >>> ss = s.to_sparse()
+ >>> A, rows, columns = ss.to_coo(row_levels=['A', 'B'],
+ column_levels=['C', 'D'],
+ sort_labels=True)
+ >>> A
+ <3x4 sparse matrix of type '<class 'numpy.float64'>'
+ with 3 stored elements in COOrdinate format>
+ >>> A.todense()
+ matrix([[ 0., 0., 1., 3.],
+ [ 3., 0., 0., 0.],
+ [ 0., 0., 0., 0.]])
+ >>> rows
+ [(1, 1), (1, 2), (2, 1)]
+ >>> columns
+ [('a', 0), ('a', 1), ('b', 0), ('b', 1)]
+ """
+ from pandas.core.sparse.scipy_sparse import _sparse_series_to_coo
+
+ A, rows, columns = _sparse_series_to_coo(self._parent,
+ row_levels,
+ column_levels,
+ sort_labels=sort_labels)
+ return A, rows, columns
diff --git a/contrib/python/pandas/py2/pandas/core/arrays/timedeltas.py b/contrib/python/pandas/py2/pandas/core/arrays/timedeltas.py
new file mode 100644
index 00000000000..7683cb5abbb
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/arrays/timedeltas.py
@@ -0,0 +1,1069 @@
+# -*- coding: utf-8 -*-
+from __future__ import division
+
+from datetime import timedelta
+import textwrap
+import warnings
+
+import numpy as np
+
+from pandas._libs import lib, tslibs
+from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT
+from pandas._libs.tslibs.fields import get_timedelta_field
+from pandas._libs.tslibs.timedeltas import (
+ array_to_timedelta64, parse_timedelta_unit, precision_from_unit)
+import pandas.compat as compat
+from pandas.util._decorators import Appender
+
+from pandas.core.dtypes.common import (
+ _NS_DTYPE, _TD_DTYPE, ensure_int64, is_datetime64_dtype, is_dtype_equal,
+ is_float_dtype, is_integer_dtype, is_list_like, is_object_dtype, is_scalar,
+ is_string_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype,
+ pandas_dtype)
+from pandas.core.dtypes.dtypes import DatetimeTZDtype
+from pandas.core.dtypes.generic import (
+ ABCDataFrame, ABCIndexClass, ABCSeries, ABCTimedeltaIndex)
+from pandas.core.dtypes.missing import isna
+
+from pandas.core import ops
+from pandas.core.algorithms import checked_add_with_arr
+import pandas.core.common as com
+
+from pandas.tseries.frequencies import to_offset
+from pandas.tseries.offsets import Tick
+
+from . import datetimelike as dtl
+
+_BAD_DTYPE = "dtype {dtype} cannot be converted to timedelta64[ns]"
+
+
+def _is_convertible_to_td(key):
+ return isinstance(key, (Tick, timedelta,
+ np.timedelta64, compat.string_types))
+
+
+def _field_accessor(name, alias, docstring=None):
+ def f(self):
+ values = self.asi8
+ result = get_timedelta_field(values, alias)
+ if self._hasnans:
+ result = self._maybe_mask_results(result, fill_value=None,
+ convert='float64')
+
+ return result
+
+ f.__name__ = name
+ f.__doc__ = "\n{}\n".format(docstring)
+ return property(f)
+
+
+def _td_array_cmp(cls, op):
+ """
+ Wrap comparison operations to convert timedelta-like to timedelta64
+ """
+ opname = '__{name}__'.format(name=op.__name__)
+ nat_result = True if opname == '__ne__' else False
+
+ def wrapper(self, other):
+ if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
+ return NotImplemented
+
+ if _is_convertible_to_td(other) or other is NaT:
+ try:
+ other = Timedelta(other)
+ except ValueError:
+ # failed to parse as timedelta
+ return ops.invalid_comparison(self, other, op)
+
+ result = op(self.view('i8'), other.value)
+ if isna(other):
+ result.fill(nat_result)
+
+ elif not is_list_like(other):
+ return ops.invalid_comparison(self, other, op)
+
+ elif len(other) != len(self):
+ raise ValueError("Lengths must match")
+
+ else:
+ try:
+ other = type(self)._from_sequence(other)._data
+ except (ValueError, TypeError):
+ return ops.invalid_comparison(self, other, op)
+
+ result = op(self.view('i8'), other.view('i8'))
+ result = com.values_from_object(result)
+
+ o_mask = np.array(isna(other))
+ if o_mask.any():
+ result[o_mask] = nat_result
+
+ if self._hasnans:
+ result[self._isnan] = nat_result
+
+ return result
+
+ return compat.set_function_name(wrapper, opname, cls)
+
+
+class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps):
+ """
+ Pandas ExtensionArray for timedelta data.
+
+ .. versionadded:: 0.24.0
+
+ .. warning::
+
+ TimedeltaArray is currently experimental, and its API may change
+ without warning. In particular, :attr:`TimedeltaArray.dtype` is
+ expected to change to be an instance of an ``ExtensionDtype``
+ subclass.
+
+ Parameters
+ ----------
+ values : array-like
+ The timedelta data.
+
+ dtype : numpy.dtype
+ Currently, only ``numpy.dtype("timedelta64[ns]")`` is accepted.
+ freq : Offset, optional
+ copy : bool, default False
+ Whether to copy the underlying array of data.
+ """
+ _typ = "timedeltaarray"
+ _scalar_type = Timedelta
+ __array_priority__ = 1000
+ # define my properties & methods for delegation
+ _other_ops = []
+ _bool_ops = []
+ _object_ops = ['freq']
+ _field_ops = ['days', 'seconds', 'microseconds', 'nanoseconds']
+ _datetimelike_ops = _field_ops + _object_ops + _bool_ops
+ _datetimelike_methods = ["to_pytimedelta", "total_seconds",
+ "round", "floor", "ceil"]
+
+ # Needed so that NaT.__richcmp__(DateTimeArray) operates pointwise
+ ndim = 1
+
+ @property
+ def _box_func(self):
+ return lambda x: Timedelta(x, unit='ns')
+
+ @property
+ def dtype(self):
+ """
+ The dtype for the TimedeltaArray.
+
+ .. warning::
+
+ A future version of pandas will change dtype to be an instance
+ of a :class:`pandas.api.extensions.ExtensionDtype` subclass,
+ not a ``numpy.dtype``.
+
+ Returns
+ -------
+ numpy.dtype
+ """
+ return _TD_DTYPE
+
+ # ----------------------------------------------------------------
+ # Constructors
+ _attributes = ["freq"]
+
+ def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False):
+ if isinstance(values, (ABCSeries, ABCIndexClass)):
+ values = values._values
+
+ inferred_freq = getattr(values, "_freq", None)
+
+ if isinstance(values, type(self)):
+ if freq is None:
+ freq = values.freq
+ elif freq and values.freq:
+ freq = to_offset(freq)
+ freq, _ = dtl.validate_inferred_freq(freq, values.freq, False)
+ values = values._data
+
+ if not isinstance(values, np.ndarray):
+ msg = (
+ "Unexpected type '{}'. 'values' must be a TimedeltaArray "
+ "ndarray, or Series or Index containing one of those."
+ )
+ raise ValueError(msg.format(type(values).__name__))
+ if values.ndim != 1:
+ raise ValueError("Only 1-dimensional input arrays are supported.")
+
+ if values.dtype == 'i8':
+ # for compat with datetime/timedelta/period shared methods,
+ # we can sometimes get here with int64 values. These represent
+ # nanosecond UTC (or tz-naive) unix timestamps
+ values = values.view(_TD_DTYPE)
+
+ _validate_td64_dtype(values.dtype)
+ dtype = _validate_td64_dtype(dtype)
+
+ if freq == "infer":
+ msg = (
+ "Frequency inference not allowed in TimedeltaArray.__init__. "
+ "Use 'pd.array()' instead."
+ )
+ raise ValueError(msg)
+
+ if copy:
+ values = values.copy()
+ if freq:
+ freq = to_offset(freq)
+
+ self._data = values
+ self._dtype = dtype
+ self._freq = freq
+
+ if inferred_freq is None and freq is not None:
+ type(self)._validate_frequency(self, freq)
+
+ @classmethod
+ def _simple_new(cls, values, freq=None, dtype=_TD_DTYPE):
+ assert dtype == _TD_DTYPE, dtype
+ assert isinstance(values, np.ndarray), type(values)
+
+ result = object.__new__(cls)
+ result._data = values.view(_TD_DTYPE)
+ result._freq = to_offset(freq)
+ result._dtype = _TD_DTYPE
+ return result
+
+ @classmethod
+ def _from_sequence(cls, data, dtype=_TD_DTYPE, copy=False,
+ freq=None, unit=None):
+ if dtype:
+ _validate_td64_dtype(dtype)
+ freq, freq_infer = dtl.maybe_infer_freq(freq)
+
+ data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=unit)
+ freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq,
+ freq_infer)
+
+ result = cls._simple_new(data, freq=freq)
+
+ if inferred_freq is None and freq is not None:
+ # this condition precludes `freq_infer`
+ cls._validate_frequency(result, freq)
+
+ elif freq_infer:
+ # Set _freq directly to bypass duplicative _validate_frequency
+ # check.
+ result._freq = to_offset(result.inferred_freq)
+
+ return result
+
+ @classmethod
+ def _generate_range(cls, start, end, periods, freq, closed=None):
+
+ periods = dtl.validate_periods(periods)
+ if freq is None and any(x is None for x in [periods, start, end]):
+ raise ValueError('Must provide freq argument if no data is '
+ 'supplied')
+
+ if com.count_not_none(start, end, periods, freq) != 3:
+ raise ValueError('Of the four parameters: start, end, periods, '
+ 'and freq, exactly three must be specified')
+
+ if start is not None:
+ start = Timedelta(start)
+
+ if end is not None:
+ end = Timedelta(end)
+
+ if start is None and end is None:
+ if closed is not None:
+ raise ValueError("Closed has to be None if not both of start"
+ "and end are defined")
+
+ left_closed, right_closed = dtl.validate_endpoints(closed)
+
+ if freq is not None:
+ index = _generate_regular_range(start, end, periods, freq)
+ else:
+ index = np.linspace(start.value, end.value, periods).astype('i8')
+
+ if not left_closed:
+ index = index[1:]
+ if not right_closed:
+ index = index[:-1]
+
+ return cls._simple_new(index, freq=freq)
+
+ # ----------------------------------------------------------------
+ # DatetimeLike Interface
+
+ def _unbox_scalar(self, value):
+ if not isinstance(value, self._scalar_type) and value is not NaT:
+ raise ValueError("'value' should be a Timedelta.")
+ self._check_compatible_with(value)
+ return value.value
+
+ def _scalar_from_string(self, value):
+ return Timedelta(value)
+
+ def _check_compatible_with(self, other):
+ # we don't have anything to validate.
+ pass
+
+ def _maybe_clear_freq(self):
+ self._freq = None
+
+ # ----------------------------------------------------------------
+ # Array-Like / EA-Interface Methods
+
+ @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__)
+ def _validate_fill_value(self, fill_value):
+ if isna(fill_value):
+ fill_value = iNaT
+ elif isinstance(fill_value, (timedelta, np.timedelta64, Tick)):
+ fill_value = Timedelta(fill_value).value
+ else:
+ raise ValueError("'fill_value' should be a Timedelta. "
+ "Got '{got}'.".format(got=fill_value))
+ return fill_value
+
+ def astype(self, dtype, copy=True):
+ # We handle
+ # --> timedelta64[ns]
+ # --> timedelta64
+ # DatetimeLikeArrayMixin super call handles other cases
+ dtype = pandas_dtype(dtype)
+
+ if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype):
+ # by pandas convention, converting to non-nano timedelta64
+ # returns an int64-dtyped array with ints representing multiples
+ # of the desired timedelta unit. This is essentially division
+ if self._hasnans:
+ # avoid double-copying
+ result = self._data.astype(dtype, copy=False)
+ values = self._maybe_mask_results(result,
+ fill_value=None,
+ convert='float64')
+ return values
+ result = self._data.astype(dtype, copy=copy)
+ return result.astype('i8')
+ elif is_timedelta64_ns_dtype(dtype):
+ if copy:
+ return self.copy()
+ return self
+ return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy)
+
+ # ----------------------------------------------------------------
+ # Rendering Methods
+
+ def _formatter(self, boxed=False):
+ from pandas.io.formats.format import _get_format_timedelta64
+ return _get_format_timedelta64(self, box=True)
+
+ def _format_native_types(self, na_rep='NaT', date_format=None):
+ from pandas.io.formats.format import _get_format_timedelta64
+
+ formatter = _get_format_timedelta64(self._data, na_rep)
+ return np.array([formatter(x) for x in self._data])
+
+ # ----------------------------------------------------------------
+ # Arithmetic Methods
+
+ _create_comparison_method = classmethod(_td_array_cmp)
+
+ def _add_offset(self, other):
+ assert not isinstance(other, Tick)
+ raise TypeError("cannot add the type {typ} to a {cls}"
+ .format(typ=type(other).__name__,
+ cls=type(self).__name__))
+
+ def _add_delta(self, delta):
+ """
+ Add a timedelta-like, Tick, or TimedeltaIndex-like object
+ to self, yielding a new TimedeltaArray.
+
+ Parameters
+ ----------
+ other : {timedelta, np.timedelta64, Tick,
+ TimedeltaIndex, ndarray[timedelta64]}
+
+ Returns
+ -------
+ result : TimedeltaArray
+ """
+ new_values = super(TimedeltaArray, self)._add_delta(delta)
+ return type(self)._from_sequence(new_values, freq='infer')
+
+ def _add_datetime_arraylike(self, other):
+ """
+ Add DatetimeArray/Index or ndarray[datetime64] to TimedeltaArray.
+ """
+ if isinstance(other, np.ndarray):
+ # At this point we have already checked that dtype is datetime64
+ from pandas.core.arrays import DatetimeArray
+ other = DatetimeArray(other)
+
+ # defer to implementation in DatetimeArray
+ return other + self
+
+ def _add_datetimelike_scalar(self, other):
+ # adding a timedeltaindex to a datetimelike
+ from pandas.core.arrays import DatetimeArray
+
+ assert other is not NaT
+ other = Timestamp(other)
+ if other is NaT:
+ # In this case we specifically interpret NaT as a datetime, not
+ # the timedelta interpretation we would get by returning self + NaT
+ result = self.asi8.view('m8[ms]') + NaT.to_datetime64()
+ return DatetimeArray(result)
+
+ i8 = self.asi8
+ result = checked_add_with_arr(i8, other.value,
+ arr_mask=self._isnan)
+ result = self._maybe_mask_results(result)
+ dtype = DatetimeTZDtype(tz=other.tz) if other.tz else _NS_DTYPE
+ return DatetimeArray(result, dtype=dtype, freq=self.freq)
+
+ def _addsub_offset_array(self, other, op):
+ # Add or subtract Array-like of DateOffset objects
+ try:
+ # TimedeltaIndex can only operate with a subset of DateOffset
+ # subclasses. Incompatible classes will raise AttributeError,
+ # which we re-raise as TypeError
+ return super(TimedeltaArray, self)._addsub_offset_array(
+ other, op
+ )
+ except AttributeError:
+ raise TypeError("Cannot add/subtract non-tick DateOffset to {cls}"
+ .format(cls=type(self).__name__))
+
+ def __mul__(self, other):
+ other = lib.item_from_zerodim(other)
+
+ if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
+ return NotImplemented
+
+ if is_scalar(other):
+ # numpy will accept float and int, raise TypeError for others
+ result = self._data * other
+ freq = None
+ if self.freq is not None and not isna(other):
+ freq = self.freq * other
+ return type(self)(result, freq=freq)
+
+ if not hasattr(other, "dtype"):
+ # list, tuple
+ other = np.array(other)
+ if len(other) != len(self) and not is_timedelta64_dtype(other):
+ # Exclude timedelta64 here so we correctly raise TypeError
+ # for that instead of ValueError
+ raise ValueError("Cannot multiply with unequal lengths")
+
+ if is_object_dtype(other):
+ # this multiplication will succeed only if all elements of other
+ # are int or float scalars, so we will end up with
+ # timedelta64[ns]-dtyped result
+ result = [self[n] * other[n] for n in range(len(self))]
+ result = np.array(result)
+ return type(self)(result)
+
+ # numpy will accept float or int dtype, raise TypeError for others
+ result = self._data * other
+ return type(self)(result)
+
+ __rmul__ = __mul__
+
+ def __truediv__(self, other):
+ # timedelta / X is well-defined for timedelta-like or numeric X
+ other = lib.item_from_zerodim(other)
+
+ if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
+ return NotImplemented
+
+ if isinstance(other, (timedelta, np.timedelta64, Tick)):
+ other = Timedelta(other)
+ if other is NaT:
+ # specifically timedelta64-NaT
+ result = np.empty(self.shape, dtype=np.float64)
+ result.fill(np.nan)
+ return result
+
+ # otherwise, dispatch to Timedelta implementation
+ return self._data / other
+
+ elif lib.is_scalar(other):
+ # assume it is numeric
+ result = self._data / other
+ freq = None
+ if self.freq is not None:
+ # Tick division is not implemented, so operate on Timedelta
+ freq = self.freq.delta / other
+ return type(self)(result, freq=freq)
+
+ if not hasattr(other, "dtype"):
+ # e.g. list, tuple
+ other = np.array(other)
+
+ if len(other) != len(self):
+ raise ValueError("Cannot divide vectors with unequal lengths")
+
+ elif is_timedelta64_dtype(other):
+ # let numpy handle it
+ return self._data / other
+
+ elif is_object_dtype(other):
+ # Note: we do not do type inference on the result, so either
+ # an object array or numeric-dtyped (if numpy does inference)
+ # will be returned. GH#23829
+ result = [self[n] / other[n] for n in range(len(self))]
+ result = np.array(result)
+ return result
+
+ else:
+ result = self._data / other
+ return type(self)(result)
+
+ def __rtruediv__(self, other):
+ # X / timedelta is defined only for timedelta-like X
+ other = lib.item_from_zerodim(other)
+
+ if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
+ return NotImplemented
+
+ if isinstance(other, (timedelta, np.timedelta64, Tick)):
+ other = Timedelta(other)
+ if other is NaT:
+ # specifically timedelta64-NaT
+ result = np.empty(self.shape, dtype=np.float64)
+ result.fill(np.nan)
+ return result
+
+ # otherwise, dispatch to Timedelta implementation
+ return other / self._data
+
+ elif lib.is_scalar(other):
+ raise TypeError("Cannot divide {typ} by {cls}"
+ .format(typ=type(other).__name__,
+ cls=type(self).__name__))
+
+ if not hasattr(other, "dtype"):
+ # e.g. list, tuple
+ other = np.array(other)
+
+ if len(other) != len(self):
+ raise ValueError("Cannot divide vectors with unequal lengths")
+
+ elif is_timedelta64_dtype(other):
+ # let numpy handle it
+ return other / self._data
+
+ elif is_object_dtype(other):
+ # Note: unlike in __truediv__, we do not _need_ to do type#
+ # inference on the result. It does not raise, a numeric array
+ # is returned. GH#23829
+ result = [other[n] / self[n] for n in range(len(self))]
+ return np.array(result)
+
+ else:
+ raise TypeError("Cannot divide {dtype} data by {cls}"
+ .format(dtype=other.dtype,
+ cls=type(self).__name__))
+
+ if compat.PY2:
+ __div__ = __truediv__
+ __rdiv__ = __rtruediv__
+
+ def __floordiv__(self, other):
+ if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
+ return NotImplemented
+
+ other = lib.item_from_zerodim(other)
+ if is_scalar(other):
+ if isinstance(other, (timedelta, np.timedelta64, Tick)):
+ other = Timedelta(other)
+ if other is NaT:
+ # treat this specifically as timedelta-NaT
+ result = np.empty(self.shape, dtype=np.float64)
+ result.fill(np.nan)
+ return result
+
+ # dispatch to Timedelta implementation
+ result = other.__rfloordiv__(self._data)
+ return result
+
+ # at this point we should only have numeric scalars; anything
+ # else will raise
+ result = self.asi8 // other
+ result[self._isnan] = iNaT
+ freq = None
+ if self.freq is not None:
+ # Note: freq gets division, not floor-division
+ freq = self.freq / other
+ return type(self)(result.view('m8[ns]'), freq=freq)
+
+ if not hasattr(other, "dtype"):
+ # list, tuple
+ other = np.array(other)
+ if len(other) != len(self):
+ raise ValueError("Cannot divide with unequal lengths")
+
+ elif is_timedelta64_dtype(other):
+ other = type(self)(other)
+
+ # numpy timedelta64 does not natively support floordiv, so operate
+ # on the i8 values
+ result = self.asi8 // other.asi8
+ mask = self._isnan | other._isnan
+ if mask.any():
+ result = result.astype(np.int64)
+ result[mask] = np.nan
+ return result
+
+ elif is_object_dtype(other):
+ result = [self[n] // other[n] for n in range(len(self))]
+ result = np.array(result)
+ if lib.infer_dtype(result, skipna=False) == 'timedelta':
+ result, _ = sequence_to_td64ns(result)
+ return type(self)(result)
+ return result
+
+ elif is_integer_dtype(other) or is_float_dtype(other):
+ result = self._data // other
+ return type(self)(result)
+
+ else:
+ dtype = getattr(other, "dtype", type(other).__name__)
+ raise TypeError("Cannot divide {typ} by {cls}"
+ .format(typ=dtype, cls=type(self).__name__))
+
+ def __rfloordiv__(self, other):
+ if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
+ return NotImplemented
+
+ other = lib.item_from_zerodim(other)
+ if is_scalar(other):
+ if isinstance(other, (timedelta, np.timedelta64, Tick)):
+ other = Timedelta(other)
+ if other is NaT:
+ # treat this specifically as timedelta-NaT
+ result = np.empty(self.shape, dtype=np.float64)
+ result.fill(np.nan)
+ return result
+
+ # dispatch to Timedelta implementation
+ result = other.__floordiv__(self._data)
+ return result
+
+ raise TypeError("Cannot divide {typ} by {cls}"
+ .format(typ=type(other).__name__,
+ cls=type(self).__name__))
+
+ if not hasattr(other, "dtype"):
+ # list, tuple
+ other = np.array(other)
+ if len(other) != len(self):
+ raise ValueError("Cannot divide with unequal lengths")
+
+ elif is_timedelta64_dtype(other):
+ other = type(self)(other)
+
+ # numpy timedelta64 does not natively support floordiv, so operate
+ # on the i8 values
+ result = other.asi8 // self.asi8
+ mask = self._isnan | other._isnan
+ if mask.any():
+ result = result.astype(np.int64)
+ result[mask] = np.nan
+ return result
+
+ elif is_object_dtype(other):
+ result = [other[n] // self[n] for n in range(len(self))]
+ result = np.array(result)
+ return result
+
+ else:
+ dtype = getattr(other, "dtype", type(other).__name__)
+ raise TypeError("Cannot divide {typ} by {cls}"
+ .format(typ=dtype, cls=type(self).__name__))
+
+ def __mod__(self, other):
+ # Note: This is a naive implementation, can likely be optimized
+ if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
+ return NotImplemented
+
+ other = lib.item_from_zerodim(other)
+ if isinstance(other, (timedelta, np.timedelta64, Tick)):
+ other = Timedelta(other)
+ return self - (self // other) * other
+
+ def __rmod__(self, other):
+ # Note: This is a naive implementation, can likely be optimized
+ if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
+ return NotImplemented
+
+ other = lib.item_from_zerodim(other)
+ if isinstance(other, (timedelta, np.timedelta64, Tick)):
+ other = Timedelta(other)
+ return other - (other // self) * self
+
+ def __divmod__(self, other):
+ # Note: This is a naive implementation, can likely be optimized
+ if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
+ return NotImplemented
+
+ other = lib.item_from_zerodim(other)
+ if isinstance(other, (timedelta, np.timedelta64, Tick)):
+ other = Timedelta(other)
+
+ res1 = self // other
+ res2 = self - res1 * other
+ return res1, res2
+
+ def __rdivmod__(self, other):
+ # Note: This is a naive implementation, can likely be optimized
+ if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
+ return NotImplemented
+
+ other = lib.item_from_zerodim(other)
+ if isinstance(other, (timedelta, np.timedelta64, Tick)):
+ other = Timedelta(other)
+
+ res1 = other // self
+ res2 = other - res1 * self
+ return res1, res2
+
+ # Note: TimedeltaIndex overrides this in call to cls._add_numeric_methods
+ def __neg__(self):
+ if self.freq is not None:
+ return type(self)(-self._data, freq=-self.freq)
+ return type(self)(-self._data)
+
+ def __abs__(self):
+ # Note: freq is not preserved
+ return type(self)(np.abs(self._data))
+
+ # ----------------------------------------------------------------
+ # Conversion Methods - Vectorized analogues of Timedelta methods
+
+ def total_seconds(self):
+ """
+ Return total duration of each element expressed in seconds.
+
+ This method is available directly on TimedeltaArray, TimedeltaIndex
+ and on Series containing timedelta values under the ``.dt`` namespace.
+
+ Returns
+ -------
+ seconds : [ndarray, Float64Index, Series]
+ When the calling object is a TimedeltaArray, the return type
+ is ndarray. When the calling object is a TimedeltaIndex,
+ the return type is a Float64Index. When the calling object
+ is a Series, the return type is Series of type `float64` whose
+ index is the same as the original.
+
+ See Also
+ --------
+ datetime.timedelta.total_seconds : Standard library version
+ of this method.
+ TimedeltaIndex.components : Return a DataFrame with components of
+ each Timedelta.
+
+ Examples
+ --------
+ **Series**
+
+ >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='d'))
+ >>> s
+ 0 0 days
+ 1 1 days
+ 2 2 days
+ 3 3 days
+ 4 4 days
+ dtype: timedelta64[ns]
+
+ >>> s.dt.total_seconds()
+ 0 0.0
+ 1 86400.0
+ 2 172800.0
+ 3 259200.0
+ 4 345600.0
+ dtype: float64
+
+ **TimedeltaIndex**
+
+ >>> idx = pd.to_timedelta(np.arange(5), unit='d')
+ >>> idx
+ TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'],
+ dtype='timedelta64[ns]', freq=None)
+
+ >>> idx.total_seconds()
+ Float64Index([0.0, 86400.0, 172800.0, 259200.00000000003, 345600.0],
+ dtype='float64')
+ """
+ return self._maybe_mask_results(1e-9 * self.asi8, fill_value=None)
+
+ def to_pytimedelta(self):
+ """
+ Return Timedelta Array/Index as object ndarray of datetime.timedelta
+ objects.
+
+ Returns
+ -------
+ datetimes : ndarray
+ """
+ return tslibs.ints_to_pytimedelta(self.asi8)
+
+ days = _field_accessor("days", "days",
+ "Number of days for each element.")
+ seconds = _field_accessor("seconds", "seconds",
+ "Number of seconds (>= 0 and less than 1 day) "
+ "for each element.")
+ microseconds = _field_accessor("microseconds", "microseconds",
+ "Number of microseconds (>= 0 and less "
+ "than 1 second) for each element.")
+ nanoseconds = _field_accessor("nanoseconds", "nanoseconds",
+ "Number of nanoseconds (>= 0 and less "
+ "than 1 microsecond) for each element.")
+
+ @property
+ def components(self):
+ """
+ Return a dataframe of the components (days, hours, minutes,
+ seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas.
+
+ Returns
+ -------
+ a DataFrame
+ """
+ from pandas import DataFrame
+
+ columns = ['days', 'hours', 'minutes', 'seconds',
+ 'milliseconds', 'microseconds', 'nanoseconds']
+ hasnans = self._hasnans
+ if hasnans:
+ def f(x):
+ if isna(x):
+ return [np.nan] * len(columns)
+ return x.components
+ else:
+ def f(x):
+ return x.components
+
+ result = DataFrame([f(x) for x in self], columns=columns)
+ if not hasnans:
+ result = result.astype('int64')
+ return result
+
+
+TimedeltaArray._add_comparison_ops()
+
+
+# ---------------------------------------------------------------------
+# Constructor Helpers
+
+def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"):
+ """
+ Parameters
+ ----------
+ array : list-like
+ copy : bool, default False
+ unit : str, default "ns"
+ The timedelta unit to treat integers as multiples of.
+ errors : {"raise", "coerce", "ignore"}, default "raise"
+ How to handle elements that cannot be converted to timedelta64[ns].
+ See ``pandas.to_timedelta`` for details.
+
+ Returns
+ -------
+ converted : numpy.ndarray
+ The sequence converted to a numpy array with dtype ``timedelta64[ns]``.
+ inferred_freq : Tick or None
+ The inferred frequency of the sequence.
+
+ Raises
+ ------
+ ValueError : Data cannot be converted to timedelta64[ns].
+
+ Notes
+ -----
+ Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause
+ errors to be ignored; they are caught and subsequently ignored at a
+ higher level.
+ """
+ inferred_freq = None
+ unit = parse_timedelta_unit(unit)
+
+ # Unwrap whatever we have into a np.ndarray
+ if not hasattr(data, 'dtype'):
+ # e.g. list, tuple
+ if np.ndim(data) == 0:
+ # i.e. generator
+ data = list(data)
+ data = np.array(data, copy=False)
+ elif isinstance(data, ABCSeries):
+ data = data._values
+ elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)):
+ inferred_freq = data.freq
+ data = data._data
+
+ # Convert whatever we have into timedelta64[ns] dtype
+ if is_object_dtype(data.dtype) or is_string_dtype(data.dtype):
+ # no need to make a copy, need to convert if string-dtyped
+ data = objects_to_td64ns(data, unit=unit, errors=errors)
+ copy = False
+
+ elif is_integer_dtype(data.dtype):
+ # treat as multiples of the given unit
+ data, copy_made = ints_to_td64ns(data, unit=unit)
+ copy = copy and not copy_made
+
+ elif is_float_dtype(data.dtype):
+ # cast the unit, multiply base/frace separately
+ # to avoid precision issues from float -> int
+ mask = np.isnan(data)
+ m, p = precision_from_unit(unit)
+ base = data.astype(np.int64)
+ frac = data - base
+ if p:
+ frac = np.round(frac, p)
+ data = (base * m + (frac * m).astype(np.int64)).view('timedelta64[ns]')
+ data[mask] = iNaT
+ copy = False
+
+ elif is_timedelta64_dtype(data.dtype):
+ if data.dtype != _TD_DTYPE:
+ # non-nano unit
+ # TODO: watch out for overflows
+ data = data.astype(_TD_DTYPE)
+ copy = False
+
+ elif is_datetime64_dtype(data):
+ # GH#23539
+ warnings.warn("Passing datetime64-dtype data to TimedeltaIndex is "
+ "deprecated, will raise a TypeError in a future "
+ "version",
+ FutureWarning, stacklevel=4)
+ data = ensure_int64(data).view(_TD_DTYPE)
+
+ else:
+ raise TypeError("dtype {dtype} cannot be converted to timedelta64[ns]"
+ .format(dtype=data.dtype))
+
+ data = np.array(data, copy=copy)
+ if data.ndim != 1:
+ raise ValueError("Only 1-dimensional input arrays are supported.")
+
+ assert data.dtype == 'm8[ns]', data
+ return data, inferred_freq
+
+
+def ints_to_td64ns(data, unit="ns"):
+ """
+ Convert an ndarray with integer-dtype to timedelta64[ns] dtype, treating
+ the integers as multiples of the given timedelta unit.
+
+ Parameters
+ ----------
+ data : numpy.ndarray with integer-dtype
+ unit : str, default "ns"
+ The timedelta unit to treat integers as multiples of.
+
+ Returns
+ -------
+ numpy.ndarray : timedelta64[ns] array converted from data
+ bool : whether a copy was made
+ """
+ copy_made = False
+ unit = unit if unit is not None else "ns"
+
+ if data.dtype != np.int64:
+ # converting to int64 makes a copy, so we can avoid
+ # re-copying later
+ data = data.astype(np.int64)
+ copy_made = True
+
+ if unit != "ns":
+ dtype_str = "timedelta64[{unit}]".format(unit=unit)
+ data = data.view(dtype_str)
+
+ # TODO: watch out for overflows when converting from lower-resolution
+ data = data.astype("timedelta64[ns]")
+ # the astype conversion makes a copy, so we can avoid re-copying later
+ copy_made = True
+
+ else:
+ data = data.view("timedelta64[ns]")
+
+ return data, copy_made
+
+
+def objects_to_td64ns(data, unit="ns", errors="raise"):
+ """
+ Convert a object-dtyped or string-dtyped array into an
+ timedelta64[ns]-dtyped array.
+
+ Parameters
+ ----------
+ data : ndarray or Index
+ unit : str, default "ns"
+ The timedelta unit to treat integers as multiples of.
+ errors : {"raise", "coerce", "ignore"}, default "raise"
+ How to handle elements that cannot be converted to timedelta64[ns].
+ See ``pandas.to_timedelta`` for details.
+
+ Returns
+ -------
+ numpy.ndarray : timedelta64[ns] array converted from data
+
+ Raises
+ ------
+ ValueError : Data cannot be converted to timedelta64[ns].
+
+ Notes
+ -----
+ Unlike `pandas.to_timedelta`, if setting `errors=ignore` will not cause
+ errors to be ignored; they are caught and subsequently ignored at a
+ higher level.
+ """
+ # coerce Index to np.ndarray, converting string-dtype if necessary
+ values = np.array(data, dtype=np.object_, copy=False)
+
+ result = array_to_timedelta64(values,
+ unit=unit, errors=errors)
+ return result.view('timedelta64[ns]')
+
+
+def _validate_td64_dtype(dtype):
+ dtype = pandas_dtype(dtype)
+ if is_dtype_equal(dtype, np.dtype("timedelta64")):
+ dtype = _TD_DTYPE
+ msg = textwrap.dedent("""\
+ Passing in 'timedelta' dtype with no precision is deprecated
+ and will raise in a future version. Please pass in
+ 'timedelta64[ns]' instead.""")
+ warnings.warn(msg, FutureWarning, stacklevel=4)
+
+ if not is_dtype_equal(dtype, _TD_DTYPE):
+ raise ValueError(_BAD_DTYPE.format(dtype=dtype))
+
+ return dtype
+
+
+def _generate_regular_range(start, end, periods, offset):
+ stride = offset.nanos
+ if periods is None:
+ b = Timedelta(start).value
+ e = Timedelta(end).value
+ e += stride - e % stride
+ elif start is not None:
+ b = Timedelta(start).value
+ e = b + periods * stride
+ elif end is not None:
+ e = Timedelta(end).value + stride
+ b = e - periods * stride
+ else:
+ raise ValueError("at least 'start' or 'end' should be specified "
+ "if a 'period' is given.")
+
+ data = np.arange(b, e, stride, dtype=np.int64)
+ return data
diff --git a/contrib/python/pandas/py2/pandas/core/base.py b/contrib/python/pandas/py2/pandas/core/base.py
new file mode 100644
index 00000000000..061ec85f820
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/base.py
@@ -0,0 +1,1530 @@
+"""
+Base and utility classes for pandas objects.
+"""
+import textwrap
+import warnings
+
+import numpy as np
+
+import pandas._libs.lib as lib
+import pandas.compat as compat
+from pandas.compat import PYPY, OrderedDict, builtins, map, range
+from pandas.compat.numpy import function as nv
+from pandas.errors import AbstractMethodError
+from pandas.util._decorators import Appender, Substitution, cache_readonly
+from pandas.util._validators import validate_bool_kwarg
+
+from pandas.core.dtypes.common import (
+ is_datetime64_ns_dtype, is_datetime64tz_dtype, is_datetimelike,
+ is_extension_array_dtype, is_extension_type, is_list_like, is_object_dtype,
+ is_scalar, is_timedelta64_ns_dtype)
+from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
+from pandas.core.dtypes.missing import isna
+
+from pandas.core import algorithms, common as com
+from pandas.core.accessor import DirNamesMixin
+import pandas.core.nanops as nanops
+
+_shared_docs = dict()
+_indexops_doc_kwargs = dict(klass='IndexOpsMixin', inplace='',
+ unique='IndexOpsMixin', duplicated='IndexOpsMixin')
+
+
+class StringMixin(object):
+ """implements string methods so long as object defines a `__unicode__`
+ method.
+
+ Handles Python2/3 compatibility transparently.
+ """
+ # side note - this could be made into a metaclass if more than one
+ # object needs
+
+ # ----------------------------------------------------------------------
+ # Formatting
+
+ def __unicode__(self):
+ raise AbstractMethodError(self)
+
+ def __str__(self):
+ """
+ Return a string representation for a particular Object
+
+ Invoked by str(df) in both py2/py3.
+ Yields Bytestring in Py2, Unicode String in py3.
+ """
+
+ if compat.PY3:
+ return self.__unicode__()
+ return self.__bytes__()
+
+ def __bytes__(self):
+ """
+ Return a string representation for a particular object.
+
+ Invoked by bytes(obj) in py3 only.
+ Yields a bytestring in both py2/py3.
+ """
+ from pandas.core.config import get_option
+
+ encoding = get_option("display.encoding")
+ return self.__unicode__().encode(encoding, 'replace')
+
+ def __repr__(self):
+ """
+ Return a string representation for a particular object.
+
+ Yields Bytestring in Py2, Unicode String in py3.
+ """
+ return str(self)
+
+
+class PandasObject(StringMixin, DirNamesMixin):
+
+ """baseclass for various pandas objects"""
+
+ @property
+ def _constructor(self):
+ """class constructor (for this class it's just `__class__`"""
+ return self.__class__
+
+ def __unicode__(self):
+ """
+ Return a string representation for a particular object.
+
+ Invoked by unicode(obj) in py2 only. Yields a Unicode String in both
+ py2/py3.
+ """
+ # Should be overwritten by base classes
+ return object.__repr__(self)
+
+ def _reset_cache(self, key=None):
+ """
+ Reset cached properties. If ``key`` is passed, only clears that key.
+ """
+ if getattr(self, '_cache', None) is None:
+ return
+ if key is None:
+ self._cache.clear()
+ else:
+ self._cache.pop(key, None)
+
+ def __sizeof__(self):
+ """
+ Generates the total memory usage for an object that returns
+ either a value or Series of values
+ """
+ if hasattr(self, 'memory_usage'):
+ mem = self.memory_usage(deep=True)
+ if not is_scalar(mem):
+ mem = mem.sum()
+ return int(mem)
+
+ # no memory_usage attribute, so fall back to
+ # object's 'sizeof'
+ return super(PandasObject, self).__sizeof__()
+
+
+class NoNewAttributesMixin(object):
+ """Mixin which prevents adding new attributes.
+
+ Prevents additional attributes via xxx.attribute = "something" after a
+ call to `self.__freeze()`. Mainly used to prevent the user from using
+ wrong attributes on a accessor (`Series.cat/.str/.dt`).
+
+ If you really want to add a new attribute at a later time, you need to use
+ `object.__setattr__(self, key, value)`.
+ """
+
+ def _freeze(self):
+ """Prevents setting additional attributes"""
+ object.__setattr__(self, "__frozen", True)
+
+ # prevent adding any attribute via s.xxx.new_attribute = ...
+ def __setattr__(self, key, value):
+ # _cache is used by a decorator
+ # We need to check both 1.) cls.__dict__ and 2.) getattr(self, key)
+ # because
+ # 1.) getattr is false for attributes that raise errors
+ # 2.) cls.__dict__ doesn't traverse into base classes
+ if (getattr(self, "__frozen", False) and not
+ (key == "_cache" or
+ key in type(self).__dict__ or
+ getattr(self, key, None) is not None)):
+ raise AttributeError("You cannot add any new attribute '{key}'".
+ format(key=key))
+ object.__setattr__(self, key, value)
+
+
+class GroupByError(Exception):
+ pass
+
+
+class DataError(GroupByError):
+ pass
+
+
+class SpecificationError(GroupByError):
+ pass
+
+
+class SelectionMixin(object):
+ """
+ mixin implementing the selection & aggregation interface on a group-like
+ object sub-classes need to define: obj, exclusions
+ """
+ _selection = None
+ _internal_names = ['_cache', '__setstate__']
+ _internal_names_set = set(_internal_names)
+
+ _builtin_table = OrderedDict((
+ (builtins.sum, np.sum),
+ (builtins.max, np.max),
+ (builtins.min, np.min),
+ ))
+
+ _cython_table = OrderedDict((
+ (builtins.sum, 'sum'),
+ (builtins.max, 'max'),
+ (builtins.min, 'min'),
+ (np.all, 'all'),
+ (np.any, 'any'),
+ (np.sum, 'sum'),
+ (np.nansum, 'sum'),
+ (np.mean, 'mean'),
+ (np.nanmean, 'mean'),
+ (np.prod, 'prod'),
+ (np.nanprod, 'prod'),
+ (np.std, 'std'),
+ (np.nanstd, 'std'),
+ (np.var, 'var'),
+ (np.nanvar, 'var'),
+ (np.median, 'median'),
+ (np.nanmedian, 'median'),
+ (np.max, 'max'),
+ (np.nanmax, 'max'),
+ (np.min, 'min'),
+ (np.nanmin, 'min'),
+ (np.cumprod, 'cumprod'),
+ (np.nancumprod, 'cumprod'),
+ (np.cumsum, 'cumsum'),
+ (np.nancumsum, 'cumsum'),
+ ))
+
+ @property
+ def _selection_name(self):
+ """
+ return a name for myself; this would ideally be called
+ the 'name' property, but we cannot conflict with the
+ Series.name property which can be set
+ """
+ if self._selection is None:
+ return None # 'result'
+ else:
+ return self._selection
+
+ @property
+ def _selection_list(self):
+ if not isinstance(self._selection, (list, tuple, ABCSeries,
+ ABCIndexClass, np.ndarray)):
+ return [self._selection]
+ return self._selection
+
+ @cache_readonly
+ def _selected_obj(self):
+
+ if self._selection is None or isinstance(self.obj, ABCSeries):
+ return self.obj
+ else:
+ return self.obj[self._selection]
+
+ @cache_readonly
+ def ndim(self):
+ return self._selected_obj.ndim
+
+ @cache_readonly
+ def _obj_with_exclusions(self):
+ if self._selection is not None and isinstance(self.obj,
+ ABCDataFrame):
+ return self.obj.reindex(columns=self._selection_list)
+
+ if len(self.exclusions) > 0:
+ return self.obj.drop(self.exclusions, axis=1)
+ else:
+ return self.obj
+
+ def __getitem__(self, key):
+ if self._selection is not None:
+ raise IndexError('Column(s) {selection} already selected'
+ .format(selection=self._selection))
+
+ if isinstance(key, (list, tuple, ABCSeries, ABCIndexClass,
+ np.ndarray)):
+ if len(self.obj.columns.intersection(key)) != len(key):
+ bad_keys = list(set(key).difference(self.obj.columns))
+ raise KeyError("Columns not found: {missing}"
+ .format(missing=str(bad_keys)[1:-1]))
+ return self._gotitem(list(key), ndim=2)
+
+ elif not getattr(self, 'as_index', False):
+ if key not in self.obj.columns:
+ raise KeyError("Column not found: {key}".format(key=key))
+ return self._gotitem(key, ndim=2)
+
+ else:
+ if key not in self.obj:
+ raise KeyError("Column not found: {key}".format(key=key))
+ return self._gotitem(key, ndim=1)
+
+ def _gotitem(self, key, ndim, subset=None):
+ """
+ sub-classes to define
+ return a sliced object
+
+ Parameters
+ ----------
+ key : string / list of selections
+ ndim : 1,2
+ requested ndim of result
+ subset : object, default None
+ subset to act on
+
+ """
+ raise AbstractMethodError(self)
+
+ def aggregate(self, func, *args, **kwargs):
+ raise AbstractMethodError(self)
+
+ agg = aggregate
+
+ def _try_aggregate_string_function(self, arg, *args, **kwargs):
+ """
+ if arg is a string, then try to operate on it:
+ - try to find a function (or attribute) on ourselves
+ - try to find a numpy function
+ - raise
+
+ """
+ assert isinstance(arg, compat.string_types)
+
+ f = getattr(self, arg, None)
+ if f is not None:
+ if callable(f):
+ return f(*args, **kwargs)
+
+ # people may try to aggregate on a non-callable attribute
+ # but don't let them think they can pass args to it
+ assert len(args) == 0
+ assert len([kwarg for kwarg in kwargs
+ if kwarg not in ['axis', '_level']]) == 0
+ return f
+
+ f = getattr(np, arg, None)
+ if f is not None:
+ return f(self, *args, **kwargs)
+
+ raise ValueError("{arg} is an unknown string function".format(arg=arg))
+
+ def _aggregate(self, arg, *args, **kwargs):
+ """
+ provide an implementation for the aggregators
+
+ Parameters
+ ----------
+ arg : string, dict, function
+ *args : args to pass on to the function
+ **kwargs : kwargs to pass on to the function
+
+ Returns
+ -------
+ tuple of result, how
+
+ Notes
+ -----
+ how can be a string describe the required post-processing, or
+ None if not required
+ """
+ is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
+ is_nested_renamer = False
+
+ _axis = kwargs.pop('_axis', None)
+ if _axis is None:
+ _axis = getattr(self, 'axis', 0)
+ _level = kwargs.pop('_level', None)
+
+ if isinstance(arg, compat.string_types):
+ return self._try_aggregate_string_function(arg, *args,
+ **kwargs), None
+
+ if isinstance(arg, dict):
+
+ # aggregate based on the passed dict
+ if _axis != 0: # pragma: no cover
+ raise ValueError('Can only pass dict with axis=0')
+
+ obj = self._selected_obj
+
+ def nested_renaming_depr(level=4):
+ # deprecation of nested renaming
+ # GH 15931
+ warnings.warn(
+ ("using a dict with renaming "
+ "is deprecated and will be removed in a future "
+ "version"),
+ FutureWarning, stacklevel=level)
+
+ # if we have a dict of any non-scalars
+ # eg. {'A' : ['mean']}, normalize all to
+ # be list-likes
+ if any(is_aggregator(x) for x in compat.itervalues(arg)):
+ new_arg = compat.OrderedDict()
+ for k, v in compat.iteritems(arg):
+ if not isinstance(v, (tuple, list, dict)):
+ new_arg[k] = [v]
+ else:
+ new_arg[k] = v
+
+ # the keys must be in the columns
+ # for ndim=2, or renamers for ndim=1
+
+ # ok for now, but deprecated
+ # {'A': { 'ra': 'mean' }}
+ # {'A': { 'ra': ['mean'] }}
+ # {'ra': ['mean']}
+
+ # not ok
+ # {'ra' : { 'A' : 'mean' }}
+ if isinstance(v, dict):
+ is_nested_renamer = True
+
+ if k not in obj.columns:
+ msg = ('cannot perform renaming for {key} with a '
+ 'nested dictionary').format(key=k)
+ raise SpecificationError(msg)
+ nested_renaming_depr(4 + (_level or 0))
+
+ elif isinstance(obj, ABCSeries):
+ nested_renaming_depr()
+ elif (isinstance(obj, ABCDataFrame) and
+ k not in obj.columns):
+ raise KeyError(
+ "Column '{col}' does not exist!".format(col=k))
+
+ arg = new_arg
+
+ else:
+ # deprecation of renaming keys
+ # GH 15931
+ keys = list(compat.iterkeys(arg))
+ if (isinstance(obj, ABCDataFrame) and
+ len(obj.columns.intersection(keys)) != len(keys)):
+ nested_renaming_depr()
+
+ from pandas.core.reshape.concat import concat
+
+ def _agg_1dim(name, how, subset=None):
+ """
+ aggregate a 1-dim with how
+ """
+ colg = self._gotitem(name, ndim=1, subset=subset)
+ if colg.ndim != 1:
+ raise SpecificationError("nested dictionary is ambiguous "
+ "in aggregation")
+ return colg.aggregate(how, _level=(_level or 0) + 1)
+
+ def _agg_2dim(name, how):
+ """
+ aggregate a 2-dim with how
+ """
+ colg = self._gotitem(self._selection, ndim=2,
+ subset=obj)
+ return colg.aggregate(how, _level=None)
+
+ def _agg(arg, func):
+ """
+ run the aggregations over the arg with func
+ return an OrderedDict
+ """
+ result = compat.OrderedDict()
+ for fname, agg_how in compat.iteritems(arg):
+ result[fname] = func(fname, agg_how)
+ return result
+
+ # set the final keys
+ keys = list(compat.iterkeys(arg))
+ result = compat.OrderedDict()
+
+ # nested renamer
+ if is_nested_renamer:
+ result = list(_agg(arg, _agg_1dim).values())
+
+ if all(isinstance(r, dict) for r in result):
+
+ result, results = compat.OrderedDict(), result
+ for r in results:
+ result.update(r)
+ keys = list(compat.iterkeys(result))
+
+ else:
+
+ if self._selection is not None:
+ keys = None
+
+ # some selection on the object
+ elif self._selection is not None:
+
+ sl = set(self._selection_list)
+
+ # we are a Series like object,
+ # but may have multiple aggregations
+ if len(sl) == 1:
+
+ result = _agg(arg, lambda fname,
+ agg_how: _agg_1dim(self._selection, agg_how))
+
+ # we are selecting the same set as we are aggregating
+ elif not len(sl - set(keys)):
+
+ result = _agg(arg, _agg_1dim)
+
+ # we are a DataFrame, with possibly multiple aggregations
+ else:
+
+ result = _agg(arg, _agg_2dim)
+
+ # no selection
+ else:
+
+ try:
+ result = _agg(arg, _agg_1dim)
+ except SpecificationError:
+
+ # we are aggregating expecting all 1d-returns
+ # but we have 2d
+ result = _agg(arg, _agg_2dim)
+
+ # combine results
+
+ def is_any_series():
+ # return a boolean if we have *any* nested series
+ return any(isinstance(r, ABCSeries)
+ for r in compat.itervalues(result))
+
+ def is_any_frame():
+ # return a boolean if we have *any* nested series
+ return any(isinstance(r, ABCDataFrame)
+ for r in compat.itervalues(result))
+
+ if isinstance(result, list):
+ return concat(result, keys=keys, axis=1, sort=True), True
+
+ elif is_any_frame():
+ # we have a dict of DataFrames
+ # return a MI DataFrame
+
+ return concat([result[k] for k in keys],
+ keys=keys, axis=1), True
+
+ elif isinstance(self, ABCSeries) and is_any_series():
+
+ # we have a dict of Series
+ # return a MI Series
+ try:
+ result = concat(result)
+ except TypeError:
+ # we want to give a nice error here if
+ # we have non-same sized objects, so
+ # we don't automatically broadcast
+
+ raise ValueError("cannot perform both aggregation "
+ "and transformation operations "
+ "simultaneously")
+
+ return result, True
+
+ # fall thru
+ from pandas import DataFrame, Series
+ try:
+ result = DataFrame(result)
+ except ValueError:
+
+ # we have a dict of scalars
+ result = Series(result,
+ name=getattr(self, 'name', None))
+
+ return result, True
+ elif is_list_like(arg) and arg not in compat.string_types:
+ # we require a list, but not an 'str'
+ return self._aggregate_multiple_funcs(arg,
+ _level=_level,
+ _axis=_axis), None
+ else:
+ result = None
+
+ f = self._is_cython_func(arg)
+ if f and not args and not kwargs:
+ return getattr(self, f)(), None
+
+ # caller can react
+ return result, True
+
+ def _aggregate_multiple_funcs(self, arg, _level, _axis):
+ from pandas.core.reshape.concat import concat
+
+ if _axis != 0:
+ raise NotImplementedError("axis other than 0 is not supported")
+
+ if self._selected_obj.ndim == 1:
+ obj = self._selected_obj
+ else:
+ obj = self._obj_with_exclusions
+
+ results = []
+ keys = []
+
+ # degenerate case
+ if obj.ndim == 1:
+ for a in arg:
+ try:
+ colg = self._gotitem(obj.name, ndim=1, subset=obj)
+ results.append(colg.aggregate(a))
+
+ # make sure we find a good name
+ name = com.get_callable_name(a) or a
+ keys.append(name)
+ except (TypeError, DataError):
+ pass
+ except SpecificationError:
+ raise
+
+ # multiples
+ else:
+ for index, col in enumerate(obj):
+ try:
+ colg = self._gotitem(col, ndim=1,
+ subset=obj.iloc[:, index])
+ results.append(colg.aggregate(arg))
+ keys.append(col)
+ except (TypeError, DataError):
+ pass
+ except ValueError:
+ # cannot aggregate
+ continue
+ except SpecificationError:
+ raise
+
+ # if we are empty
+ if not len(results):
+ raise ValueError("no results")
+
+ try:
+ return concat(results, keys=keys, axis=1, sort=False)
+ except TypeError:
+
+ # we are concatting non-NDFrame objects,
+ # e.g. a list of scalars
+
+ from pandas.core.dtypes.cast import is_nested_object
+ from pandas import Series
+ result = Series(results, index=keys, name=self.name)
+ if is_nested_object(result):
+ raise ValueError("cannot combine transform and "
+ "aggregation operations")
+ return result
+
+ def _shallow_copy(self, obj=None, obj_type=None, **kwargs):
+ """
+ return a new object with the replacement attributes
+ """
+ if obj is None:
+ obj = self._selected_obj.copy()
+ if obj_type is None:
+ obj_type = self._constructor
+ if isinstance(obj, obj_type):
+ obj = obj.obj
+ for attr in self._attributes:
+ if attr not in kwargs:
+ kwargs[attr] = getattr(self, attr)
+ return obj_type(obj, **kwargs)
+
+ def _is_cython_func(self, arg):
+ """
+ if we define an internal function for this argument, return it
+ """
+ return self._cython_table.get(arg)
+
+ def _is_builtin_func(self, arg):
+ """
+ if we define an builtin function for this argument, return it,
+ otherwise return the arg
+ """
+ return self._builtin_table.get(arg, arg)
+
+
+class IndexOpsMixin(object):
+ """ common ops mixin to support a unified interface / docs for Series /
+ Index
+ """
+
+ # ndarray compatibility
+ __array_priority__ = 1000
+
+ def transpose(self, *args, **kwargs):
+ """
+ Return the transpose, which is by definition self.
+ """
+ nv.validate_transpose(args, kwargs)
+ return self
+
+ T = property(transpose, doc="Return the transpose, which is by "
+ "definition self.")
+
+ @property
+ def _is_homogeneous_type(self):
+ """
+ Whether the object has a single dtype.
+
+ By definition, Series and Index are always considered homogeneous.
+ A MultiIndex may or may not be homogeneous, depending on the
+ dtypes of the levels.
+
+ See Also
+ --------
+ DataFrame._is_homogeneous_type
+ MultiIndex._is_homogeneous_type
+ """
+ return True
+
+ @property
+ def shape(self):
+ """
+ Return a tuple of the shape of the underlying data.
+ """
+ return self._values.shape
+
+ @property
+ def ndim(self):
+ """
+ Number of dimensions of the underlying data, by definition 1.
+ """
+ return 1
+
+ def item(self):
+ """
+ Return the first element of the underlying data as a python scalar.
+ """
+ try:
+ return self.values.item()
+ except IndexError:
+ # copy numpy's message here because Py26 raises an IndexError
+ raise ValueError('can only convert an array of size 1 to a '
+ 'Python scalar')
+
+ @property
+ def data(self):
+ """
+ Return the data pointer of the underlying data.
+ """
+ warnings.warn("{obj}.data is deprecated and will be removed "
+ "in a future version".format(obj=type(self).__name__),
+ FutureWarning, stacklevel=2)
+ return self.values.data
+
+ @property
+ def itemsize(self):
+ """
+ Return the size of the dtype of the item of the underlying data.
+ """
+ warnings.warn("{obj}.itemsize is deprecated and will be removed "
+ "in a future version".format(obj=type(self).__name__),
+ FutureWarning, stacklevel=2)
+ return self._ndarray_values.itemsize
+
+ @property
+ def nbytes(self):
+ """
+ Return the number of bytes in the underlying data.
+ """
+ return self._values.nbytes
+
+ @property
+ def strides(self):
+ """
+ Return the strides of the underlying data.
+ """
+ warnings.warn("{obj}.strides is deprecated and will be removed "
+ "in a future version".format(obj=type(self).__name__),
+ FutureWarning, stacklevel=2)
+ return self._ndarray_values.strides
+
+ @property
+ def size(self):
+ """
+ Return the number of elements in the underlying data.
+ """
+ return len(self._values)
+
+ @property
+ def flags(self):
+ """
+ Return the ndarray.flags for the underlying data.
+ """
+ warnings.warn("{obj}.flags is deprecated and will be removed "
+ "in a future version".format(obj=type(self).__name__),
+ FutureWarning, stacklevel=2)
+ return self.values.flags
+
+ @property
+ def base(self):
+ """
+ Return the base object if the memory of the underlying data is shared.
+ """
+ warnings.warn("{obj}.base is deprecated and will be removed "
+ "in a future version".format(obj=type(self).__name__),
+ FutureWarning, stacklevel=2)
+ return self.values.base
+
+ @property
+ def array(self):
+ # type: () -> ExtensionArray
+ """
+ The ExtensionArray of the data backing this Series or Index.
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ array : ExtensionArray
+ An ExtensionArray of the values stored within. For extension
+ types, this is the actual array. For NumPy native types, this
+ is a thin (no copy) wrapper around :class:`numpy.ndarray`.
+
+ ``.array`` differs ``.values`` which may require converting the
+ data to a different form.
+
+ See Also
+ --------
+ Index.to_numpy : Similar method that always returns a NumPy array.
+ Series.to_numpy : Similar method that always returns a NumPy array.
+
+ Notes
+ -----
+ This table lays out the different array types for each extension
+ dtype within pandas.
+
+ ================== =============================
+ dtype array type
+ ================== =============================
+ category Categorical
+ period PeriodArray
+ interval IntervalArray
+ IntegerNA IntegerArray
+ datetime64[ns, tz] DatetimeArray
+ ================== =============================
+
+ For any 3rd-party extension types, the array type will be an
+ ExtensionArray.
+
+ For all remaining dtypes ``.array`` will be a
+ :class:`arrays.NumpyExtensionArray` wrapping the actual ndarray
+ stored within. If you absolutely need a NumPy array (possibly with
+ copying / coercing data), then use :meth:`Series.to_numpy` instead.
+
+ Examples
+ --------
+
+ For regular NumPy types like int, and float, a PandasArray
+ is returned.
+
+ >>> pd.Series([1, 2, 3]).array
+ <PandasArray>
+ [1, 2, 3]
+ Length: 3, dtype: int64
+
+ For extension types, like Categorical, the actual ExtensionArray
+ is returned
+
+ >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
+ >>> ser.array
+ [a, b, a]
+ Categories (2, object): [a, b]
+ """
+ result = self._values
+
+ if is_datetime64_ns_dtype(result.dtype):
+ from pandas.arrays import DatetimeArray
+ result = DatetimeArray(result)
+ elif is_timedelta64_ns_dtype(result.dtype):
+ from pandas.arrays import TimedeltaArray
+ result = TimedeltaArray(result)
+
+ elif not is_extension_array_dtype(result.dtype):
+ from pandas.core.arrays.numpy_ import PandasArray
+ result = PandasArray(result)
+
+ return result
+
+ def to_numpy(self, dtype=None, copy=False):
+ """
+ A NumPy ndarray representing the values in this Series or Index.
+
+ .. versionadded:: 0.24.0
+
+
+ Parameters
+ ----------
+ dtype : str or numpy.dtype, optional
+ The dtype to pass to :meth:`numpy.asarray`
+ copy : bool, default False
+ Whether to ensure that the returned value is a not a view on
+ another array. Note that ``copy=False`` does not *ensure* that
+ ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
+ a copy is made, even if not strictly necessary.
+
+ Returns
+ -------
+ numpy.ndarray
+
+ See Also
+ --------
+ Series.array : Get the actual data stored within.
+ Index.array : Get the actual data stored within.
+ DataFrame.to_numpy : Similar method for DataFrame.
+
+ Notes
+ -----
+ The returned array will be the same up to equality (values equal
+ in `self` will be equal in the returned array; likewise for values
+ that are not equal). When `self` contains an ExtensionArray, the
+ dtype may be different. For example, for a category-dtype Series,
+ ``to_numpy()`` will return a NumPy array and the categorical dtype
+ will be lost.
+
+ For NumPy dtypes, this will be a reference to the actual data stored
+ in this Series or Index (assuming ``copy=False``). Modifying the result
+ in place will modify the data stored in the Series or Index (not that
+ we recommend doing that).
+
+ For extension types, ``to_numpy()`` *may* require copying data and
+ coercing the result to a NumPy type (possibly object), which may be
+ expensive. When you need a no-copy reference to the underlying data,
+ :attr:`Series.array` should be used instead.
+
+ This table lays out the different dtypes and default return types of
+ ``to_numpy()`` for various dtypes within pandas.
+
+ ================== ================================
+ dtype array type
+ ================== ================================
+ category[T] ndarray[T] (same dtype as input)
+ period ndarray[object] (Periods)
+ interval ndarray[object] (Intervals)
+ IntegerNA ndarray[object]
+ datetime64[ns] datetime64[ns]
+ datetime64[ns, tz] ndarray[object] (Timestamps)
+ ================== ================================
+
+ Examples
+ --------
+ >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
+ >>> ser.to_numpy()
+ array(['a', 'b', 'a'], dtype=object)
+
+ Specify the `dtype` to control how datetime-aware data is represented.
+ Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp`
+ objects, each with the correct ``tz``.
+
+ >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
+ >>> ser.to_numpy(dtype=object)
+ array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'),
+ Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')],
+ dtype=object)
+
+ Or ``dtype='datetime64[ns]'`` to return an ndarray of native
+ datetime64 values. The values are converted to UTC and the timezone
+ info is dropped.
+
+ >>> ser.to_numpy(dtype="datetime64[ns]")
+ ... # doctest: +ELLIPSIS
+ array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
+ dtype='datetime64[ns]')
+ """
+ if is_datetime64tz_dtype(self.dtype) and dtype is None:
+ # note: this is going to change very soon.
+ # I have a WIP PR making this unnecessary, but it's
+ # a bit out of scope for the DatetimeArray PR.
+ dtype = "object"
+
+ result = np.asarray(self._values, dtype=dtype)
+ # TODO(GH-24345): Avoid potential double copy
+ if copy:
+ result = result.copy()
+ return result
+
+ @property
+ def _ndarray_values(self):
+ # type: () -> np.ndarray
+ """
+ The data as an ndarray, possibly losing information.
+
+ The expectation is that this is cheap to compute, and is primarily
+ used for interacting with our indexers.
+
+ - categorical -> codes
+ """
+ if is_extension_array_dtype(self):
+ return self.array._ndarray_values
+ return self.values
+
+ @property
+ def empty(self):
+ return not self.size
+
+ def max(self, axis=None, skipna=True):
+ """
+ Return the maximum value of the Index.
+
+ Parameters
+ ----------
+ axis : int, optional
+ For compatibility with NumPy. Only 0 or None are allowed.
+ skipna : bool, default True
+
+ Returns
+ -------
+ scalar
+ Maximum value.
+
+ See Also
+ --------
+ Index.min : Return the minimum value in an Index.
+ Series.max : Return the maximum value in a Series.
+ DataFrame.max : Return the maximum values in a DataFrame.
+
+ Examples
+ --------
+ >>> idx = pd.Index([3, 2, 1])
+ >>> idx.max()
+ 3
+
+ >>> idx = pd.Index(['c', 'b', 'a'])
+ >>> idx.max()
+ 'c'
+
+ For a MultiIndex, the maximum is determined lexicographically.
+
+ >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])
+ >>> idx.max()
+ ('b', 2)
+ """
+ nv.validate_minmax_axis(axis)
+ return nanops.nanmax(self._values, skipna=skipna)
+
+ def argmax(self, axis=None, skipna=True):
+ """
+ Return a ndarray of the maximum argument indexer.
+
+ Parameters
+ ----------
+ axis : {None}
+ Dummy argument for consistency with Series
+ skipna : bool, default True
+
+ See Also
+ --------
+ numpy.ndarray.argmax
+ """
+ nv.validate_minmax_axis(axis)
+ return nanops.nanargmax(self._values, skipna=skipna)
+
+ def min(self, axis=None, skipna=True):
+ """
+ Return the minimum value of the Index.
+
+ Parameters
+ ----------
+ axis : {None}
+ Dummy argument for consistency with Series
+ skipna : bool, default True
+
+ Returns
+ -------
+ scalar
+ Minimum value.
+
+ See Also
+ --------
+ Index.max : Return the maximum value of the object.
+ Series.min : Return the minimum value in a Series.
+ DataFrame.min : Return the minimum values in a DataFrame.
+
+ Examples
+ --------
+ >>> idx = pd.Index([3, 2, 1])
+ >>> idx.min()
+ 1
+
+ >>> idx = pd.Index(['c', 'b', 'a'])
+ >>> idx.min()
+ 'a'
+
+ For a MultiIndex, the minimum is determined lexicographically.
+
+ >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])
+ >>> idx.min()
+ ('a', 1)
+ """
+ nv.validate_minmax_axis(axis)
+ return nanops.nanmin(self._values, skipna=skipna)
+
+ def argmin(self, axis=None, skipna=True):
+ """
+ Return a ndarray of the minimum argument indexer.
+
+ Parameters
+ ----------
+ axis : {None}
+ Dummy argument for consistency with Series
+ skipna : bool, default True
+
+ See Also
+ --------
+ numpy.ndarray.argmin
+ """
+ nv.validate_minmax_axis(axis)
+ return nanops.nanargmin(self._values, skipna=skipna)
+
+ def tolist(self):
+ """
+ Return a list of the values.
+
+ These are each a scalar type, which is a Python scalar
+ (for str, int, float) or a pandas scalar
+ (for Timestamp/Timedelta/Interval/Period)
+
+ See Also
+ --------
+ numpy.ndarray.tolist
+ """
+ if is_datetimelike(self._values):
+ return [com.maybe_box_datetimelike(x) for x in self._values]
+ elif is_extension_array_dtype(self._values):
+ return list(self._values)
+ else:
+ return self._values.tolist()
+
+ to_list = tolist
+
+ def __iter__(self):
+ """
+ Return an iterator of the values.
+
+ These are each a scalar type, which is a Python scalar
+ (for str, int, float) or a pandas scalar
+ (for Timestamp/Timedelta/Interval/Period)
+ """
+ # We are explicity making element iterators.
+ if is_datetimelike(self._values):
+ return map(com.maybe_box_datetimelike, self._values)
+ elif is_extension_array_dtype(self._values):
+ return iter(self._values)
+ else:
+ return map(self._values.item, range(self._values.size))
+
+ @cache_readonly
+ def hasnans(self):
+ """
+ Return if I have any nans; enables various perf speedups.
+ """
+ return bool(isna(self).any())
+
+ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
+ filter_type=None, **kwds):
+ """ perform the reduction type operation if we can """
+ func = getattr(self, name, None)
+ if func is None:
+ raise TypeError("{klass} cannot perform the operation {op}".format(
+ klass=self.__class__.__name__, op=name))
+ return func(skipna=skipna, **kwds)
+
+ def _map_values(self, mapper, na_action=None):
+ """
+ An internal function that maps values using the input
+ correspondence (which can be a dict, Series, or function).
+
+ Parameters
+ ----------
+ mapper : function, dict, or Series
+ The input correspondence object
+ na_action : {None, 'ignore'}
+ If 'ignore', propagate NA values, without passing them to the
+ mapping function
+
+ Returns
+ -------
+ applied : Union[Index, MultiIndex], inferred
+ The output of the mapping function applied to the index.
+ If the function returns a tuple with more than one element
+ a MultiIndex will be returned.
+
+ """
+
+ # we can fastpath dict/Series to an efficient map
+ # as we know that we are not going to have to yield
+ # python types
+ if isinstance(mapper, dict):
+ if hasattr(mapper, '__missing__'):
+ # If a dictionary subclass defines a default value method,
+ # convert mapper to a lookup function (GH #15999).
+ dict_with_default = mapper
+ mapper = lambda x: dict_with_default[x]
+ else:
+ # Dictionary does not have a default. Thus it's safe to
+ # convert to an Series for efficiency.
+ # we specify the keys here to handle the
+ # possibility that they are tuples
+ from pandas import Series
+ mapper = Series(mapper)
+
+ if isinstance(mapper, ABCSeries):
+ # Since values were input this means we came from either
+ # a dict or a series and mapper should be an index
+ if is_extension_type(self.dtype):
+ values = self._values
+ else:
+ values = self.values
+
+ indexer = mapper.index.get_indexer(values)
+ new_values = algorithms.take_1d(mapper._values, indexer)
+
+ return new_values
+
+ # we must convert to python types
+ if is_extension_type(self.dtype):
+ values = self._values
+ if na_action is not None:
+ raise NotImplementedError
+ map_f = lambda values, f: values.map(f)
+ else:
+ values = self.astype(object)
+ values = getattr(values, 'values', values)
+ if na_action == 'ignore':
+ def map_f(values, f):
+ return lib.map_infer_mask(values, f,
+ isna(values).view(np.uint8))
+ else:
+ map_f = lib.map_infer
+
+ # mapper is a function
+ new_values = map_f(values, mapper)
+
+ return new_values
+
+ def value_counts(self, normalize=False, sort=True, ascending=False,
+ bins=None, dropna=True):
+ """
+ Return a Series containing counts of unique values.
+
+ The resulting object will be in descending order so that the
+ first element is the most frequently-occurring element.
+ Excludes NA values by default.
+
+ Parameters
+ ----------
+ normalize : boolean, default False
+ If True then the object returned will contain the relative
+ frequencies of the unique values.
+ sort : boolean, default True
+ Sort by values.
+ ascending : boolean, default False
+ Sort in ascending order.
+ bins : integer, optional
+ Rather than count values, group them into half-open bins,
+ a convenience for ``pd.cut``, only works with numeric data.
+ dropna : boolean, default True
+ Don't include counts of NaN.
+
+ Returns
+ -------
+ counts : Series
+
+ See Also
+ --------
+ Series.count: Number of non-NA elements in a Series.
+ DataFrame.count: Number of non-NA elements in a DataFrame.
+
+ Examples
+ --------
+ >>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
+ >>> index.value_counts()
+ 3.0 2
+ 4.0 1
+ 2.0 1
+ 1.0 1
+ dtype: int64
+
+ With `normalize` set to `True`, returns the relative frequency by
+ dividing all values by the sum of values.
+
+ >>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
+ >>> s.value_counts(normalize=True)
+ 3.0 0.4
+ 4.0 0.2
+ 2.0 0.2
+ 1.0 0.2
+ dtype: float64
+
+ **bins**
+
+ Bins can be useful for going from a continuous variable to a
+ categorical variable; instead of counting unique
+ apparitions of values, divide the index in the specified
+ number of half-open bins.
+
+ >>> s.value_counts(bins=3)
+ (2.0, 3.0] 2
+ (0.996, 2.0] 2
+ (3.0, 4.0] 1
+ dtype: int64
+
+ **dropna**
+
+ With `dropna` set to `False` we can also see NaN index values.
+
+ >>> s.value_counts(dropna=False)
+ 3.0 2
+ NaN 1
+ 4.0 1
+ 2.0 1
+ 1.0 1
+ dtype: int64
+ """
+ from pandas.core.algorithms import value_counts
+ result = value_counts(self, sort=sort, ascending=ascending,
+ normalize=normalize, bins=bins, dropna=dropna)
+ return result
+
+ def unique(self):
+ values = self._values
+
+ if hasattr(values, 'unique'):
+
+ result = values.unique()
+ else:
+ from pandas.core.algorithms import unique1d
+ result = unique1d(values)
+
+ return result
+
+ def nunique(self, dropna=True):
+ """
+ Return number of unique elements in the object.
+
+ Excludes NA values by default.
+
+ Parameters
+ ----------
+ dropna : boolean, default True
+ Don't include NaN in the count.
+
+ Returns
+ -------
+ nunique : int
+ """
+ uniqs = self.unique()
+ n = len(uniqs)
+ if dropna and isna(uniqs).any():
+ n -= 1
+ return n
+
+ @property
+ def is_unique(self):
+ """
+ Return boolean if values in the object are unique.
+
+ Returns
+ -------
+ is_unique : boolean
+ """
+ return self.nunique(dropna=False) == len(self)
+
+ @property
+ def is_monotonic(self):
+ """
+ Return boolean if values in the object are
+ monotonic_increasing.
+
+ .. versionadded:: 0.19.0
+
+ Returns
+ -------
+ is_monotonic : boolean
+ """
+ from pandas import Index
+ return Index(self).is_monotonic
+
+ is_monotonic_increasing = is_monotonic
+
+ @property
+ def is_monotonic_decreasing(self):
+ """
+ Return boolean if values in the object are
+ monotonic_decreasing.
+
+ .. versionadded:: 0.19.0
+
+ Returns
+ -------
+ is_monotonic_decreasing : boolean
+ """
+ from pandas import Index
+ return Index(self).is_monotonic_decreasing
+
+ def memory_usage(self, deep=False):
+ """
+ Memory usage of the values
+
+ Parameters
+ ----------
+ deep : bool
+ Introspect the data deeply, interrogate
+ `object` dtypes for system-level memory consumption
+
+ Returns
+ -------
+ bytes used
+
+ See Also
+ --------
+ numpy.ndarray.nbytes
+
+ Notes
+ -----
+ Memory usage does not include memory consumed by elements that
+ are not components of the array if deep=False or if used on PyPy
+ """
+ if hasattr(self.array, 'memory_usage'):
+ return self.array.memory_usage(deep=deep)
+
+ v = self.array.nbytes
+ if deep and is_object_dtype(self) and not PYPY:
+ v += lib.memory_usage_of_objects(self.array)
+ return v
+
+ @Substitution(
+ values='', order='', size_hint='',
+ sort=textwrap.dedent("""\
+ sort : boolean, default False
+ Sort `uniques` and shuffle `labels` to maintain the
+ relationship.
+ """))
+ @Appender(algorithms._shared_docs['factorize'])
+ def factorize(self, sort=False, na_sentinel=-1):
+ return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
+
+ _shared_docs['searchsorted'] = (
+ """
+ Find indices where elements should be inserted to maintain order.
+
+ Find the indices into a sorted %(klass)s `self` such that, if the
+ corresponding elements in `value` were inserted before the indices,
+ the order of `self` would be preserved.
+
+ Parameters
+ ----------
+ value : array_like
+ Values to insert into `self`.
+ side : {'left', 'right'}, optional
+ If 'left', the index of the first suitable location found is given.
+ If 'right', return the last such index. If there is no suitable
+ index, return either 0 or N (where N is the length of `self`).
+ sorter : 1-D array_like, optional
+ Optional array of integer indices that sort `self` into ascending
+ order. They are typically the result of ``np.argsort``.
+
+ Returns
+ -------
+ int or array of int
+ A scalar or array of insertion points with the
+ same shape as `value`.
+
+ .. versionchanged :: 0.24.0
+ If `value` is a scalar, an int is now always returned.
+ Previously, scalar inputs returned an 1-item array for
+ :class:`Series` and :class:`Categorical`.
+
+ See Also
+ --------
+ numpy.searchsorted
+
+ Notes
+ -----
+ Binary search is used to find the required insertion points.
+
+ Examples
+ --------
+
+ >>> x = pd.Series([1, 2, 3])
+ >>> x
+ 0 1
+ 1 2
+ 2 3
+ dtype: int64
+
+ >>> x.searchsorted(4)
+ 3
+
+ >>> x.searchsorted([0, 4])
+ array([0, 3])
+
+ >>> x.searchsorted([1, 3], side='left')
+ array([0, 2])
+
+ >>> x.searchsorted([1, 3], side='right')
+ array([1, 3])
+
+ >>> x = pd.Categorical(['apple', 'bread', 'bread',
+ 'cheese', 'milk'], ordered=True)
+ [apple, bread, bread, cheese, milk]
+ Categories (4, object): [apple < bread < cheese < milk]
+
+ >>> x.searchsorted('bread')
+ 1
+
+ >>> x.searchsorted(['bread'], side='right')
+ array([3])
+ """)
+
+ @Substitution(klass='IndexOpsMixin')
+ @Appender(_shared_docs['searchsorted'])
+ def searchsorted(self, value, side='left', sorter=None):
+ # needs coercion on the key (DatetimeIndex does already)
+ return self._values.searchsorted(value, side=side, sorter=sorter)
+
+ def drop_duplicates(self, keep='first', inplace=False):
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ if isinstance(self, ABCIndexClass):
+ if self.is_unique:
+ return self._shallow_copy()
+
+ duplicated = self.duplicated(keep=keep)
+ result = self[np.logical_not(duplicated)]
+ if inplace:
+ return self._update_inplace(result)
+ else:
+ return result
+
+ def duplicated(self, keep='first'):
+ from pandas.core.algorithms import duplicated
+ if isinstance(self, ABCIndexClass):
+ if self.is_unique:
+ return np.zeros(len(self), dtype=np.bool)
+ return duplicated(self, keep=keep)
+ else:
+ return self._constructor(duplicated(self, keep=keep),
+ index=self.index).__finalize__(self)
+
+ # ----------------------------------------------------------------------
+ # abstracts
+
+ def _update_inplace(self, result, **kwargs):
+ raise AbstractMethodError(self)
diff --git a/contrib/python/pandas/py2/pandas/core/categorical.py b/contrib/python/pandas/py2/pandas/core/categorical.py
new file mode 100644
index 00000000000..43c35c4000b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/categorical.py
@@ -0,0 +1,9 @@
+import warnings
+
+from pandas.core.dtypes.dtypes import CategoricalDtype # noqa
+
+from pandas.core.arrays import Categorical # noqa
+
+# TODO: Remove after 0.23.x
+warnings.warn("'pandas.core' is private. Use 'pandas.Categorical'",
+ FutureWarning, stacklevel=2)
diff --git a/contrib/python/pandas/py2/pandas/core/common.py b/contrib/python/pandas/py2/pandas/core/common.py
new file mode 100644
index 00000000000..b4de0daa13b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/common.py
@@ -0,0 +1,472 @@
+"""
+Misc tools for implementing data structures
+
+Note: pandas.core.common is *not* part of the public API.
+"""
+
+import collections
+from datetime import datetime, timedelta
+from functools import partial
+import inspect
+
+import numpy as np
+
+from pandas._libs import lib, tslibs
+import pandas.compat as compat
+from pandas.compat import PY36, OrderedDict, iteritems
+
+from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
+from pandas.core.dtypes.common import (
+ is_array_like, is_bool_dtype, is_extension_array_dtype, is_integer)
+from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries
+from pandas.core.dtypes.inference import _iterable_not_string
+from pandas.core.dtypes.missing import isna, isnull, notnull # noqa
+
+
+class SettingWithCopyError(ValueError):
+ pass
+
+
+class SettingWithCopyWarning(Warning):
+ pass
+
+
+def flatten(l):
+ """Flatten an arbitrarily nested sequence.
+
+ Parameters
+ ----------
+ l : sequence
+ The non string sequence to flatten
+
+ Notes
+ -----
+ This doesn't consider strings sequences.
+
+ Returns
+ -------
+ flattened : generator
+ """
+ for el in l:
+ if _iterable_not_string(el):
+ for s in flatten(el):
+ yield s
+ else:
+ yield el
+
+
+def consensus_name_attr(objs):
+ name = objs[0].name
+ for obj in objs[1:]:
+ try:
+ if obj.name != name:
+ name = None
+ except ValueError:
+ name = None
+ return name
+
+
+def maybe_box(indexer, values, obj, key):
+
+ # if we have multiples coming back, box em
+ if isinstance(values, np.ndarray):
+ return obj[indexer.get_loc(key)]
+
+ # return the value
+ return values
+
+
+def maybe_box_datetimelike(value):
+ # turn a datetime like into a Timestamp/timedelta as needed
+
+ if isinstance(value, (np.datetime64, datetime)):
+ value = tslibs.Timestamp(value)
+ elif isinstance(value, (np.timedelta64, timedelta)):
+ value = tslibs.Timedelta(value)
+
+ return value
+
+
+values_from_object = lib.values_from_object
+
+
+def is_bool_indexer(key):
+ # type: (Any) -> bool
+ """
+ Check whether `key` is a valid boolean indexer.
+
+ Parameters
+ ----------
+ key : Any
+ Only list-likes may be considered boolean indexers.
+ All other types are not considered a boolean indexer.
+ For array-like input, boolean ndarrays or ExtensionArrays
+ with ``_is_boolean`` set are considered boolean indexers.
+
+ Returns
+ -------
+ bool
+
+ Raises
+ ------
+ ValueError
+ When the array is an object-dtype ndarray or ExtensionArray
+ and contains missing values.
+ """
+ na_msg = 'cannot index with vector containing NA / NaN values'
+ if (isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or
+ (is_array_like(key) and is_extension_array_dtype(key.dtype))):
+ if key.dtype == np.object_:
+ key = np.asarray(values_from_object(key))
+
+ if not lib.is_bool_array(key):
+ if isna(key).any():
+ raise ValueError(na_msg)
+ return False
+ return True
+ elif is_bool_dtype(key.dtype):
+ # an ndarray with bool-dtype by definition has no missing values.
+ # So we only need to check for NAs in ExtensionArrays
+ if is_extension_array_dtype(key.dtype):
+ if np.any(key.isna()):
+ raise ValueError(na_msg)
+ return True
+ elif isinstance(key, list):
+ try:
+ arr = np.asarray(key)
+ return arr.dtype == np.bool_ and len(arr) == len(key)
+ except TypeError: # pragma: no cover
+ return False
+
+ return False
+
+
+def cast_scalar_indexer(val):
+ """
+ To avoid numpy DeprecationWarnings, cast float to integer where valid.
+
+ Parameters
+ ----------
+ val : scalar
+
+ Returns
+ -------
+ outval : scalar
+ """
+ # assumes lib.is_scalar(val)
+ if lib.is_float(val) and val == int(val):
+ return int(val)
+ return val
+
+
+def _not_none(*args):
+ """Returns a generator consisting of the arguments that are not None"""
+ return (arg for arg in args if arg is not None)
+
+
+def _any_none(*args):
+ """Returns a boolean indicating if any argument is None"""
+ for arg in args:
+ if arg is None:
+ return True
+ return False
+
+
+def _all_none(*args):
+ """Returns a boolean indicating if all arguments are None"""
+ for arg in args:
+ if arg is not None:
+ return False
+ return True
+
+
+def _any_not_none(*args):
+ """Returns a boolean indicating if any argument is not None"""
+ for arg in args:
+ if arg is not None:
+ return True
+ return False
+
+
+def _all_not_none(*args):
+ """Returns a boolean indicating if all arguments are not None"""
+ for arg in args:
+ if arg is None:
+ return False
+ return True
+
+
+def count_not_none(*args):
+ """Returns the count of arguments that are not None"""
+ return sum(x is not None for x in args)
+
+
+def try_sort(iterable):
+ listed = list(iterable)
+ try:
+ return sorted(listed)
+ except Exception:
+ return listed
+
+
+def dict_keys_to_ordered_list(mapping):
+ # when pandas drops support for Python < 3.6, this function
+ # can be replaced by a simple list(mapping.keys())
+ if PY36 or isinstance(mapping, OrderedDict):
+ keys = list(mapping.keys())
+ else:
+ keys = try_sort(mapping)
+ return keys
+
+
+def asarray_tuplesafe(values, dtype=None):
+
+ if not (isinstance(values, (list, tuple)) or hasattr(values, '__array__')):
+ values = list(values)
+ elif isinstance(values, ABCIndexClass):
+ return values.values
+
+ if isinstance(values, list) and dtype in [np.object_, object]:
+ return construct_1d_object_array_from_listlike(values)
+
+ result = np.asarray(values, dtype=dtype)
+
+ if issubclass(result.dtype.type, compat.string_types):
+ result = np.asarray(values, dtype=object)
+
+ if result.ndim == 2:
+ # Avoid building an array of arrays:
+ # TODO: verify whether any path hits this except #18819 (invalid)
+ values = [tuple(x) for x in values]
+ result = construct_1d_object_array_from_listlike(values)
+
+ return result
+
+
+def index_labels_to_array(labels, dtype=None):
+ """
+ Transform label or iterable of labels to array, for use in Index.
+
+ Parameters
+ ----------
+ dtype : dtype
+ If specified, use as dtype of the resulting array, otherwise infer.
+
+ Returns
+ -------
+ array
+ """
+ if isinstance(labels, (compat.string_types, tuple)):
+ labels = [labels]
+
+ if not isinstance(labels, (list, np.ndarray)):
+ try:
+ labels = list(labels)
+ except TypeError: # non-iterable
+ labels = [labels]
+
+ labels = asarray_tuplesafe(labels, dtype=dtype)
+
+ return labels
+
+
+def maybe_make_list(obj):
+ if obj is not None and not isinstance(obj, (tuple, list)):
+ return [obj]
+ return obj
+
+
+def is_null_slice(obj):
+ """ we have a null slice """
+ return (isinstance(obj, slice) and obj.start is None and
+ obj.stop is None and obj.step is None)
+
+
+def is_true_slices(l):
+ """
+ Find non-trivial slices in "l": return a list of booleans with same length.
+ """
+ return [isinstance(k, slice) and not is_null_slice(k) for k in l]
+
+
+# TODO: used only once in indexing; belongs elsewhere?
+def is_full_slice(obj, l):
+ """ we have a full length slice """
+ return (isinstance(obj, slice) and obj.start == 0 and obj.stop == l and
+ obj.step is None)
+
+
+def get_callable_name(obj):
+ # typical case has name
+ if hasattr(obj, '__name__'):
+ return getattr(obj, '__name__')
+ # some objects don't; could recurse
+ if isinstance(obj, partial):
+ return get_callable_name(obj.func)
+ # fall back to class name
+ if hasattr(obj, '__call__'):
+ return obj.__class__.__name__
+ # everything failed (probably because the argument
+ # wasn't actually callable); we return None
+ # instead of the empty string in this case to allow
+ # distinguishing between no name and a name of ''
+ return None
+
+
+def apply_if_callable(maybe_callable, obj, **kwargs):
+ """
+ Evaluate possibly callable input using obj and kwargs if it is callable,
+ otherwise return as it is
+
+ Parameters
+ ----------
+ maybe_callable : possibly a callable
+ obj : NDFrame
+ **kwargs
+ """
+
+ if callable(maybe_callable):
+ return maybe_callable(obj, **kwargs)
+
+ return maybe_callable
+
+
+def dict_compat(d):
+ """
+ Helper function to convert datetimelike-keyed dicts to Timestamp-keyed dict
+
+ Parameters
+ ----------
+ d: dict like object
+
+ Returns
+ -------
+ dict
+
+ """
+ return {maybe_box_datetimelike(key): value for key, value in iteritems(d)}
+
+
+def standardize_mapping(into):
+ """
+ Helper function to standardize a supplied mapping.
+
+ .. versionadded:: 0.21.0
+
+ Parameters
+ ----------
+ into : instance or subclass of collections.Mapping
+ Must be a class, an initialized collections.defaultdict,
+ or an instance of a collections.Mapping subclass.
+
+ Returns
+ -------
+ mapping : a collections.Mapping subclass or other constructor
+ a callable object that can accept an iterator to create
+ the desired Mapping.
+
+ See Also
+ --------
+ DataFrame.to_dict
+ Series.to_dict
+ """
+ if not inspect.isclass(into):
+ if isinstance(into, collections.defaultdict):
+ return partial(
+ collections.defaultdict, into.default_factory)
+ into = type(into)
+ if not issubclass(into, compat.Mapping):
+ raise TypeError('unsupported type: {into}'.format(into=into))
+ elif into == collections.defaultdict:
+ raise TypeError(
+ 'to_dict() only accepts initialized defaultdicts')
+ return into
+
+
+def sentinel_factory():
+ class Sentinel(object):
+ pass
+
+ return Sentinel()
+
+
+def random_state(state=None):
+ """
+ Helper function for processing random_state arguments.
+
+ Parameters
+ ----------
+ state : int, np.random.RandomState, None.
+ If receives an int, passes to np.random.RandomState() as seed.
+ If receives an np.random.RandomState object, just returns object.
+ If receives `None`, returns np.random.
+ If receives anything else, raises an informative ValueError.
+ Default None.
+
+ Returns
+ -------
+ np.random.RandomState
+ """
+
+ if is_integer(state):
+ return np.random.RandomState(state)
+ elif isinstance(state, np.random.RandomState):
+ return state
+ elif state is None:
+ return np.random
+ else:
+ raise ValueError("random_state must be an integer, a numpy "
+ "RandomState, or None")
+
+
+def _pipe(obj, func, *args, **kwargs):
+ """
+ Apply a function ``func`` to object ``obj`` either by passing obj as the
+ first argument to the function or, in the case that the func is a tuple,
+ interpret the first element of the tuple as a function and pass the obj to
+ that function as a keyword argument whose key is the value of the second
+ element of the tuple.
+
+ Parameters
+ ----------
+ func : callable or tuple of (callable, string)
+ Function to apply to this object or, alternatively, a
+ ``(callable, data_keyword)`` tuple where ``data_keyword`` is a
+ string indicating the keyword of `callable`` that expects the
+ object.
+ args : iterable, optional
+ positional arguments passed into ``func``.
+ kwargs : dict, optional
+ a dictionary of keyword arguments passed into ``func``.
+
+ Returns
+ -------
+ object : the return type of ``func``.
+ """
+ if isinstance(func, tuple):
+ func, target = func
+ if target in kwargs:
+ msg = '%s is both the pipe target and a keyword argument' % target
+ raise ValueError(msg)
+ kwargs[target] = obj
+ return func(*args, **kwargs)
+ else:
+ return func(obj, *args, **kwargs)
+
+
+def _get_rename_function(mapper):
+ """
+ Returns a function that will map names/labels, dependent if mapper
+ is a dict, Series or just a function.
+ """
+ if isinstance(mapper, (compat.Mapping, ABCSeries)):
+
+ def f(x):
+ if x in mapper:
+ return mapper[x]
+ else:
+ return x
+ else:
+ f = mapper
+
+ return f
diff --git a/contrib/python/pandas/py2/pandas/core/computation/__init__.py b/contrib/python/pandas/py2/pandas/core/computation/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/computation/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/core/computation/align.py b/contrib/python/pandas/py2/pandas/core/computation/align.py
new file mode 100644
index 00000000000..95117464809
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/computation/align.py
@@ -0,0 +1,179 @@
+"""Core eval alignment algorithms
+"""
+
+from functools import partial, wraps
+import warnings
+
+import numpy as np
+
+from pandas.compat import range, zip
+from pandas.errors import PerformanceWarning
+
+import pandas as pd
+from pandas import compat
+import pandas.core.common as com
+from pandas.core.computation.common import _result_type_many
+
+
+def _align_core_single_unary_op(term):
+ if isinstance(term.value, np.ndarray):
+ typ = partial(np.asanyarray, dtype=term.value.dtype)
+ else:
+ typ = type(term.value)
+ ret = typ,
+
+ if not hasattr(term.value, 'axes'):
+ ret += None,
+ else:
+ ret += _zip_axes_from_type(typ, term.value.axes),
+ return ret
+
+
+def _zip_axes_from_type(typ, new_axes):
+ axes = {ax_name: new_axes[ax_ind]
+ for ax_ind, ax_name in compat.iteritems(typ._AXIS_NAMES)}
+ return axes
+
+
+def _any_pandas_objects(terms):
+ """Check a sequence of terms for instances of PandasObject."""
+ return any(isinstance(term.value, pd.core.generic.PandasObject)
+ for term in terms)
+
+
+def _filter_special_cases(f):
+ @wraps(f)
+ def wrapper(terms):
+ # single unary operand
+ if len(terms) == 1:
+ return _align_core_single_unary_op(terms[0])
+
+ term_values = (term.value for term in terms)
+
+ # we don't have any pandas objects
+ if not _any_pandas_objects(terms):
+ return _result_type_many(*term_values), None
+
+ return f(terms)
+ return wrapper
+
+
+@_filter_special_cases
+def _align_core(terms):
+ term_index = [i for i, term in enumerate(terms)
+ if hasattr(term.value, 'axes')]
+ term_dims = [terms[i].value.ndim for i in term_index]
+ ndims = pd.Series(dict(zip(term_index, term_dims)))
+
+ # initial axes are the axes of the largest-axis'd term
+ biggest = terms[ndims.idxmax()].value
+ typ = biggest._constructor
+ axes = biggest.axes
+ naxes = len(axes)
+ gt_than_one_axis = naxes > 1
+
+ for value in (terms[i].value for i in term_index):
+ is_series = isinstance(value, pd.Series)
+ is_series_and_gt_one_axis = is_series and gt_than_one_axis
+
+ for axis, items in enumerate(value.axes):
+ if is_series_and_gt_one_axis:
+ ax, itm = naxes - 1, value.index
+ else:
+ ax, itm = axis, items
+
+ if not axes[ax].is_(itm):
+ axes[ax] = axes[ax].join(itm, how='outer')
+
+ for i, ndim in compat.iteritems(ndims):
+ for axis, items in zip(range(ndim), axes):
+ ti = terms[i].value
+
+ if hasattr(ti, 'reindex'):
+ transpose = isinstance(ti, pd.Series) and naxes > 1
+ reindexer = axes[naxes - 1] if transpose else items
+
+ term_axis_size = len(ti.axes[axis])
+ reindexer_size = len(reindexer)
+
+ ordm = np.log10(max(1, abs(reindexer_size - term_axis_size)))
+ if ordm >= 1 and reindexer_size >= 10000:
+ w = ('Alignment difference on axis {axis} is larger '
+ 'than an order of magnitude on term {term!r}, by '
+ 'more than {ordm:.4g}; performance may suffer'
+ ).format(axis=axis, term=terms[i].name, ordm=ordm)
+ warnings.warn(w, category=PerformanceWarning, stacklevel=6)
+
+ f = partial(ti.reindex, reindexer, axis=axis, copy=False)
+
+ terms[i].update(f())
+
+ terms[i].update(terms[i].value.values)
+
+ return typ, _zip_axes_from_type(typ, axes)
+
+
+def _align(terms):
+ """Align a set of terms"""
+ try:
+ # flatten the parse tree (a nested list, really)
+ terms = list(com.flatten(terms))
+ except TypeError:
+ # can't iterate so it must just be a constant or single variable
+ if isinstance(terms.value, pd.core.generic.NDFrame):
+ typ = type(terms.value)
+ return typ, _zip_axes_from_type(typ, terms.value.axes)
+ return np.result_type(terms.type), None
+
+ # if all resolved variables are numeric scalars
+ if all(term.is_scalar for term in terms):
+ return _result_type_many(*(term.value for term in terms)).type, None
+
+ # perform the main alignment
+ typ, axes = _align_core(terms)
+ return typ, axes
+
+
+def _reconstruct_object(typ, obj, axes, dtype):
+ """Reconstruct an object given its type, raw value, and possibly empty
+ (None) axes.
+
+ Parameters
+ ----------
+ typ : object
+ A type
+ obj : object
+ The value to use in the type constructor
+ axes : dict
+ The axes to use to construct the resulting pandas object
+
+ Returns
+ -------
+ ret : typ
+ An object of type ``typ`` with the value `obj` and possible axes
+ `axes`.
+ """
+ try:
+ typ = typ.type
+ except AttributeError:
+ pass
+
+ res_t = np.result_type(obj.dtype, dtype)
+
+ if (not isinstance(typ, partial) and
+ issubclass(typ, pd.core.generic.PandasObject)):
+ return typ(obj, dtype=res_t, **axes)
+
+ # special case for pathological things like ~True/~False
+ if hasattr(res_t, 'type') and typ == np.bool_ and res_t != np.bool_:
+ ret_value = res_t.type(obj)
+ else:
+ ret_value = typ(obj).astype(res_t)
+ # The condition is to distinguish 0-dim array (returned in case of
+ # scalar) and 1 element array
+ # e.g. np.array(0) and np.array([0])
+ if len(obj.shape) == 1 and len(obj) == 1:
+ if not isinstance(ret_value, np.ndarray):
+ ret_value = np.array([ret_value]).astype(res_t)
+
+ return ret_value
diff --git a/contrib/python/pandas/py2/pandas/core/computation/api.py b/contrib/python/pandas/py2/pandas/core/computation/api.py
new file mode 100644
index 00000000000..31e8a4873b0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/computation/api.py
@@ -0,0 +1,3 @@
+# flake8: noqa
+
+from pandas.core.computation.eval import eval
diff --git a/contrib/python/pandas/py2/pandas/core/computation/check.py b/contrib/python/pandas/py2/pandas/core/computation/check.py
new file mode 100644
index 00000000000..da89bde56fe
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/computation/check.py
@@ -0,0 +1,24 @@
+from distutils.version import LooseVersion
+import warnings
+
+_NUMEXPR_INSTALLED = False
+_MIN_NUMEXPR_VERSION = "2.6.1"
+_NUMEXPR_VERSION = None
+
+try:
+ import numexpr as ne
+ ver = LooseVersion(ne.__version__)
+ _NUMEXPR_INSTALLED = ver >= LooseVersion(_MIN_NUMEXPR_VERSION)
+ _NUMEXPR_VERSION = ver
+
+ if not _NUMEXPR_INSTALLED:
+ warnings.warn(
+ "The installed version of numexpr {ver} is not supported "
+ "in pandas and will be not be used\nThe minimum supported "
+ "version is {min_ver}\n".format(
+ ver=ver, min_ver=_MIN_NUMEXPR_VERSION), UserWarning)
+
+except ImportError: # pragma: no cover
+ pass
+
+__all__ = ['_NUMEXPR_INSTALLED', '_NUMEXPR_VERSION']
diff --git a/contrib/python/pandas/py2/pandas/core/computation/common.py b/contrib/python/pandas/py2/pandas/core/computation/common.py
new file mode 100644
index 00000000000..e7eca04e413
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/computation/common.py
@@ -0,0 +1,26 @@
+import numpy as np
+
+from pandas.compat import reduce
+
+import pandas as pd
+
+
+def _ensure_decoded(s):
+ """ if we have bytes, decode them to unicode """
+ if isinstance(s, (np.bytes_, bytes)):
+ s = s.decode(pd.get_option('display.encoding'))
+ return s
+
+
+def _result_type_many(*arrays_and_dtypes):
+ """ wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32)
+ argument limit """
+ try:
+ return np.result_type(*arrays_and_dtypes)
+ except ValueError:
+ # we have > NPY_MAXARGS terms in our expression
+ return reduce(np.result_type, arrays_and_dtypes)
+
+
+class NameResolutionError(NameError):
+ pass
diff --git a/contrib/python/pandas/py2/pandas/core/computation/engines.py b/contrib/python/pandas/py2/pandas/core/computation/engines.py
new file mode 100644
index 00000000000..bccd37131c8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/computation/engines.py
@@ -0,0 +1,151 @@
+"""
+Engine classes for :func:`~pandas.eval`
+"""
+
+import abc
+
+from pandas.compat import map
+
+from pandas import compat
+from pandas.core.computation.align import _align, _reconstruct_object
+from pandas.core.computation.ops import (
+ UndefinedVariableError, _mathops, _reductions)
+
+import pandas.io.formats.printing as printing
+
+_ne_builtins = frozenset(_mathops + _reductions)
+
+
+class NumExprClobberingError(NameError):
+ pass
+
+
+def _check_ne_builtin_clash(expr):
+ """Attempt to prevent foot-shooting in a helpful way.
+
+ Parameters
+ ----------
+ terms : Term
+ Terms can contain
+ """
+ names = expr.names
+ overlap = names & _ne_builtins
+
+ if overlap:
+ s = ', '.join(map(repr, overlap))
+ raise NumExprClobberingError('Variables in expression "{expr}" '
+ 'overlap with builtins: ({s})'
+ .format(expr=expr, s=s))
+
+
+class AbstractEngine(object):
+
+ """Object serving as a base class for all engines."""
+
+ __metaclass__ = abc.ABCMeta
+
+ has_neg_frac = False
+
+ def __init__(self, expr):
+ self.expr = expr
+ self.aligned_axes = None
+ self.result_type = None
+
+ def convert(self):
+ """Convert an expression for evaluation.
+
+ Defaults to return the expression as a string.
+ """
+ return printing.pprint_thing(self.expr)
+
+ def evaluate(self):
+ """Run the engine on the expression
+
+ This method performs alignment which is necessary no matter what engine
+ is being used, thus its implementation is in the base class.
+
+ Returns
+ -------
+ obj : object
+ The result of the passed expression.
+ """
+ if not self._is_aligned:
+ self.result_type, self.aligned_axes = _align(self.expr.terms)
+
+ # make sure no names in resolvers and locals/globals clash
+ res = self._evaluate()
+ return _reconstruct_object(self.result_type, res, self.aligned_axes,
+ self.expr.terms.return_type)
+
+ @property
+ def _is_aligned(self):
+ return self.aligned_axes is not None and self.result_type is not None
+
+ @abc.abstractmethod
+ def _evaluate(self):
+ """Return an evaluated expression.
+
+ Parameters
+ ----------
+ env : Scope
+ The local and global environment in which to evaluate an
+ expression.
+
+ Notes
+ -----
+ Must be implemented by subclasses.
+ """
+ pass
+
+
+class NumExprEngine(AbstractEngine):
+
+ """NumExpr engine class"""
+ has_neg_frac = True
+
+ def __init__(self, expr):
+ super(NumExprEngine, self).__init__(expr)
+
+ def convert(self):
+ return str(super(NumExprEngine, self).convert())
+
+ def _evaluate(self):
+ import numexpr as ne
+
+ # convert the expression to a valid numexpr expression
+ s = self.convert()
+
+ try:
+ env = self.expr.env
+ scope = env.full_scope
+ truediv = scope['truediv']
+ _check_ne_builtin_clash(self.expr)
+ return ne.evaluate(s, local_dict=scope, truediv=truediv)
+ except KeyError as e:
+ # python 3 compat kludge
+ try:
+ msg = e.message
+ except AttributeError:
+ msg = compat.text_type(e)
+ raise UndefinedVariableError(msg)
+
+
+class PythonEngine(AbstractEngine):
+
+ """Evaluate an expression in Python space.
+
+ Mostly for testing purposes.
+ """
+ has_neg_frac = False
+
+ def __init__(self, expr):
+ super(PythonEngine, self).__init__(expr)
+
+ def evaluate(self):
+ return self.expr()
+
+ def _evaluate(self):
+ pass
+
+
+_engines = {'numexpr': NumExprEngine, 'python': PythonEngine}
diff --git a/contrib/python/pandas/py2/pandas/core/computation/eval.py b/contrib/python/pandas/py2/pandas/core/computation/eval.py
new file mode 100644
index 00000000000..b768ed6df30
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/computation/eval.py
@@ -0,0 +1,351 @@
+#!/usr/bin/env python
+
+"""Top level ``eval`` module.
+"""
+
+import tokenize
+import warnings
+
+from pandas.compat import string_types
+from pandas.util._validators import validate_bool_kwarg
+
+from pandas.core.computation.engines import _engines
+from pandas.core.computation.scope import _ensure_scope
+
+from pandas.io.formats.printing import pprint_thing
+
+
+def _check_engine(engine):
+ """Make sure a valid engine is passed.
+
+ Parameters
+ ----------
+ engine : str
+
+ Raises
+ ------
+ KeyError
+ * If an invalid engine is passed
+ ImportError
+ * If numexpr was requested but doesn't exist
+
+ Returns
+ -------
+ string engine
+
+ """
+ from pandas.core.computation.check import _NUMEXPR_INSTALLED
+
+ if engine is None:
+ if _NUMEXPR_INSTALLED:
+ engine = 'numexpr'
+ else:
+ engine = 'python'
+
+ if engine not in _engines:
+ valid = list(_engines.keys())
+ raise KeyError('Invalid engine {engine!r} passed, valid engines are'
+ ' {valid}'.format(engine=engine, valid=valid))
+
+ # TODO: validate this in a more general way (thinking of future engines
+ # that won't necessarily be import-able)
+ # Could potentially be done on engine instantiation
+ if engine == 'numexpr':
+ if not _NUMEXPR_INSTALLED:
+ raise ImportError("'numexpr' is not installed or an "
+ "unsupported version. Cannot use "
+ "engine='numexpr' for query/eval "
+ "if 'numexpr' is not installed")
+
+ return engine
+
+
+def _check_parser(parser):
+ """Make sure a valid parser is passed.
+
+ Parameters
+ ----------
+ parser : str
+
+ Raises
+ ------
+ KeyError
+ * If an invalid parser is passed
+ """
+ from pandas.core.computation.expr import _parsers
+
+ if parser not in _parsers:
+ raise KeyError('Invalid parser {parser!r} passed, valid parsers are'
+ ' {valid}'.format(parser=parser, valid=_parsers.keys()))
+
+
+def _check_resolvers(resolvers):
+ if resolvers is not None:
+ for resolver in resolvers:
+ if not hasattr(resolver, '__getitem__'):
+ name = type(resolver).__name__
+ raise TypeError('Resolver of type {name!r} does not implement '
+ 'the __getitem__ method'.format(name=name))
+
+
+def _check_expression(expr):
+ """Make sure an expression is not an empty string
+
+ Parameters
+ ----------
+ expr : object
+ An object that can be converted to a string
+
+ Raises
+ ------
+ ValueError
+ * If expr is an empty string
+ """
+ if not expr:
+ raise ValueError("expr cannot be an empty string")
+
+
+def _convert_expression(expr):
+ """Convert an object to an expression.
+
+ Thus function converts an object to an expression (a unicode string) and
+ checks to make sure it isn't empty after conversion. This is used to
+ convert operators to their string representation for recursive calls to
+ :func:`~pandas.eval`.
+
+ Parameters
+ ----------
+ expr : object
+ The object to be converted to a string.
+
+ Returns
+ -------
+ s : unicode
+ The string representation of an object.
+
+ Raises
+ ------
+ ValueError
+ * If the expression is empty.
+ """
+ s = pprint_thing(expr)
+ _check_expression(s)
+ return s
+
+
+def _check_for_locals(expr, stack_level, parser):
+ from pandas.core.computation.expr import tokenize_string
+
+ at_top_of_stack = stack_level == 0
+ not_pandas_parser = parser != 'pandas'
+
+ if not_pandas_parser:
+ msg = "The '@' prefix is only supported by the pandas parser"
+ elif at_top_of_stack:
+ msg = ("The '@' prefix is not allowed in "
+ "top-level eval calls, \nplease refer to "
+ "your variables by name without the '@' "
+ "prefix")
+
+ if at_top_of_stack or not_pandas_parser:
+ for toknum, tokval in tokenize_string(expr):
+ if toknum == tokenize.OP and tokval == '@':
+ raise SyntaxError(msg)
+
+
+def eval(expr, parser='pandas', engine=None, truediv=True,
+ local_dict=None, global_dict=None, resolvers=(), level=0,
+ target=None, inplace=False):
+ """Evaluate a Python expression as a string using various backends.
+
+ The following arithmetic operations are supported: ``+``, ``-``, ``*``,
+ ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following
+ boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not).
+ Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`,
+ :keyword:`or`, and :keyword:`not` with the same semantics as the
+ corresponding bitwise operators. :class:`~pandas.Series` and
+ :class:`~pandas.DataFrame` objects are supported and behave as they would
+ with plain ol' Python evaluation.
+
+ Parameters
+ ----------
+ expr : str or unicode
+ The expression to evaluate. This string cannot contain any Python
+ `statements
+ <https://docs.python.org/3/reference/simple_stmts.html#simple-statements>`__,
+ only Python `expressions
+ <https://docs.python.org/3/reference/simple_stmts.html#expression-statements>`__.
+ parser : string, default 'pandas', {'pandas', 'python'}
+ The parser to use to construct the syntax tree from the expression. The
+ default of ``'pandas'`` parses code slightly different than standard
+ Python. Alternatively, you can parse an expression using the
+ ``'python'`` parser to retain strict Python semantics. See the
+ :ref:`enhancing performance <enhancingperf.eval>` documentation for
+ more details.
+ engine : string or None, default 'numexpr', {'python', 'numexpr'}
+
+ The engine used to evaluate the expression. Supported engines are
+
+ - None : tries to use ``numexpr``, falls back to ``python``
+ - ``'numexpr'``: This default engine evaluates pandas objects using
+ numexpr for large speed ups in complex expressions
+ with large frames.
+ - ``'python'``: Performs operations as if you had ``eval``'d in top
+ level python. This engine is generally not that useful.
+
+ More backends may be available in the future.
+
+ truediv : bool, optional
+ Whether to use true division, like in Python >= 3
+ local_dict : dict or None, optional
+ A dictionary of local variables, taken from locals() by default.
+ global_dict : dict or None, optional
+ A dictionary of global variables, taken from globals() by default.
+ resolvers : list of dict-like or None, optional
+ A list of objects implementing the ``__getitem__`` special method that
+ you can use to inject an additional collection of namespaces to use for
+ variable lookup. For example, this is used in the
+ :meth:`~pandas.DataFrame.query` method to inject the
+ ``DataFrame.index`` and ``DataFrame.columns``
+ variables that refer to their respective :class:`~pandas.DataFrame`
+ instance attributes.
+ level : int, optional
+ The number of prior stack frames to traverse and add to the current
+ scope. Most users will **not** need to change this parameter.
+ target : object, optional, default None
+ This is the target object for assignment. It is used when there is
+ variable assignment in the expression. If so, then `target` must
+ support item assignment with string keys, and if a copy is being
+ returned, it must also support `.copy()`.
+ inplace : bool, default False
+ If `target` is provided, and the expression mutates `target`, whether
+ to modify `target` inplace. Otherwise, return a copy of `target` with
+ the mutation.
+
+ Returns
+ -------
+ ndarray, numeric scalar, DataFrame, Series
+
+ Raises
+ ------
+ ValueError
+ There are many instances where such an error can be raised:
+
+ - `target=None`, but the expression is multiline.
+ - The expression is multiline, but not all them have item assignment.
+ An example of such an arrangement is this:
+
+ a = b + 1
+ a + 2
+
+ Here, there are expressions on different lines, making it multiline,
+ but the last line has no variable assigned to the output of `a + 2`.
+ - `inplace=True`, but the expression is missing item assignment.
+ - Item assignment is provided, but the `target` does not support
+ string item assignment.
+ - Item assignment is provided and `inplace=False`, but the `target`
+ does not support the `.copy()` method
+
+ See Also
+ --------
+ pandas.DataFrame.query
+ pandas.DataFrame.eval
+
+ Notes
+ -----
+ The ``dtype`` of any objects involved in an arithmetic ``%`` operation are
+ recursively cast to ``float64``.
+
+ See the :ref:`enhancing performance <enhancingperf.eval>` documentation for
+ more details.
+ """
+ from pandas.core.computation.expr import Expr
+
+ inplace = validate_bool_kwarg(inplace, "inplace")
+
+ if isinstance(expr, string_types):
+ _check_expression(expr)
+ exprs = [e.strip() for e in expr.splitlines() if e.strip() != '']
+ else:
+ exprs = [expr]
+ multi_line = len(exprs) > 1
+
+ if multi_line and target is None:
+ raise ValueError("multi-line expressions are only valid in the "
+ "context of data, use DataFrame.eval")
+
+ ret = None
+ first_expr = True
+ target_modified = False
+
+ for expr in exprs:
+ expr = _convert_expression(expr)
+ engine = _check_engine(engine)
+ _check_parser(parser)
+ _check_resolvers(resolvers)
+ _check_for_locals(expr, level, parser)
+
+ # get our (possibly passed-in) scope
+ env = _ensure_scope(level + 1, global_dict=global_dict,
+ local_dict=local_dict, resolvers=resolvers,
+ target=target)
+
+ parsed_expr = Expr(expr, engine=engine, parser=parser, env=env,
+ truediv=truediv)
+
+ # construct the engine and evaluate the parsed expression
+ eng = _engines[engine]
+ eng_inst = eng(parsed_expr)
+ ret = eng_inst.evaluate()
+
+ if parsed_expr.assigner is None:
+ if multi_line:
+ raise ValueError("Multi-line expressions are only valid"
+ " if all expressions contain an assignment")
+ elif inplace:
+ raise ValueError("Cannot operate inplace "
+ "if there is no assignment")
+
+ # assign if needed
+ assigner = parsed_expr.assigner
+ if env.target is not None and assigner is not None:
+ target_modified = True
+
+ # if returning a copy, copy only on the first assignment
+ if not inplace and first_expr:
+ try:
+ target = env.target.copy()
+ except AttributeError:
+ raise ValueError("Cannot return a copy of the target")
+ else:
+ target = env.target
+
+ # TypeError is most commonly raised (e.g. int, list), but you
+ # get IndexError if you try to do this assignment on np.ndarray.
+ # we will ignore numpy warnings here; e.g. if trying
+ # to use a non-numeric indexer
+ try:
+ with warnings.catch_warnings(record=True):
+ # TODO: Filter the warnings we actually care about here.
+ target[assigner] = ret
+ except (TypeError, IndexError):
+ raise ValueError("Cannot assign expression output to target")
+
+ if not resolvers:
+ resolvers = ({assigner: ret},)
+ else:
+ # existing resolver needs updated to handle
+ # case of mutating existing column in copy
+ for resolver in resolvers:
+ if assigner in resolver:
+ resolver[assigner] = ret
+ break
+ else:
+ resolvers += ({assigner: ret},)
+
+ ret = None
+ first_expr = False
+
+ # We want to exclude `inplace=None` as being False.
+ if inplace is False:
+ return target if target_modified else ret
diff --git a/contrib/python/pandas/py2/pandas/core/computation/expr.py b/contrib/python/pandas/py2/pandas/core/computation/expr.py
new file mode 100644
index 00000000000..d840bf6ae71
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/computation/expr.py
@@ -0,0 +1,776 @@
+""":func:`~pandas.eval` parsers
+"""
+
+import ast
+from functools import partial
+import tokenize
+
+import numpy as np
+
+from pandas.compat import StringIO, lmap, reduce, string_types, zip
+
+import pandas as pd
+from pandas import compat
+from pandas.core import common as com
+from pandas.core.base import StringMixin
+from pandas.core.computation.ops import (
+ _LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp,
+ UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms,
+ _mathops, _reductions, _unary_ops_syms, is_term)
+from pandas.core.computation.scope import Scope
+
+import pandas.io.formats.printing as printing
+
+
+def tokenize_string(source):
+ """Tokenize a Python source code string.
+
+ Parameters
+ ----------
+ source : str
+ A Python source code string
+ """
+ line_reader = StringIO(source).readline
+ for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader):
+ yield toknum, tokval
+
+
+def _rewrite_assign(tok):
+ """Rewrite the assignment operator for PyTables expressions that use ``=``
+ as a substitute for ``==``.
+
+ Parameters
+ ----------
+ tok : tuple of int, str
+ ints correspond to the all caps constants in the tokenize module
+
+ Returns
+ -------
+ t : tuple of int, str
+ Either the input or token or the replacement values
+ """
+ toknum, tokval = tok
+ return toknum, '==' if tokval == '=' else tokval
+
+
+def _replace_booleans(tok):
+ """Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise
+ precedence is changed to boolean precedence.
+
+ Parameters
+ ----------
+ tok : tuple of int, str
+ ints correspond to the all caps constants in the tokenize module
+
+ Returns
+ -------
+ t : tuple of int, str
+ Either the input or token or the replacement values
+ """
+ toknum, tokval = tok
+ if toknum == tokenize.OP:
+ if tokval == '&':
+ return tokenize.NAME, 'and'
+ elif tokval == '|':
+ return tokenize.NAME, 'or'
+ return toknum, tokval
+ return toknum, tokval
+
+
+def _replace_locals(tok):
+ """Replace local variables with a syntactically valid name.
+
+ Parameters
+ ----------
+ tok : tuple of int, str
+ ints correspond to the all caps constants in the tokenize module
+
+ Returns
+ -------
+ t : tuple of int, str
+ Either the input or token or the replacement values
+
+ Notes
+ -----
+ This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as
+ ``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_``
+ is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it.
+ """
+ toknum, tokval = tok
+ if toknum == tokenize.OP and tokval == '@':
+ return tokenize.OP, _LOCAL_TAG
+ return toknum, tokval
+
+
+def _compose2(f, g):
+ """Compose 2 callables"""
+ return lambda *args, **kwargs: f(g(*args, **kwargs))
+
+
+def _compose(*funcs):
+ """Compose 2 or more callables"""
+ assert len(funcs) > 1, 'At least 2 callables must be passed to compose'
+ return reduce(_compose2, funcs)
+
+
+def _preparse(source, f=_compose(_replace_locals, _replace_booleans,
+ _rewrite_assign)):
+ """Compose a collection of tokenization functions
+
+ Parameters
+ ----------
+ source : str
+ A Python source code string
+ f : callable
+ This takes a tuple of (toknum, tokval) as its argument and returns a
+ tuple with the same structure but possibly different elements. Defaults
+ to the composition of ``_rewrite_assign``, ``_replace_booleans``, and
+ ``_replace_locals``.
+
+ Returns
+ -------
+ s : str
+ Valid Python source code
+
+ Notes
+ -----
+ The `f` parameter can be any callable that takes *and* returns input of the
+ form ``(toknum, tokval)``, where ``toknum`` is one of the constants from
+ the ``tokenize`` module and ``tokval`` is a string.
+ """
+ assert callable(f), 'f must be callable'
+ return tokenize.untokenize(lmap(f, tokenize_string(source)))
+
+
+def _is_type(t):
+ """Factory for a type checking function of type ``t`` or tuple of types."""
+ return lambda x: isinstance(x.value, t)
+
+
+_is_list = _is_type(list)
+_is_str = _is_type(string_types)
+
+
+# partition all AST nodes
+_all_nodes = frozenset(filter(lambda x: isinstance(x, type) and
+ issubclass(x, ast.AST),
+ (getattr(ast, node) for node in dir(ast))))
+
+
+def _filter_nodes(superclass, all_nodes=_all_nodes):
+ """Filter out AST nodes that are subclasses of ``superclass``."""
+ node_names = (node.__name__ for node in all_nodes
+ if issubclass(node, superclass))
+ return frozenset(node_names)
+
+
+_all_node_names = frozenset(map(lambda x: x.__name__, _all_nodes))
+_mod_nodes = _filter_nodes(ast.mod)
+_stmt_nodes = _filter_nodes(ast.stmt)
+_expr_nodes = _filter_nodes(ast.expr)
+_expr_context_nodes = _filter_nodes(ast.expr_context)
+_slice_nodes = _filter_nodes(ast.slice)
+_boolop_nodes = _filter_nodes(ast.boolop)
+_operator_nodes = _filter_nodes(ast.operator)
+_unary_op_nodes = _filter_nodes(ast.unaryop)
+_cmp_op_nodes = _filter_nodes(ast.cmpop)
+_comprehension_nodes = _filter_nodes(ast.comprehension)
+_handler_nodes = _filter_nodes(ast.excepthandler)
+_arguments_nodes = _filter_nodes(ast.arguments)
+_keyword_nodes = _filter_nodes(ast.keyword)
+_alias_nodes = _filter_nodes(ast.alias)
+
+
+# nodes that we don't support directly but are needed for parsing
+_hacked_nodes = frozenset(['Assign', 'Module', 'Expr'])
+
+
+_unsupported_expr_nodes = frozenset(['Yield', 'GeneratorExp', 'IfExp',
+ 'DictComp', 'SetComp', 'Repr', 'Lambda',
+ 'Set', 'AST', 'Is', 'IsNot'])
+
+# these nodes are low priority or won't ever be supported (e.g., AST)
+_unsupported_nodes = ((_stmt_nodes | _mod_nodes | _handler_nodes |
+ _arguments_nodes | _keyword_nodes | _alias_nodes |
+ _expr_context_nodes | _unsupported_expr_nodes) -
+ _hacked_nodes)
+
+# we're adding a different assignment in some cases to be equality comparison
+# and we don't want `stmt` and friends in their so get only the class whose
+# names are capitalized
+_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes
+_msg = 'cannot both support and not support {intersection}'.format(
+ intersection=_unsupported_nodes & _base_supported_nodes)
+assert not _unsupported_nodes & _base_supported_nodes, _msg
+
+
+def _node_not_implemented(node_name, cls):
+ """Return a function that raises a NotImplementedError with a passed node
+ name.
+ """
+
+ def f(self, *args, **kwargs):
+ raise NotImplementedError("{name!r} nodes are not "
+ "implemented".format(name=node_name))
+ return f
+
+
+def disallow(nodes):
+ """Decorator to disallow certain nodes from parsing. Raises a
+ NotImplementedError instead.
+
+ Returns
+ -------
+ disallowed : callable
+ """
+ def disallowed(cls):
+ cls.unsupported_nodes = ()
+ for node in nodes:
+ new_method = _node_not_implemented(node, cls)
+ name = 'visit_{node}'.format(node=node)
+ cls.unsupported_nodes += (name,)
+ setattr(cls, name, new_method)
+ return cls
+ return disallowed
+
+
+def _op_maker(op_class, op_symbol):
+ """Return a function to create an op class with its symbol already passed.
+
+ Returns
+ -------
+ f : callable
+ """
+
+ def f(self, node, *args, **kwargs):
+ """Return a partial function with an Op subclass with an operator
+ already passed.
+
+ Returns
+ -------
+ f : callable
+ """
+ return partial(op_class, op_symbol, *args, **kwargs)
+ return f
+
+
+_op_classes = {'binary': BinOp, 'unary': UnaryOp}
+
+
+def add_ops(op_classes):
+ """Decorator to add default implementation of ops."""
+ def f(cls):
+ for op_attr_name, op_class in compat.iteritems(op_classes):
+ ops = getattr(cls, '{name}_ops'.format(name=op_attr_name))
+ ops_map = getattr(cls, '{name}_op_nodes_map'.format(
+ name=op_attr_name))
+ for op in ops:
+ op_node = ops_map[op]
+ if op_node is not None:
+ made_op = _op_maker(op_class, op)
+ setattr(cls, 'visit_{node}'.format(node=op_node), made_op)
+ return cls
+ return f
+
+
+@disallow(_unsupported_nodes)
+@add_ops(_op_classes)
+class BaseExprVisitor(ast.NodeVisitor):
+
+ """Custom ast walker. Parsers of other engines should subclass this class
+ if necessary.
+
+ Parameters
+ ----------
+ env : Scope
+ engine : str
+ parser : str
+ preparser : callable
+ """
+ const_type = Constant
+ term_type = Term
+
+ binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms
+ binary_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'In', 'NotIn',
+ 'BitAnd', 'BitOr', 'And', 'Or', 'Add', 'Sub', 'Mult',
+ None, 'Pow', 'FloorDiv', 'Mod')
+ binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes))
+
+ unary_ops = _unary_ops_syms
+ unary_op_nodes = 'UAdd', 'USub', 'Invert', 'Not'
+ unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes))
+
+ rewrite_map = {
+ ast.Eq: ast.In,
+ ast.NotEq: ast.NotIn,
+ ast.In: ast.In,
+ ast.NotIn: ast.NotIn
+ }
+
+ def __init__(self, env, engine, parser, preparser=_preparse):
+ self.env = env
+ self.engine = engine
+ self.parser = parser
+ self.preparser = preparser
+ self.assigner = None
+
+ def visit(self, node, **kwargs):
+ if isinstance(node, string_types):
+ clean = self.preparser(node)
+ try:
+ node = ast.fix_missing_locations(ast.parse(clean))
+ except SyntaxError as e:
+ from keyword import iskeyword
+ if any(iskeyword(x) for x in clean.split()):
+ e.msg = ("Python keyword not valid identifier"
+ " in numexpr query")
+ raise e
+
+ method = 'visit_' + node.__class__.__name__
+ visitor = getattr(self, method)
+ return visitor(node, **kwargs)
+
+ def visit_Module(self, node, **kwargs):
+ if len(node.body) != 1:
+ raise SyntaxError('only a single expression is allowed')
+ expr = node.body[0]
+ return self.visit(expr, **kwargs)
+
+ def visit_Expr(self, node, **kwargs):
+ return self.visit(node.value, **kwargs)
+
+ def _rewrite_membership_op(self, node, left, right):
+ # the kind of the operator (is actually an instance)
+ op_instance = node.op
+ op_type = type(op_instance)
+
+ # must be two terms and the comparison operator must be ==/!=/in/not in
+ if is_term(left) and is_term(right) and op_type in self.rewrite_map:
+
+ left_list, right_list = map(_is_list, (left, right))
+ left_str, right_str = map(_is_str, (left, right))
+
+ # if there are any strings or lists in the expression
+ if left_list or right_list or left_str or right_str:
+ op_instance = self.rewrite_map[op_type]()
+
+ # pop the string variable out of locals and replace it with a list
+ # of one string, kind of a hack
+ if right_str:
+ name = self.env.add_tmp([right.value])
+ right = self.term_type(name, self.env)
+
+ if left_str:
+ name = self.env.add_tmp([left.value])
+ left = self.term_type(name, self.env)
+
+ op = self.visit(op_instance)
+ return op, op_instance, left, right
+
+ def _maybe_transform_eq_ne(self, node, left=None, right=None):
+ if left is None:
+ left = self.visit(node.left, side='left')
+ if right is None:
+ right = self.visit(node.right, side='right')
+ op, op_class, left, right = self._rewrite_membership_op(node, left,
+ right)
+ return op, op_class, left, right
+
+ def _maybe_downcast_constants(self, left, right):
+ f32 = np.dtype(np.float32)
+ if left.is_scalar and not right.is_scalar and right.return_type == f32:
+ # right is a float32 array, left is a scalar
+ name = self.env.add_tmp(np.float32(left.value))
+ left = self.term_type(name, self.env)
+ if right.is_scalar and not left.is_scalar and left.return_type == f32:
+ # left is a float32 array, right is a scalar
+ name = self.env.add_tmp(np.float32(right.value))
+ right = self.term_type(name, self.env)
+
+ return left, right
+
+ def _maybe_eval(self, binop, eval_in_python):
+ # eval `in` and `not in` (for now) in "partial" python space
+ # things that can be evaluated in "eval" space will be turned into
+ # temporary variables. for example,
+ # [1,2] in a + 2 * b
+ # in that case a + 2 * b will be evaluated using numexpr, and the "in"
+ # call will be evaluated using isin (in python space)
+ return binop.evaluate(self.env, self.engine, self.parser,
+ self.term_type, eval_in_python)
+
+ def _maybe_evaluate_binop(self, op, op_class, lhs, rhs,
+ eval_in_python=('in', 'not in'),
+ maybe_eval_in_python=('==', '!=', '<', '>',
+ '<=', '>=')):
+ res = op(lhs, rhs)
+
+ if res.has_invalid_return_type:
+ raise TypeError("unsupported operand type(s) for {op}:"
+ " '{lhs}' and '{rhs}'".format(op=res.op,
+ lhs=lhs.type,
+ rhs=rhs.type))
+
+ if self.engine != 'pytables':
+ if (res.op in _cmp_ops_syms and
+ getattr(lhs, 'is_datetime', False) or
+ getattr(rhs, 'is_datetime', False)):
+ # all date ops must be done in python bc numexpr doesn't work
+ # well with NaT
+ return self._maybe_eval(res, self.binary_ops)
+
+ if res.op in eval_in_python:
+ # "in"/"not in" ops are always evaluated in python
+ return self._maybe_eval(res, eval_in_python)
+ elif self.engine != 'pytables':
+ if (getattr(lhs, 'return_type', None) == object or
+ getattr(rhs, 'return_type', None) == object):
+ # evaluate "==" and "!=" in python if either of our operands
+ # has an object return type
+ return self._maybe_eval(res, eval_in_python +
+ maybe_eval_in_python)
+ return res
+
+ def visit_BinOp(self, node, **kwargs):
+ op, op_class, left, right = self._maybe_transform_eq_ne(node)
+ left, right = self._maybe_downcast_constants(left, right)
+ return self._maybe_evaluate_binop(op, op_class, left, right)
+
+ def visit_Div(self, node, **kwargs):
+ truediv = self.env.scope['truediv']
+ return lambda lhs, rhs: Div(lhs, rhs, truediv)
+
+ def visit_UnaryOp(self, node, **kwargs):
+ op = self.visit(node.op)
+ operand = self.visit(node.operand)
+ return op(operand)
+
+ def visit_Name(self, node, **kwargs):
+ return self.term_type(node.id, self.env, **kwargs)
+
+ def visit_NameConstant(self, node, **kwargs):
+ return self.const_type(node.value, self.env)
+
+ def visit_Num(self, node, **kwargs):
+ return self.const_type(node.n, self.env)
+
+ def visit_Str(self, node, **kwargs):
+ name = self.env.add_tmp(node.s)
+ return self.term_type(name, self.env)
+
+ def visit_List(self, node, **kwargs):
+ name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts])
+ return self.term_type(name, self.env)
+
+ visit_Tuple = visit_List
+
+ def visit_Index(self, node, **kwargs):
+ """ df.index[4] """
+ return self.visit(node.value)
+
+ def visit_Subscript(self, node, **kwargs):
+ value = self.visit(node.value)
+ slobj = self.visit(node.slice)
+ result = pd.eval(slobj, local_dict=self.env, engine=self.engine,
+ parser=self.parser)
+ try:
+ # a Term instance
+ v = value.value[result]
+ except AttributeError:
+ # an Op instance
+ lhs = pd.eval(value, local_dict=self.env, engine=self.engine,
+ parser=self.parser)
+ v = lhs[result]
+ name = self.env.add_tmp(v)
+ return self.term_type(name, env=self.env)
+
+ def visit_Slice(self, node, **kwargs):
+ """ df.index[slice(4,6)] """
+ lower = node.lower
+ if lower is not None:
+ lower = self.visit(lower).value
+ upper = node.upper
+ if upper is not None:
+ upper = self.visit(upper).value
+ step = node.step
+ if step is not None:
+ step = self.visit(step).value
+
+ return slice(lower, upper, step)
+
+ def visit_Assign(self, node, **kwargs):
+ """
+ support a single assignment node, like
+
+ c = a + b
+
+ set the assigner at the top level, must be a Name node which
+ might or might not exist in the resolvers
+
+ """
+
+ if len(node.targets) != 1:
+ raise SyntaxError('can only assign a single expression')
+ if not isinstance(node.targets[0], ast.Name):
+ raise SyntaxError('left hand side of an assignment must be a '
+ 'single name')
+ if self.env.target is None:
+ raise ValueError('cannot assign without a target object')
+
+ try:
+ assigner = self.visit(node.targets[0], **kwargs)
+ except UndefinedVariableError:
+ assigner = node.targets[0].id
+
+ self.assigner = getattr(assigner, 'name', assigner)
+ if self.assigner is None:
+ raise SyntaxError('left hand side of an assignment must be a '
+ 'single resolvable name')
+
+ return self.visit(node.value, **kwargs)
+
+ def visit_Attribute(self, node, **kwargs):
+ attr = node.attr
+ value = node.value
+
+ ctx = node.ctx
+ if isinstance(ctx, ast.Load):
+ # resolve the value
+ resolved = self.visit(value).value
+ try:
+ v = getattr(resolved, attr)
+ name = self.env.add_tmp(v)
+ return self.term_type(name, self.env)
+ except AttributeError:
+ # something like datetime.datetime where scope is overridden
+ if isinstance(value, ast.Name) and value.id == attr:
+ return resolved
+
+ raise ValueError("Invalid Attribute context {name}"
+ .format(name=ctx.__name__))
+
+ def visit_Call_35(self, node, side=None, **kwargs):
+ """ in 3.5 the starargs attribute was changed to be more flexible,
+ #11097 """
+
+ if isinstance(node.func, ast.Attribute):
+ res = self.visit_Attribute(node.func)
+ elif not isinstance(node.func, ast.Name):
+ raise TypeError("Only named functions are supported")
+ else:
+ try:
+ res = self.visit(node.func)
+ except UndefinedVariableError:
+ # Check if this is a supported function name
+ try:
+ res = FuncNode(node.func.id)
+ except ValueError:
+ # Raise original error
+ raise
+
+ if res is None:
+ raise ValueError("Invalid function call {func}"
+ .format(func=node.func.id))
+ if hasattr(res, 'value'):
+ res = res.value
+
+ if isinstance(res, FuncNode):
+
+ new_args = [self.visit(arg) for arg in node.args]
+
+ if node.keywords:
+ raise TypeError("Function \"{name}\" does not support keyword "
+ "arguments".format(name=res.name))
+
+ return res(*new_args, **kwargs)
+
+ else:
+
+ new_args = [self.visit(arg).value for arg in node.args]
+
+ for key in node.keywords:
+ if not isinstance(key, ast.keyword):
+ raise ValueError("keyword error in function call "
+ "'{func}'".format(func=node.func.id))
+
+ if key.arg:
+ # TODO: bug?
+ kwargs.append(ast.keyword(
+ keyword.arg, self.visit(keyword.value))) # noqa
+
+ return self.const_type(res(*new_args, **kwargs), self.env)
+
+ def visit_Call_legacy(self, node, side=None, **kwargs):
+
+ # this can happen with: datetime.datetime
+ if isinstance(node.func, ast.Attribute):
+ res = self.visit_Attribute(node.func)
+ elif not isinstance(node.func, ast.Name):
+ raise TypeError("Only named functions are supported")
+ else:
+ try:
+ res = self.visit(node.func)
+ except UndefinedVariableError:
+ # Check if this is a supported function name
+ try:
+ res = FuncNode(node.func.id)
+ except ValueError:
+ # Raise original error
+ raise
+
+ if res is None:
+ raise ValueError("Invalid function call {func}"
+ .format(func=node.func.id))
+ if hasattr(res, 'value'):
+ res = res.value
+
+ if isinstance(res, FuncNode):
+ args = [self.visit(targ) for targ in node.args]
+
+ if node.starargs is not None:
+ args += self.visit(node.starargs)
+
+ if node.keywords or node.kwargs:
+ raise TypeError("Function \"{name}\" does not support keyword "
+ "arguments".format(name=res.name))
+
+ return res(*args, **kwargs)
+
+ else:
+ args = [self.visit(targ).value for targ in node.args]
+ if node.starargs is not None:
+ args += self.visit(node.starargs).value
+
+ keywords = {}
+ for key in node.keywords:
+ if not isinstance(key, ast.keyword):
+ raise ValueError("keyword error in function call "
+ "'{func}'".format(func=node.func.id))
+ keywords[key.arg] = self.visit(key.value).value
+ if node.kwargs is not None:
+ keywords.update(self.visit(node.kwargs).value)
+
+ return self.const_type(res(*args, **keywords), self.env)
+
+ def translate_In(self, op):
+ return op
+
+ def visit_Compare(self, node, **kwargs):
+ ops = node.ops
+ comps = node.comparators
+
+ # base case: we have something like a CMP b
+ if len(comps) == 1:
+ op = self.translate_In(ops[0])
+ binop = ast.BinOp(op=op, left=node.left, right=comps[0])
+ return self.visit(binop)
+
+ # recursive case: we have a chained comparison, a CMP b CMP c, etc.
+ left = node.left
+ values = []
+ for op, comp in zip(ops, comps):
+ new_node = self.visit(ast.Compare(comparators=[comp], left=left,
+ ops=[self.translate_In(op)]))
+ left = comp
+ values.append(new_node)
+ return self.visit(ast.BoolOp(op=ast.And(), values=values))
+
+ def _try_visit_binop(self, bop):
+ if isinstance(bop, (Op, Term)):
+ return bop
+ return self.visit(bop)
+
+ def visit_BoolOp(self, node, **kwargs):
+ def visitor(x, y):
+ lhs = self._try_visit_binop(x)
+ rhs = self._try_visit_binop(y)
+
+ op, op_class, lhs, rhs = self._maybe_transform_eq_ne(
+ node, lhs, rhs)
+ return self._maybe_evaluate_binop(op, node.op, lhs, rhs)
+
+ operands = node.values
+ return reduce(visitor, operands)
+
+
+# ast.Call signature changed on 3.5,
+# conditionally change which methods is named
+# visit_Call depending on Python version, #11097
+if compat.PY35:
+ BaseExprVisitor.visit_Call = BaseExprVisitor.visit_Call_35
+else:
+ BaseExprVisitor.visit_Call = BaseExprVisitor.visit_Call_legacy
+
+_python_not_supported = frozenset(['Dict', 'BoolOp', 'In', 'NotIn'])
+_numexpr_supported_calls = frozenset(_reductions + _mathops)
+
+
+@disallow((_unsupported_nodes | _python_not_supported) -
+ (_boolop_nodes | frozenset(['BoolOp', 'Attribute', 'In', 'NotIn',
+ 'Tuple'])))
+class PandasExprVisitor(BaseExprVisitor):
+
+ def __init__(self, env, engine, parser,
+ preparser=partial(_preparse, f=_compose(_replace_locals,
+ _replace_booleans))):
+ super(PandasExprVisitor, self).__init__(env, engine, parser, preparser)
+
+
+@disallow(_unsupported_nodes | _python_not_supported | frozenset(['Not']))
+class PythonExprVisitor(BaseExprVisitor):
+
+ def __init__(self, env, engine, parser, preparser=lambda x: x):
+ super(PythonExprVisitor, self).__init__(env, engine, parser,
+ preparser=preparser)
+
+
+class Expr(StringMixin):
+
+ """Object encapsulating an expression.
+
+ Parameters
+ ----------
+ expr : str
+ engine : str, optional, default 'numexpr'
+ parser : str, optional, default 'pandas'
+ env : Scope, optional, default None
+ truediv : bool, optional, default True
+ level : int, optional, default 2
+ """
+
+ def __init__(self, expr, engine='numexpr', parser='pandas', env=None,
+ truediv=True, level=0):
+ self.expr = expr
+ self.env = env or Scope(level=level + 1)
+ self.engine = engine
+ self.parser = parser
+ self.env.scope['truediv'] = truediv
+ self._visitor = _parsers[parser](self.env, self.engine, self.parser)
+ self.terms = self.parse()
+
+ @property
+ def assigner(self):
+ return getattr(self._visitor, 'assigner', None)
+
+ def __call__(self):
+ return self.terms(self.env)
+
+ def __unicode__(self):
+ return printing.pprint_thing(self.terms)
+
+ def __len__(self):
+ return len(self.expr)
+
+ def parse(self):
+ """Parse an expression"""
+ return self._visitor.visit(self.expr)
+
+ @property
+ def names(self):
+ """Get the names in an expression"""
+ if is_term(self.terms):
+ return frozenset([self.terms.name])
+ return frozenset(term.name for term in com.flatten(self.terms))
+
+
+_parsers = {'python': PythonExprVisitor, 'pandas': PandasExprVisitor}
diff --git a/contrib/python/pandas/py2/pandas/core/computation/expressions.py b/contrib/python/pandas/py2/pandas/core/computation/expressions.py
new file mode 100644
index 00000000000..a91ef7592a3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/computation/expressions.py
@@ -0,0 +1,251 @@
+"""
+Expressions
+-----------
+
+Offer fast expression evaluation through numexpr
+
+"""
+
+import warnings
+
+import numpy as np
+
+from pandas.core.dtypes.generic import ABCDataFrame
+
+import pandas.core.common as com
+from pandas.core.computation.check import _NUMEXPR_INSTALLED
+from pandas.core.config import get_option
+
+if _NUMEXPR_INSTALLED:
+ import numexpr as ne
+
+_TEST_MODE = None
+_TEST_RESULT = None
+_USE_NUMEXPR = _NUMEXPR_INSTALLED
+_evaluate = None
+_where = None
+
+# the set of dtypes that we will allow pass to numexpr
+_ALLOWED_DTYPES = {
+ 'evaluate': {'int64', 'int32', 'float64', 'float32', 'bool'},
+ 'where': {'int64', 'float64', 'bool'}
+}
+
+# the minimum prod shape that we will use numexpr
+_MIN_ELEMENTS = 10000
+
+
+def set_use_numexpr(v=True):
+ # set/unset to use numexpr
+ global _USE_NUMEXPR
+ if _NUMEXPR_INSTALLED:
+ _USE_NUMEXPR = v
+
+ # choose what we are going to do
+ global _evaluate, _where
+ if not _USE_NUMEXPR:
+ _evaluate = _evaluate_standard
+ _where = _where_standard
+ else:
+ _evaluate = _evaluate_numexpr
+ _where = _where_numexpr
+
+
+def set_numexpr_threads(n=None):
+ # if we are using numexpr, set the threads to n
+ # otherwise reset
+ if _NUMEXPR_INSTALLED and _USE_NUMEXPR:
+ if n is None:
+ n = ne.detect_number_of_cores()
+ ne.set_num_threads(n)
+
+
+def _evaluate_standard(op, op_str, a, b, **eval_kwargs):
+ """ standard evaluation """
+ if _TEST_MODE:
+ _store_test_result(False)
+ with np.errstate(all='ignore'):
+ return op(a, b)
+
+
+def _can_use_numexpr(op, op_str, a, b, dtype_check):
+ """ return a boolean if we WILL be using numexpr """
+ if op_str is not None:
+
+ # required min elements (otherwise we are adding overhead)
+ if np.prod(a.shape) > _MIN_ELEMENTS:
+
+ # check for dtype compatibility
+ dtypes = set()
+ for o in [a, b]:
+ if hasattr(o, 'get_dtype_counts'):
+ s = o.get_dtype_counts()
+ if len(s) > 1:
+ return False
+ dtypes |= set(s.index)
+ elif isinstance(o, np.ndarray):
+ dtypes |= {o.dtype.name}
+
+ # allowed are a superset
+ if not len(dtypes) or _ALLOWED_DTYPES[dtype_check] >= dtypes:
+ return True
+
+ return False
+
+
+def _evaluate_numexpr(op, op_str, a, b, truediv=True,
+ reversed=False, **eval_kwargs):
+ result = None
+
+ if _can_use_numexpr(op, op_str, a, b, 'evaluate'):
+ try:
+
+ # we were originally called by a reversed op
+ # method
+ if reversed:
+ a, b = b, a
+
+ a_value = getattr(a, "values", a)
+ b_value = getattr(b, "values", b)
+ result = ne.evaluate('a_value {op} b_value'.format(op=op_str),
+ local_dict={'a_value': a_value,
+ 'b_value': b_value},
+ casting='safe', truediv=truediv,
+ **eval_kwargs)
+ except ValueError as detail:
+ if 'unknown type object' in str(detail):
+ pass
+
+ if _TEST_MODE:
+ _store_test_result(result is not None)
+
+ if result is None:
+ result = _evaluate_standard(op, op_str, a, b)
+
+ return result
+
+
+def _where_standard(cond, a, b):
+ return np.where(com.values_from_object(cond), com.values_from_object(a),
+ com.values_from_object(b))
+
+
+def _where_numexpr(cond, a, b):
+ result = None
+
+ if _can_use_numexpr(None, 'where', a, b, 'where'):
+
+ try:
+ cond_value = getattr(cond, 'values', cond)
+ a_value = getattr(a, 'values', a)
+ b_value = getattr(b, 'values', b)
+ result = ne.evaluate('where(cond_value, a_value, b_value)',
+ local_dict={'cond_value': cond_value,
+ 'a_value': a_value,
+ 'b_value': b_value},
+ casting='safe')
+ except ValueError as detail:
+ if 'unknown type object' in str(detail):
+ pass
+ except Exception as detail:
+ raise TypeError(str(detail))
+
+ if result is None:
+ result = _where_standard(cond, a, b)
+
+ return result
+
+
+# turn myself on
+set_use_numexpr(get_option('compute.use_numexpr'))
+
+
+def _has_bool_dtype(x):
+ try:
+ if isinstance(x, ABCDataFrame):
+ return 'bool' in x.dtypes
+ else:
+ return x.dtype == bool
+ except AttributeError:
+ return isinstance(x, (bool, np.bool_))
+
+
+def _bool_arith_check(op_str, a, b, not_allowed=frozenset(('/', '//', '**')),
+ unsupported=None):
+ if unsupported is None:
+ unsupported = {'+': '|', '*': '&', '-': '^'}
+
+ if _has_bool_dtype(a) and _has_bool_dtype(b):
+ if op_str in unsupported:
+ warnings.warn("evaluating in Python space because the {op!r} "
+ "operator is not supported by numexpr for "
+ "the bool dtype, use {alt_op!r} instead"
+ .format(op=op_str, alt_op=unsupported[op_str]))
+ return False
+
+ if op_str in not_allowed:
+ raise NotImplementedError("operator {op!r} not implemented for "
+ "bool dtypes".format(op=op_str))
+ return True
+
+
+def evaluate(op, op_str, a, b, use_numexpr=True,
+ **eval_kwargs):
+ """ evaluate and return the expression of the op on a and b
+
+ Parameters
+ ----------
+
+ op : the actual operand
+ op_str: the string version of the op
+ a : left operand
+ b : right operand
+ use_numexpr : whether to try to use numexpr (default True)
+ """
+
+ use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b)
+ if use_numexpr:
+ return _evaluate(op, op_str, a, b, **eval_kwargs)
+ return _evaluate_standard(op, op_str, a, b)
+
+
+def where(cond, a, b, use_numexpr=True):
+ """ evaluate the where condition cond on a and b
+
+ Parameters
+ ----------
+
+ cond : a boolean array
+ a : return if cond is True
+ b : return if cond is False
+ use_numexpr : whether to try to use numexpr (default True)
+ """
+
+ if use_numexpr:
+ return _where(cond, a, b)
+ return _where_standard(cond, a, b)
+
+
+def set_test_mode(v=True):
+ """
+ Keeps track of whether numexpr was used. Stores an additional ``True``
+ for every successful use of evaluate with numexpr since the last
+ ``get_test_result``
+ """
+ global _TEST_MODE, _TEST_RESULT
+ _TEST_MODE = v
+ _TEST_RESULT = []
+
+
+def _store_test_result(used_numexpr):
+ global _TEST_RESULT
+ if used_numexpr:
+ _TEST_RESULT.append(used_numexpr)
+
+
+def get_test_result():
+ """get test result and reset test_results"""
+ global _TEST_RESULT
+ res = _TEST_RESULT
+ _TEST_RESULT = []
+ return res
diff --git a/contrib/python/pandas/py2/pandas/core/computation/ops.py b/contrib/python/pandas/py2/pandas/core/computation/ops.py
new file mode 100644
index 00000000000..8c3218a976b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/computation/ops.py
@@ -0,0 +1,561 @@
+"""Operator classes for eval.
+"""
+
+from datetime import datetime
+from distutils.version import LooseVersion
+from functools import partial
+import operator as op
+
+import numpy as np
+
+from pandas.compat import PY3, string_types, text_type
+
+from pandas.core.dtypes.common import is_list_like, is_scalar
+
+import pandas as pd
+from pandas.core.base import StringMixin
+import pandas.core.common as com
+from pandas.core.computation.common import _ensure_decoded, _result_type_many
+from pandas.core.computation.scope import _DEFAULT_GLOBALS
+
+from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded
+
+_reductions = 'sum', 'prod'
+
+_unary_math_ops = ('sin', 'cos', 'exp', 'log', 'expm1', 'log1p',
+ 'sqrt', 'sinh', 'cosh', 'tanh', 'arcsin', 'arccos',
+ 'arctan', 'arccosh', 'arcsinh', 'arctanh', 'abs', 'log10',
+ 'floor', 'ceil'
+ )
+_binary_math_ops = ('arctan2',)
+
+_mathops = _unary_math_ops + _binary_math_ops
+
+
+_LOCAL_TAG = '__pd_eval_local_'
+
+
+class UndefinedVariableError(NameError):
+
+ """NameError subclass for local variables."""
+
+ def __init__(self, name, is_local):
+ if is_local:
+ msg = 'local variable {0!r} is not defined'
+ else:
+ msg = 'name {0!r} is not defined'
+ super(UndefinedVariableError, self).__init__(msg.format(name))
+
+
+class Term(StringMixin):
+
+ def __new__(cls, name, env, side=None, encoding=None):
+ klass = Constant if not isinstance(name, string_types) else cls
+ supr_new = super(Term, klass).__new__
+ return supr_new(klass)
+
+ def __init__(self, name, env, side=None, encoding=None):
+ self._name = name
+ self.env = env
+ self.side = side
+ tname = text_type(name)
+ self.is_local = (tname.startswith(_LOCAL_TAG) or
+ tname in _DEFAULT_GLOBALS)
+ self._value = self._resolve_name()
+ self.encoding = encoding
+
+ @property
+ def local_name(self):
+ return self.name.replace(_LOCAL_TAG, '')
+
+ def __unicode__(self):
+ return pprint_thing(self.name)
+
+ def __call__(self, *args, **kwargs):
+ return self.value
+
+ def evaluate(self, *args, **kwargs):
+ return self
+
+ def _resolve_name(self):
+ res = self.env.resolve(self.local_name, is_local=self.is_local)
+ self.update(res)
+
+ if hasattr(res, 'ndim') and res.ndim > 2:
+ raise NotImplementedError("N-dimensional objects, where N > 2,"
+ " are not supported with eval")
+ return res
+
+ def update(self, value):
+ """
+ search order for local (i.e., @variable) variables:
+
+ scope, key_variable
+ [('locals', 'local_name'),
+ ('globals', 'local_name'),
+ ('locals', 'key'),
+ ('globals', 'key')]
+ """
+ key = self.name
+
+ # if it's a variable name (otherwise a constant)
+ if isinstance(key, string_types):
+ self.env.swapkey(self.local_name, key, new_value=value)
+
+ self.value = value
+
+ @property
+ def is_scalar(self):
+ return is_scalar(self._value)
+
+ @property
+ def type(self):
+ try:
+ # potentially very slow for large, mixed dtype frames
+ return self._value.values.dtype
+ except AttributeError:
+ try:
+ # ndarray
+ return self._value.dtype
+ except AttributeError:
+ # scalar
+ return type(self._value)
+
+ return_type = type
+
+ @property
+ def raw(self):
+ return pprint_thing('{0}(name={1!r}, type={2})'
+ ''.format(self.__class__.__name__, self.name,
+ self.type))
+
+ @property
+ def is_datetime(self):
+ try:
+ t = self.type.type
+ except AttributeError:
+ t = self.type
+
+ return issubclass(t, (datetime, np.datetime64))
+
+ @property
+ def value(self):
+ return self._value
+
+ @value.setter
+ def value(self, new_value):
+ self._value = new_value
+
+ @property
+ def name(self):
+ return self._name
+
+ @name.setter
+ def name(self, new_name):
+ self._name = new_name
+
+ @property
+ def ndim(self):
+ return self._value.ndim
+
+
+class Constant(Term):
+
+ def __init__(self, value, env, side=None, encoding=None):
+ super(Constant, self).__init__(value, env, side=side,
+ encoding=encoding)
+
+ def _resolve_name(self):
+ return self._name
+
+ @property
+ def name(self):
+ return self.value
+
+ def __unicode__(self):
+ # in python 2 str() of float
+ # can truncate shorter than repr()
+ return repr(self.name)
+
+
+_bool_op_map = {'not': '~', 'and': '&', 'or': '|'}
+
+
+class Op(StringMixin):
+
+ """Hold an operator of arbitrary arity
+ """
+
+ def __init__(self, op, operands, *args, **kwargs):
+ self.op = _bool_op_map.get(op, op)
+ self.operands = operands
+ self.encoding = kwargs.get('encoding', None)
+
+ def __iter__(self):
+ return iter(self.operands)
+
+ def __unicode__(self):
+ """Print a generic n-ary operator and its operands using infix
+ notation"""
+ # recurse over the operands
+ parened = ('({0})'.format(pprint_thing(opr))
+ for opr in self.operands)
+ return pprint_thing(' {0} '.format(self.op).join(parened))
+
+ @property
+ def return_type(self):
+ # clobber types to bool if the op is a boolean operator
+ if self.op in (_cmp_ops_syms + _bool_ops_syms):
+ return np.bool_
+ return _result_type_many(*(term.type for term in com.flatten(self)))
+
+ @property
+ def has_invalid_return_type(self):
+ types = self.operand_types
+ obj_dtype_set = frozenset([np.dtype('object')])
+ return self.return_type == object and types - obj_dtype_set
+
+ @property
+ def operand_types(self):
+ return frozenset(term.type for term in com.flatten(self))
+
+ @property
+ def is_scalar(self):
+ return all(operand.is_scalar for operand in self.operands)
+
+ @property
+ def is_datetime(self):
+ try:
+ t = self.return_type.type
+ except AttributeError:
+ t = self.return_type
+
+ return issubclass(t, (datetime, np.datetime64))
+
+
+def _in(x, y):
+ """Compute the vectorized membership of ``x in y`` if possible, otherwise
+ use Python.
+ """
+ try:
+ return x.isin(y)
+ except AttributeError:
+ if is_list_like(x):
+ try:
+ return y.isin(x)
+ except AttributeError:
+ pass
+ return x in y
+
+
+def _not_in(x, y):
+ """Compute the vectorized membership of ``x not in y`` if possible,
+ otherwise use Python.
+ """
+ try:
+ return ~x.isin(y)
+ except AttributeError:
+ if is_list_like(x):
+ try:
+ return ~y.isin(x)
+ except AttributeError:
+ pass
+ return x not in y
+
+
+_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=', 'in', 'not in'
+_cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne, _in, _not_in
+_cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs))
+
+_bool_ops_syms = '&', '|', 'and', 'or'
+_bool_ops_funcs = op.and_, op.or_, op.and_, op.or_
+_bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs))
+
+_arith_ops_syms = '+', '-', '*', '/', '**', '//', '%'
+_arith_ops_funcs = (op.add, op.sub, op.mul, op.truediv if PY3 else op.div,
+ op.pow, op.floordiv, op.mod)
+_arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs))
+
+_special_case_arith_ops_syms = '**', '//', '%'
+_special_case_arith_ops_funcs = op.pow, op.floordiv, op.mod
+_special_case_arith_ops_dict = dict(zip(_special_case_arith_ops_syms,
+ _special_case_arith_ops_funcs))
+
+_binary_ops_dict = {}
+
+for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict):
+ _binary_ops_dict.update(d)
+
+
+def _cast_inplace(terms, acceptable_dtypes, dtype):
+ """Cast an expression inplace.
+
+ Parameters
+ ----------
+ terms : Op
+ The expression that should cast.
+ acceptable_dtypes : list of acceptable numpy.dtype
+ Will not cast if term's dtype in this list.
+
+ .. versionadded:: 0.19.0
+
+ dtype : str or numpy.dtype
+ The dtype to cast to.
+ """
+ dt = np.dtype(dtype)
+ for term in terms:
+ if term.type in acceptable_dtypes:
+ continue
+
+ try:
+ new_value = term.value.astype(dt)
+ except AttributeError:
+ new_value = dt.type(term.value)
+ term.update(new_value)
+
+
+def is_term(obj):
+ return isinstance(obj, Term)
+
+
+class BinOp(Op):
+
+ """Hold a binary operator and its operands
+
+ Parameters
+ ----------
+ op : str
+ left : Term or Op
+ right : Term or Op
+ """
+
+ def __init__(self, op, lhs, rhs, **kwargs):
+ super(BinOp, self).__init__(op, (lhs, rhs))
+ self.lhs = lhs
+ self.rhs = rhs
+
+ self._disallow_scalar_only_bool_ops()
+
+ self.convert_values()
+
+ try:
+ self.func = _binary_ops_dict[op]
+ except KeyError:
+ # has to be made a list for python3
+ keys = list(_binary_ops_dict.keys())
+ raise ValueError('Invalid binary operator {0!r}, valid'
+ ' operators are {1}'.format(op, keys))
+
+ def __call__(self, env):
+ """Recursively evaluate an expression in Python space.
+
+ Parameters
+ ----------
+ env : Scope
+
+ Returns
+ -------
+ object
+ The result of an evaluated expression.
+ """
+ # handle truediv
+ if self.op == '/' and env.scope['truediv']:
+ self.func = op.truediv
+
+ # recurse over the left/right nodes
+ left = self.lhs(env)
+ right = self.rhs(env)
+
+ return self.func(left, right)
+
+ def evaluate(self, env, engine, parser, term_type, eval_in_python):
+ """Evaluate a binary operation *before* being passed to the engine.
+
+ Parameters
+ ----------
+ env : Scope
+ engine : str
+ parser : str
+ term_type : type
+ eval_in_python : list
+
+ Returns
+ -------
+ term_type
+ The "pre-evaluated" expression as an instance of ``term_type``
+ """
+ if engine == 'python':
+ res = self(env)
+ else:
+ # recurse over the left/right nodes
+ left = self.lhs.evaluate(env, engine=engine, parser=parser,
+ term_type=term_type,
+ eval_in_python=eval_in_python)
+ right = self.rhs.evaluate(env, engine=engine, parser=parser,
+ term_type=term_type,
+ eval_in_python=eval_in_python)
+
+ # base cases
+ if self.op in eval_in_python:
+ res = self.func(left.value, right.value)
+ else:
+ res = pd.eval(self, local_dict=env, engine=engine,
+ parser=parser)
+
+ name = env.add_tmp(res)
+ return term_type(name, env=env)
+
+ def convert_values(self):
+ """Convert datetimes to a comparable value in an expression.
+ """
+ def stringify(value):
+ if self.encoding is not None:
+ encoder = partial(pprint_thing_encoded,
+ encoding=self.encoding)
+ else:
+ encoder = pprint_thing
+ return encoder(value)
+
+ lhs, rhs = self.lhs, self.rhs
+
+ if is_term(lhs) and lhs.is_datetime and is_term(rhs) and rhs.is_scalar:
+ v = rhs.value
+ if isinstance(v, (int, float)):
+ v = stringify(v)
+ v = pd.Timestamp(_ensure_decoded(v))
+ if v.tz is not None:
+ v = v.tz_convert('UTC')
+ self.rhs.update(v)
+
+ if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.is_scalar:
+ v = lhs.value
+ if isinstance(v, (int, float)):
+ v = stringify(v)
+ v = pd.Timestamp(_ensure_decoded(v))
+ if v.tz is not None:
+ v = v.tz_convert('UTC')
+ self.lhs.update(v)
+
+ def _disallow_scalar_only_bool_ops(self):
+ if ((self.lhs.is_scalar or self.rhs.is_scalar) and
+ self.op in _bool_ops_dict and
+ (not (issubclass(self.rhs.return_type, (bool, np.bool_)) and
+ issubclass(self.lhs.return_type, (bool, np.bool_))))):
+ raise NotImplementedError("cannot evaluate scalar only bool ops")
+
+
+def isnumeric(dtype):
+ return issubclass(np.dtype(dtype).type, np.number)
+
+
+class Div(BinOp):
+
+ """Div operator to special case casting.
+
+ Parameters
+ ----------
+ lhs, rhs : Term or Op
+ The Terms or Ops in the ``/`` expression.
+ truediv : bool
+ Whether or not to use true division. With Python 3 this happens
+ regardless of the value of ``truediv``.
+ """
+
+ def __init__(self, lhs, rhs, truediv, *args, **kwargs):
+ super(Div, self).__init__('/', lhs, rhs, *args, **kwargs)
+
+ if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type):
+ raise TypeError("unsupported operand type(s) for {0}:"
+ " '{1}' and '{2}'".format(self.op,
+ lhs.return_type,
+ rhs.return_type))
+
+ if truediv or PY3:
+ # do not upcast float32s to float64 un-necessarily
+ acceptable_dtypes = [np.float32, np.float_]
+ _cast_inplace(com.flatten(self), acceptable_dtypes, np.float_)
+
+
+_unary_ops_syms = '+', '-', '~', 'not'
+_unary_ops_funcs = op.pos, op.neg, op.invert, op.invert
+_unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs))
+
+
+class UnaryOp(Op):
+
+ """Hold a unary operator and its operands
+
+ Parameters
+ ----------
+ op : str
+ The token used to represent the operator.
+ operand : Term or Op
+ The Term or Op operand to the operator.
+
+ Raises
+ ------
+ ValueError
+ * If no function associated with the passed operator token is found.
+ """
+
+ def __init__(self, op, operand):
+ super(UnaryOp, self).__init__(op, (operand,))
+ self.operand = operand
+
+ try:
+ self.func = _unary_ops_dict[op]
+ except KeyError:
+ raise ValueError('Invalid unary operator {0!r}, valid operators '
+ 'are {1}'.format(op, _unary_ops_syms))
+
+ def __call__(self, env):
+ operand = self.operand(env)
+ return self.func(operand)
+
+ def __unicode__(self):
+ return pprint_thing('{0}({1})'.format(self.op, self.operand))
+
+ @property
+ def return_type(self):
+ operand = self.operand
+ if operand.return_type == np.dtype('bool'):
+ return np.dtype('bool')
+ if (isinstance(operand, Op) and
+ (operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict)):
+ return np.dtype('bool')
+ return np.dtype('int')
+
+
+class MathCall(Op):
+
+ def __init__(self, func, args):
+ super(MathCall, self).__init__(func.name, args)
+ self.func = func
+
+ def __call__(self, env):
+ operands = [op(env) for op in self.operands]
+ with np.errstate(all='ignore'):
+ return self.func.func(*operands)
+
+ def __unicode__(self):
+ operands = map(str, self.operands)
+ return pprint_thing('{0}({1})'.format(self.op, ','.join(operands)))
+
+
+class FuncNode(object):
+ def __init__(self, name):
+ from pandas.core.computation.check import (_NUMEXPR_INSTALLED,
+ _NUMEXPR_VERSION)
+ if name not in _mathops or (
+ _NUMEXPR_INSTALLED and
+ _NUMEXPR_VERSION < LooseVersion('2.6.9') and
+ name in ('floor', 'ceil')
+ ):
+ raise ValueError(
+ "\"{0}\" is not a supported function".format(name))
+
+ self.name = name
+ self.func = getattr(np, name)
+
+ def __call__(self, *args):
+ return MathCall(self, args)
diff --git a/contrib/python/pandas/py2/pandas/core/computation/pytables.py b/contrib/python/pandas/py2/pandas/core/computation/pytables.py
new file mode 100644
index 00000000000..00de29b07c7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/computation/pytables.py
@@ -0,0 +1,604 @@
+""" manage PyTables query interface via Expressions """
+
+import ast
+from functools import partial
+
+import numpy as np
+
+from pandas.compat import DeepChainMap, string_types, u
+
+from pandas.core.dtypes.common import is_list_like
+
+import pandas as pd
+from pandas.core.base import StringMixin
+import pandas.core.common as com
+from pandas.core.computation import expr, ops
+from pandas.core.computation.common import _ensure_decoded
+from pandas.core.computation.expr import BaseExprVisitor
+from pandas.core.computation.ops import UndefinedVariableError, is_term
+
+from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded
+
+
+class Scope(expr.Scope):
+ __slots__ = 'queryables',
+
+ def __init__(self, level, global_dict=None, local_dict=None,
+ queryables=None):
+ super(Scope, self).__init__(level + 1, global_dict=global_dict,
+ local_dict=local_dict)
+ self.queryables = queryables or dict()
+
+
+class Term(ops.Term):
+
+ def __new__(cls, name, env, side=None, encoding=None):
+ klass = Constant if not isinstance(name, string_types) else cls
+ supr_new = StringMixin.__new__
+ return supr_new(klass)
+
+ def __init__(self, name, env, side=None, encoding=None):
+ super(Term, self).__init__(name, env, side=side, encoding=encoding)
+
+ def _resolve_name(self):
+ # must be a queryables
+ if self.side == 'left':
+ if self.name not in self.env.queryables:
+ raise NameError('name {name!r} is not defined'
+ .format(name=self.name))
+ return self.name
+
+ # resolve the rhs (and allow it to be None)
+ try:
+ return self.env.resolve(self.name, is_local=False)
+ except UndefinedVariableError:
+ return self.name
+
+ @property
+ def value(self):
+ return self._value
+
+
+class Constant(Term):
+
+ def __init__(self, value, env, side=None, encoding=None):
+ super(Constant, self).__init__(value, env, side=side,
+ encoding=encoding)
+
+ def _resolve_name(self):
+ return self._name
+
+
+class BinOp(ops.BinOp):
+
+ _max_selectors = 31
+
+ def __init__(self, op, lhs, rhs, queryables, encoding):
+ super(BinOp, self).__init__(op, lhs, rhs)
+ self.queryables = queryables
+ self.encoding = encoding
+ self.filter = None
+ self.condition = None
+
+ def _disallow_scalar_only_bool_ops(self):
+ pass
+
+ def prune(self, klass):
+
+ def pr(left, right):
+ """ create and return a new specialized BinOp from myself """
+
+ if left is None:
+ return right
+ elif right is None:
+ return left
+
+ k = klass
+ if isinstance(left, ConditionBinOp):
+ if (isinstance(left, ConditionBinOp) and
+ isinstance(right, ConditionBinOp)):
+ k = JointConditionBinOp
+ elif isinstance(left, k):
+ return left
+ elif isinstance(right, k):
+ return right
+
+ elif isinstance(left, FilterBinOp):
+ if (isinstance(left, FilterBinOp) and
+ isinstance(right, FilterBinOp)):
+ k = JointFilterBinOp
+ elif isinstance(left, k):
+ return left
+ elif isinstance(right, k):
+ return right
+
+ return k(self.op, left, right, queryables=self.queryables,
+ encoding=self.encoding).evaluate()
+
+ left, right = self.lhs, self.rhs
+
+ if is_term(left) and is_term(right):
+ res = pr(left.value, right.value)
+ elif not is_term(left) and is_term(right):
+ res = pr(left.prune(klass), right.value)
+ elif is_term(left) and not is_term(right):
+ res = pr(left.value, right.prune(klass))
+ elif not (is_term(left) or is_term(right)):
+ res = pr(left.prune(klass), right.prune(klass))
+
+ return res
+
+ def conform(self, rhs):
+ """ inplace conform rhs """
+ if not is_list_like(rhs):
+ rhs = [rhs]
+ if isinstance(rhs, np.ndarray):
+ rhs = rhs.ravel()
+ return rhs
+
+ @property
+ def is_valid(self):
+ """ return True if this is a valid field """
+ return self.lhs in self.queryables
+
+ @property
+ def is_in_table(self):
+ """ return True if this is a valid column name for generation (e.g. an
+ actual column in the table) """
+ return self.queryables.get(self.lhs) is not None
+
+ @property
+ def kind(self):
+ """ the kind of my field """
+ return getattr(self.queryables.get(self.lhs), 'kind', None)
+
+ @property
+ def meta(self):
+ """ the meta of my field """
+ return getattr(self.queryables.get(self.lhs), 'meta', None)
+
+ @property
+ def metadata(self):
+ """ the metadata of my field """
+ return getattr(self.queryables.get(self.lhs), 'metadata', None)
+
+ def generate(self, v):
+ """ create and return the op string for this TermValue """
+ val = v.tostring(self.encoding)
+ return "({lhs} {op} {val})".format(lhs=self.lhs, op=self.op, val=val)
+
+ def convert_value(self, v):
+ """ convert the expression that is in the term to something that is
+ accepted by pytables """
+
+ def stringify(value):
+ if self.encoding is not None:
+ encoder = partial(pprint_thing_encoded,
+ encoding=self.encoding)
+ else:
+ encoder = pprint_thing
+ return encoder(value)
+
+ kind = _ensure_decoded(self.kind)
+ meta = _ensure_decoded(self.meta)
+ if kind == u('datetime64') or kind == u('datetime'):
+ if isinstance(v, (int, float)):
+ v = stringify(v)
+ v = _ensure_decoded(v)
+ v = pd.Timestamp(v)
+ if v.tz is not None:
+ v = v.tz_convert('UTC')
+ return TermValue(v, v.value, kind)
+ elif kind == u('timedelta64') or kind == u('timedelta'):
+ v = pd.Timedelta(v, unit='s').value
+ return TermValue(int(v), v, kind)
+ elif meta == u('category'):
+ metadata = com.values_from_object(self.metadata)
+ result = metadata.searchsorted(v, side='left')
+
+ # result returns 0 if v is first element or if v is not in metadata
+ # check that metadata contains v
+ if not result and v not in metadata:
+ result = -1
+ return TermValue(result, result, u('integer'))
+ elif kind == u('integer'):
+ v = int(float(v))
+ return TermValue(v, v, kind)
+ elif kind == u('float'):
+ v = float(v)
+ return TermValue(v, v, kind)
+ elif kind == u('bool'):
+ if isinstance(v, string_types):
+ v = not v.strip().lower() in [u('false'), u('f'), u('no'),
+ u('n'), u('none'), u('0'),
+ u('[]'), u('{}'), u('')]
+ else:
+ v = bool(v)
+ return TermValue(v, v, kind)
+ elif isinstance(v, string_types):
+ # string quoting
+ return TermValue(v, stringify(v), u('string'))
+ else:
+ raise TypeError("Cannot compare {v} of type {typ} to {kind} column"
+ .format(v=v, typ=type(v), kind=kind))
+
+ def convert_values(self):
+ pass
+
+
+class FilterBinOp(BinOp):
+
+ def __unicode__(self):
+ return pprint_thing("[Filter : [{lhs}] -> [{op}]"
+ .format(lhs=self.filter[0], op=self.filter[1]))
+
+ def invert(self):
+ """ invert the filter """
+ if self.filter is not None:
+ f = list(self.filter)
+ f[1] = self.generate_filter_op(invert=True)
+ self.filter = tuple(f)
+ return self
+
+ def format(self):
+ """ return the actual filter format """
+ return [self.filter]
+
+ def evaluate(self):
+
+ if not self.is_valid:
+ raise ValueError("query term is not valid [{slf}]"
+ .format(slf=self))
+
+ rhs = self.conform(self.rhs)
+ values = [TermValue(v, v, self.kind) for v in rhs]
+
+ if self.is_in_table:
+
+ # if too many values to create the expression, use a filter instead
+ if self.op in ['==', '!='] and len(values) > self._max_selectors:
+
+ filter_op = self.generate_filter_op()
+ self.filter = (
+ self.lhs,
+ filter_op,
+ pd.Index([v.value for v in values]))
+
+ return self
+ return None
+
+ # equality conditions
+ if self.op in ['==', '!=']:
+
+ filter_op = self.generate_filter_op()
+ self.filter = (
+ self.lhs,
+ filter_op,
+ pd.Index([v.value for v in values]))
+
+ else:
+ raise TypeError("passing a filterable condition to a non-table "
+ "indexer [{slf}]".format(slf=self))
+
+ return self
+
+ def generate_filter_op(self, invert=False):
+ if (self.op == '!=' and not invert) or (self.op == '==' and invert):
+ return lambda axis, vals: ~axis.isin(vals)
+ else:
+ return lambda axis, vals: axis.isin(vals)
+
+
+class JointFilterBinOp(FilterBinOp):
+
+ def format(self):
+ raise NotImplementedError("unable to collapse Joint Filters")
+
+ def evaluate(self):
+ return self
+
+
+class ConditionBinOp(BinOp):
+
+ def __unicode__(self):
+ return pprint_thing("[Condition : [{cond}]]"
+ .format(cond=self.condition))
+
+ def invert(self):
+ """ invert the condition """
+ # if self.condition is not None:
+ # self.condition = "~(%s)" % self.condition
+ # return self
+ raise NotImplementedError("cannot use an invert condition when "
+ "passing to numexpr")
+
+ def format(self):
+ """ return the actual ne format """
+ return self.condition
+
+ def evaluate(self):
+
+ if not self.is_valid:
+ raise ValueError("query term is not valid [{slf}]"
+ .format(slf=self))
+
+ # convert values if we are in the table
+ if not self.is_in_table:
+ return None
+
+ rhs = self.conform(self.rhs)
+ values = [self.convert_value(v) for v in rhs]
+
+ # equality conditions
+ if self.op in ['==', '!=']:
+
+ # too many values to create the expression?
+ if len(values) <= self._max_selectors:
+ vs = [self.generate(v) for v in values]
+ self.condition = "({cond})".format(cond=' | '.join(vs))
+
+ # use a filter after reading
+ else:
+ return None
+ else:
+ self.condition = self.generate(values[0])
+
+ return self
+
+
+class JointConditionBinOp(ConditionBinOp):
+
+ def evaluate(self):
+ self.condition = "({lhs} {op} {rhs})".format(lhs=self.lhs.condition,
+ op=self.op,
+ rhs=self.rhs.condition)
+ return self
+
+
+class UnaryOp(ops.UnaryOp):
+
+ def prune(self, klass):
+
+ if self.op != '~':
+ raise NotImplementedError("UnaryOp only support invert type ops")
+
+ operand = self.operand
+ operand = operand.prune(klass)
+
+ if operand is not None:
+ if issubclass(klass, ConditionBinOp):
+ if operand.condition is not None:
+ return operand.invert()
+ elif issubclass(klass, FilterBinOp):
+ if operand.filter is not None:
+ return operand.invert()
+
+ return None
+
+
+_op_classes = {'unary': UnaryOp}
+
+
+class ExprVisitor(BaseExprVisitor):
+ const_type = Constant
+ term_type = Term
+
+ def __init__(self, env, engine, parser, **kwargs):
+ super(ExprVisitor, self).__init__(env, engine, parser)
+ for bin_op in self.binary_ops:
+ bin_node = self.binary_op_nodes_map[bin_op]
+ setattr(self, 'visit_{node}'.format(node=bin_node),
+ lambda node, bin_op=bin_op: partial(BinOp, bin_op,
+ **kwargs))
+
+ def visit_UnaryOp(self, node, **kwargs):
+ if isinstance(node.op, (ast.Not, ast.Invert)):
+ return UnaryOp('~', self.visit(node.operand))
+ elif isinstance(node.op, ast.USub):
+ return self.const_type(-self.visit(node.operand).value, self.env)
+ elif isinstance(node.op, ast.UAdd):
+ raise NotImplementedError('Unary addition not supported')
+
+ def visit_Index(self, node, **kwargs):
+ return self.visit(node.value).value
+
+ def visit_Assign(self, node, **kwargs):
+ cmpr = ast.Compare(ops=[ast.Eq()], left=node.targets[0],
+ comparators=[node.value])
+ return self.visit(cmpr)
+
+ def visit_Subscript(self, node, **kwargs):
+ # only allow simple suscripts
+
+ value = self.visit(node.value)
+ slobj = self.visit(node.slice)
+ try:
+ value = value.value
+ except AttributeError:
+ pass
+
+ try:
+ return self.const_type(value[slobj], self.env)
+ except TypeError:
+ raise ValueError("cannot subscript {value!r} with "
+ "{slobj!r}".format(value=value, slobj=slobj))
+
+ def visit_Attribute(self, node, **kwargs):
+ attr = node.attr
+ value = node.value
+
+ ctx = node.ctx.__class__
+ if ctx == ast.Load:
+ # resolve the value
+ resolved = self.visit(value)
+
+ # try to get the value to see if we are another expression
+ try:
+ resolved = resolved.value
+ except (AttributeError):
+ pass
+
+ try:
+ return self.term_type(getattr(resolved, attr), self.env)
+ except AttributeError:
+
+ # something like datetime.datetime where scope is overridden
+ if isinstance(value, ast.Name) and value.id == attr:
+ return resolved
+
+ raise ValueError("Invalid Attribute context {name}"
+ .format(name=ctx.__name__))
+
+ def translate_In(self, op):
+ return ast.Eq() if isinstance(op, ast.In) else op
+
+ def _rewrite_membership_op(self, node, left, right):
+ return self.visit(node.op), node.op, left, right
+
+
+def _validate_where(w):
+ """
+ Validate that the where statement is of the right type.
+
+ The type may either be String, Expr, or list-like of Exprs.
+
+ Parameters
+ ----------
+ w : String term expression, Expr, or list-like of Exprs.
+
+ Returns
+ -------
+ where : The original where clause if the check was successful.
+
+ Raises
+ ------
+ TypeError : An invalid data type was passed in for w (e.g. dict).
+ """
+
+ if not (isinstance(w, (Expr, string_types)) or is_list_like(w)):
+ raise TypeError("where must be passed as a string, Expr, "
+ "or list-like of Exprs")
+
+ return w
+
+
+class Expr(expr.Expr):
+
+ """ hold a pytables like expression, comprised of possibly multiple 'terms'
+
+ Parameters
+ ----------
+ where : string term expression, Expr, or list-like of Exprs
+ queryables : a "kinds" map (dict of column name -> kind), or None if column
+ is non-indexable
+ encoding : an encoding that will encode the query terms
+
+ Returns
+ -------
+ an Expr object
+
+ Examples
+ --------
+
+ 'index>=date'
+ "columns=['A', 'D']"
+ 'columns=A'
+ 'columns==A'
+ "~(columns=['A','B'])"
+ 'index>df.index[3] & string="bar"'
+ '(index>df.index[3] & index<=df.index[6]) | string="bar"'
+ "ts>=Timestamp('2012-02-01')"
+ "major_axis>=20130101"
+ """
+
+ def __init__(self, where, queryables=None, encoding=None, scope_level=0):
+
+ where = _validate_where(where)
+
+ self.encoding = encoding
+ self.condition = None
+ self.filter = None
+ self.terms = None
+ self._visitor = None
+
+ # capture the environment if needed
+ local_dict = DeepChainMap()
+
+ if isinstance(where, Expr):
+ local_dict = where.env.scope
+ where = where.expr
+
+ elif isinstance(where, (list, tuple)):
+ for idx, w in enumerate(where):
+ if isinstance(w, Expr):
+ local_dict = w.env.scope
+ else:
+ w = _validate_where(w)
+ where[idx] = w
+ where = ' & '.join(map('({})'.format, com.flatten(where))) # noqa
+
+ self.expr = where
+ self.env = Scope(scope_level + 1, local_dict=local_dict)
+
+ if queryables is not None and isinstance(self.expr, string_types):
+ self.env.queryables.update(queryables)
+ self._visitor = ExprVisitor(self.env, queryables=queryables,
+ parser='pytables', engine='pytables',
+ encoding=encoding)
+ self.terms = self.parse()
+
+ def __unicode__(self):
+ if self.terms is not None:
+ return pprint_thing(self.terms)
+ return pprint_thing(self.expr)
+
+ def evaluate(self):
+ """ create and return the numexpr condition and filter """
+
+ try:
+ self.condition = self.terms.prune(ConditionBinOp)
+ except AttributeError:
+ raise ValueError("cannot process expression [{expr}], [{slf}] "
+ "is not a valid condition".format(expr=self.expr,
+ slf=self))
+ try:
+ self.filter = self.terms.prune(FilterBinOp)
+ except AttributeError:
+ raise ValueError("cannot process expression [{expr}], [{slf}] "
+ "is not a valid filter".format(expr=self.expr,
+ slf=self))
+
+ return self.condition, self.filter
+
+
+class TermValue(object):
+
+ """ hold a term value the we use to construct a condition/filter """
+
+ def __init__(self, value, converted, kind):
+ self.value = value
+ self.converted = converted
+ self.kind = kind
+
+ def tostring(self, encoding):
+ """ quote the string if not encoded
+ else encode and return """
+ if self.kind == u'string':
+ if encoding is not None:
+ return self.converted
+ return '"{converted}"'.format(converted=self.converted)
+ elif self.kind == u'float':
+ # python 2 str(float) is not always
+ # round-trippable so use repr()
+ return repr(self.converted)
+ return self.converted
+
+
+def maybe_expression(s):
+ """ loose checking if s is a pytables-acceptable expression """
+ if not isinstance(s, string_types):
+ return False
+ ops = ExprVisitor.binary_ops + ExprVisitor.unary_ops + ('=',)
+
+ # make sure we have an op at least
+ return any(op in s for op in ops)
diff --git a/contrib/python/pandas/py2/pandas/core/computation/scope.py b/contrib/python/pandas/py2/pandas/core/computation/scope.py
new file mode 100644
index 00000000000..33c5a1c2e0f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/computation/scope.py
@@ -0,0 +1,302 @@
+"""
+Module for scope operations
+"""
+
+import datetime
+import inspect
+import itertools
+import pprint
+import struct
+import sys
+
+import numpy as np
+
+from pandas.compat import DeepChainMap, StringIO, map
+
+import pandas as pd # noqa
+from pandas.core.base import StringMixin
+import pandas.core.computation as compu
+
+
+def _ensure_scope(level, global_dict=None, local_dict=None, resolvers=(),
+ target=None, **kwargs):
+ """Ensure that we are grabbing the correct scope."""
+ return Scope(level + 1, global_dict=global_dict, local_dict=local_dict,
+ resolvers=resolvers, target=target)
+
+
+def _replacer(x):
+ """Replace a number with its hexadecimal representation. Used to tag
+ temporary variables with their calling scope's id.
+ """
+ # get the hex repr of the binary char and remove 0x and pad by pad_size
+ # zeros
+ try:
+ hexin = ord(x)
+ except TypeError:
+ # bytes literals masquerade as ints when iterating in py3
+ hexin = x
+
+ return hex(hexin)
+
+
+def _raw_hex_id(obj):
+ """Return the padded hexadecimal id of ``obj``."""
+ # interpret as a pointer since that's what really what id returns
+ packed = struct.pack('@P', id(obj))
+ return ''.join(map(_replacer, packed))
+
+
+_DEFAULT_GLOBALS = {
+ 'Timestamp': pd._libs.tslib.Timestamp,
+ 'datetime': datetime.datetime,
+ 'True': True,
+ 'False': False,
+ 'list': list,
+ 'tuple': tuple,
+ 'inf': np.inf,
+ 'Inf': np.inf,
+}
+
+
+def _get_pretty_string(obj):
+ """Return a prettier version of obj
+
+ Parameters
+ ----------
+ obj : object
+ Object to pretty print
+
+ Returns
+ -------
+ s : str
+ Pretty print object repr
+ """
+ sio = StringIO()
+ pprint.pprint(obj, stream=sio)
+ return sio.getvalue()
+
+
+class Scope(StringMixin):
+
+ """Object to hold scope, with a few bells to deal with some custom syntax
+ and contexts added by pandas.
+
+ Parameters
+ ----------
+ level : int
+ global_dict : dict or None, optional, default None
+ local_dict : dict or Scope or None, optional, default None
+ resolvers : list-like or None, optional, default None
+ target : object
+
+ Attributes
+ ----------
+ level : int
+ scope : DeepChainMap
+ target : object
+ temps : dict
+ """
+ __slots__ = 'level', 'scope', 'target', 'temps'
+
+ def __init__(self, level, global_dict=None, local_dict=None, resolvers=(),
+ target=None):
+ self.level = level + 1
+
+ # shallow copy because we don't want to keep filling this up with what
+ # was there before if there are multiple calls to Scope/_ensure_scope
+ self.scope = DeepChainMap(_DEFAULT_GLOBALS.copy())
+ self.target = target
+
+ if isinstance(local_dict, Scope):
+ self.scope.update(local_dict.scope)
+ if local_dict.target is not None:
+ self.target = local_dict.target
+ self.update(local_dict.level)
+
+ frame = sys._getframe(self.level)
+
+ try:
+ # shallow copy here because we don't want to replace what's in
+ # scope when we align terms (alignment accesses the underlying
+ # numpy array of pandas objects)
+ self.scope = self.scope.new_child((global_dict or
+ frame.f_globals).copy())
+ if not isinstance(local_dict, Scope):
+ self.scope = self.scope.new_child((local_dict or
+ frame.f_locals).copy())
+ finally:
+ del frame
+
+ # assumes that resolvers are going from outermost scope to inner
+ if isinstance(local_dict, Scope):
+ resolvers += tuple(local_dict.resolvers.maps)
+ self.resolvers = DeepChainMap(*resolvers)
+ self.temps = {}
+
+ def __unicode__(self):
+ scope_keys = _get_pretty_string(list(self.scope.keys()))
+ res_keys = _get_pretty_string(list(self.resolvers.keys()))
+ unicode_str = '{name}(scope={scope_keys}, resolvers={res_keys})'
+ return unicode_str.format(name=type(self).__name__,
+ scope_keys=scope_keys,
+ res_keys=res_keys)
+
+ @property
+ def has_resolvers(self):
+ """Return whether we have any extra scope.
+
+ For example, DataFrames pass Their columns as resolvers during calls to
+ ``DataFrame.eval()`` and ``DataFrame.query()``.
+
+ Returns
+ -------
+ hr : bool
+ """
+ return bool(len(self.resolvers))
+
+ def resolve(self, key, is_local):
+ """Resolve a variable name in a possibly local context
+
+ Parameters
+ ----------
+ key : text_type
+ A variable name
+ is_local : bool
+ Flag indicating whether the variable is local or not (prefixed with
+ the '@' symbol)
+
+ Returns
+ -------
+ value : object
+ The value of a particular variable
+ """
+ try:
+ # only look for locals in outer scope
+ if is_local:
+ return self.scope[key]
+
+ # not a local variable so check in resolvers if we have them
+ if self.has_resolvers:
+ return self.resolvers[key]
+
+ # if we're here that means that we have no locals and we also have
+ # no resolvers
+ assert not is_local and not self.has_resolvers
+ return self.scope[key]
+ except KeyError:
+ try:
+ # last ditch effort we look in temporaries
+ # these are created when parsing indexing expressions
+ # e.g., df[df > 0]
+ return self.temps[key]
+ except KeyError:
+ raise compu.ops.UndefinedVariableError(key, is_local)
+
+ def swapkey(self, old_key, new_key, new_value=None):
+ """Replace a variable name, with a potentially new value.
+
+ Parameters
+ ----------
+ old_key : str
+ Current variable name to replace
+ new_key : str
+ New variable name to replace `old_key` with
+ new_value : object
+ Value to be replaced along with the possible renaming
+ """
+ if self.has_resolvers:
+ maps = self.resolvers.maps + self.scope.maps
+ else:
+ maps = self.scope.maps
+
+ maps.append(self.temps)
+
+ for mapping in maps:
+ if old_key in mapping:
+ mapping[new_key] = new_value
+ return
+
+ def _get_vars(self, stack, scopes):
+ """Get specifically scoped variables from a list of stack frames.
+
+ Parameters
+ ----------
+ stack : list
+ A list of stack frames as returned by ``inspect.stack()``
+ scopes : sequence of strings
+ A sequence containing valid stack frame attribute names that
+ evaluate to a dictionary. For example, ('locals', 'globals')
+ """
+ variables = itertools.product(scopes, stack)
+ for scope, (frame, _, _, _, _, _) in variables:
+ try:
+ d = getattr(frame, 'f_' + scope)
+ self.scope = self.scope.new_child(d)
+ finally:
+ # won't remove it, but DECREF it
+ # in Py3 this probably isn't necessary since frame won't be
+ # scope after the loop
+ del frame
+
+ def update(self, level):
+ """Update the current scope by going back `level` levels.
+
+ Parameters
+ ----------
+ level : int or None, optional, default None
+ """
+ sl = level + 1
+
+ # add sl frames to the scope starting with the
+ # most distant and overwriting with more current
+ # makes sure that we can capture variable scope
+ stack = inspect.stack()
+
+ try:
+ self._get_vars(stack[:sl], scopes=['locals'])
+ finally:
+ del stack[:], stack
+
+ def add_tmp(self, value):
+ """Add a temporary variable to the scope.
+
+ Parameters
+ ----------
+ value : object
+ An arbitrary object to be assigned to a temporary variable.
+
+ Returns
+ -------
+ name : basestring
+ The name of the temporary variable created.
+ """
+ name = '{name}_{num}_{hex_id}'.format(name=type(value).__name__,
+ num=self.ntemps,
+ hex_id=_raw_hex_id(self))
+
+ # add to inner most scope
+ assert name not in self.temps
+ self.temps[name] = value
+ assert name in self.temps
+
+ # only increment if the variable gets put in the scope
+ return name
+
+ @property
+ def ntemps(self):
+ """The number of temporary variables in this scope"""
+ return len(self.temps)
+
+ @property
+ def full_scope(self):
+ """Return the full scope for use with passing to engines transparently
+ as a mapping.
+
+ Returns
+ -------
+ vars : DeepChainMap
+ All variables in this scope.
+ """
+ maps = [self.temps] + self.resolvers.maps + self.scope.maps
+ return DeepChainMap(*maps)
diff --git a/contrib/python/pandas/py2/pandas/core/config.py b/contrib/python/pandas/py2/pandas/core/config.py
new file mode 100644
index 00000000000..0f43ca65d18
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/config.py
@@ -0,0 +1,837 @@
+"""
+The config module holds package-wide configurables and provides
+a uniform API for working with them.
+
+Overview
+========
+
+This module supports the following requirements:
+- options are referenced using keys in dot.notation, e.g. "x.y.option - z".
+- keys are case-insensitive.
+- functions should accept partial/regex keys, when unambiguous.
+- options can be registered by modules at import time.
+- options can be registered at init-time (via core.config_init)
+- options have a default value, and (optionally) a description and
+ validation function associated with them.
+- options can be deprecated, in which case referencing them
+ should produce a warning.
+- deprecated options can optionally be rerouted to a replacement
+ so that accessing a deprecated option reroutes to a differently
+ named option.
+- options can be reset to their default value.
+- all option can be reset to their default value at once.
+- all options in a certain sub - namespace can be reset at once.
+- the user can set / get / reset or ask for the description of an option.
+- a developer can register and mark an option as deprecated.
+- you can register a callback to be invoked when the option value
+ is set or reset. Changing the stored value is considered misuse, but
+ is not verboten.
+
+Implementation
+==============
+
+- Data is stored using nested dictionaries, and should be accessed
+ through the provided API.
+
+- "Registered options" and "Deprecated options" have metadata associated
+ with them, which are stored in auxiliary dictionaries keyed on the
+ fully-qualified key, e.g. "x.y.z.option".
+
+- the config_init module is imported by the package's __init__.py file.
+ placing any register_option() calls there will ensure those options
+ are available as soon as pandas is loaded. If you use register_option
+ in a module, it will only be available after that module is imported,
+ which you should be aware of.
+
+- `config_prefix` is a context_manager (for use with the `with` keyword)
+ which can save developers some typing, see the docstring.
+
+"""
+
+from collections import namedtuple
+from contextlib import contextmanager
+import re
+import warnings
+
+import pandas.compat as compat
+from pandas.compat import lmap, map, u
+
+DeprecatedOption = namedtuple('DeprecatedOption', 'key msg rkey removal_ver')
+RegisteredOption = namedtuple('RegisteredOption',
+ 'key defval doc validator cb')
+
+_deprecated_options = {} # holds deprecated option metdata
+_registered_options = {} # holds registered option metdata
+_global_config = {} # holds the current values for registered options
+_reserved_keys = ['all'] # keys which have a special meaning
+
+
+class OptionError(AttributeError, KeyError):
+ """Exception for pandas.options, backwards compatible with KeyError
+ checks
+ """
+
+#
+# User API
+
+
+def _get_single_key(pat, silent):
+ keys = _select_options(pat)
+ if len(keys) == 0:
+ if not silent:
+ _warn_if_deprecated(pat)
+ raise OptionError('No such keys(s): {pat!r}'.format(pat=pat))
+ if len(keys) > 1:
+ raise OptionError('Pattern matched multiple keys')
+ key = keys[0]
+
+ if not silent:
+ _warn_if_deprecated(key)
+
+ key = _translate_key(key)
+
+ return key
+
+
+def _get_option(pat, silent=False):
+ key = _get_single_key(pat, silent)
+
+ # walk the nested dict
+ root, k = _get_root(key)
+ return root[k]
+
+
+def _set_option(*args, **kwargs):
+ # must at least 1 arg deal with constraints later
+ nargs = len(args)
+ if not nargs or nargs % 2 != 0:
+ raise ValueError("Must provide an even number of non-keyword "
+ "arguments")
+
+ # default to false
+ silent = kwargs.pop('silent', False)
+
+ if kwargs:
+ msg = '_set_option() got an unexpected keyword argument "{kwarg}"'
+ raise TypeError(msg.format(list(kwargs.keys())[0]))
+
+ for k, v in zip(args[::2], args[1::2]):
+ key = _get_single_key(k, silent)
+
+ o = _get_registered_option(key)
+ if o and o.validator:
+ o.validator(v)
+
+ # walk the nested dict
+ root, k = _get_root(key)
+ root[k] = v
+
+ if o.cb:
+ if silent:
+ with warnings.catch_warnings(record=True):
+ o.cb(key)
+ else:
+ o.cb(key)
+
+
+def _describe_option(pat='', _print_desc=True):
+
+ keys = _select_options(pat)
+ if len(keys) == 0:
+ raise OptionError('No such keys(s)')
+
+ s = u('')
+ for k in keys: # filter by pat
+ s += _build_option_description(k)
+
+ if _print_desc:
+ print(s)
+ else:
+ return s
+
+
+def _reset_option(pat, silent=False):
+
+ keys = _select_options(pat)
+
+ if len(keys) == 0:
+ raise OptionError('No such keys(s)')
+
+ if len(keys) > 1 and len(pat) < 4 and pat != 'all':
+ raise ValueError('You must specify at least 4 characters when '
+ 'resetting multiple keys, use the special keyword '
+ '"all" to reset all the options to their default '
+ 'value')
+
+ for k in keys:
+ _set_option(k, _registered_options[k].defval, silent=silent)
+
+
+def get_default_val(pat):
+ key = _get_single_key(pat, silent=True)
+ return _get_registered_option(key).defval
+
+
+class DictWrapper(object):
+ """ provide attribute-style access to a nested dict"""
+
+ def __init__(self, d, prefix=""):
+ object.__setattr__(self, "d", d)
+ object.__setattr__(self, "prefix", prefix)
+
+ def __setattr__(self, key, val):
+ prefix = object.__getattribute__(self, "prefix")
+ if prefix:
+ prefix += "."
+ prefix += key
+ # you can't set new keys
+ # can you can't overwrite subtrees
+ if key in self.d and not isinstance(self.d[key], dict):
+ _set_option(prefix, val)
+ else:
+ raise OptionError("You can only set the value of existing options")
+
+ def __getattr__(self, key):
+ prefix = object.__getattribute__(self, "prefix")
+ if prefix:
+ prefix += "."
+ prefix += key
+ try:
+ v = object.__getattribute__(self, "d")[key]
+ except KeyError:
+ raise OptionError("No such option")
+ if isinstance(v, dict):
+ return DictWrapper(v, prefix)
+ else:
+ return _get_option(prefix)
+
+ def __dir__(self):
+ return list(self.d.keys())
+
+# For user convenience, we'd like to have the available options described
+# in the docstring. For dev convenience we'd like to generate the docstrings
+# dynamically instead of maintaining them by hand. To this, we use the
+# class below which wraps functions inside a callable, and converts
+# __doc__ into a property function. The doctsrings below are templates
+# using the py2.6+ advanced formatting syntax to plug in a concise list
+# of options, and option descriptions.
+
+
+class CallableDynamicDoc(object):
+
+ def __init__(self, func, doc_tmpl):
+ self.__doc_tmpl__ = doc_tmpl
+ self.__func__ = func
+
+ def __call__(self, *args, **kwds):
+ return self.__func__(*args, **kwds)
+
+ @property
+ def __doc__(self):
+ opts_desc = _describe_option('all', _print_desc=False)
+ opts_list = pp_options_list(list(_registered_options.keys()))
+ return self.__doc_tmpl__.format(opts_desc=opts_desc,
+ opts_list=opts_list)
+
+
+_get_option_tmpl = """
+get_option(pat)
+
+Retrieves the value of the specified option.
+
+Available options:
+
+{opts_list}
+
+Parameters
+----------
+pat : str
+ Regexp which should match a single option.
+ Note: partial matches are supported for convenience, but unless you use the
+ full option name (e.g. x.y.z.option_name), your code may break in future
+ versions if new options with similar names are introduced.
+
+Returns
+-------
+result : the value of the option
+
+Raises
+------
+OptionError : if no such option exists
+
+Notes
+-----
+The available options with its descriptions:
+
+{opts_desc}
+"""
+
+_set_option_tmpl = """
+set_option(pat, value)
+
+Sets the value of the specified option.
+
+Available options:
+
+{opts_list}
+
+Parameters
+----------
+pat : str
+ Regexp which should match a single option.
+ Note: partial matches are supported for convenience, but unless you use the
+ full option name (e.g. x.y.z.option_name), your code may break in future
+ versions if new options with similar names are introduced.
+value :
+ new value of option.
+
+Returns
+-------
+None
+
+Raises
+------
+OptionError if no such option exists
+
+Notes
+-----
+The available options with its descriptions:
+
+{opts_desc}
+"""
+
+_describe_option_tmpl = """
+describe_option(pat, _print_desc=False)
+
+Prints the description for one or more registered options.
+
+Call with not arguments to get a listing for all registered options.
+
+Available options:
+
+{opts_list}
+
+Parameters
+----------
+pat : str
+ Regexp pattern. All matching keys will have their description displayed.
+_print_desc : bool, default True
+ If True (default) the description(s) will be printed to stdout.
+ Otherwise, the description(s) will be returned as a unicode string
+ (for testing).
+
+Returns
+-------
+None by default, the description(s) as a unicode string if _print_desc
+is False
+
+Notes
+-----
+The available options with its descriptions:
+
+{opts_desc}
+"""
+
+_reset_option_tmpl = """
+reset_option(pat)
+
+Reset one or more options to their default value.
+
+Pass "all" as argument to reset all options.
+
+Available options:
+
+{opts_list}
+
+Parameters
+----------
+pat : str/regex
+ If specified only options matching `prefix*` will be reset.
+ Note: partial matches are supported for convenience, but unless you
+ use the full option name (e.g. x.y.z.option_name), your code may break
+ in future versions if new options with similar names are introduced.
+
+Returns
+-------
+None
+
+Notes
+-----
+The available options with its descriptions:
+
+{opts_desc}
+"""
+
+# bind the functions with their docstrings into a Callable
+# and use that as the functions exposed in pd.api
+get_option = CallableDynamicDoc(_get_option, _get_option_tmpl)
+set_option = CallableDynamicDoc(_set_option, _set_option_tmpl)
+reset_option = CallableDynamicDoc(_reset_option, _reset_option_tmpl)
+describe_option = CallableDynamicDoc(_describe_option, _describe_option_tmpl)
+options = DictWrapper(_global_config)
+
+#
+# Functions for use by pandas developers, in addition to User - api
+
+
+class option_context(object):
+ """
+ Context manager to temporarily set options in the `with` statement context.
+
+ You need to invoke as ``option_context(pat, val, [(pat, val), ...])``.
+
+ Examples
+ --------
+
+ >>> with option_context('display.max_rows', 10, 'display.max_columns', 5):
+ ... ...
+ """
+
+ def __init__(self, *args):
+ if not (len(args) % 2 == 0 and len(args) >= 2):
+ raise ValueError('Need to invoke as'
+ ' option_context(pat, val, [(pat, val), ...]).')
+
+ self.ops = list(zip(args[::2], args[1::2]))
+
+ def __enter__(self):
+ self.undo = [(pat, _get_option(pat, silent=True))
+ for pat, val in self.ops]
+
+ for pat, val in self.ops:
+ _set_option(pat, val, silent=True)
+
+ def __exit__(self, *args):
+ if self.undo:
+ for pat, val in self.undo:
+ _set_option(pat, val, silent=True)
+
+
+def register_option(key, defval, doc='', validator=None, cb=None):
+ """Register an option in the package-wide pandas config object
+
+ Parameters
+ ----------
+ key - a fully-qualified key, e.g. "x.y.option - z".
+ defval - the default value of the option
+ doc - a string description of the option
+ validator - a function of a single argument, should raise `ValueError` if
+ called with a value which is not a legal value for the option.
+ cb - a function of a single argument "key", which is called
+ immediately after an option value is set/reset. key is
+ the full name of the option.
+
+ Returns
+ -------
+ Nothing.
+
+ Raises
+ ------
+ ValueError if `validator` is specified and `defval` is not a valid value.
+
+ """
+ import tokenize
+ import keyword
+ key = key.lower()
+
+ if key in _registered_options:
+ msg = "Option '{key}' has already been registered"
+ raise OptionError(msg.format(key=key))
+ if key in _reserved_keys:
+ msg = "Option '{key}' is a reserved key"
+ raise OptionError(msg.format(key=key))
+
+ # the default value should be legal
+ if validator:
+ validator(defval)
+
+ # walk the nested dict, creating dicts as needed along the path
+ path = key.split('.')
+
+ for k in path:
+ if not bool(re.match('^' + tokenize.Name + '$', k)):
+ raise ValueError("{k} is not a valid identifier".format(k=k))
+ if keyword.iskeyword(k):
+ raise ValueError("{k} is a python keyword".format(k=k))
+
+ cursor = _global_config
+ msg = "Path prefix to option '{option}' is already an option"
+ for i, p in enumerate(path[:-1]):
+ if not isinstance(cursor, dict):
+ raise OptionError(msg.format(option='.'.join(path[:i])))
+ if p not in cursor:
+ cursor[p] = {}
+ cursor = cursor[p]
+
+ if not isinstance(cursor, dict):
+ raise OptionError(msg.format(option='.'.join(path[:-1])))
+
+ cursor[path[-1]] = defval # initialize
+
+ # save the option metadata
+ _registered_options[key] = RegisteredOption(key=key, defval=defval,
+ doc=doc, validator=validator,
+ cb=cb)
+
+
+def deprecate_option(key, msg=None, rkey=None, removal_ver=None):
+ """
+ Mark option `key` as deprecated, if code attempts to access this option,
+ a warning will be produced, using `msg` if given, or a default message
+ if not.
+ if `rkey` is given, any access to the key will be re-routed to `rkey`.
+
+ Neither the existence of `key` nor that if `rkey` is checked. If they
+ do not exist, any subsequence access will fail as usual, after the
+ deprecation warning is given.
+
+ Parameters
+ ----------
+ key - the name of the option to be deprecated. must be a fully-qualified
+ option name (e.g "x.y.z.rkey").
+
+ msg - (Optional) a warning message to output when the key is referenced.
+ if no message is given a default message will be emitted.
+
+ rkey - (Optional) the name of an option to reroute access to.
+ If specified, any referenced `key` will be re-routed to `rkey`
+ including set/get/reset.
+ rkey must be a fully-qualified option name (e.g "x.y.z.rkey").
+ used by the default message if no `msg` is specified.
+
+ removal_ver - (Optional) specifies the version in which this option will
+ be removed. used by the default message if no `msg`
+ is specified.
+
+ Returns
+ -------
+ Nothing
+
+ Raises
+ ------
+ OptionError - if key has already been deprecated.
+
+ """
+
+ key = key.lower()
+
+ if key in _deprecated_options:
+ msg = "Option '{key}' has already been defined as deprecated."
+ raise OptionError(msg.format(key=key))
+
+ _deprecated_options[key] = DeprecatedOption(key, msg, rkey, removal_ver)
+
+#
+# functions internal to the module
+
+
+def _select_options(pat):
+ """returns a list of keys matching `pat`
+
+ if pat=="all", returns all registered options
+ """
+
+ # short-circuit for exact key
+ if pat in _registered_options:
+ return [pat]
+
+ # else look through all of them
+ keys = sorted(_registered_options.keys())
+ if pat == 'all': # reserved key
+ return keys
+
+ return [k for k in keys if re.search(pat, k, re.I)]
+
+
+def _get_root(key):
+ path = key.split('.')
+ cursor = _global_config
+ for p in path[:-1]:
+ cursor = cursor[p]
+ return cursor, path[-1]
+
+
+def _is_deprecated(key):
+ """ Returns True if the given option has been deprecated """
+
+ key = key.lower()
+ return key in _deprecated_options
+
+
+def _get_deprecated_option(key):
+ """
+ Retrieves the metadata for a deprecated option, if `key` is deprecated.
+
+ Returns
+ -------
+ DeprecatedOption (namedtuple) if key is deprecated, None otherwise
+ """
+
+ try:
+ d = _deprecated_options[key]
+ except KeyError:
+ return None
+ else:
+ return d
+
+
+def _get_registered_option(key):
+ """
+ Retrieves the option metadata if `key` is a registered option.
+
+ Returns
+ -------
+ RegisteredOption (namedtuple) if key is deprecated, None otherwise
+ """
+ return _registered_options.get(key)
+
+
+def _translate_key(key):
+ """
+ if key id deprecated and a replacement key defined, will return the
+ replacement key, otherwise returns `key` as - is
+ """
+
+ d = _get_deprecated_option(key)
+ if d:
+ return d.rkey or key
+ else:
+ return key
+
+
+def _warn_if_deprecated(key):
+ """
+ Checks if `key` is a deprecated option and if so, prints a warning.
+
+ Returns
+ -------
+ bool - True if `key` is deprecated, False otherwise.
+ """
+
+ d = _get_deprecated_option(key)
+ if d:
+ if d.msg:
+ print(d.msg)
+ warnings.warn(d.msg, FutureWarning)
+ else:
+ msg = "'{key}' is deprecated".format(key=key)
+ if d.removal_ver:
+ msg += (' and will be removed in {version}'
+ .format(version=d.removal_ver))
+ if d.rkey:
+ msg += ", please use '{rkey}' instead.".format(rkey=d.rkey)
+ else:
+ msg += ', please refrain from using it.'
+
+ warnings.warn(msg, FutureWarning)
+ return True
+ return False
+
+
+def _build_option_description(k):
+ """ Builds a formatted description of a registered option and prints it """
+
+ o = _get_registered_option(k)
+ d = _get_deprecated_option(k)
+
+ s = u('{k} ').format(k=k)
+
+ if o.doc:
+ s += '\n'.join(o.doc.strip().split('\n'))
+ else:
+ s += 'No description available.'
+
+ if o:
+ s += (u('\n [default: {default}] [currently: {current}]')
+ .format(default=o.defval, current=_get_option(k, True)))
+
+ if d:
+ s += u('\n (Deprecated')
+ s += (u(', use `{rkey}` instead.')
+ .format(rkey=d.rkey if d.rkey else ''))
+ s += u(')')
+
+ s += '\n\n'
+ return s
+
+
+def pp_options_list(keys, width=80, _print=False):
+ """ Builds a concise listing of available options, grouped by prefix """
+
+ from textwrap import wrap
+ from itertools import groupby
+
+ def pp(name, ks):
+ pfx = ('- ' + name + '.[' if name else '')
+ ls = wrap(', '.join(ks), width, initial_indent=pfx,
+ subsequent_indent=' ', break_long_words=False)
+ if ls and ls[-1] and name:
+ ls[-1] = ls[-1] + ']'
+ return ls
+
+ ls = []
+ singles = [x for x in sorted(keys) if x.find('.') < 0]
+ if singles:
+ ls += pp('', singles)
+ keys = [x for x in keys if x.find('.') >= 0]
+
+ for k, g in groupby(sorted(keys), lambda x: x[:x.rfind('.')]):
+ ks = [x[len(k) + 1:] for x in list(g)]
+ ls += pp(k, ks)
+ s = '\n'.join(ls)
+ if _print:
+ print(s)
+ else:
+ return s
+
+#
+# helpers
+
+
+@contextmanager
+def config_prefix(prefix):
+ """contextmanager for multiple invocations of API with a common prefix
+
+ supported API functions: (register / get / set )__option
+
+ Warning: This is not thread - safe, and won't work properly if you import
+ the API functions into your module using the "from x import y" construct.
+
+ Example:
+
+ import pandas.core.config as cf
+ with cf.config_prefix("display.font"):
+ cf.register_option("color", "red")
+ cf.register_option("size", " 5 pt")
+ cf.set_option(size, " 6 pt")
+ cf.get_option(size)
+ ...
+
+ etc'
+
+ will register options "display.font.color", "display.font.size", set the
+ value of "display.font.size"... and so on.
+ """
+
+ # Note: reset_option relies on set_option, and on key directly
+ # it does not fit in to this monkey-patching scheme
+
+ global register_option, get_option, set_option, reset_option
+
+ def wrap(func):
+ def inner(key, *args, **kwds):
+ pkey = '{prefix}.{key}'.format(prefix=prefix, key=key)
+ return func(pkey, *args, **kwds)
+
+ return inner
+
+ _register_option = register_option
+ _get_option = get_option
+ _set_option = set_option
+ set_option = wrap(set_option)
+ get_option = wrap(get_option)
+ register_option = wrap(register_option)
+ yield None
+ set_option = _set_option
+ get_option = _get_option
+ register_option = _register_option
+
+# These factories and methods are handy for use as the validator
+# arg in register_option
+
+
+def is_type_factory(_type):
+ """
+
+ Parameters
+ ----------
+ `_type` - a type to be compared against (e.g. type(x) == `_type`)
+
+ Returns
+ -------
+ validator - a function of a single argument x , which raises
+ ValueError if type(x) is not equal to `_type`
+
+ """
+
+ def inner(x):
+ if type(x) != _type:
+ msg = "Value must have type '{typ!s}'"
+ raise ValueError(msg.format(typ=_type))
+
+ return inner
+
+
+def is_instance_factory(_type):
+ """
+
+ Parameters
+ ----------
+ `_type` - the type to be checked against
+
+ Returns
+ -------
+ validator - a function of a single argument x , which raises
+ ValueError if x is not an instance of `_type`
+
+ """
+ if isinstance(_type, (tuple, list)):
+ _type = tuple(_type)
+ from pandas.io.formats.printing import pprint_thing
+ type_repr = "|".join(map(pprint_thing, _type))
+ else:
+ type_repr = "'{typ}'".format(typ=_type)
+
+ def inner(x):
+ if not isinstance(x, _type):
+ msg = "Value must be an instance of {type_repr}"
+ raise ValueError(msg.format(type_repr=type_repr))
+
+ return inner
+
+
+def is_one_of_factory(legal_values):
+
+ callables = [c for c in legal_values if callable(c)]
+ legal_values = [c for c in legal_values if not callable(c)]
+
+ def inner(x):
+ from pandas.io.formats.printing import pprint_thing as pp
+ if x not in legal_values:
+
+ if not any(c(x) for c in callables):
+ pp_values = pp("|".join(lmap(pp, legal_values)))
+ msg = "Value must be one of {pp_values}"
+ if len(callables):
+ msg += " or a callable"
+ raise ValueError(msg.format(pp_values=pp_values))
+
+ return inner
+
+
+# common type validators, for convenience
+# usage: register_option(... , validator = is_int)
+is_int = is_type_factory(int)
+is_bool = is_type_factory(bool)
+is_float = is_type_factory(float)
+is_str = is_type_factory(str)
+is_unicode = is_type_factory(compat.text_type)
+is_text = is_instance_factory((str, bytes))
+
+
+def is_callable(obj):
+ """
+
+ Parameters
+ ----------
+ `obj` - the object to be checked
+
+ Returns
+ -------
+ validator - returns True if object is callable
+ raises ValueError otherwise.
+
+ """
+ if not callable(obj):
+ raise ValueError("Value must be a callable")
+ return True
diff --git a/contrib/python/pandas/py2/pandas/core/config_init.py b/contrib/python/pandas/py2/pandas/core/config_init.py
new file mode 100644
index 00000000000..d42a1ab72b1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/config_init.py
@@ -0,0 +1,507 @@
+"""
+This module is imported from the pandas package __init__.py file
+in order to ensure that the core.config options registered here will
+be available as soon as the user loads the package. if register_option
+is invoked inside specific modules, they will not be registered until that
+module is imported, which may or may not be a problem.
+
+If you need to make sure options are available even before a certain
+module is imported, register them here rather then in the module.
+
+"""
+import pandas.core.config as cf
+from pandas.core.config import (
+ is_bool, is_callable, is_instance_factory, is_int, is_one_of_factory,
+ is_text)
+
+from pandas.io.formats.console import detect_console_encoding
+from pandas.io.formats.terminal import is_terminal
+
+# compute
+
+use_bottleneck_doc = """
+: bool
+ Use the bottleneck library to accelerate if it is installed,
+ the default is True
+ Valid values: False,True
+"""
+
+
+def use_bottleneck_cb(key):
+ from pandas.core import nanops
+ nanops.set_use_bottleneck(cf.get_option(key))
+
+
+use_numexpr_doc = """
+: bool
+ Use the numexpr library to accelerate computation if it is installed,
+ the default is True
+ Valid values: False,True
+"""
+
+
+def use_numexpr_cb(key):
+ from pandas.core.computation import expressions
+ expressions.set_use_numexpr(cf.get_option(key))
+
+
+with cf.config_prefix('compute'):
+ cf.register_option('use_bottleneck', True, use_bottleneck_doc,
+ validator=is_bool, cb=use_bottleneck_cb)
+ cf.register_option('use_numexpr', True, use_numexpr_doc,
+ validator=is_bool, cb=use_numexpr_cb)
+#
+# options from the "display" namespace
+
+pc_precision_doc = """
+: int
+ Floating point output precision (number of significant digits). This is
+ only a suggestion
+"""
+
+pc_colspace_doc = """
+: int
+ Default space for DataFrame columns.
+"""
+
+pc_max_rows_doc = """
+: int
+ If max_rows is exceeded, switch to truncate view. Depending on
+ `large_repr`, objects are either centrally truncated or printed as
+ a summary view. 'None' value means unlimited.
+
+ In case python/IPython is running in a terminal and `large_repr`
+ equals 'truncate' this can be set to 0 and pandas will auto-detect
+ the height of the terminal and print a truncated object which fits
+ the screen height. The IPython notebook, IPython qtconsole, or
+ IDLE do not run in a terminal and hence it is not possible to do
+ correct auto-detection.
+"""
+
+pc_max_cols_doc = """
+: int
+ If max_cols is exceeded, switch to truncate view. Depending on
+ `large_repr`, objects are either centrally truncated or printed as
+ a summary view. 'None' value means unlimited.
+
+ In case python/IPython is running in a terminal and `large_repr`
+ equals 'truncate' this can be set to 0 and pandas will auto-detect
+ the width of the terminal and print a truncated object which fits
+ the screen width. The IPython notebook, IPython qtconsole, or IDLE
+ do not run in a terminal and hence it is not possible to do
+ correct auto-detection.
+"""
+
+pc_max_categories_doc = """
+: int
+ This sets the maximum number of categories pandas should output when
+ printing out a `Categorical` or a Series of dtype "category".
+"""
+
+pc_max_info_cols_doc = """
+: int
+ max_info_columns is used in DataFrame.info method to decide if
+ per column information will be printed.
+"""
+
+pc_nb_repr_h_doc = """
+: boolean
+ When True, IPython notebook will use html representation for
+ pandas objects (if it is available).
+"""
+
+pc_date_dayfirst_doc = """
+: boolean
+ When True, prints and parses dates with the day first, eg 20/01/2005
+"""
+
+pc_date_yearfirst_doc = """
+: boolean
+ When True, prints and parses dates with the year first, eg 2005/01/20
+"""
+
+pc_pprint_nest_depth = """
+: int
+ Controls the number of nested levels to process when pretty-printing
+"""
+
+pc_multi_sparse_doc = """
+: boolean
+ "sparsify" MultiIndex display (don't display repeated
+ elements in outer levels within groups)
+"""
+
+pc_encoding_doc = """
+: str/unicode
+ Defaults to the detected encoding of the console.
+ Specifies the encoding to be used for strings returned by to_string,
+ these are generally strings meant to be displayed on the console.
+"""
+
+float_format_doc = """
+: callable
+ The callable should accept a floating point number and return
+ a string with the desired format of the number. This is used
+ in some places like SeriesFormatter.
+ See formats.format.EngFormatter for an example.
+"""
+
+max_colwidth_doc = """
+: int
+ The maximum width in characters of a column in the repr of
+ a pandas data structure. When the column overflows, a "..."
+ placeholder is embedded in the output.
+"""
+
+colheader_justify_doc = """
+: 'left'/'right'
+ Controls the justification of column headers. used by DataFrameFormatter.
+"""
+
+pc_expand_repr_doc = """
+: boolean
+ Whether to print out the full DataFrame repr for wide DataFrames across
+ multiple lines, `max_columns` is still respected, but the output will
+ wrap-around across multiple "pages" if its width exceeds `display.width`.
+"""
+
+pc_show_dimensions_doc = """
+: boolean or 'truncate'
+ Whether to print out dimensions at the end of DataFrame repr.
+ If 'truncate' is specified, only print out the dimensions if the
+ frame is truncated (e.g. not display all rows and/or columns)
+"""
+
+pc_east_asian_width_doc = """
+: boolean
+ Whether to use the Unicode East Asian Width to calculate the display text
+ width.
+ Enabling this may affect to the performance (default: False)
+"""
+
+pc_ambiguous_as_wide_doc = """
+: boolean
+ Whether to handle Unicode characters belong to Ambiguous as Wide (width=2)
+ (default: False)
+"""
+
+pc_latex_repr_doc = """
+: boolean
+ Whether to produce a latex DataFrame representation for jupyter
+ environments that support it.
+ (default: False)
+"""
+
+pc_table_schema_doc = """
+: boolean
+ Whether to publish a Table Schema representation for frontends
+ that support it.
+ (default: False)
+"""
+
+pc_html_border_doc = """
+: int
+ A ``border=value`` attribute is inserted in the ``<table>`` tag
+ for the DataFrame HTML repr.
+"""
+
+pc_html_border_deprecation_warning = """\
+html.border has been deprecated, use display.html.border instead
+(currently both are identical)
+"""
+
+pc_html_use_mathjax_doc = """\
+: boolean
+ When True, Jupyter notebook will process table contents using MathJax,
+ rendering mathematical expressions enclosed by the dollar symbol.
+ (default: True)
+"""
+
+pc_width_doc = """
+: int
+ Width of the display in characters. In case python/IPython is running in
+ a terminal this can be set to None and pandas will correctly auto-detect
+ the width.
+ Note that the IPython notebook, IPython qtconsole, or IDLE do not run in a
+ terminal and hence it is not possible to correctly detect the width.
+"""
+
+pc_chop_threshold_doc = """
+: float or None
+ if set to a float value, all float values smaller then the given threshold
+ will be displayed as exactly 0 by repr and friends.
+"""
+
+pc_max_seq_items = """
+: int or None
+ when pretty-printing a long sequence, no more then `max_seq_items`
+ will be printed. If items are omitted, they will be denoted by the
+ addition of "..." to the resulting string.
+
+ If set to None, the number of items to be printed is unlimited.
+"""
+
+pc_max_info_rows_doc = """
+: int or None
+ df.info() will usually show null-counts for each column.
+ For large frames this can be quite slow. max_info_rows and max_info_cols
+ limit this null check only to frames with smaller dimensions than
+ specified.
+"""
+
+pc_large_repr_doc = """
+: 'truncate'/'info'
+ For DataFrames exceeding max_rows/max_cols, the repr (and HTML repr) can
+ show a truncated table (the default from 0.13), or switch to the view from
+ df.info() (the behaviour in earlier versions of pandas).
+"""
+
+pc_memory_usage_doc = """
+: bool, string or None
+ This specifies if the memory usage of a DataFrame should be displayed when
+ df.info() is called. Valid values True,False,'deep'
+"""
+
+pc_latex_escape = """
+: bool
+ This specifies if the to_latex method of a Dataframe uses escapes special
+ characters.
+ Valid values: False,True
+"""
+
+pc_latex_longtable = """
+:bool
+ This specifies if the to_latex method of a Dataframe uses the longtable
+ format.
+ Valid values: False,True
+"""
+
+pc_latex_multicolumn = """
+: bool
+ This specifies if the to_latex method of a Dataframe uses multicolumns
+ to pretty-print MultiIndex columns.
+ Valid values: False,True
+"""
+
+pc_latex_multicolumn_format = """
+: string
+ This specifies the format for multicolumn headers.
+ Can be surrounded with '|'.
+ Valid values: 'l', 'c', 'r', 'p{<width>}'
+"""
+
+pc_latex_multirow = """
+: bool
+ This specifies if the to_latex method of a Dataframe uses multirows
+ to pretty-print MultiIndex rows.
+ Valid values: False,True
+"""
+
+style_backup = dict()
+
+
+def table_schema_cb(key):
+ from pandas.io.formats.printing import _enable_data_resource_formatter
+ _enable_data_resource_formatter(cf.get_option(key))
+
+
+with cf.config_prefix('display'):
+ cf.register_option('precision', 6, pc_precision_doc, validator=is_int)
+ cf.register_option('float_format', None, float_format_doc,
+ validator=is_one_of_factory([None, is_callable]))
+ cf.register_option('column_space', 12, validator=is_int)
+ cf.register_option('max_info_rows', 1690785, pc_max_info_rows_doc,
+ validator=is_instance_factory((int, type(None))))
+ cf.register_option('max_rows', 60, pc_max_rows_doc,
+ validator=is_instance_factory([type(None), int]))
+ cf.register_option('max_categories', 8, pc_max_categories_doc,
+ validator=is_int)
+ cf.register_option('max_colwidth', 50, max_colwidth_doc, validator=is_int)
+ if is_terminal():
+ max_cols = 0 # automatically determine optimal number of columns
+ else:
+ max_cols = 20 # cannot determine optimal number of columns
+ cf.register_option('max_columns', max_cols, pc_max_cols_doc,
+ validator=is_instance_factory([type(None), int]))
+ cf.register_option('large_repr', 'truncate', pc_large_repr_doc,
+ validator=is_one_of_factory(['truncate', 'info']))
+ cf.register_option('max_info_columns', 100, pc_max_info_cols_doc,
+ validator=is_int)
+ cf.register_option('colheader_justify', 'right', colheader_justify_doc,
+ validator=is_text)
+ cf.register_option('notebook_repr_html', True, pc_nb_repr_h_doc,
+ validator=is_bool)
+ cf.register_option('date_dayfirst', False, pc_date_dayfirst_doc,
+ validator=is_bool)
+ cf.register_option('date_yearfirst', False, pc_date_yearfirst_doc,
+ validator=is_bool)
+ cf.register_option('pprint_nest_depth', 3, pc_pprint_nest_depth,
+ validator=is_int)
+ cf.register_option('multi_sparse', True, pc_multi_sparse_doc,
+ validator=is_bool)
+ cf.register_option('encoding', detect_console_encoding(), pc_encoding_doc,
+ validator=is_text)
+ cf.register_option('expand_frame_repr', True, pc_expand_repr_doc)
+ cf.register_option('show_dimensions', 'truncate', pc_show_dimensions_doc,
+ validator=is_one_of_factory([True, False, 'truncate']))
+ cf.register_option('chop_threshold', None, pc_chop_threshold_doc)
+ cf.register_option('max_seq_items', 100, pc_max_seq_items)
+ cf.register_option('width', 80, pc_width_doc,
+ validator=is_instance_factory([type(None), int]))
+ cf.register_option('memory_usage', True, pc_memory_usage_doc,
+ validator=is_one_of_factory([None, True,
+ False, 'deep']))
+ cf.register_option('unicode.east_asian_width', False,
+ pc_east_asian_width_doc, validator=is_bool)
+ cf.register_option('unicode.ambiguous_as_wide', False,
+ pc_east_asian_width_doc, validator=is_bool)
+ cf.register_option('latex.repr', False,
+ pc_latex_repr_doc, validator=is_bool)
+ cf.register_option('latex.escape', True, pc_latex_escape,
+ validator=is_bool)
+ cf.register_option('latex.longtable', False, pc_latex_longtable,
+ validator=is_bool)
+ cf.register_option('latex.multicolumn', True, pc_latex_multicolumn,
+ validator=is_bool)
+ cf.register_option('latex.multicolumn_format', 'l', pc_latex_multicolumn,
+ validator=is_text)
+ cf.register_option('latex.multirow', False, pc_latex_multirow,
+ validator=is_bool)
+ cf.register_option('html.table_schema', False, pc_table_schema_doc,
+ validator=is_bool, cb=table_schema_cb)
+ cf.register_option('html.border', 1, pc_html_border_doc,
+ validator=is_int)
+ cf.register_option('html.use_mathjax', True, pc_html_use_mathjax_doc,
+ validator=is_bool)
+
+with cf.config_prefix('html'):
+ cf.register_option('border', 1, pc_html_border_doc,
+ validator=is_int)
+
+cf.deprecate_option('html.border', msg=pc_html_border_deprecation_warning,
+ rkey='display.html.border')
+
+
+tc_sim_interactive_doc = """
+: boolean
+ Whether to simulate interactive mode for purposes of testing
+"""
+
+with cf.config_prefix('mode'):
+ cf.register_option('sim_interactive', False, tc_sim_interactive_doc)
+
+use_inf_as_null_doc = """
+: boolean
+ use_inf_as_null had been deprecated and will be removed in a future
+ version. Use `use_inf_as_na` instead.
+"""
+
+use_inf_as_na_doc = """
+: boolean
+ True means treat None, NaN, INF, -INF as NA (old way),
+ False means None and NaN are null, but INF, -INF are not NA
+ (new way).
+"""
+
+# We don't want to start importing everything at the global context level
+# or we'll hit circular deps.
+
+
+def use_inf_as_na_cb(key):
+ from pandas.core.dtypes.missing import _use_inf_as_na
+ _use_inf_as_na(key)
+
+
+with cf.config_prefix('mode'):
+ cf.register_option('use_inf_as_na', False, use_inf_as_na_doc,
+ cb=use_inf_as_na_cb)
+ cf.register_option('use_inf_as_null', False, use_inf_as_null_doc,
+ cb=use_inf_as_na_cb)
+
+cf.deprecate_option('mode.use_inf_as_null', msg=use_inf_as_null_doc,
+ rkey='mode.use_inf_as_na')
+
+
+# user warnings
+chained_assignment = """
+: string
+ Raise an exception, warn, or no action if trying to use chained assignment,
+ The default is warn
+"""
+
+with cf.config_prefix('mode'):
+ cf.register_option('chained_assignment', 'warn', chained_assignment,
+ validator=is_one_of_factory([None, 'warn', 'raise']))
+
+# Set up the io.excel specific configuration.
+writer_engine_doc = """
+: string
+ The default Excel writer engine for '{ext}' files. Available options:
+ auto, {others}.
+"""
+
+_xls_options = ['xlwt']
+_xlsm_options = ['openpyxl']
+_xlsx_options = ['openpyxl', 'xlsxwriter']
+
+
+with cf.config_prefix("io.excel.xls"):
+ cf.register_option("writer", "auto",
+ writer_engine_doc.format(
+ ext='xls',
+ others=', '.join(_xls_options)),
+ validator=str)
+
+with cf.config_prefix("io.excel.xlsm"):
+ cf.register_option("writer", "auto",
+ writer_engine_doc.format(
+ ext='xlsm',
+ others=', '.join(_xlsm_options)),
+ validator=str)
+
+
+with cf.config_prefix("io.excel.xlsx"):
+ cf.register_option("writer", "auto",
+ writer_engine_doc.format(
+ ext='xlsx',
+ others=', '.join(_xlsx_options)),
+ validator=str)
+
+
+# Set up the io.parquet specific configuration.
+parquet_engine_doc = """
+: string
+ The default parquet reader/writer engine. Available options:
+ 'auto', 'pyarrow', 'fastparquet', the default is 'auto'
+"""
+
+with cf.config_prefix('io.parquet'):
+ cf.register_option(
+ 'engine', 'auto', parquet_engine_doc,
+ validator=is_one_of_factory(['auto', 'pyarrow', 'fastparquet']))
+
+# --------
+# Plotting
+# ---------
+
+register_converter_doc = """
+: bool
+ Whether to register converters with matplotlib's units registry for
+ dates, times, datetimes, and Periods. Toggling to False will remove
+ the converters, restoring any converters that pandas overwrote.
+"""
+
+
+def register_converter_cb(key):
+ from pandas.plotting import register_matplotlib_converters
+ from pandas.plotting import deregister_matplotlib_converters
+
+ if cf.get_option(key):
+ register_matplotlib_converters()
+ else:
+ deregister_matplotlib_converters()
+
+
+with cf.config_prefix("plotting.matplotlib"):
+ cf.register_option("register_converters", True, register_converter_doc,
+ validator=bool, cb=register_converter_cb)
diff --git a/contrib/python/pandas/py2/pandas/core/dtypes/__init__.py b/contrib/python/pandas/py2/pandas/core/dtypes/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/dtypes/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/core/dtypes/api.py b/contrib/python/pandas/py2/pandas/core/dtypes/api.py
new file mode 100644
index 00000000000..e9d7b9c4281
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/dtypes/api.py
@@ -0,0 +1,14 @@
+# flake8: noqa
+
+from .common import (
+ is_array_like, is_bool, is_bool_dtype, is_categorical,
+ is_categorical_dtype, is_complex, is_complex_dtype,
+ is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64_ns_dtype,
+ is_datetime64tz_dtype, is_datetimetz, is_dict_like, is_dtype_equal,
+ is_extension_array_dtype, is_extension_type, is_file_like, is_float,
+ is_float_dtype, is_hashable, is_int64_dtype, is_integer, is_integer_dtype,
+ is_interval, is_interval_dtype, is_iterator, is_list_like, is_named_tuple,
+ is_number, is_numeric_dtype, is_object_dtype, is_period, is_period_dtype,
+ is_re, is_re_compilable, is_scalar, is_signed_integer_dtype, is_sparse,
+ is_string_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype,
+ is_unsigned_integer_dtype, pandas_dtype)
diff --git a/contrib/python/pandas/py2/pandas/core/dtypes/base.py b/contrib/python/pandas/py2/pandas/core/dtypes/base.py
new file mode 100644
index 00000000000..ab1cb9cf249
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/dtypes/base.py
@@ -0,0 +1,294 @@
+"""Extend pandas with custom array types"""
+import numpy as np
+
+from pandas.errors import AbstractMethodError
+
+from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
+
+from pandas import compat
+
+
+class _DtypeOpsMixin(object):
+ # Not all of pandas' extension dtypes are compatibile with
+ # the new ExtensionArray interface. This means PandasExtensionDtype
+ # can't subclass ExtensionDtype yet, as is_extension_array_dtype would
+ # incorrectly say that these types are extension types.
+ #
+ # In the interim, we put methods that are shared between the two base
+ # classes ExtensionDtype and PandasExtensionDtype here. Both those base
+ # classes will inherit from this Mixin. Once everything is compatible, this
+ # class's methods can be moved to ExtensionDtype and removed.
+
+ # na_value is the default NA value to use for this type. This is used in
+ # e.g. ExtensionArray.take. This should be the user-facing "boxed" version
+ # of the NA value, not the physical NA vaalue for storage.
+ # e.g. for JSONArray, this is an empty dictionary.
+ na_value = np.nan
+ _metadata = ()
+
+ def __eq__(self, other):
+ """Check whether 'other' is equal to self.
+
+ By default, 'other' is considered equal if either
+
+ * it's a string matching 'self.name'.
+ * it's an instance of this type and all of the
+ the attributes in ``self._metadata`` are equal between
+ `self` and `other`.
+
+ Parameters
+ ----------
+ other : Any
+
+ Returns
+ -------
+ bool
+ """
+ if isinstance(other, compat.string_types):
+ try:
+ other = self.construct_from_string(other)
+ except TypeError:
+ return False
+ if isinstance(other, type(self)):
+ return all(
+ getattr(self, attr) == getattr(other, attr)
+ for attr in self._metadata
+ )
+ return False
+
+ def __hash__(self):
+ return hash(tuple(getattr(self, attr) for attr in self._metadata))
+
+ def __ne__(self, other):
+ return not self.__eq__(other)
+
+ @property
+ def names(self):
+ # type: () -> Optional[List[str]]
+ """Ordered list of field names, or None if there are no fields.
+
+ This is for compatibility with NumPy arrays, and may be removed in the
+ future.
+ """
+ return None
+
+ @classmethod
+ def is_dtype(cls, dtype):
+ """Check if we match 'dtype'.
+
+ Parameters
+ ----------
+ dtype : object
+ The object to check.
+
+ Returns
+ -------
+ is_dtype : bool
+
+ Notes
+ -----
+ The default implementation is True if
+
+ 1. ``cls.construct_from_string(dtype)`` is an instance
+ of ``cls``.
+ 2. ``dtype`` is an object and is an instance of ``cls``
+ 3. ``dtype`` has a ``dtype`` attribute, and any of the above
+ conditions is true for ``dtype.dtype``.
+ """
+ dtype = getattr(dtype, 'dtype', dtype)
+
+ if isinstance(dtype, (ABCSeries, ABCIndexClass,
+ ABCDataFrame, np.dtype)):
+ # https://github.com/pandas-dev/pandas/issues/22960
+ # avoid passing data to `construct_from_string`. This could
+ # cause a FutureWarning from numpy about failing elementwise
+ # comparison from, e.g., comparing DataFrame == 'category'.
+ return False
+ elif dtype is None:
+ return False
+ elif isinstance(dtype, cls):
+ return True
+ try:
+ return cls.construct_from_string(dtype) is not None
+ except TypeError:
+ return False
+
+ @property
+ def _is_numeric(self):
+ # type: () -> bool
+ """
+ Whether columns with this dtype should be considered numeric.
+
+ By default ExtensionDtypes are assumed to be non-numeric.
+ They'll be excluded from operations that exclude non-numeric
+ columns, like (groupby) reductions, plotting, etc.
+ """
+ return False
+
+ @property
+ def _is_boolean(self):
+ # type: () -> bool
+ """
+ Whether this dtype should be considered boolean.
+
+ By default, ExtensionDtypes are assumed to be non-numeric.
+ Setting this to True will affect the behavior of several places,
+ e.g.
+
+ * is_bool
+ * boolean indexing
+
+ Returns
+ -------
+ bool
+ """
+ return False
+
+
+class ExtensionDtype(_DtypeOpsMixin):
+ """
+ A custom data type, to be paired with an ExtensionArray.
+
+ .. versionadded:: 0.23.0
+
+ See Also
+ --------
+ pandas.api.extensions.register_extension_dtype
+ pandas.api.extensions.ExtensionArray
+
+ Notes
+ -----
+ The interface includes the following abstract methods that must
+ be implemented by subclasses:
+
+ * type
+ * name
+ * construct_from_string
+
+ The following attributes influence the behavior of the dtype in
+ pandas operations
+
+ * _is_numeric
+ * _is_boolean
+
+ Optionally one can override construct_array_type for construction
+ with the name of this dtype via the Registry. See
+ :meth:`pandas.api.extensions.register_extension_dtype`.
+
+ * construct_array_type
+
+ The `na_value` class attribute can be used to set the default NA value
+ for this type. :attr:`numpy.nan` is used by default.
+
+ ExtensionDtypes are required to be hashable. The base class provides
+ a default implementation, which relies on the ``_metadata`` class
+ attribute. ``_metadata`` should be a tuple containing the strings
+ that define your data type. For example, with ``PeriodDtype`` that's
+ the ``freq`` attribute.
+
+ **If you have a parametrized dtype you should set the ``_metadata``
+ class property**.
+
+ Ideally, the attributes in ``_metadata`` will match the
+ parameters to your ``ExtensionDtype.__init__`` (if any). If any of
+ the attributes in ``_metadata`` don't implement the standard
+ ``__eq__`` or ``__hash__``, the default implementations here will not
+ work.
+
+ .. versionchanged:: 0.24.0
+
+ Added ``_metadata``, ``__hash__``, and changed the default definition
+ of ``__eq__``.
+
+ This class does not inherit from 'abc.ABCMeta' for performance reasons.
+ Methods and properties required by the interface raise
+ ``pandas.errors.AbstractMethodError`` and no ``register`` method is
+ provided for registering virtual subclasses.
+ """
+
+ def __str__(self):
+ return self.name
+
+ @property
+ def type(self):
+ # type: () -> type
+ """
+ The scalar type for the array, e.g. ``int``
+
+ It's expected ``ExtensionArray[item]`` returns an instance
+ of ``ExtensionDtype.type`` for scalar ``item``, assuming
+ that value is valid (not NA). NA values do not need to be
+ instances of `type`.
+ """
+ raise AbstractMethodError(self)
+
+ @property
+ def kind(self):
+ # type () -> str
+ """
+ A character code (one of 'biufcmMOSUV'), default 'O'
+
+ This should match the NumPy dtype used when the array is
+ converted to an ndarray, which is probably 'O' for object if
+ the extension type cannot be represented as a built-in NumPy
+ type.
+
+ See Also
+ --------
+ numpy.dtype.kind
+ """
+ return 'O'
+
+ @property
+ def name(self):
+ # type: () -> str
+ """
+ A string identifying the data type.
+
+ Will be used for display in, e.g. ``Series.dtype``
+ """
+ raise AbstractMethodError(self)
+
+ @classmethod
+ def construct_array_type(cls):
+ """
+ Return the array type associated with this dtype
+
+ Returns
+ -------
+ type
+ """
+ raise NotImplementedError
+
+ @classmethod
+ def construct_from_string(cls, string):
+ """
+ Attempt to construct this type from a string.
+
+ Parameters
+ ----------
+ string : str
+
+ Returns
+ -------
+ self : instance of 'cls'
+
+ Raises
+ ------
+ TypeError
+ If a class cannot be constructed from this 'string'.
+
+ Examples
+ --------
+ If the extension dtype can be constructed without any arguments,
+ the following may be an adequate implementation.
+
+ >>> @classmethod
+ ... def construct_from_string(cls, string)
+ ... if string == cls.name:
+ ... return cls()
+ ... else:
+ ... raise TypeError("Cannot construct a '{}' from "
+ ... "'{}'".format(cls, string))
+ """
+ raise AbstractMethodError(cls)
diff --git a/contrib/python/pandas/py2/pandas/core/dtypes/cast.py b/contrib/python/pandas/py2/pandas/core/dtypes/cast.py
new file mode 100644
index 00000000000..ad62146dda2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/dtypes/cast.py
@@ -0,0 +1,1328 @@
+""" routings for casting """
+
+from datetime import datetime, timedelta
+
+import numpy as np
+
+from pandas._libs import lib, tslib, tslibs
+from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT
+from pandas.compat import PY3, string_types, text_type, to_str
+
+from .common import (
+ _INT64_DTYPE, _NS_DTYPE, _POSSIBLY_CAST_DTYPES, _TD_DTYPE, ensure_int8,
+ ensure_int16, ensure_int32, ensure_int64, ensure_object, is_bool,
+ is_bool_dtype, is_categorical_dtype, is_complex, is_complex_dtype,
+ is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype,
+ is_datetime_or_timedelta_dtype, is_datetimelike, is_dtype_equal,
+ is_extension_array_dtype, is_extension_type, is_float, is_float_dtype,
+ is_integer, is_integer_dtype, is_object_dtype, is_scalar, is_string_dtype,
+ is_timedelta64_dtype, is_timedelta64_ns_dtype, is_unsigned_integer_dtype,
+ pandas_dtype)
+from .dtypes import (
+ DatetimeTZDtype, ExtensionDtype, PandasExtensionDtype, PeriodDtype)
+from .generic import (
+ ABCDatetimeArray, ABCDatetimeIndex, ABCPeriodArray, ABCPeriodIndex,
+ ABCSeries)
+from .inference import is_list_like
+from .missing import isna, notna
+
+_int8_max = np.iinfo(np.int8).max
+_int16_max = np.iinfo(np.int16).max
+_int32_max = np.iinfo(np.int32).max
+_int64_max = np.iinfo(np.int64).max
+
+
+def maybe_convert_platform(values):
+ """ try to do platform conversion, allow ndarray or list here """
+
+ if isinstance(values, (list, tuple)):
+ values = construct_1d_object_array_from_listlike(list(values))
+ if getattr(values, 'dtype', None) == np.object_:
+ if hasattr(values, '_values'):
+ values = values._values
+ values = lib.maybe_convert_objects(values)
+
+ return values
+
+
+def is_nested_object(obj):
+ """
+ return a boolean if we have a nested object, e.g. a Series with 1 or
+ more Series elements
+
+ This may not be necessarily be performant.
+
+ """
+
+ if isinstance(obj, ABCSeries) and is_object_dtype(obj):
+
+ if any(isinstance(v, ABCSeries) for v in obj.values):
+ return True
+
+ return False
+
+
+def maybe_downcast_to_dtype(result, dtype):
+ """ try to cast to the specified dtype (e.g. convert back to bool/int
+ or could be an astype of float64->float32
+ """
+
+ if is_scalar(result):
+ return result
+
+ def trans(x):
+ return x
+
+ if isinstance(dtype, string_types):
+ if dtype == 'infer':
+ inferred_type = lib.infer_dtype(ensure_object(result.ravel()),
+ skipna=False)
+ if inferred_type == 'boolean':
+ dtype = 'bool'
+ elif inferred_type == 'integer':
+ dtype = 'int64'
+ elif inferred_type == 'datetime64':
+ dtype = 'datetime64[ns]'
+ elif inferred_type == 'timedelta64':
+ dtype = 'timedelta64[ns]'
+
+ # try to upcast here
+ elif inferred_type == 'floating':
+ dtype = 'int64'
+ if issubclass(result.dtype.type, np.number):
+
+ def trans(x): # noqa
+ return x.round()
+ else:
+ dtype = 'object'
+
+ if isinstance(dtype, string_types):
+ dtype = np.dtype(dtype)
+
+ try:
+
+ # don't allow upcasts here (except if empty)
+ if dtype.kind == result.dtype.kind:
+ if (result.dtype.itemsize <= dtype.itemsize and
+ np.prod(result.shape)):
+ return result
+
+ if is_bool_dtype(dtype) or is_integer_dtype(dtype):
+
+ # if we don't have any elements, just astype it
+ if not np.prod(result.shape):
+ return trans(result).astype(dtype)
+
+ # do a test on the first element, if it fails then we are done
+ r = result.ravel()
+ arr = np.array([r[0]])
+
+ # if we have any nulls, then we are done
+ if (isna(arr).any() or
+ not np.allclose(arr, trans(arr).astype(dtype), rtol=0)):
+ return result
+
+ # a comparable, e.g. a Decimal may slip in here
+ elif not isinstance(r[0], (np.integer, np.floating, np.bool, int,
+ float, bool)):
+ return result
+
+ if (issubclass(result.dtype.type, (np.object_, np.number)) and
+ notna(result).all()):
+ new_result = trans(result).astype(dtype)
+ try:
+ if np.allclose(new_result, result, rtol=0):
+ return new_result
+ except Exception:
+
+ # comparison of an object dtype with a number type could
+ # hit here
+ if (new_result == result).all():
+ return new_result
+ elif (issubclass(dtype.type, np.floating) and
+ not is_bool_dtype(result.dtype)):
+ return result.astype(dtype)
+
+ # a datetimelike
+ # GH12821, iNaT is casted to float
+ elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i', 'f']:
+ try:
+ result = result.astype(dtype)
+ except Exception:
+ if dtype.tz:
+ # convert to datetime and change timezone
+ from pandas import to_datetime
+ result = to_datetime(result).tz_localize('utc')
+ result = result.tz_convert(dtype.tz)
+
+ elif dtype.type == Period:
+ # TODO(DatetimeArray): merge with previous elif
+ from pandas.core.arrays import PeriodArray
+
+ return PeriodArray(result, freq=dtype.freq)
+
+ except Exception:
+ pass
+
+ return result
+
+
+def maybe_upcast_putmask(result, mask, other):
+ """
+ A safe version of putmask that potentially upcasts the result
+
+ Parameters
+ ----------
+ result : ndarray
+ The destination array. This will be mutated in-place if no upcasting is
+ necessary.
+ mask : boolean ndarray
+ other : ndarray or scalar
+ The source array or value
+
+ Returns
+ -------
+ result : ndarray
+ changed : boolean
+ Set to true if the result array was upcasted
+ """
+
+ if mask.any():
+ # Two conversions for date-like dtypes that can't be done automatically
+ # in np.place:
+ # NaN -> NaT
+ # integer or integer array -> date-like array
+ if is_datetimelike(result.dtype):
+ if is_scalar(other):
+ if isna(other):
+ other = result.dtype.type('nat')
+ elif is_integer(other):
+ other = np.array(other, dtype=result.dtype)
+ elif is_integer_dtype(other):
+ other = np.array(other, dtype=result.dtype)
+
+ def changeit():
+
+ # try to directly set by expanding our array to full
+ # length of the boolean
+ try:
+ om = other[mask]
+ om_at = om.astype(result.dtype)
+ if (om == om_at).all():
+ new_result = result.values.copy()
+ new_result[mask] = om_at
+ result[:] = new_result
+ return result, False
+ except Exception:
+ pass
+
+ # we are forced to change the dtype of the result as the input
+ # isn't compatible
+ r, _ = maybe_upcast(result, fill_value=other, copy=True)
+ np.place(r, mask, other)
+
+ return r, True
+
+ # we want to decide whether place will work
+ # if we have nans in the False portion of our mask then we need to
+ # upcast (possibly), otherwise we DON't want to upcast (e.g. if we
+ # have values, say integers, in the success portion then it's ok to not
+ # upcast)
+ new_dtype, _ = maybe_promote(result.dtype, other)
+ if new_dtype != result.dtype:
+
+ # we have a scalar or len 0 ndarray
+ # and its nan and we are changing some values
+ if (is_scalar(other) or
+ (isinstance(other, np.ndarray) and other.ndim < 1)):
+ if isna(other):
+ return changeit()
+
+ # we have an ndarray and the masking has nans in it
+ else:
+
+ if isna(other[mask]).any():
+ return changeit()
+
+ try:
+ np.place(result, mask, other)
+ except Exception:
+ return changeit()
+
+ return result, False
+
+
+def maybe_promote(dtype, fill_value=np.nan):
+ # if we passed an array here, determine the fill value by dtype
+ if isinstance(fill_value, np.ndarray):
+ if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)):
+ fill_value = iNaT
+ else:
+
+ # we need to change to object type as our
+ # fill_value is of object type
+ if fill_value.dtype == np.object_:
+ dtype = np.dtype(np.object_)
+ fill_value = np.nan
+
+ # returns tuple of (dtype, fill_value)
+ if issubclass(dtype.type, np.datetime64):
+ fill_value = tslibs.Timestamp(fill_value).value
+ elif issubclass(dtype.type, np.timedelta64):
+ fill_value = tslibs.Timedelta(fill_value).value
+ elif is_datetime64tz_dtype(dtype):
+ if isna(fill_value):
+ fill_value = NaT
+ elif is_extension_array_dtype(dtype) and isna(fill_value):
+ fill_value = dtype.na_value
+ elif is_float(fill_value):
+ if issubclass(dtype.type, np.bool_):
+ dtype = np.object_
+ elif issubclass(dtype.type, np.integer):
+ dtype = np.float64
+ elif is_bool(fill_value):
+ if not issubclass(dtype.type, np.bool_):
+ dtype = np.object_
+ elif is_integer(fill_value):
+ if issubclass(dtype.type, np.bool_):
+ dtype = np.object_
+ elif issubclass(dtype.type, np.integer):
+ # upcast to prevent overflow
+ arr = np.asarray(fill_value)
+ if arr != arr.astype(dtype):
+ dtype = arr.dtype
+ elif is_complex(fill_value):
+ if issubclass(dtype.type, np.bool_):
+ dtype = np.object_
+ elif issubclass(dtype.type, (np.integer, np.floating)):
+ dtype = np.complex128
+ elif fill_value is None:
+ if is_float_dtype(dtype) or is_complex_dtype(dtype):
+ fill_value = np.nan
+ elif is_integer_dtype(dtype):
+ dtype = np.float64
+ fill_value = np.nan
+ elif is_datetime_or_timedelta_dtype(dtype):
+ fill_value = iNaT
+ else:
+ dtype = np.object_
+ fill_value = np.nan
+ else:
+ dtype = np.object_
+
+ # in case we have a string that looked like a number
+ if is_extension_array_dtype(dtype):
+ pass
+ elif is_datetime64tz_dtype(dtype):
+ pass
+ elif issubclass(np.dtype(dtype).type, string_types):
+ dtype = np.object_
+
+ return dtype, fill_value
+
+
+def infer_dtype_from(val, pandas_dtype=False):
+ """
+ interpret the dtype from a scalar or array. This is a convenience
+ routines to infer dtype from a scalar or an array
+
+ Parameters
+ ----------
+ pandas_dtype : bool, default False
+ whether to infer dtype including pandas extension types.
+ If False, scalar/array belongs to pandas extension types is inferred as
+ object
+ """
+ if is_scalar(val):
+ return infer_dtype_from_scalar(val, pandas_dtype=pandas_dtype)
+ return infer_dtype_from_array(val, pandas_dtype=pandas_dtype)
+
+
+def infer_dtype_from_scalar(val, pandas_dtype=False):
+ """
+ interpret the dtype from a scalar
+
+ Parameters
+ ----------
+ pandas_dtype : bool, default False
+ whether to infer dtype including pandas extension types.
+ If False, scalar belongs to pandas extension types is inferred as
+ object
+ """
+
+ dtype = np.object_
+
+ # a 1-element ndarray
+ if isinstance(val, np.ndarray):
+ msg = "invalid ndarray passed to infer_dtype_from_scalar"
+ if val.ndim != 0:
+ raise ValueError(msg)
+
+ dtype = val.dtype
+ val = val.item()
+
+ elif isinstance(val, string_types):
+
+ # If we create an empty array using a string to infer
+ # the dtype, NumPy will only allocate one character per entry
+ # so this is kind of bad. Alternately we could use np.repeat
+ # instead of np.empty (but then you still don't want things
+ # coming out as np.str_!
+
+ dtype = np.object_
+
+ elif isinstance(val, (np.datetime64, datetime)):
+ val = tslibs.Timestamp(val)
+ if val is tslibs.NaT or val.tz is None:
+ dtype = np.dtype('M8[ns]')
+ else:
+ if pandas_dtype:
+ dtype = DatetimeTZDtype(unit='ns', tz=val.tz)
+ else:
+ # return datetimetz as object
+ return np.object_, val
+ val = val.value
+
+ elif isinstance(val, (np.timedelta64, timedelta)):
+ val = tslibs.Timedelta(val).value
+ dtype = np.dtype('m8[ns]')
+
+ elif is_bool(val):
+ dtype = np.bool_
+
+ elif is_integer(val):
+ if isinstance(val, np.integer):
+ dtype = type(val)
+ else:
+ dtype = np.int64
+
+ elif is_float(val):
+ if isinstance(val, np.floating):
+ dtype = type(val)
+ else:
+ dtype = np.float64
+
+ elif is_complex(val):
+ dtype = np.complex_
+
+ elif pandas_dtype:
+ if lib.is_period(val):
+ dtype = PeriodDtype(freq=val.freq)
+ val = val.ordinal
+
+ return dtype, val
+
+
+def infer_dtype_from_array(arr, pandas_dtype=False):
+ """
+ infer the dtype from a scalar or array
+
+ Parameters
+ ----------
+ arr : scalar or array
+ pandas_dtype : bool, default False
+ whether to infer dtype including pandas extension types.
+ If False, array belongs to pandas extension types
+ is inferred as object
+
+ Returns
+ -------
+ tuple (numpy-compat/pandas-compat dtype, array)
+
+ Notes
+ -----
+ if pandas_dtype=False. these infer to numpy dtypes
+ exactly with the exception that mixed / object dtypes
+ are not coerced by stringifying or conversion
+
+ if pandas_dtype=True. datetime64tz-aware/categorical
+ types will retain there character.
+
+ Examples
+ --------
+ >>> np.asarray([1, '1'])
+ array(['1', '1'], dtype='<U21')
+
+ >>> infer_dtype_from_array([1, '1'])
+ (numpy.object_, [1, '1'])
+
+ """
+
+ if isinstance(arr, np.ndarray):
+ return arr.dtype, arr
+
+ if not is_list_like(arr):
+ arr = [arr]
+
+ if pandas_dtype and is_extension_type(arr):
+ return arr.dtype, arr
+
+ elif isinstance(arr, ABCSeries):
+ return arr.dtype, np.asarray(arr)
+
+ # don't force numpy coerce with nan's
+ inferred = lib.infer_dtype(arr, skipna=False)
+ if inferred in ['string', 'bytes', 'unicode',
+ 'mixed', 'mixed-integer']:
+ return (np.object_, arr)
+
+ arr = np.asarray(arr)
+ return arr.dtype, arr
+
+
+def maybe_infer_dtype_type(element):
+ """Try to infer an object's dtype, for use in arithmetic ops
+
+ Uses `element.dtype` if that's available.
+ Objects implementing the iterator protocol are cast to a NumPy array,
+ and from there the array's type is used.
+
+ Parameters
+ ----------
+ element : object
+ Possibly has a `.dtype` attribute, and possibly the iterator
+ protocol.
+
+ Returns
+ -------
+ tipo : type
+
+ Examples
+ --------
+ >>> from collections import namedtuple
+ >>> Foo = namedtuple("Foo", "dtype")
+ >>> maybe_infer_dtype_type(Foo(np.dtype("i8")))
+ numpy.int64
+ """
+ tipo = None
+ if hasattr(element, 'dtype'):
+ tipo = element.dtype
+ elif is_list_like(element):
+ element = np.asarray(element)
+ tipo = element.dtype
+ return tipo
+
+
+def maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False):
+ """ provide explicit type promotion and coercion
+
+ Parameters
+ ----------
+ values : the ndarray that we want to maybe upcast
+ fill_value : what we want to fill with
+ dtype : if None, then use the dtype of the values, else coerce to this type
+ copy : if True always make a copy even if no upcast is required
+ """
+
+ if is_extension_type(values):
+ if copy:
+ values = values.copy()
+ else:
+ if dtype is None:
+ dtype = values.dtype
+ new_dtype, fill_value = maybe_promote(dtype, fill_value)
+ if new_dtype != values.dtype:
+ values = values.astype(new_dtype)
+ elif copy:
+ values = values.copy()
+
+ return values, fill_value
+
+
+def maybe_cast_item(obj, item, dtype):
+ chunk = obj[item]
+
+ if chunk.values.dtype != dtype:
+ if dtype in (np.object_, np.bool_):
+ obj[item] = chunk.astype(np.object_)
+ elif not issubclass(dtype, (np.integer, np.bool_)): # pragma: no cover
+ raise ValueError("Unexpected dtype encountered: {dtype}"
+ .format(dtype=dtype))
+
+
+def invalidate_string_dtypes(dtype_set):
+ """Change string like dtypes to object for
+ ``DataFrame.select_dtypes()``.
+ """
+ non_string_dtypes = dtype_set - {np.dtype('S').type, np.dtype('<U').type}
+ if non_string_dtypes != dtype_set:
+ raise TypeError("string dtypes are not allowed, use 'object' instead")
+
+
+def coerce_indexer_dtype(indexer, categories):
+ """ coerce the indexer input array to the smallest dtype possible """
+ length = len(categories)
+ if length < _int8_max:
+ return ensure_int8(indexer)
+ elif length < _int16_max:
+ return ensure_int16(indexer)
+ elif length < _int32_max:
+ return ensure_int32(indexer)
+ return ensure_int64(indexer)
+
+
+def coerce_to_dtypes(result, dtypes):
+ """
+ given a dtypes and a result set, coerce the result elements to the
+ dtypes
+ """
+ if len(result) != len(dtypes):
+ raise AssertionError("_coerce_to_dtypes requires equal len arrays")
+
+ def conv(r, dtype):
+ try:
+ if isna(r):
+ pass
+ elif dtype == _NS_DTYPE:
+ r = tslibs.Timestamp(r)
+ elif dtype == _TD_DTYPE:
+ r = tslibs.Timedelta(r)
+ elif dtype == np.bool_:
+ # messy. non 0/1 integers do not get converted.
+ if is_integer(r) and r not in [0, 1]:
+ return int(r)
+ r = bool(r)
+ elif dtype.kind == 'f':
+ r = float(r)
+ elif dtype.kind == 'i':
+ r = int(r)
+ except Exception:
+ pass
+
+ return r
+
+ return [conv(r, dtype) for r, dtype in zip(result, dtypes)]
+
+
+def astype_nansafe(arr, dtype, copy=True, skipna=False):
+ """
+ Cast the elements of an array to a given dtype a nan-safe manner.
+
+ Parameters
+ ----------
+ arr : ndarray
+ dtype : np.dtype
+ copy : bool, default True
+ If False, a view will be attempted but may fail, if
+ e.g. the item sizes don't align.
+ skipna: bool, default False
+ Whether or not we should skip NaN when casting as a string-type.
+
+ Raises
+ ------
+ ValueError
+ The dtype was a datetime64/timedelta64 dtype, but it had no unit.
+ """
+
+ # dispatch on extension dtype if needed
+ if is_extension_array_dtype(dtype):
+ return dtype.construct_array_type()._from_sequence(
+ arr, dtype=dtype, copy=copy)
+
+ if not isinstance(dtype, np.dtype):
+ dtype = pandas_dtype(dtype)
+
+ if issubclass(dtype.type, text_type):
+ # in Py3 that's str, in Py2 that's unicode
+ return lib.astype_unicode(arr.ravel(),
+ skipna=skipna).reshape(arr.shape)
+
+ elif issubclass(dtype.type, string_types):
+ return lib.astype_str(arr.ravel(),
+ skipna=skipna).reshape(arr.shape)
+
+ elif is_datetime64_dtype(arr):
+ if is_object_dtype(dtype):
+ return tslib.ints_to_pydatetime(arr.view(np.int64))
+ elif dtype == np.int64:
+ return arr.view(dtype)
+
+ # allow frequency conversions
+ if dtype.kind == 'M':
+ return arr.astype(dtype)
+
+ raise TypeError("cannot astype a datetimelike from [{from_dtype}] "
+ "to [{to_dtype}]".format(from_dtype=arr.dtype,
+ to_dtype=dtype))
+
+ elif is_timedelta64_dtype(arr):
+ if is_object_dtype(dtype):
+ return tslibs.ints_to_pytimedelta(arr.view(np.int64))
+ elif dtype == np.int64:
+ return arr.view(dtype)
+
+ # in py3, timedelta64[ns] are int64
+ if ((PY3 and dtype not in [_INT64_DTYPE, _TD_DTYPE]) or
+ (not PY3 and dtype != _TD_DTYPE)):
+
+ # allow frequency conversions
+ # we return a float here!
+ if dtype.kind == 'm':
+ mask = isna(arr)
+ result = arr.astype(dtype).astype(np.float64)
+ result[mask] = np.nan
+ return result
+ elif dtype == _TD_DTYPE:
+ return arr.astype(_TD_DTYPE, copy=copy)
+
+ raise TypeError("cannot astype a timedelta from [{from_dtype}] "
+ "to [{to_dtype}]".format(from_dtype=arr.dtype,
+ to_dtype=dtype))
+
+ elif (np.issubdtype(arr.dtype, np.floating) and
+ np.issubdtype(dtype, np.integer)):
+
+ if not np.isfinite(arr).all():
+ raise ValueError('Cannot convert non-finite values (NA or inf) to '
+ 'integer')
+
+ elif is_object_dtype(arr):
+
+ # work around NumPy brokenness, #1987
+ if np.issubdtype(dtype.type, np.integer):
+ return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
+
+ # if we have a datetime/timedelta array of objects
+ # then coerce to a proper dtype and recall astype_nansafe
+
+ elif is_datetime64_dtype(dtype):
+ from pandas import to_datetime
+ return astype_nansafe(to_datetime(arr).values, dtype, copy=copy)
+ elif is_timedelta64_dtype(dtype):
+ from pandas import to_timedelta
+ return astype_nansafe(to_timedelta(arr).values, dtype, copy=copy)
+
+ if dtype.name in ("datetime64", "timedelta64"):
+ msg = ("The '{dtype}' dtype has no unit. "
+ "Please pass in '{dtype}[ns]' instead.")
+ raise ValueError(msg.format(dtype=dtype.name))
+
+ if copy or is_object_dtype(arr) or is_object_dtype(dtype):
+ # Explicit copy, or required since NumPy can't view from / to object.
+ return arr.astype(dtype, copy=True)
+
+ return arr.view(dtype)
+
+
+def maybe_convert_objects(values, convert_dates=True, convert_numeric=True,
+ convert_timedeltas=True, copy=True):
+ """ if we have an object dtype, try to coerce dates and/or numbers """
+
+ # if we have passed in a list or scalar
+ if isinstance(values, (list, tuple)):
+ values = np.array(values, dtype=np.object_)
+ if not hasattr(values, 'dtype'):
+ values = np.array([values], dtype=np.object_)
+
+ # convert dates
+ if convert_dates and values.dtype == np.object_:
+
+ # we take an aggressive stance and convert to datetime64[ns]
+ if convert_dates == 'coerce':
+ new_values = maybe_cast_to_datetime(
+ values, 'M8[ns]', errors='coerce')
+
+ # if we are all nans then leave me alone
+ if not isna(new_values).all():
+ values = new_values
+
+ else:
+ values = lib.maybe_convert_objects(values,
+ convert_datetime=convert_dates)
+
+ # convert timedeltas
+ if convert_timedeltas and values.dtype == np.object_:
+
+ if convert_timedeltas == 'coerce':
+ from pandas.core.tools.timedeltas import to_timedelta
+ new_values = to_timedelta(values, errors='coerce')
+
+ # if we are all nans then leave me alone
+ if not isna(new_values).all():
+ values = new_values
+
+ else:
+ values = lib.maybe_convert_objects(
+ values, convert_timedelta=convert_timedeltas)
+
+ # convert to numeric
+ if values.dtype == np.object_:
+ if convert_numeric:
+ try:
+ new_values = lib.maybe_convert_numeric(values, set(),
+ coerce_numeric=True)
+
+ # if we are all nans then leave me alone
+ if not isna(new_values).all():
+ values = new_values
+
+ except Exception:
+ pass
+ else:
+ # soft-conversion
+ values = lib.maybe_convert_objects(values)
+
+ values = values.copy() if copy else values
+
+ return values
+
+
+def soft_convert_objects(values, datetime=True, numeric=True, timedelta=True,
+ coerce=False, copy=True):
+ """ if we have an object dtype, try to coerce dates and/or numbers """
+
+ conversion_count = sum((datetime, numeric, timedelta))
+ if conversion_count == 0:
+ raise ValueError('At least one of datetime, numeric or timedelta must '
+ 'be True.')
+ elif conversion_count > 1 and coerce:
+ raise ValueError("Only one of 'datetime', 'numeric' or "
+ "'timedelta' can be True when when coerce=True.")
+
+ if isinstance(values, (list, tuple)):
+ # List or scalar
+ values = np.array(values, dtype=np.object_)
+ elif not hasattr(values, 'dtype'):
+ values = np.array([values], dtype=np.object_)
+ elif not is_object_dtype(values.dtype):
+ # If not object, do not attempt conversion
+ values = values.copy() if copy else values
+ return values
+
+ # If 1 flag is coerce, ensure 2 others are False
+ if coerce:
+ # Immediate return if coerce
+ if datetime:
+ from pandas import to_datetime
+ return to_datetime(values, errors='coerce', box=False)
+ elif timedelta:
+ from pandas import to_timedelta
+ return to_timedelta(values, errors='coerce', box=False)
+ elif numeric:
+ from pandas import to_numeric
+ return to_numeric(values, errors='coerce')
+
+ # Soft conversions
+ if datetime:
+ # GH 20380, when datetime is beyond year 2262, hence outside
+ # bound of nanosecond-resolution 64-bit integers.
+ try:
+ values = lib.maybe_convert_objects(values,
+ convert_datetime=datetime)
+ except OutOfBoundsDatetime:
+ pass
+
+ if timedelta and is_object_dtype(values.dtype):
+ # Object check to ensure only run if previous did not convert
+ values = lib.maybe_convert_objects(values, convert_timedelta=timedelta)
+
+ if numeric and is_object_dtype(values.dtype):
+ try:
+ converted = lib.maybe_convert_numeric(values, set(),
+ coerce_numeric=True)
+ # If all NaNs, then do not-alter
+ values = converted if not isna(converted).all() else values
+ values = values.copy() if copy else values
+ except Exception:
+ pass
+
+ return values
+
+
+def maybe_castable(arr):
+ # return False to force a non-fastpath
+
+ # check datetime64[ns]/timedelta64[ns] are valid
+ # otherwise try to coerce
+ kind = arr.dtype.kind
+ if kind == 'M':
+ return is_datetime64_ns_dtype(arr.dtype)
+ elif kind == 'm':
+ return is_timedelta64_ns_dtype(arr.dtype)
+
+ return arr.dtype.name not in _POSSIBLY_CAST_DTYPES
+
+
+def maybe_infer_to_datetimelike(value, convert_dates=False):
+ """
+ we might have a array (or single object) that is datetime like,
+ and no dtype is passed don't change the value unless we find a
+ datetime/timedelta set
+
+ this is pretty strict in that a datetime/timedelta is REQUIRED
+ in addition to possible nulls/string likes
+
+ Parameters
+ ----------
+ value : np.array / Series / Index / list-like
+ convert_dates : boolean, default False
+ if True try really hard to convert dates (such as datetime.date), other
+ leave inferred dtype 'date' alone
+
+ """
+
+ # TODO: why not timedelta?
+ if isinstance(value, (ABCDatetimeIndex, ABCPeriodIndex,
+ ABCDatetimeArray, ABCPeriodArray)):
+ return value
+ elif isinstance(value, ABCSeries):
+ if isinstance(value._values, ABCDatetimeIndex):
+ return value._values
+
+ v = value
+
+ if not is_list_like(v):
+ v = [v]
+ v = np.array(v, copy=False)
+
+ # we only care about object dtypes
+ if not is_object_dtype(v):
+ return value
+
+ shape = v.shape
+ if not v.ndim == 1:
+ v = v.ravel()
+
+ if not len(v):
+ return value
+
+ def try_datetime(v):
+ # safe coerce to datetime64
+ try:
+ # GH19671
+ v = tslib.array_to_datetime(v,
+ require_iso8601=True,
+ errors='raise')[0]
+ except ValueError:
+
+ # we might have a sequence of the same-datetimes with tz's
+ # if so coerce to a DatetimeIndex; if they are not the same,
+ # then these stay as object dtype, xref GH19671
+ try:
+ from pandas._libs.tslibs import conversion
+ from pandas import DatetimeIndex
+
+ values, tz = conversion.datetime_to_datetime64(v)
+ return DatetimeIndex(values).tz_localize(
+ 'UTC').tz_convert(tz=tz)
+ except (ValueError, TypeError):
+ pass
+
+ except Exception:
+ pass
+
+ return v.reshape(shape)
+
+ def try_timedelta(v):
+ # safe coerce to timedelta64
+
+ # will try first with a string & object conversion
+ from pandas import to_timedelta
+ try:
+ return to_timedelta(v)._ndarray_values.reshape(shape)
+ except Exception:
+ return v.reshape(shape)
+
+ inferred_type = lib.infer_datetimelike_array(ensure_object(v))
+
+ if inferred_type == 'date' and convert_dates:
+ value = try_datetime(v)
+ elif inferred_type == 'datetime':
+ value = try_datetime(v)
+ elif inferred_type == 'timedelta':
+ value = try_timedelta(v)
+ elif inferred_type == 'nat':
+
+ # if all NaT, return as datetime
+ if isna(v).all():
+ value = try_datetime(v)
+ else:
+
+ # We have at least a NaT and a string
+ # try timedelta first to avoid spurious datetime conversions
+ # e.g. '00:00:01' is a timedelta but technically is also a datetime
+ value = try_timedelta(v)
+ if lib.infer_dtype(value, skipna=False) in ['mixed']:
+ # cannot skip missing values, as NaT implies that the string
+ # is actually a datetime
+ value = try_datetime(v)
+
+ return value
+
+
+def maybe_cast_to_datetime(value, dtype, errors='raise'):
+ """ try to cast the array/value to a datetimelike dtype, converting float
+ nan to iNaT
+ """
+ from pandas.core.tools.timedeltas import to_timedelta
+ from pandas.core.tools.datetimes import to_datetime
+
+ if dtype is not None:
+ if isinstance(dtype, string_types):
+ dtype = np.dtype(dtype)
+
+ is_datetime64 = is_datetime64_dtype(dtype)
+ is_datetime64tz = is_datetime64tz_dtype(dtype)
+ is_timedelta64 = is_timedelta64_dtype(dtype)
+
+ if is_datetime64 or is_datetime64tz or is_timedelta64:
+
+ # Force the dtype if needed.
+ msg = ("The '{dtype}' dtype has no unit. "
+ "Please pass in '{dtype}[ns]' instead.")
+
+ if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE):
+ if dtype.name in ('datetime64', 'datetime64[ns]'):
+ if dtype.name == 'datetime64':
+ raise ValueError(msg.format(dtype=dtype.name))
+ dtype = _NS_DTYPE
+ else:
+ raise TypeError("cannot convert datetimelike to "
+ "dtype [{dtype}]".format(dtype=dtype))
+ elif is_datetime64tz:
+
+ # our NaT doesn't support tz's
+ # this will coerce to DatetimeIndex with
+ # a matching dtype below
+ if is_scalar(value) and isna(value):
+ value = [value]
+
+ elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE):
+ if dtype.name in ('timedelta64', 'timedelta64[ns]'):
+ if dtype.name == 'timedelta64':
+ raise ValueError(msg.format(dtype=dtype.name))
+ dtype = _TD_DTYPE
+ else:
+ raise TypeError("cannot convert timedeltalike to "
+ "dtype [{dtype}]".format(dtype=dtype))
+
+ if is_scalar(value):
+ if value == iNaT or isna(value):
+ value = iNaT
+ else:
+ value = np.array(value, copy=False)
+
+ # have a scalar array-like (e.g. NaT)
+ if value.ndim == 0:
+ value = iNaT
+
+ # we have an array of datetime or timedeltas & nulls
+ elif np.prod(value.shape) or not is_dtype_equal(value.dtype,
+ dtype):
+ try:
+ if is_datetime64:
+ value = to_datetime(value, errors=errors)._values
+ elif is_datetime64tz:
+ # The string check can be removed once issue #13712
+ # is solved. String data that is passed with a
+ # datetime64tz is assumed to be naive which should
+ # be localized to the timezone.
+ is_dt_string = is_string_dtype(value)
+ value = to_datetime(value, errors=errors).array
+ if is_dt_string:
+ # Strings here are naive, so directly localize
+ value = value.tz_localize(dtype.tz)
+ else:
+ # Numeric values are UTC at this point,
+ # so localize and convert
+ value = (value.tz_localize('UTC')
+ .tz_convert(dtype.tz))
+ elif is_timedelta64:
+ value = to_timedelta(value, errors=errors)._values
+ except (AttributeError, ValueError, TypeError):
+ pass
+
+ # coerce datetimelike to object
+ elif is_datetime64_dtype(value) and not is_datetime64_dtype(dtype):
+ if is_object_dtype(dtype):
+ if value.dtype != _NS_DTYPE:
+ value = value.astype(_NS_DTYPE)
+ ints = np.asarray(value).view('i8')
+ return tslib.ints_to_pydatetime(ints)
+
+ # we have a non-castable dtype that was passed
+ raise TypeError('Cannot cast datetime64 to {dtype}'
+ .format(dtype=dtype))
+
+ else:
+
+ is_array = isinstance(value, np.ndarray)
+
+ # catch a datetime/timedelta that is not of ns variety
+ # and no coercion specified
+ if is_array and value.dtype.kind in ['M', 'm']:
+ dtype = value.dtype
+
+ if dtype.kind == 'M' and dtype != _NS_DTYPE:
+ value = value.astype(_NS_DTYPE)
+
+ elif dtype.kind == 'm' and dtype != _TD_DTYPE:
+ value = to_timedelta(value)
+
+ # only do this if we have an array and the dtype of the array is not
+ # setup already we are not an integer/object, so don't bother with this
+ # conversion
+ elif not (is_array and not (issubclass(value.dtype.type, np.integer) or
+ value.dtype == np.object_)):
+ value = maybe_infer_to_datetimelike(value)
+
+ return value
+
+
+def find_common_type(types):
+ """
+ Find a common data type among the given dtypes.
+
+ Parameters
+ ----------
+ types : list of dtypes
+
+ Returns
+ -------
+ pandas extension or numpy dtype
+
+ See Also
+ --------
+ numpy.find_common_type
+
+ """
+
+ if len(types) == 0:
+ raise ValueError('no types given')
+
+ first = types[0]
+
+ # workaround for find_common_type([np.dtype('datetime64[ns]')] * 2)
+ # => object
+ if all(is_dtype_equal(first, t) for t in types[1:]):
+ return first
+
+ if any(isinstance(t, (PandasExtensionDtype, ExtensionDtype))
+ for t in types):
+ return np.object
+
+ # take lowest unit
+ if all(is_datetime64_dtype(t) for t in types):
+ return np.dtype('datetime64[ns]')
+ if all(is_timedelta64_dtype(t) for t in types):
+ return np.dtype('timedelta64[ns]')
+
+ # don't mix bool / int or float or complex
+ # this is different from numpy, which casts bool with float/int as int
+ has_bools = any(is_bool_dtype(t) for t in types)
+ if has_bools:
+ has_ints = any(is_integer_dtype(t) for t in types)
+ has_floats = any(is_float_dtype(t) for t in types)
+ has_complex = any(is_complex_dtype(t) for t in types)
+ if has_ints or has_floats or has_complex:
+ return np.object
+
+ return np.find_common_type(types, [])
+
+
+def cast_scalar_to_array(shape, value, dtype=None):
+ """
+ create np.ndarray of specified shape and dtype, filled with values
+
+ Parameters
+ ----------
+ shape : tuple
+ value : scalar value
+ dtype : np.dtype, optional
+ dtype to coerce
+
+ Returns
+ -------
+ ndarray of shape, filled with value, of specified / inferred dtype
+
+ """
+
+ if dtype is None:
+ dtype, fill_value = infer_dtype_from_scalar(value)
+ else:
+ fill_value = value
+
+ values = np.empty(shape, dtype=dtype)
+ values.fill(fill_value)
+
+ return values
+
+
+def construct_1d_arraylike_from_scalar(value, length, dtype):
+ """
+ create a np.ndarray / pandas type of specified shape and dtype
+ filled with values
+
+ Parameters
+ ----------
+ value : scalar value
+ length : int
+ dtype : pandas_dtype / np.dtype
+
+ Returns
+ -------
+ np.ndarray / pandas type of length, filled with value
+
+ """
+ if is_datetime64tz_dtype(dtype):
+ from pandas import DatetimeIndex
+ subarr = DatetimeIndex([value] * length, dtype=dtype)
+ elif is_categorical_dtype(dtype):
+ from pandas import Categorical
+ subarr = Categorical([value] * length, dtype=dtype)
+ else:
+ if not isinstance(dtype, (np.dtype, type(np.dtype))):
+ dtype = dtype.dtype
+
+ if length and is_integer_dtype(dtype) and isna(value):
+ # coerce if we have nan for an integer dtype
+ dtype = np.dtype('float64')
+ elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"):
+ # we need to coerce to object dtype to avoid
+ # to allow numpy to take our string as a scalar value
+ dtype = object
+ if not isna(value):
+ value = to_str(value)
+
+ subarr = np.empty(length, dtype=dtype)
+ subarr.fill(value)
+
+ return subarr
+
+
+def construct_1d_object_array_from_listlike(values):
+ """
+ Transform any list-like object in a 1-dimensional numpy array of object
+ dtype.
+
+ Parameters
+ ----------
+ values : any iterable which has a len()
+
+ Raises
+ ------
+ TypeError
+ * If `values` does not have a len()
+
+ Returns
+ -------
+ 1-dimensional numpy array of dtype object
+ """
+ # numpy will try to interpret nested lists as further dimensions, hence
+ # making a 1D array that contains list-likes is a bit tricky:
+ result = np.empty(len(values), dtype='object')
+ result[:] = values
+ return result
+
+
+def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False):
+ """
+ Construct a new ndarray, coercing `values` to `dtype`, preserving NA.
+
+ Parameters
+ ----------
+ values : Sequence
+ dtype : numpy.dtype, optional
+ copy : bool, default False
+ Note that copies may still be made with ``copy=False`` if casting
+ is required.
+
+ Returns
+ -------
+ arr : ndarray[dtype]
+
+ Examples
+ --------
+ >>> np.array([1.0, 2.0, None], dtype='str')
+ array(['1.0', '2.0', 'None'], dtype='<U4')
+
+ >>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype='str')
+
+
+ """
+ subarr = np.array(values, dtype=dtype, copy=copy)
+
+ if dtype is not None and dtype.kind in ("U", "S"):
+ # GH-21083
+ # We can't just return np.array(subarr, dtype='str') since
+ # NumPy will convert the non-string objects into strings
+ # Including NA values. Se we have to go
+ # string -> object -> update NA, which requires an
+ # additional pass over the data.
+ na_values = isna(values)
+ subarr2 = subarr.astype(object)
+ subarr2[na_values] = np.asarray(values, dtype=object)[na_values]
+ subarr = subarr2
+
+ return subarr
+
+
+def maybe_cast_to_integer_array(arr, dtype, copy=False):
+ """
+ Takes any dtype and returns the casted version, raising for when data is
+ incompatible with integer/unsigned integer dtypes.
+
+ .. versionadded:: 0.24.0
+
+ Parameters
+ ----------
+ arr : array-like
+ The array to cast.
+ dtype : str, np.dtype
+ The integer dtype to cast the array to.
+ copy: boolean, default False
+ Whether to make a copy of the array before returning.
+
+ Returns
+ -------
+ int_arr : ndarray
+ An array of integer or unsigned integer dtype
+
+ Raises
+ ------
+ OverflowError : the dtype is incompatible with the data
+ ValueError : loss of precision has occurred during casting
+
+ Examples
+ --------
+ If you try to coerce negative values to unsigned integers, it raises:
+
+ >>> Series([-1], dtype="uint64")
+ Traceback (most recent call last):
+ ...
+ OverflowError: Trying to coerce negative values to unsigned integers
+
+ Also, if you try to coerce float values to integers, it raises:
+
+ >>> Series([1, 2, 3.5], dtype="int64")
+ Traceback (most recent call last):
+ ...
+ ValueError: Trying to coerce float values to integers
+ """
+
+ try:
+ if not hasattr(arr, "astype"):
+ casted = np.array(arr, dtype=dtype, copy=copy)
+ else:
+ casted = arr.astype(dtype, copy=copy)
+ except OverflowError:
+ raise OverflowError("The elements provided in the data cannot all be "
+ "casted to the dtype {dtype}".format(dtype=dtype))
+
+ if np.array_equal(arr, casted):
+ return casted
+
+ # We do this casting to allow for proper
+ # data and dtype checking.
+ #
+ # We didn't do this earlier because NumPy
+ # doesn't handle `uint64` correctly.
+ arr = np.asarray(arr)
+
+ if is_unsigned_integer_dtype(dtype) and (arr < 0).any():
+ raise OverflowError("Trying to coerce negative values "
+ "to unsigned integers")
+
+ if is_integer_dtype(dtype) and (is_float_dtype(arr) or
+ is_object_dtype(arr)):
+ raise ValueError("Trying to coerce float values to integers")
diff --git a/contrib/python/pandas/py2/pandas/core/dtypes/common.py b/contrib/python/pandas/py2/pandas/core/dtypes/common.py
new file mode 100644
index 00000000000..e9bf0f87088
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/dtypes/common.py
@@ -0,0 +1,2031 @@
+""" common type operations """
+import warnings
+
+import numpy as np
+
+from pandas._libs import algos, lib
+from pandas._libs.tslibs import conversion
+from pandas.compat import PY3, PY36, string_types
+
+from pandas.core.dtypes.dtypes import (
+ CategoricalDtype, DatetimeTZDtype, ExtensionDtype, IntervalDtype,
+ PandasExtensionDtype, PeriodDtype, registry)
+from pandas.core.dtypes.generic import (
+ ABCCategorical, ABCDateOffset, ABCDatetimeIndex, ABCIndexClass,
+ ABCPeriodArray, ABCPeriodIndex, ABCSeries)
+from pandas.core.dtypes.inference import ( # noqa:F401
+ is_array_like, is_bool, is_complex, is_decimal, is_dict_like, is_file_like,
+ is_float, is_hashable, is_integer, is_interval, is_iterator, is_list_like,
+ is_named_tuple, is_nested_list_like, is_number, is_re, is_re_compilable,
+ is_scalar, is_sequence, is_string_like)
+
+_POSSIBLY_CAST_DTYPES = {np.dtype(t).name
+ for t in ['O', 'int8', 'uint8', 'int16', 'uint16',
+ 'int32', 'uint32', 'int64', 'uint64']}
+
+_NS_DTYPE = conversion.NS_DTYPE
+_TD_DTYPE = conversion.TD_DTYPE
+_INT64_DTYPE = np.dtype(np.int64)
+
+# oh the troubles to reduce import time
+_is_scipy_sparse = None
+
+ensure_float64 = algos.ensure_float64
+ensure_float32 = algos.ensure_float32
+
+_ensure_datetime64ns = conversion.ensure_datetime64ns
+_ensure_timedelta64ns = conversion.ensure_timedelta64ns
+
+
+def ensure_float(arr):
+ """
+ Ensure that an array object has a float dtype if possible.
+
+ Parameters
+ ----------
+ arr : array-like
+ The array whose data type we want to enforce as float.
+
+ Returns
+ -------
+ float_arr : The original array cast to the float dtype if
+ possible. Otherwise, the original array is returned.
+ """
+
+ if issubclass(arr.dtype.type, (np.integer, np.bool_)):
+ arr = arr.astype(float)
+ return arr
+
+
+ensure_uint64 = algos.ensure_uint64
+ensure_int64 = algos.ensure_int64
+ensure_int32 = algos.ensure_int32
+ensure_int16 = algos.ensure_int16
+ensure_int8 = algos.ensure_int8
+ensure_platform_int = algos.ensure_platform_int
+ensure_object = algos.ensure_object
+
+
+def ensure_categorical(arr):
+ """
+ Ensure that an array-like object is a Categorical (if not already).
+
+ Parameters
+ ----------
+ arr : array-like
+ The array that we want to convert into a Categorical.
+
+ Returns
+ -------
+ cat_arr : The original array cast as a Categorical. If it already
+ is a Categorical, we return as is.
+ """
+
+ if not is_categorical(arr):
+ from pandas import Categorical
+ arr = Categorical(arr)
+ return arr
+
+
+def ensure_int64_or_float64(arr, copy=False):
+ """
+ Ensure that an dtype array of some integer dtype
+ has an int64 dtype if possible
+ If it's not possible, potentially because of overflow,
+ convert the array to float64 instead.
+
+ Parameters
+ ----------
+ arr : array-like
+ The array whose data type we want to enforce.
+ copy: boolean
+ Whether to copy the original array or reuse
+ it in place, if possible.
+
+ Returns
+ -------
+ out_arr : The input array cast as int64 if
+ possible without overflow.
+ Otherwise the input array cast to float64.
+ """
+ try:
+ return arr.astype('int64', copy=copy, casting='safe')
+ except TypeError:
+ return arr.astype('float64', copy=copy)
+
+
+def classes(*klasses):
+ """ evaluate if the tipo is a subclass of the klasses """
+ return lambda tipo: issubclass(tipo, klasses)
+
+
+def classes_and_not_datetimelike(*klasses):
+ """
+ evaluate if the tipo is a subclass of the klasses
+ and not a datetimelike
+ """
+ return lambda tipo: (issubclass(tipo, klasses) and
+ not issubclass(tipo, (np.datetime64, np.timedelta64)))
+
+
+def is_object_dtype(arr_or_dtype):
+ """
+ Check whether an array-like or dtype is of the object dtype.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array-like or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array-like or dtype is of the object dtype.
+
+ Examples
+ --------
+ >>> is_object_dtype(object)
+ True
+ >>> is_object_dtype(int)
+ False
+ >>> is_object_dtype(np.array([], dtype=object))
+ True
+ >>> is_object_dtype(np.array([], dtype=int))
+ False
+ >>> is_object_dtype([1, 2, 3])
+ False
+ """
+ return _is_dtype_type(arr_or_dtype, classes(np.object_))
+
+
+def is_sparse(arr):
+ """
+ Check whether an array-like is a 1-D pandas sparse array.
+
+ Check that the one-dimensional array-like is a pandas sparse array.
+ Returns True if it is a pandas sparse array, not another type of
+ sparse array.
+
+ Parameters
+ ----------
+ arr : array-like
+ Array-like to check.
+
+ Returns
+ -------
+ bool
+ Whether or not the array-like is a pandas sparse array.
+
+ See Also
+ --------
+ DataFrame.to_sparse : Convert DataFrame to a SparseDataFrame.
+ Series.to_sparse : Convert Series to SparseSeries.
+ Series.to_dense : Return dense representation of a Series.
+
+ Examples
+ --------
+ Returns `True` if the parameter is a 1-D pandas sparse array.
+
+ >>> is_sparse(pd.SparseArray([0, 0, 1, 0]))
+ True
+ >>> is_sparse(pd.SparseSeries([0, 0, 1, 0]))
+ True
+
+ Returns `False` if the parameter is not sparse.
+
+ >>> is_sparse(np.array([0, 0, 1, 0]))
+ False
+ >>> is_sparse(pd.Series([0, 1, 0, 0]))
+ False
+
+ Returns `False` if the parameter is not a pandas sparse array.
+
+ >>> from scipy.sparse import bsr_matrix
+ >>> is_sparse(bsr_matrix([0, 1, 0, 0]))
+ False
+
+ Returns `False` if the parameter has more than one dimension.
+
+ >>> df = pd.SparseDataFrame([389., 24., 80.5, np.nan],
+ columns=['max_speed'],
+ index=['falcon', 'parrot', 'lion', 'monkey'])
+ >>> is_sparse(df)
+ False
+ >>> is_sparse(df.max_speed)
+ True
+ """
+ from pandas.core.arrays.sparse import SparseDtype
+
+ dtype = getattr(arr, 'dtype', arr)
+ return isinstance(dtype, SparseDtype)
+
+
+def is_scipy_sparse(arr):
+ """
+ Check whether an array-like is a scipy.sparse.spmatrix instance.
+
+ Parameters
+ ----------
+ arr : array-like
+ The array-like to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array-like is a
+ scipy.sparse.spmatrix instance.
+
+ Notes
+ -----
+ If scipy is not installed, this function will always return False.
+
+ Examples
+ --------
+ >>> from scipy.sparse import bsr_matrix
+ >>> is_scipy_sparse(bsr_matrix([1, 2, 3]))
+ True
+ >>> is_scipy_sparse(pd.SparseArray([1, 2, 3]))
+ False
+ >>> is_scipy_sparse(pd.SparseSeries([1, 2, 3]))
+ False
+ """
+
+ global _is_scipy_sparse
+
+ if _is_scipy_sparse is None:
+ try:
+ from scipy.sparse import issparse as _is_scipy_sparse
+ except ImportError:
+ _is_scipy_sparse = lambda _: False
+
+ return _is_scipy_sparse(arr)
+
+
+def is_categorical(arr):
+ """
+ Check whether an array-like is a Categorical instance.
+
+ Parameters
+ ----------
+ arr : array-like
+ The array-like to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array-like is of a Categorical instance.
+
+ Examples
+ --------
+ >>> is_categorical([1, 2, 3])
+ False
+
+ Categoricals, Series Categoricals, and CategoricalIndex will return True.
+
+ >>> cat = pd.Categorical([1, 2, 3])
+ >>> is_categorical(cat)
+ True
+ >>> is_categorical(pd.Series(cat))
+ True
+ >>> is_categorical(pd.CategoricalIndex([1, 2, 3]))
+ True
+ """
+
+ return isinstance(arr, ABCCategorical) or is_categorical_dtype(arr)
+
+
+def is_datetimetz(arr):
+ """
+ Check whether an array-like is a datetime array-like with a timezone
+ component in its dtype.
+
+ .. deprecated:: 0.24.0
+
+ Parameters
+ ----------
+ arr : array-like
+ The array-like to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array-like is a datetime array-like with
+ a timezone component in its dtype.
+
+ Examples
+ --------
+ >>> is_datetimetz([1, 2, 3])
+ False
+
+ Although the following examples are both DatetimeIndex objects,
+ the first one returns False because it has no timezone component
+ unlike the second one, which returns True.
+
+ >>> is_datetimetz(pd.DatetimeIndex([1, 2, 3]))
+ False
+ >>> is_datetimetz(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern"))
+ True
+
+ The object need not be a DatetimeIndex object. It just needs to have
+ a dtype which has a timezone component.
+
+ >>> dtype = DatetimeTZDtype("ns", tz="US/Eastern")
+ >>> s = pd.Series([], dtype=dtype)
+ >>> is_datetimetz(s)
+ True
+ """
+
+ warnings.warn("'is_datetimetz' is deprecated and will be removed in a "
+ "future version. Use 'is_datetime64tz_dtype' instead.",
+ FutureWarning, stacklevel=2)
+ return is_datetime64tz_dtype(arr)
+
+
+def is_offsetlike(arr_or_obj):
+ """
+ Check if obj or all elements of list-like is DateOffset
+
+ Parameters
+ ----------
+ arr_or_obj : object
+
+ Returns
+ -------
+ boolean : Whether the object is a DateOffset or listlike of DatetOffsets
+
+ Examples
+ --------
+ >>> is_offsetlike(pd.DateOffset(days=1))
+ True
+ >>> is_offsetlike('offset')
+ False
+ >>> is_offsetlike([pd.offsets.Minute(4), pd.offsets.MonthEnd()])
+ True
+ >>> is_offsetlike(np.array([pd.DateOffset(months=3), pd.Timestamp.now()]))
+ False
+ """
+ if isinstance(arr_or_obj, ABCDateOffset):
+ return True
+ elif (is_list_like(arr_or_obj) and len(arr_or_obj) and
+ is_object_dtype(arr_or_obj)):
+ return all(isinstance(x, ABCDateOffset) for x in arr_or_obj)
+ return False
+
+
+def is_period(arr):
+ """
+ Check whether an array-like is a periodical index.
+
+ .. deprecated:: 0.24.0
+
+ Parameters
+ ----------
+ arr : array-like
+ The array-like to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array-like is a periodical index.
+
+ Examples
+ --------
+ >>> is_period([1, 2, 3])
+ False
+ >>> is_period(pd.Index([1, 2, 3]))
+ False
+ >>> is_period(pd.PeriodIndex(["2017-01-01"], freq="D"))
+ True
+ """
+
+ warnings.warn("'is_period' is deprecated and will be removed in a future "
+ "version. Use 'is_period_dtype' or is_period_arraylike' "
+ "instead.", FutureWarning, stacklevel=2)
+
+ return isinstance(arr, ABCPeriodIndex) or is_period_arraylike(arr)
+
+
+def is_datetime64_dtype(arr_or_dtype):
+ """
+ Check whether an array-like or dtype is of the datetime64 dtype.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array-like or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array-like or dtype is of
+ the datetime64 dtype.
+
+ Examples
+ --------
+ >>> is_datetime64_dtype(object)
+ False
+ >>> is_datetime64_dtype(np.datetime64)
+ True
+ >>> is_datetime64_dtype(np.array([], dtype=int))
+ False
+ >>> is_datetime64_dtype(np.array([], dtype=np.datetime64))
+ True
+ >>> is_datetime64_dtype([1, 2, 3])
+ False
+ """
+
+ return _is_dtype_type(arr_or_dtype, classes(np.datetime64))
+
+
+def is_datetime64tz_dtype(arr_or_dtype):
+ """
+ Check whether an array-like or dtype is of a DatetimeTZDtype dtype.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array-like or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array-like or dtype is of
+ a DatetimeTZDtype dtype.
+
+ Examples
+ --------
+ >>> is_datetime64tz_dtype(object)
+ False
+ >>> is_datetime64tz_dtype([1, 2, 3])
+ False
+ >>> is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3])) # tz-naive
+ False
+ >>> is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern"))
+ True
+
+ >>> dtype = DatetimeTZDtype("ns", tz="US/Eastern")
+ >>> s = pd.Series([], dtype=dtype)
+ >>> is_datetime64tz_dtype(dtype)
+ True
+ >>> is_datetime64tz_dtype(s)
+ True
+ """
+
+ if arr_or_dtype is None:
+ return False
+ return DatetimeTZDtype.is_dtype(arr_or_dtype)
+
+
+def is_timedelta64_dtype(arr_or_dtype):
+ """
+ Check whether an array-like or dtype is of the timedelta64 dtype.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array-like or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array-like or dtype is
+ of the timedelta64 dtype.
+
+ Examples
+ --------
+ >>> is_timedelta64_dtype(object)
+ False
+ >>> is_timedelta64_dtype(np.timedelta64)
+ True
+ >>> is_timedelta64_dtype([1, 2, 3])
+ False
+ >>> is_timedelta64_dtype(pd.Series([], dtype="timedelta64[ns]"))
+ True
+ >>> is_timedelta64_dtype('0 days')
+ False
+ """
+
+ return _is_dtype_type(arr_or_dtype, classes(np.timedelta64))
+
+
+def is_period_dtype(arr_or_dtype):
+ """
+ Check whether an array-like or dtype is of the Period dtype.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array-like or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array-like or dtype is of the Period dtype.
+
+ Examples
+ --------
+ >>> is_period_dtype(object)
+ False
+ >>> is_period_dtype(PeriodDtype(freq="D"))
+ True
+ >>> is_period_dtype([1, 2, 3])
+ False
+ >>> is_period_dtype(pd.Period("2017-01-01"))
+ False
+ >>> is_period_dtype(pd.PeriodIndex([], freq="A"))
+ True
+ """
+
+ # TODO: Consider making Period an instance of PeriodDtype
+ if arr_or_dtype is None:
+ return False
+ return PeriodDtype.is_dtype(arr_or_dtype)
+
+
+def is_interval_dtype(arr_or_dtype):
+ """
+ Check whether an array-like or dtype is of the Interval dtype.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array-like or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array-like or dtype is
+ of the Interval dtype.
+
+ Examples
+ --------
+ >>> is_interval_dtype(object)
+ False
+ >>> is_interval_dtype(IntervalDtype())
+ True
+ >>> is_interval_dtype([1, 2, 3])
+ False
+ >>>
+ >>> interval = pd.Interval(1, 2, closed="right")
+ >>> is_interval_dtype(interval)
+ False
+ >>> is_interval_dtype(pd.IntervalIndex([interval]))
+ True
+ """
+
+ # TODO: Consider making Interval an instance of IntervalDtype
+ if arr_or_dtype is None:
+ return False
+ return IntervalDtype.is_dtype(arr_or_dtype)
+
+
+def is_categorical_dtype(arr_or_dtype):
+ """
+ Check whether an array-like or dtype is of the Categorical dtype.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array-like or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array-like or dtype is
+ of the Categorical dtype.
+
+ Examples
+ --------
+ >>> is_categorical_dtype(object)
+ False
+ >>> is_categorical_dtype(CategoricalDtype())
+ True
+ >>> is_categorical_dtype([1, 2, 3])
+ False
+ >>> is_categorical_dtype(pd.Categorical([1, 2, 3]))
+ True
+ >>> is_categorical_dtype(pd.CategoricalIndex([1, 2, 3]))
+ True
+ """
+
+ if arr_or_dtype is None:
+ return False
+ return CategoricalDtype.is_dtype(arr_or_dtype)
+
+
+def is_string_dtype(arr_or_dtype):
+ """
+ Check whether the provided array or dtype is of the string dtype.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array or dtype is of the string dtype.
+
+ Examples
+ --------
+ >>> is_string_dtype(str)
+ True
+ >>> is_string_dtype(object)
+ True
+ >>> is_string_dtype(int)
+ False
+ >>>
+ >>> is_string_dtype(np.array(['a', 'b']))
+ True
+ >>> is_string_dtype(pd.Series([1, 2]))
+ False
+ """
+
+ # TODO: gh-15585: consider making the checks stricter.
+ def condition(dtype):
+ return dtype.kind in ('O', 'S', 'U') and not is_period_dtype(dtype)
+ return _is_dtype(arr_or_dtype, condition)
+
+
+def is_period_arraylike(arr):
+ """
+ Check whether an array-like is a periodical array-like or PeriodIndex.
+
+ Parameters
+ ----------
+ arr : array-like
+ The array-like to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array-like is a periodical
+ array-like or PeriodIndex instance.
+
+ Examples
+ --------
+ >>> is_period_arraylike([1, 2, 3])
+ False
+ >>> is_period_arraylike(pd.Index([1, 2, 3]))
+ False
+ >>> is_period_arraylike(pd.PeriodIndex(["2017-01-01"], freq="D"))
+ True
+ """
+
+ if isinstance(arr, (ABCPeriodIndex, ABCPeriodArray)):
+ return True
+ elif isinstance(arr, (np.ndarray, ABCSeries)):
+ return is_period_dtype(arr.dtype)
+ return getattr(arr, 'inferred_type', None) == 'period'
+
+
+def is_datetime_arraylike(arr):
+ """
+ Check whether an array-like is a datetime array-like or DatetimeIndex.
+
+ Parameters
+ ----------
+ arr : array-like
+ The array-like to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array-like is a datetime
+ array-like or DatetimeIndex.
+
+ Examples
+ --------
+ >>> is_datetime_arraylike([1, 2, 3])
+ False
+ >>> is_datetime_arraylike(pd.Index([1, 2, 3]))
+ False
+ >>> is_datetime_arraylike(pd.DatetimeIndex([1, 2, 3]))
+ True
+ """
+
+ if isinstance(arr, ABCDatetimeIndex):
+ return True
+ elif isinstance(arr, (np.ndarray, ABCSeries)):
+ return (is_object_dtype(arr.dtype)
+ and lib.infer_dtype(arr, skipna=False) == 'datetime')
+ return getattr(arr, 'inferred_type', None) == 'datetime'
+
+
+def is_datetimelike(arr):
+ """
+ Check whether an array-like is a datetime-like array-like.
+
+ Acceptable datetime-like objects are (but not limited to) datetime
+ indices, periodic indices, and timedelta indices.
+
+ Parameters
+ ----------
+ arr : array-like
+ The array-like to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array-like is a datetime-like array-like.
+
+ Examples
+ --------
+ >>> is_datetimelike([1, 2, 3])
+ False
+ >>> is_datetimelike(pd.Index([1, 2, 3]))
+ False
+ >>> is_datetimelike(pd.DatetimeIndex([1, 2, 3]))
+ True
+ >>> is_datetimelike(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern"))
+ True
+ >>> is_datetimelike(pd.PeriodIndex([], freq="A"))
+ True
+ >>> is_datetimelike(np.array([], dtype=np.datetime64))
+ True
+ >>> is_datetimelike(pd.Series([], dtype="timedelta64[ns]"))
+ True
+ >>>
+ >>> dtype = DatetimeTZDtype("ns", tz="US/Eastern")
+ >>> s = pd.Series([], dtype=dtype)
+ >>> is_datetimelike(s)
+ True
+ """
+
+ return (is_datetime64_dtype(arr) or is_datetime64tz_dtype(arr) or
+ is_timedelta64_dtype(arr) or
+ isinstance(arr, ABCPeriodIndex))
+
+
+def is_dtype_equal(source, target):
+ """
+ Check if two dtypes are equal.
+
+ Parameters
+ ----------
+ source : The first dtype to compare
+ target : The second dtype to compare
+
+ Returns
+ ----------
+ boolean : Whether or not the two dtypes are equal.
+
+ Examples
+ --------
+ >>> is_dtype_equal(int, float)
+ False
+ >>> is_dtype_equal("int", int)
+ True
+ >>> is_dtype_equal(object, "category")
+ False
+ >>> is_dtype_equal(CategoricalDtype(), "category")
+ True
+ >>> is_dtype_equal(DatetimeTZDtype(), "datetime64")
+ False
+ """
+
+ try:
+ source = _get_dtype(source)
+ target = _get_dtype(target)
+ return source == target
+ except (TypeError, AttributeError):
+
+ # invalid comparison
+ # object == category will hit this
+ return False
+
+
+def is_dtype_union_equal(source, target):
+ """
+ Check whether two arrays have compatible dtypes to do a union.
+ numpy types are checked with ``is_dtype_equal``. Extension types are
+ checked separately.
+
+ Parameters
+ ----------
+ source : The first dtype to compare
+ target : The second dtype to compare
+
+ Returns
+ ----------
+ boolean : Whether or not the two dtypes are equal.
+
+ >>> is_dtype_equal("int", int)
+ True
+
+ >>> is_dtype_equal(CategoricalDtype(['a', 'b'],
+ ... CategoricalDtype(['b', 'c']))
+ True
+
+ >>> is_dtype_equal(CategoricalDtype(['a', 'b'],
+ ... CategoricalDtype(['b', 'c'], ordered=True))
+ False
+ """
+ source = _get_dtype(source)
+ target = _get_dtype(target)
+ if is_categorical_dtype(source) and is_categorical_dtype(target):
+ # ordered False for both
+ return source.ordered is target.ordered
+ return is_dtype_equal(source, target)
+
+
+def is_any_int_dtype(arr_or_dtype):
+ """Check whether the provided array or dtype is of an integer dtype.
+
+ In this function, timedelta64 instances are also considered "any-integer"
+ type objects and will return True.
+
+ This function is internal and should not be exposed in the public API.
+
+ .. versionchanged:: 0.24.0
+
+ The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered
+ as integer by this function.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array or dtype is of an integer dtype.
+
+ Examples
+ --------
+ >>> is_any_int_dtype(str)
+ False
+ >>> is_any_int_dtype(int)
+ True
+ >>> is_any_int_dtype(float)
+ False
+ >>> is_any_int_dtype(np.uint64)
+ True
+ >>> is_any_int_dtype(np.datetime64)
+ False
+ >>> is_any_int_dtype(np.timedelta64)
+ True
+ >>> is_any_int_dtype(np.array(['a', 'b']))
+ False
+ >>> is_any_int_dtype(pd.Series([1, 2]))
+ True
+ >>> is_any_int_dtype(np.array([], dtype=np.timedelta64))
+ True
+ >>> is_any_int_dtype(pd.Index([1, 2.])) # float
+ False
+ """
+
+ return _is_dtype_type(
+ arr_or_dtype, classes(np.integer, np.timedelta64))
+
+
+def is_integer_dtype(arr_or_dtype):
+ """
+ Check whether the provided array or dtype is of an integer dtype.
+
+ Unlike in `in_any_int_dtype`, timedelta64 instances will return False.
+
+ .. versionchanged:: 0.24.0
+
+ The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered
+ as integer by this function.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array or dtype is of an integer dtype
+ and not an instance of timedelta64.
+
+ Examples
+ --------
+ >>> is_integer_dtype(str)
+ False
+ >>> is_integer_dtype(int)
+ True
+ >>> is_integer_dtype(float)
+ False
+ >>> is_integer_dtype(np.uint64)
+ True
+ >>> is_integer_dtype('int8')
+ True
+ >>> is_integer_dtype('Int8')
+ True
+ >>> is_integer_dtype(pd.Int8Dtype)
+ True
+ >>> is_integer_dtype(np.datetime64)
+ False
+ >>> is_integer_dtype(np.timedelta64)
+ False
+ >>> is_integer_dtype(np.array(['a', 'b']))
+ False
+ >>> is_integer_dtype(pd.Series([1, 2]))
+ True
+ >>> is_integer_dtype(np.array([], dtype=np.timedelta64))
+ False
+ >>> is_integer_dtype(pd.Index([1, 2.])) # float
+ False
+ """
+
+ return _is_dtype_type(
+ arr_or_dtype, classes_and_not_datetimelike(np.integer))
+
+
+def is_signed_integer_dtype(arr_or_dtype):
+ """
+ Check whether the provided array or dtype is of a signed integer dtype.
+
+ Unlike in `in_any_int_dtype`, timedelta64 instances will return False.
+
+ .. versionchanged:: 0.24.0
+
+ The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered
+ as integer by this function.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array or dtype is of a signed integer dtype
+ and not an instance of timedelta64.
+
+ Examples
+ --------
+ >>> is_signed_integer_dtype(str)
+ False
+ >>> is_signed_integer_dtype(int)
+ True
+ >>> is_signed_integer_dtype(float)
+ False
+ >>> is_signed_integer_dtype(np.uint64) # unsigned
+ False
+ >>> is_signed_integer_dtype('int8')
+ True
+ >>> is_signed_integer_dtype('Int8')
+ True
+ >>> is_signed_dtype(pd.Int8Dtype)
+ True
+ >>> is_signed_integer_dtype(np.datetime64)
+ False
+ >>> is_signed_integer_dtype(np.timedelta64)
+ False
+ >>> is_signed_integer_dtype(np.array(['a', 'b']))
+ False
+ >>> is_signed_integer_dtype(pd.Series([1, 2]))
+ True
+ >>> is_signed_integer_dtype(np.array([], dtype=np.timedelta64))
+ False
+ >>> is_signed_integer_dtype(pd.Index([1, 2.])) # float
+ False
+ >>> is_signed_integer_dtype(np.array([1, 2], dtype=np.uint32)) # unsigned
+ False
+ """
+
+ return _is_dtype_type(
+ arr_or_dtype, classes_and_not_datetimelike(np.signedinteger))
+
+
+def is_unsigned_integer_dtype(arr_or_dtype):
+ """
+ Check whether the provided array or dtype is of an unsigned integer dtype.
+
+ .. versionchanged:: 0.24.0
+
+ The nullable Integer dtypes (e.g. pandas.UInt64Dtype) are also
+ considered as integer by this function.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array or dtype is of an
+ unsigned integer dtype.
+
+ Examples
+ --------
+ >>> is_unsigned_integer_dtype(str)
+ False
+ >>> is_unsigned_integer_dtype(int) # signed
+ False
+ >>> is_unsigned_integer_dtype(float)
+ False
+ >>> is_unsigned_integer_dtype(np.uint64)
+ True
+ >>> is_unsigned_integer_dtype('uint8')
+ True
+ >>> is_unsigned_integer_dtype('UInt8')
+ True
+ >>> is_unsigned_integer_dtype(pd.UInt8Dtype)
+ True
+ >>> is_unsigned_integer_dtype(np.array(['a', 'b']))
+ False
+ >>> is_unsigned_integer_dtype(pd.Series([1, 2])) # signed
+ False
+ >>> is_unsigned_integer_dtype(pd.Index([1, 2.])) # float
+ False
+ >>> is_unsigned_integer_dtype(np.array([1, 2], dtype=np.uint32))
+ True
+ """
+ return _is_dtype_type(
+ arr_or_dtype, classes_and_not_datetimelike(np.unsignedinteger))
+
+
+def is_int64_dtype(arr_or_dtype):
+ """
+ Check whether the provided array or dtype is of the int64 dtype.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array or dtype is of the int64 dtype.
+
+ Notes
+ -----
+ Depending on system architecture, the return value of `is_int64_dtype(
+ int)` will be True if the OS uses 64-bit integers and False if the OS
+ uses 32-bit integers.
+
+ Examples
+ --------
+ >>> is_int64_dtype(str)
+ False
+ >>> is_int64_dtype(np.int32)
+ False
+ >>> is_int64_dtype(np.int64)
+ True
+ >>> is_int64_dtype('int8')
+ False
+ >>> is_int64_dtype('Int8')
+ False
+ >>> is_int64_dtype(pd.Int64Dtype)
+ True
+ >>> is_int64_dtype(float)
+ False
+ >>> is_int64_dtype(np.uint64) # unsigned
+ False
+ >>> is_int64_dtype(np.array(['a', 'b']))
+ False
+ >>> is_int64_dtype(np.array([1, 2], dtype=np.int64))
+ True
+ >>> is_int64_dtype(pd.Index([1, 2.])) # float
+ False
+ >>> is_int64_dtype(np.array([1, 2], dtype=np.uint32)) # unsigned
+ False
+ """
+
+ return _is_dtype_type(arr_or_dtype, classes(np.int64))
+
+
+def is_datetime64_any_dtype(arr_or_dtype):
+ """
+ Check whether the provided array or dtype is of the datetime64 dtype.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array or dtype is of the datetime64 dtype.
+
+ Examples
+ --------
+ >>> is_datetime64_any_dtype(str)
+ False
+ >>> is_datetime64_any_dtype(int)
+ False
+ >>> is_datetime64_any_dtype(np.datetime64) # can be tz-naive
+ True
+ >>> is_datetime64_any_dtype(DatetimeTZDtype("ns", "US/Eastern"))
+ True
+ >>> is_datetime64_any_dtype(np.array(['a', 'b']))
+ False
+ >>> is_datetime64_any_dtype(np.array([1, 2]))
+ False
+ >>> is_datetime64_any_dtype(np.array([], dtype=np.datetime64))
+ True
+ >>> is_datetime64_any_dtype(pd.DatetimeIndex([1, 2, 3],
+ dtype=np.datetime64))
+ True
+ """
+
+ if arr_or_dtype is None:
+ return False
+ return (is_datetime64_dtype(arr_or_dtype) or
+ is_datetime64tz_dtype(arr_or_dtype))
+
+
+def is_datetime64_ns_dtype(arr_or_dtype):
+ """
+ Check whether the provided array or dtype is of the datetime64[ns] dtype.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array or dtype is of the datetime64[ns] dtype.
+
+ Examples
+ --------
+ >>> is_datetime64_ns_dtype(str)
+ False
+ >>> is_datetime64_ns_dtype(int)
+ False
+ >>> is_datetime64_ns_dtype(np.datetime64) # no unit
+ False
+ >>> is_datetime64_ns_dtype(DatetimeTZDtype("ns", "US/Eastern"))
+ True
+ >>> is_datetime64_ns_dtype(np.array(['a', 'b']))
+ False
+ >>> is_datetime64_ns_dtype(np.array([1, 2]))
+ False
+ >>> is_datetime64_ns_dtype(np.array([], dtype=np.datetime64)) # no unit
+ False
+ >>> is_datetime64_ns_dtype(np.array([],
+ dtype="datetime64[ps]")) # wrong unit
+ False
+ >>> is_datetime64_ns_dtype(pd.DatetimeIndex([1, 2, 3],
+ dtype=np.datetime64)) # has 'ns' unit
+ True
+ """
+
+ if arr_or_dtype is None:
+ return False
+ try:
+ tipo = _get_dtype(arr_or_dtype)
+ except TypeError:
+ if is_datetime64tz_dtype(arr_or_dtype):
+ tipo = _get_dtype(arr_or_dtype.dtype)
+ else:
+ return False
+ return tipo == _NS_DTYPE or getattr(tipo, 'base', None) == _NS_DTYPE
+
+
+def is_timedelta64_ns_dtype(arr_or_dtype):
+ """
+ Check whether the provided array or dtype is of the timedelta64[ns] dtype.
+
+ This is a very specific dtype, so generic ones like `np.timedelta64`
+ will return False if passed into this function.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array or dtype is of the
+ timedelta64[ns] dtype.
+
+ Examples
+ --------
+ >>> is_timedelta64_ns_dtype(np.dtype('m8[ns]'))
+ True
+ >>> is_timedelta64_ns_dtype(np.dtype('m8[ps]')) # Wrong frequency
+ False
+ >>> is_timedelta64_ns_dtype(np.array([1, 2], dtype='m8[ns]'))
+ True
+ >>> is_timedelta64_ns_dtype(np.array([1, 2], dtype=np.timedelta64))
+ False
+ """
+ return _is_dtype(arr_or_dtype, lambda dtype: dtype == _TD_DTYPE)
+
+
+def is_datetime_or_timedelta_dtype(arr_or_dtype):
+ """
+ Check whether the provided array or dtype is of
+ a timedelta64 or datetime64 dtype.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array or dtype is of a
+ timedelta64, or datetime64 dtype.
+
+ Examples
+ --------
+ >>> is_datetime_or_timedelta_dtype(str)
+ False
+ >>> is_datetime_or_timedelta_dtype(int)
+ False
+ >>> is_datetime_or_timedelta_dtype(np.datetime64)
+ True
+ >>> is_datetime_or_timedelta_dtype(np.timedelta64)
+ True
+ >>> is_datetime_or_timedelta_dtype(np.array(['a', 'b']))
+ False
+ >>> is_datetime_or_timedelta_dtype(pd.Series([1, 2]))
+ False
+ >>> is_datetime_or_timedelta_dtype(np.array([], dtype=np.timedelta64))
+ True
+ >>> is_datetime_or_timedelta_dtype(np.array([], dtype=np.datetime64))
+ True
+ """
+
+ return _is_dtype_type(
+ arr_or_dtype, classes(np.datetime64, np.timedelta64))
+
+
+def _is_unorderable_exception(e):
+ """
+ Check if the exception raised is an unorderable exception.
+
+ The error message differs for 3 <= PY <= 3.5 and PY >= 3.6, so
+ we need to condition based on Python version.
+
+ Parameters
+ ----------
+ e : Exception or sub-class
+ The exception object to check.
+
+ Returns
+ -------
+ boolean : Whether or not the exception raised is an unorderable exception.
+ """
+
+ if PY36:
+ return "'>' not supported between instances of" in str(e)
+
+ elif PY3:
+ return 'unorderable' in str(e)
+ return False
+
+
+def is_numeric_v_string_like(a, b):
+ """
+ Check if we are comparing a string-like object to a numeric ndarray.
+
+ NumPy doesn't like to compare such objects, especially numeric arrays
+ and scalar string-likes.
+
+ Parameters
+ ----------
+ a : array-like, scalar
+ The first object to check.
+ b : array-like, scalar
+ The second object to check.
+
+ Returns
+ -------
+ boolean : Whether we return a comparing a string-like
+ object to a numeric array.
+
+ Examples
+ --------
+ >>> is_numeric_v_string_like(1, 1)
+ False
+ >>> is_numeric_v_string_like("foo", "foo")
+ False
+ >>> is_numeric_v_string_like(1, "foo") # non-array numeric
+ False
+ >>> is_numeric_v_string_like(np.array([1]), "foo")
+ True
+ >>> is_numeric_v_string_like("foo", np.array([1])) # symmetric check
+ True
+ >>> is_numeric_v_string_like(np.array([1, 2]), np.array(["foo"]))
+ True
+ >>> is_numeric_v_string_like(np.array(["foo"]), np.array([1, 2]))
+ True
+ >>> is_numeric_v_string_like(np.array([1]), np.array([2]))
+ False
+ >>> is_numeric_v_string_like(np.array(["foo"]), np.array(["foo"]))
+ False
+ """
+
+ is_a_array = isinstance(a, np.ndarray)
+ is_b_array = isinstance(b, np.ndarray)
+
+ is_a_numeric_array = is_a_array and is_numeric_dtype(a)
+ is_b_numeric_array = is_b_array and is_numeric_dtype(b)
+ is_a_string_array = is_a_array and is_string_like_dtype(a)
+ is_b_string_array = is_b_array and is_string_like_dtype(b)
+
+ is_a_scalar_string_like = not is_a_array and is_string_like(a)
+ is_b_scalar_string_like = not is_b_array and is_string_like(b)
+
+ return ((is_a_numeric_array and is_b_scalar_string_like) or
+ (is_b_numeric_array and is_a_scalar_string_like) or
+ (is_a_numeric_array and is_b_string_array) or
+ (is_b_numeric_array and is_a_string_array))
+
+
+def is_datetimelike_v_numeric(a, b):
+ """
+ Check if we are comparing a datetime-like object to a numeric object.
+
+ By "numeric," we mean an object that is either of an int or float dtype.
+
+ Parameters
+ ----------
+ a : array-like, scalar
+ The first object to check.
+ b : array-like, scalar
+ The second object to check.
+
+ Returns
+ -------
+ boolean : Whether we return a comparing a datetime-like
+ to a numeric object.
+
+ Examples
+ --------
+ >>> dt = np.datetime64(pd.datetime(2017, 1, 1))
+ >>>
+ >>> is_datetimelike_v_numeric(1, 1)
+ False
+ >>> is_datetimelike_v_numeric(dt, dt)
+ False
+ >>> is_datetimelike_v_numeric(1, dt)
+ True
+ >>> is_datetimelike_v_numeric(dt, 1) # symmetric check
+ True
+ >>> is_datetimelike_v_numeric(np.array([dt]), 1)
+ True
+ >>> is_datetimelike_v_numeric(np.array([1]), dt)
+ True
+ >>> is_datetimelike_v_numeric(np.array([dt]), np.array([1]))
+ True
+ >>> is_datetimelike_v_numeric(np.array([1]), np.array([2]))
+ False
+ >>> is_datetimelike_v_numeric(np.array([dt]), np.array([dt]))
+ False
+ """
+
+ if not hasattr(a, 'dtype'):
+ a = np.asarray(a)
+ if not hasattr(b, 'dtype'):
+ b = np.asarray(b)
+
+ def is_numeric(x):
+ """
+ Check if an object has a numeric dtype (i.e. integer or float).
+ """
+ return is_integer_dtype(x) or is_float_dtype(x)
+
+ is_datetimelike = needs_i8_conversion
+ return ((is_datetimelike(a) and is_numeric(b)) or
+ (is_datetimelike(b) and is_numeric(a)))
+
+
+def is_datetimelike_v_object(a, b):
+ """
+ Check if we are comparing a datetime-like object to an object instance.
+
+ Parameters
+ ----------
+ a : array-like, scalar
+ The first object to check.
+ b : array-like, scalar
+ The second object to check.
+
+ Returns
+ -------
+ boolean : Whether we return a comparing a datetime-like
+ to an object instance.
+
+ Examples
+ --------
+ >>> obj = object()
+ >>> dt = np.datetime64(pd.datetime(2017, 1, 1))
+ >>>
+ >>> is_datetimelike_v_object(obj, obj)
+ False
+ >>> is_datetimelike_v_object(dt, dt)
+ False
+ >>> is_datetimelike_v_object(obj, dt)
+ True
+ >>> is_datetimelike_v_object(dt, obj) # symmetric check
+ True
+ >>> is_datetimelike_v_object(np.array([dt]), obj)
+ True
+ >>> is_datetimelike_v_object(np.array([obj]), dt)
+ True
+ >>> is_datetimelike_v_object(np.array([dt]), np.array([obj]))
+ True
+ >>> is_datetimelike_v_object(np.array([obj]), np.array([obj]))
+ False
+ >>> is_datetimelike_v_object(np.array([dt]), np.array([1]))
+ False
+ >>> is_datetimelike_v_object(np.array([dt]), np.array([dt]))
+ False
+ """
+
+ if not hasattr(a, 'dtype'):
+ a = np.asarray(a)
+ if not hasattr(b, 'dtype'):
+ b = np.asarray(b)
+
+ is_datetimelike = needs_i8_conversion
+ return ((is_datetimelike(a) and is_object_dtype(b)) or
+ (is_datetimelike(b) and is_object_dtype(a)))
+
+
+def needs_i8_conversion(arr_or_dtype):
+ """
+ Check whether the array or dtype should be converted to int64.
+
+ An array-like or dtype "needs" such a conversion if the array-like
+ or dtype is of a datetime-like dtype
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array or dtype should be converted to int64.
+
+ Examples
+ --------
+ >>> needs_i8_conversion(str)
+ False
+ >>> needs_i8_conversion(np.int64)
+ False
+ >>> needs_i8_conversion(np.datetime64)
+ True
+ >>> needs_i8_conversion(np.array(['a', 'b']))
+ False
+ >>> needs_i8_conversion(pd.Series([1, 2]))
+ False
+ >>> needs_i8_conversion(pd.Series([], dtype="timedelta64[ns]"))
+ True
+ >>> needs_i8_conversion(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern"))
+ True
+ """
+
+ if arr_or_dtype is None:
+ return False
+ return (is_datetime_or_timedelta_dtype(arr_or_dtype) or
+ is_datetime64tz_dtype(arr_or_dtype) or
+ is_period_dtype(arr_or_dtype))
+
+
+def is_numeric_dtype(arr_or_dtype):
+ """
+ Check whether the provided array or dtype is of a numeric dtype.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array or dtype is of a numeric dtype.
+
+ Examples
+ --------
+ >>> is_numeric_dtype(str)
+ False
+ >>> is_numeric_dtype(int)
+ True
+ >>> is_numeric_dtype(float)
+ True
+ >>> is_numeric_dtype(np.uint64)
+ True
+ >>> is_numeric_dtype(np.datetime64)
+ False
+ >>> is_numeric_dtype(np.timedelta64)
+ False
+ >>> is_numeric_dtype(np.array(['a', 'b']))
+ False
+ >>> is_numeric_dtype(pd.Series([1, 2]))
+ True
+ >>> is_numeric_dtype(pd.Index([1, 2.]))
+ True
+ >>> is_numeric_dtype(np.array([], dtype=np.timedelta64))
+ False
+ """
+
+ return _is_dtype_type(
+ arr_or_dtype, classes_and_not_datetimelike(np.number, np.bool_))
+
+
+def is_string_like_dtype(arr_or_dtype):
+ """
+ Check whether the provided array or dtype is of a string-like dtype.
+
+ Unlike `is_string_dtype`, the object dtype is excluded because it
+ is a mixed dtype.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array or dtype is of the string dtype.
+
+ Examples
+ --------
+ >>> is_string_like_dtype(str)
+ True
+ >>> is_string_like_dtype(object)
+ False
+ >>> is_string_like_dtype(np.array(['a', 'b']))
+ True
+ >>> is_string_like_dtype(pd.Series([1, 2]))
+ False
+ """
+
+ return _is_dtype(
+ arr_or_dtype, lambda dtype: dtype.kind in ('S', 'U'))
+
+
+def is_float_dtype(arr_or_dtype):
+ """
+ Check whether the provided array or dtype is of a float dtype.
+
+ This function is internal and should not be exposed in the public API.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array or dtype is of a float dtype.
+
+ Examples
+ --------
+ >>> is_float_dtype(str)
+ False
+ >>> is_float_dtype(int)
+ False
+ >>> is_float_dtype(float)
+ True
+ >>> is_float_dtype(np.array(['a', 'b']))
+ False
+ >>> is_float_dtype(pd.Series([1, 2]))
+ False
+ >>> is_float_dtype(pd.Index([1, 2.]))
+ True
+ """
+ return _is_dtype_type(arr_or_dtype, classes(np.floating))
+
+
+def is_bool_dtype(arr_or_dtype):
+ """
+ Check whether the provided array or dtype is of a boolean dtype.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array or dtype is of a boolean dtype.
+
+ Notes
+ -----
+ An ExtensionArray is considered boolean when the ``_is_boolean``
+ attribute is set to True.
+
+ Examples
+ --------
+ >>> is_bool_dtype(str)
+ False
+ >>> is_bool_dtype(int)
+ False
+ >>> is_bool_dtype(bool)
+ True
+ >>> is_bool_dtype(np.bool)
+ True
+ >>> is_bool_dtype(np.array(['a', 'b']))
+ False
+ >>> is_bool_dtype(pd.Series([1, 2]))
+ False
+ >>> is_bool_dtype(np.array([True, False]))
+ True
+ >>> is_bool_dtype(pd.Categorical([True, False]))
+ True
+ >>> is_bool_dtype(pd.SparseArray([True, False]))
+ True
+ """
+ if arr_or_dtype is None:
+ return False
+ try:
+ dtype = _get_dtype(arr_or_dtype)
+ except TypeError:
+ return False
+
+ if isinstance(arr_or_dtype, CategoricalDtype):
+ arr_or_dtype = arr_or_dtype.categories
+ # now we use the special definition for Index
+
+ if isinstance(arr_or_dtype, ABCIndexClass):
+
+ # TODO(jreback)
+ # we don't have a boolean Index class
+ # so its object, we need to infer to
+ # guess this
+ return (arr_or_dtype.is_object and
+ arr_or_dtype.inferred_type == 'boolean')
+ elif is_extension_array_dtype(arr_or_dtype):
+ dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype)
+ return dtype._is_boolean
+
+ return issubclass(dtype.type, np.bool_)
+
+
+def is_extension_type(arr):
+ """
+ Check whether an array-like is of a pandas extension class instance.
+
+ Extension classes include categoricals, pandas sparse objects (i.e.
+ classes represented within the pandas library and not ones external
+ to it like scipy sparse matrices), and datetime-like arrays.
+
+ Parameters
+ ----------
+ arr : array-like
+ The array-like to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array-like is of a pandas
+ extension class instance.
+
+ Examples
+ --------
+ >>> is_extension_type([1, 2, 3])
+ False
+ >>> is_extension_type(np.array([1, 2, 3]))
+ False
+ >>>
+ >>> cat = pd.Categorical([1, 2, 3])
+ >>>
+ >>> is_extension_type(cat)
+ True
+ >>> is_extension_type(pd.Series(cat))
+ True
+ >>> is_extension_type(pd.SparseArray([1, 2, 3]))
+ True
+ >>> is_extension_type(pd.SparseSeries([1, 2, 3]))
+ True
+ >>>
+ >>> from scipy.sparse import bsr_matrix
+ >>> is_extension_type(bsr_matrix([1, 2, 3]))
+ False
+ >>> is_extension_type(pd.DatetimeIndex([1, 2, 3]))
+ False
+ >>> is_extension_type(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern"))
+ True
+ >>>
+ >>> dtype = DatetimeTZDtype("ns", tz="US/Eastern")
+ >>> s = pd.Series([], dtype=dtype)
+ >>> is_extension_type(s)
+ True
+ """
+
+ if is_categorical(arr):
+ return True
+ elif is_sparse(arr):
+ return True
+ elif is_datetime64tz_dtype(arr):
+ return True
+ return False
+
+
+def is_extension_array_dtype(arr_or_dtype):
+ """
+ Check if an object is a pandas extension array type.
+
+ See the :ref:`Use Guide <extending.extension-types>` for more.
+
+ Parameters
+ ----------
+ arr_or_dtype : object
+ For array-like input, the ``.dtype`` attribute will
+ be extracted.
+
+ Returns
+ -------
+ bool
+ Whether the `arr_or_dtype` is an extension array type.
+
+ Notes
+ -----
+ This checks whether an object implements the pandas extension
+ array interface. In pandas, this includes:
+
+ * Categorical
+ * Sparse
+ * Interval
+ * Period
+ * DatetimeArray
+ * TimedeltaArray
+
+ Third-party libraries may implement arrays or types satisfying
+ this interface as well.
+
+ Examples
+ --------
+ >>> from pandas.api.types import is_extension_array_dtype
+ >>> arr = pd.Categorical(['a', 'b'])
+ >>> is_extension_array_dtype(arr)
+ True
+ >>> is_extension_array_dtype(arr.dtype)
+ True
+
+ >>> arr = np.array(['a', 'b'])
+ >>> is_extension_array_dtype(arr.dtype)
+ False
+ """
+ dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype)
+ return (isinstance(dtype, ExtensionDtype) or
+ registry.find(dtype) is not None)
+
+
+def is_complex_dtype(arr_or_dtype):
+ """
+ Check whether the provided array or dtype is of a complex dtype.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array or dtype to check.
+
+ Returns
+ -------
+ boolean : Whether or not the array or dtype is of a compex dtype.
+
+ Examples
+ --------
+ >>> is_complex_dtype(str)
+ False
+ >>> is_complex_dtype(int)
+ False
+ >>> is_complex_dtype(np.complex)
+ True
+ >>> is_complex_dtype(np.array(['a', 'b']))
+ False
+ >>> is_complex_dtype(pd.Series([1, 2]))
+ False
+ >>> is_complex_dtype(np.array([1 + 1j, 5]))
+ True
+ """
+
+ return _is_dtype_type(arr_or_dtype, classes(np.complexfloating))
+
+
+def _is_dtype(arr_or_dtype, condition):
+ """
+ Return a boolean if the condition is satisfied for the arr_or_dtype.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like, str, np.dtype, or ExtensionArrayType
+ The array-like or dtype object whose dtype we want to extract.
+ condition : callable[Union[np.dtype, ExtensionDtype]]
+
+ Returns
+ -------
+ bool
+
+ """
+
+ if arr_or_dtype is None:
+ return False
+ try:
+ dtype = _get_dtype(arr_or_dtype)
+ except (TypeError, ValueError, UnicodeEncodeError):
+ return False
+ return condition(dtype)
+
+
+def _get_dtype(arr_or_dtype):
+ """
+ Get the dtype instance associated with an array
+ or dtype object.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array-like or dtype object whose dtype we want to extract.
+
+ Returns
+ -------
+ obj_dtype : The extract dtype instance from the
+ passed in array or dtype object.
+
+ Raises
+ ------
+ TypeError : The passed in object is None.
+ """
+
+ if arr_or_dtype is None:
+ raise TypeError("Cannot deduce dtype from null object")
+
+ # fastpath
+ elif isinstance(arr_or_dtype, np.dtype):
+ return arr_or_dtype
+ elif isinstance(arr_or_dtype, type):
+ return np.dtype(arr_or_dtype)
+
+ # if we have an array-like
+ elif hasattr(arr_or_dtype, 'dtype'):
+ arr_or_dtype = arr_or_dtype.dtype
+
+ return pandas_dtype(arr_or_dtype)
+
+
+def _is_dtype_type(arr_or_dtype, condition):
+ """
+ Return a boolean if the condition is satisfied for the arr_or_dtype.
+
+ Parameters
+ ----------
+ arr_or_dtype : array-like
+ The array-like or dtype object whose dtype we want to extract.
+ condition : callable[Union[np.dtype, ExtensionDtypeType]]
+
+ Returns
+ -------
+ bool : if the condition is satisifed for the arr_or_dtype
+ """
+
+ if arr_or_dtype is None:
+ return condition(type(None))
+
+ # fastpath
+ if isinstance(arr_or_dtype, np.dtype):
+ return condition(arr_or_dtype.type)
+ elif isinstance(arr_or_dtype, type):
+ if issubclass(arr_or_dtype, (PandasExtensionDtype, ExtensionDtype)):
+ arr_or_dtype = arr_or_dtype.type
+ return condition(np.dtype(arr_or_dtype).type)
+ elif arr_or_dtype is None:
+ return condition(type(None))
+
+ # if we have an array-like
+ if hasattr(arr_or_dtype, 'dtype'):
+ arr_or_dtype = arr_or_dtype.dtype
+
+ # we are not possibly a dtype
+ elif is_list_like(arr_or_dtype):
+ return condition(type(None))
+
+ try:
+ tipo = pandas_dtype(arr_or_dtype).type
+ except (TypeError, ValueError, UnicodeEncodeError):
+ if is_scalar(arr_or_dtype):
+ return condition(type(None))
+
+ return False
+
+ return condition(tipo)
+
+
+def infer_dtype_from_object(dtype):
+ """
+ Get a numpy dtype.type-style object for a dtype object.
+
+ This methods also includes handling of the datetime64[ns] and
+ datetime64[ns, TZ] objects.
+
+ If no dtype can be found, we return ``object``.
+
+ Parameters
+ ----------
+ dtype : dtype, type
+ The dtype object whose numpy dtype.type-style
+ object we want to extract.
+
+ Returns
+ -------
+ dtype_object : The extracted numpy dtype.type-style object.
+ """
+
+ if isinstance(dtype, type) and issubclass(dtype, np.generic):
+ # Type object from a dtype
+ return dtype
+ elif isinstance(dtype, (np.dtype, PandasExtensionDtype, ExtensionDtype)):
+ # dtype object
+ try:
+ _validate_date_like_dtype(dtype)
+ except TypeError:
+ # Should still pass if we don't have a date-like
+ pass
+ return dtype.type
+
+ try:
+ dtype = pandas_dtype(dtype)
+ except TypeError:
+ pass
+
+ if is_extension_array_dtype(dtype):
+ return dtype.type
+ elif isinstance(dtype, string_types):
+
+ # TODO(jreback)
+ # should deprecate these
+ if dtype in ['datetimetz', 'datetime64tz']:
+ return DatetimeTZDtype.type
+ elif dtype in ['period']:
+ raise NotImplementedError
+
+ if dtype == 'datetime' or dtype == 'timedelta':
+ dtype += '64'
+ try:
+ return infer_dtype_from_object(getattr(np, dtype))
+ except (AttributeError, TypeError):
+ # Handles cases like _get_dtype(int) i.e.,
+ # Python objects that are valid dtypes
+ # (unlike user-defined types, in general)
+ #
+ # TypeError handles the float16 type code of 'e'
+ # further handle internal types
+ pass
+
+ return infer_dtype_from_object(np.dtype(dtype))
+
+
+def _validate_date_like_dtype(dtype):
+ """
+ Check whether the dtype is a date-like dtype. Raises an error if invalid.
+
+ Parameters
+ ----------
+ dtype : dtype, type
+ The dtype to check.
+
+ Raises
+ ------
+ TypeError : The dtype could not be casted to a date-like dtype.
+ ValueError : The dtype is an illegal date-like dtype (e.g. the
+ the frequency provided is too specific)
+ """
+
+ try:
+ typ = np.datetime_data(dtype)[0]
+ except ValueError as e:
+ raise TypeError('{error}'.format(error=e))
+ if typ != 'generic' and typ != 'ns':
+ msg = '{name!r} is too specific of a frequency, try passing {type!r}'
+ raise ValueError(msg.format(name=dtype.name, type=dtype.type.__name__))
+
+
+def pandas_dtype(dtype):
+ """
+ Converts input into a pandas only dtype object or a numpy dtype object.
+
+ Parameters
+ ----------
+ dtype : object to be converted
+
+ Returns
+ -------
+ np.dtype or a pandas dtype
+
+ Raises
+ ------
+ TypeError if not a dtype
+ """
+ # short-circuit
+ if isinstance(dtype, np.ndarray):
+ return dtype.dtype
+ elif isinstance(dtype, (np.dtype, PandasExtensionDtype, ExtensionDtype)):
+ return dtype
+
+ # registered extension types
+ result = registry.find(dtype)
+ if result is not None:
+ return result
+
+ # try a numpy dtype
+ # raise a consistent TypeError if failed
+ try:
+ npdtype = np.dtype(dtype)
+ except Exception:
+ # we don't want to force a repr of the non-string
+ if not isinstance(dtype, string_types):
+ raise TypeError("data type not understood")
+ raise TypeError("data type '{}' not understood".format(
+ dtype))
+
+ # Any invalid dtype (such as pd.Timestamp) should raise an error.
+ # np.dtype(invalid_type).kind = 0 for such objects. However, this will
+ # also catch some valid dtypes such as object, np.object_ and 'object'
+ # which we safeguard against by catching them earlier and returning
+ # np.dtype(valid_dtype) before this condition is evaluated.
+ if is_hashable(dtype) and dtype in [object, np.object_, 'object', 'O']:
+ # check hashability to avoid errors/DeprecationWarning when we get
+ # here and `dtype` is an array
+ return npdtype
+ elif npdtype.kind == 'O':
+ raise TypeError("dtype '{}' not understood".format(dtype))
+
+ return npdtype
diff --git a/contrib/python/pandas/py2/pandas/core/dtypes/concat.py b/contrib/python/pandas/py2/pandas/core/dtypes/concat.py
new file mode 100644
index 00000000000..aada777deca
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/dtypes/concat.py
@@ -0,0 +1,583 @@
+"""
+Utility functions related to concat
+"""
+
+import numpy as np
+
+from pandas._libs import tslib, tslibs
+
+from pandas.core.dtypes.common import (
+ _NS_DTYPE, _TD_DTYPE, is_bool_dtype, is_categorical_dtype,
+ is_datetime64_dtype, is_datetime64tz_dtype, is_dtype_equal,
+ is_extension_array_dtype, is_object_dtype, is_sparse, is_timedelta64_dtype)
+from pandas.core.dtypes.generic import (
+ ABCDatetimeArray, ABCDatetimeIndex, ABCIndexClass, ABCPeriodIndex,
+ ABCRangeIndex, ABCSparseDataFrame, ABCTimedeltaIndex)
+
+from pandas import compat
+
+
+def get_dtype_kinds(l):
+ """
+ Parameters
+ ----------
+ l : list of arrays
+
+ Returns
+ -------
+ a set of kinds that exist in this list of arrays
+ """
+
+ typs = set()
+ for arr in l:
+
+ dtype = arr.dtype
+ if is_categorical_dtype(dtype):
+ typ = 'category'
+ elif is_sparse(arr):
+ typ = 'sparse'
+ elif isinstance(arr, ABCRangeIndex):
+ typ = 'range'
+ elif is_datetime64tz_dtype(arr):
+ # if to_concat contains different tz,
+ # the result must be object dtype
+ typ = str(arr.dtype)
+ elif is_datetime64_dtype(dtype):
+ typ = 'datetime'
+ elif is_timedelta64_dtype(dtype):
+ typ = 'timedelta'
+ elif is_object_dtype(dtype):
+ typ = 'object'
+ elif is_bool_dtype(dtype):
+ typ = 'bool'
+ elif is_extension_array_dtype(dtype):
+ typ = str(arr.dtype)
+ else:
+ typ = dtype.kind
+ typs.add(typ)
+ return typs
+
+
+def _get_series_result_type(result, objs=None):
+ """
+ return appropriate class of Series concat
+ input is either dict or array-like
+ """
+ from pandas import SparseSeries, SparseDataFrame, DataFrame
+
+ # concat Series with axis 1
+ if isinstance(result, dict):
+ # concat Series with axis 1
+ if all(isinstance(c, (SparseSeries, SparseDataFrame))
+ for c in compat.itervalues(result)):
+ return SparseDataFrame
+ else:
+ return DataFrame
+
+ # otherwise it is a SingleBlockManager (axis = 0)
+ if result._block.is_sparse:
+ return SparseSeries
+ else:
+ return objs[0]._constructor
+
+
+def _get_frame_result_type(result, objs):
+ """
+ return appropriate class of DataFrame-like concat
+ if all blocks are sparse, return SparseDataFrame
+ otherwise, return 1st obj
+ """
+
+ if (result.blocks and (
+ all(is_sparse(b) for b in result.blocks) or
+ all(isinstance(obj, ABCSparseDataFrame) for obj in objs))):
+ from pandas.core.sparse.api import SparseDataFrame
+ return SparseDataFrame
+ else:
+ return next(obj for obj in objs if not isinstance(obj,
+ ABCSparseDataFrame))
+
+
+def _concat_compat(to_concat, axis=0):
+ """
+ provide concatenation of an array of arrays each of which is a single
+ 'normalized' dtypes (in that for example, if it's object, then it is a
+ non-datetimelike and provide a combined dtype for the resulting array that
+ preserves the overall dtype if possible)
+
+ Parameters
+ ----------
+ to_concat : array of arrays
+ axis : axis to provide concatenation
+
+ Returns
+ -------
+ a single array, preserving the combined dtypes
+ """
+
+ # filter empty arrays
+ # 1-d dtypes always are included here
+ def is_nonempty(x):
+ try:
+ return x.shape[axis] > 0
+ except Exception:
+ return True
+
+ nonempty = [x for x in to_concat if is_nonempty(x)]
+
+ # If all arrays are empty, there's nothing to convert, just short-cut to
+ # the concatenation, #3121.
+ #
+ # Creating an empty array directly is tempting, but the winnings would be
+ # marginal given that it would still require shape & dtype calculation and
+ # np.concatenate which has them both implemented is compiled.
+
+ typs = get_dtype_kinds(to_concat)
+ _contains_datetime = any(typ.startswith('datetime') for typ in typs)
+ _contains_period = any(typ.startswith('period') for typ in typs)
+
+ if 'category' in typs:
+ # this must be priort to _concat_datetime,
+ # to support Categorical + datetime-like
+ return _concat_categorical(to_concat, axis=axis)
+
+ elif _contains_datetime or 'timedelta' in typs or _contains_period:
+ return _concat_datetime(to_concat, axis=axis, typs=typs)
+
+ # these are mandated to handle empties as well
+ elif 'sparse' in typs:
+ return _concat_sparse(to_concat, axis=axis, typs=typs)
+
+ extensions = [is_extension_array_dtype(x) for x in to_concat]
+ if any(extensions) and axis == 1:
+ to_concat = [np.atleast_2d(x.astype('object')) for x in to_concat]
+
+ if not nonempty:
+ # we have all empties, but may need to coerce the result dtype to
+ # object if we have non-numeric type operands (numpy would otherwise
+ # cast this to float)
+ typs = get_dtype_kinds(to_concat)
+ if len(typs) != 1:
+
+ if (not len(typs - {'i', 'u', 'f'}) or
+ not len(typs - {'bool', 'i', 'u'})):
+ # let numpy coerce
+ pass
+ else:
+ # coerce to object
+ to_concat = [x.astype('object') for x in to_concat]
+
+ return np.concatenate(to_concat, axis=axis)
+
+
+def _concat_categorical(to_concat, axis=0):
+ """Concatenate an object/categorical array of arrays, each of which is a
+ single dtype
+
+ Parameters
+ ----------
+ to_concat : array of arrays
+ axis : int
+ Axis to provide concatenation in the current implementation this is
+ always 0, e.g. we only have 1D categoricals
+
+ Returns
+ -------
+ Categorical
+ A single array, preserving the combined dtypes
+ """
+
+ # we could have object blocks and categoricals here
+ # if we only have a single categoricals then combine everything
+ # else its a non-compat categorical
+ categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]
+
+ # validate the categories
+ if len(categoricals) != len(to_concat):
+ pass
+ else:
+ # when all categories are identical
+ first = to_concat[0]
+ if all(first.is_dtype_equal(other) for other in to_concat[1:]):
+ return union_categoricals(categoricals)
+
+ # extract the categoricals & coerce to object if needed
+ to_concat = [x.get_values() if is_categorical_dtype(x.dtype)
+ else np.asarray(x).ravel() if not is_datetime64tz_dtype(x)
+ else np.asarray(x.astype(object)) for x in to_concat]
+ result = _concat_compat(to_concat)
+ if axis == 1:
+ result = result.reshape(1, len(result))
+ return result
+
+
+def union_categoricals(to_union, sort_categories=False, ignore_order=False):
+ """
+ Combine list-like of Categorical-like, unioning categories. All
+ categories must have the same dtype.
+
+ .. versionadded:: 0.19.0
+
+ Parameters
+ ----------
+ to_union : list-like of Categorical, CategoricalIndex,
+ or Series with dtype='category'
+ sort_categories : boolean, default False
+ If true, resulting categories will be lexsorted, otherwise
+ they will be ordered as they appear in the data.
+ ignore_order : boolean, default False
+ If true, the ordered attribute of the Categoricals will be ignored.
+ Results in an unordered categorical.
+
+ .. versionadded:: 0.20.0
+
+ Returns
+ -------
+ result : Categorical
+
+ Raises
+ ------
+ TypeError
+ - all inputs do not have the same dtype
+ - all inputs do not have the same ordered property
+ - all inputs are ordered and their categories are not identical
+ - sort_categories=True and Categoricals are ordered
+ ValueError
+ Empty list of categoricals passed
+
+ Notes
+ -----
+
+ To learn more about categories, see `link
+ <http://pandas.pydata.org/pandas-docs/stable/categorical.html#unioning>`__
+
+ Examples
+ --------
+
+ >>> from pandas.api.types import union_categoricals
+
+ If you want to combine categoricals that do not necessarily have
+ the same categories, `union_categoricals` will combine a list-like
+ of categoricals. The new categories will be the union of the
+ categories being combined.
+
+ >>> a = pd.Categorical(["b", "c"])
+ >>> b = pd.Categorical(["a", "b"])
+ >>> union_categoricals([a, b])
+ [b, c, a, b]
+ Categories (3, object): [b, c, a]
+
+ By default, the resulting categories will be ordered as they appear
+ in the `categories` of the data. If you want the categories to be
+ lexsorted, use `sort_categories=True` argument.
+
+ >>> union_categoricals([a, b], sort_categories=True)
+ [b, c, a, b]
+ Categories (3, object): [a, b, c]
+
+ `union_categoricals` also works with the case of combining two
+ categoricals of the same categories and order information (e.g. what
+ you could also `append` for).
+
+ >>> a = pd.Categorical(["a", "b"], ordered=True)
+ >>> b = pd.Categorical(["a", "b", "a"], ordered=True)
+ >>> union_categoricals([a, b])
+ [a, b, a, b, a]
+ Categories (2, object): [a < b]
+
+ Raises `TypeError` because the categories are ordered and not identical.
+
+ >>> a = pd.Categorical(["a", "b"], ordered=True)
+ >>> b = pd.Categorical(["a", "b", "c"], ordered=True)
+ >>> union_categoricals([a, b])
+ TypeError: to union ordered Categoricals, all categories must be the same
+
+ New in version 0.20.0
+
+ Ordered categoricals with different categories or orderings can be
+ combined by using the `ignore_ordered=True` argument.
+
+ >>> a = pd.Categorical(["a", "b", "c"], ordered=True)
+ >>> b = pd.Categorical(["c", "b", "a"], ordered=True)
+ >>> union_categoricals([a, b], ignore_order=True)
+ [a, b, c, c, b, a]
+ Categories (3, object): [a, b, c]
+
+ `union_categoricals` also works with a `CategoricalIndex`, or `Series`
+ containing categorical data, but note that the resulting array will
+ always be a plain `Categorical`
+
+ >>> a = pd.Series(["b", "c"], dtype='category')
+ >>> b = pd.Series(["a", "b"], dtype='category')
+ >>> union_categoricals([a, b])
+ [b, c, a, b]
+ Categories (3, object): [b, c, a]
+ """
+ from pandas import Index, Categorical, CategoricalIndex, Series
+ from pandas.core.arrays.categorical import _recode_for_categories
+
+ if len(to_union) == 0:
+ raise ValueError('No Categoricals to union')
+
+ def _maybe_unwrap(x):
+ if isinstance(x, (CategoricalIndex, Series)):
+ return x.values
+ elif isinstance(x, Categorical):
+ return x
+ else:
+ raise TypeError("all components to combine must be Categorical")
+
+ to_union = [_maybe_unwrap(x) for x in to_union]
+ first = to_union[0]
+
+ if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype)
+ for other in to_union[1:]):
+ raise TypeError("dtype of categories must be the same")
+
+ ordered = False
+ if all(first.is_dtype_equal(other) for other in to_union[1:]):
+ # identical categories - fastpath
+ categories = first.categories
+ ordered = first.ordered
+
+ if all(first.categories.equals(other.categories)
+ for other in to_union[1:]):
+ new_codes = np.concatenate([c.codes for c in to_union])
+ else:
+ codes = [first.codes] + [_recode_for_categories(other.codes,
+ other.categories,
+ first.categories)
+ for other in to_union[1:]]
+ new_codes = np.concatenate(codes)
+
+ if sort_categories and not ignore_order and ordered:
+ raise TypeError("Cannot use sort_categories=True with "
+ "ordered Categoricals")
+
+ if sort_categories and not categories.is_monotonic_increasing:
+ categories = categories.sort_values()
+ indexer = categories.get_indexer(first.categories)
+
+ from pandas.core.algorithms import take_1d
+ new_codes = take_1d(indexer, new_codes, fill_value=-1)
+ elif ignore_order or all(not c.ordered for c in to_union):
+ # different categories - union and recode
+ cats = first.categories.append([c.categories for c in to_union[1:]])
+ categories = Index(cats.unique())
+ if sort_categories:
+ categories = categories.sort_values()
+
+ new_codes = [_recode_for_categories(c.codes, c.categories, categories)
+ for c in to_union]
+ new_codes = np.concatenate(new_codes)
+ else:
+ # ordered - to show a proper error message
+ if all(c.ordered for c in to_union):
+ msg = ("to union ordered Categoricals, "
+ "all categories must be the same")
+ raise TypeError(msg)
+ else:
+ raise TypeError('Categorical.ordered must be the same')
+
+ if ignore_order:
+ ordered = False
+
+ return Categorical(new_codes, categories=categories, ordered=ordered,
+ fastpath=True)
+
+
+def _concatenate_2d(to_concat, axis):
+ # coerce to 2d if needed & concatenate
+ if axis == 1:
+ to_concat = [np.atleast_2d(x) for x in to_concat]
+ return np.concatenate(to_concat, axis=axis)
+
+
+def _concat_datetime(to_concat, axis=0, typs=None):
+ """
+ provide concatenation of an datetimelike array of arrays each of which is a
+ single M8[ns], datetimet64[ns, tz] or m8[ns] dtype
+
+ Parameters
+ ----------
+ to_concat : array of arrays
+ axis : axis to provide concatenation
+ typs : set of to_concat dtypes
+
+ Returns
+ -------
+ a single array, preserving the combined dtypes
+ """
+
+ if typs is None:
+ typs = get_dtype_kinds(to_concat)
+
+ # multiple types, need to coerce to object
+ if len(typs) != 1:
+ return _concatenate_2d([_convert_datetimelike_to_object(x)
+ for x in to_concat],
+ axis=axis)
+
+ # must be single dtype
+ if any(typ.startswith('datetime') for typ in typs):
+
+ if 'datetime' in typs:
+ to_concat = [x.astype(np.int64, copy=False) for x in to_concat]
+ return _concatenate_2d(to_concat, axis=axis).view(_NS_DTYPE)
+ else:
+ # when to_concat has different tz, len(typs) > 1.
+ # thus no need to care
+ return _concat_datetimetz(to_concat)
+
+ elif 'timedelta' in typs:
+ return _concatenate_2d([x.view(np.int64) for x in to_concat],
+ axis=axis).view(_TD_DTYPE)
+
+ elif any(typ.startswith('period') for typ in typs):
+ assert len(typs) == 1
+ cls = to_concat[0]
+ new_values = cls._concat_same_type(to_concat)
+ return new_values
+
+
+def _convert_datetimelike_to_object(x):
+ # coerce datetimelike array to object dtype
+
+ # if dtype is of datetimetz or timezone
+ if x.dtype.kind == _NS_DTYPE.kind:
+ if getattr(x, 'tz', None) is not None:
+ x = np.asarray(x.astype(object))
+ else:
+ shape = x.shape
+ x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(),
+ box="timestamp")
+ x = x.reshape(shape)
+
+ elif x.dtype == _TD_DTYPE:
+ shape = x.shape
+ x = tslibs.ints_to_pytimedelta(x.view(np.int64).ravel(), box=True)
+ x = x.reshape(shape)
+
+ return x
+
+
+def _concat_datetimetz(to_concat, name=None):
+ """
+ concat DatetimeIndex with the same tz
+ all inputs must be DatetimeIndex
+ it is used in DatetimeIndex.append also
+ """
+ # Right now, internals will pass a List[DatetimeArray] here
+ # for reductions like quantile. I would like to disentangle
+ # all this before we get here.
+ sample = to_concat[0]
+
+ if isinstance(sample, ABCIndexClass):
+ return sample._concat_same_dtype(to_concat, name=name)
+ elif isinstance(sample, ABCDatetimeArray):
+ return sample._concat_same_type(to_concat)
+
+
+def _concat_index_same_dtype(indexes, klass=None):
+ klass = klass if klass is not None else indexes[0].__class__
+ return klass(np.concatenate([x._values for x in indexes]))
+
+
+def _concat_index_asobject(to_concat, name=None):
+ """
+ concat all inputs as object. DatetimeIndex, TimedeltaIndex and
+ PeriodIndex are converted to object dtype before concatenation
+ """
+ from pandas import Index
+ from pandas.core.arrays import ExtensionArray
+
+ klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex,
+ ExtensionArray)
+ to_concat = [x.astype(object) if isinstance(x, klasses) else x
+ for x in to_concat]
+
+ self = to_concat[0]
+ attribs = self._get_attributes_dict()
+ attribs['name'] = name
+
+ to_concat = [x._values if isinstance(x, Index) else x
+ for x in to_concat]
+
+ return self._shallow_copy_with_infer(np.concatenate(to_concat), **attribs)
+
+
+def _concat_sparse(to_concat, axis=0, typs=None):
+ """
+ provide concatenation of an sparse/dense array of arrays each of which is a
+ single dtype
+
+ Parameters
+ ----------
+ to_concat : array of arrays
+ axis : axis to provide concatenation
+ typs : set of to_concat dtypes
+
+ Returns
+ -------
+ a single array, preserving the combined dtypes
+ """
+
+ from pandas.core.arrays import SparseArray
+
+ fill_values = [x.fill_value for x in to_concat
+ if isinstance(x, SparseArray)]
+ fill_value = fill_values[0]
+
+ # TODO: Fix join unit generation so we aren't passed this.
+ to_concat = [x if isinstance(x, SparseArray)
+ else SparseArray(x.squeeze(), fill_value=fill_value)
+ for x in to_concat]
+
+ return SparseArray._concat_same_type(to_concat)
+
+
+def _concat_rangeindex_same_dtype(indexes):
+ """
+ Concatenates multiple RangeIndex instances. All members of "indexes" must
+ be of type RangeIndex; result will be RangeIndex if possible, Int64Index
+ otherwise. E.g.:
+ indexes = [RangeIndex(3), RangeIndex(3, 6)] -> RangeIndex(6)
+ indexes = [RangeIndex(3), RangeIndex(4, 6)] -> Int64Index([0,1,2,4,5])
+ """
+ from pandas import Int64Index, RangeIndex
+
+ start = step = next = None
+
+ # Filter the empty indexes
+ non_empty_indexes = [obj for obj in indexes if len(obj)]
+
+ for obj in non_empty_indexes:
+
+ if start is None:
+ # This is set by the first non-empty index
+ start = obj._start
+ if step is None and len(obj) > 1:
+ step = obj._step
+ elif step is None:
+ # First non-empty index had only one element
+ if obj._start == start:
+ return _concat_index_same_dtype(indexes, klass=Int64Index)
+ step = obj._start - start
+
+ non_consecutive = ((step != obj._step and len(obj) > 1) or
+ (next is not None and obj._start != next))
+ if non_consecutive:
+ return _concat_index_same_dtype(indexes, klass=Int64Index)
+
+ if step is not None:
+ next = obj[-1] + step
+
+ if non_empty_indexes:
+ # Get the stop value from "next" or alternatively
+ # from the last non-empty index
+ stop = non_empty_indexes[-1]._stop if next is None else next
+ return RangeIndex(start, stop, step)
+
+ # Here all "indexes" had 0 length, i.e. were empty.
+ # In this case return an empty range index.
+ return RangeIndex(0, 0)
diff --git a/contrib/python/pandas/py2/pandas/core/dtypes/dtypes.py b/contrib/python/pandas/py2/pandas/core/dtypes/dtypes.py
new file mode 100644
index 00000000000..b73f55329e2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/dtypes/dtypes.py
@@ -0,0 +1,991 @@
+""" define extension dtypes """
+import re
+import warnings
+
+import numpy as np
+import pytz
+
+from pandas._libs.interval import Interval
+from pandas._libs.tslibs import NaT, Period, Timestamp, timezones
+
+from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCIndexClass
+
+from pandas import compat
+
+from .base import ExtensionDtype, _DtypeOpsMixin
+from .inference import is_list_like
+
+
+def register_extension_dtype(cls):
+ """Class decorator to register an ExtensionType with pandas.
+
+ .. versionadded:: 0.24.0
+
+ This enables operations like ``.astype(name)`` for the name
+ of the ExtensionDtype.
+
+ Examples
+ --------
+ >>> from pandas.api.extensions import register_extension_dtype
+ >>> from pandas.api.extensions import ExtensionDtype
+ >>> @register_extension_dtype
+ ... class MyExtensionDtype(ExtensionDtype):
+ ... pass
+ """
+ registry.register(cls)
+ return cls
+
+
+class Registry(object):
+ """
+ Registry for dtype inference
+
+ The registry allows one to map a string repr of a extension
+ dtype to an extension dtype. The string alias can be used in several
+ places, including
+
+ * Series and Index constructors
+ * :meth:`pandas.array`
+ * :meth:`pandas.Series.astype`
+
+ Multiple extension types can be registered.
+ These are tried in order.
+ """
+ def __init__(self):
+ self.dtypes = []
+
+ def register(self, dtype):
+ """
+ Parameters
+ ----------
+ dtype : ExtensionDtype
+ """
+ if not issubclass(dtype, (PandasExtensionDtype, ExtensionDtype)):
+ raise ValueError("can only register pandas extension dtypes")
+
+ self.dtypes.append(dtype)
+
+ def find(self, dtype):
+ """
+ Parameters
+ ----------
+ dtype : PandasExtensionDtype or string
+
+ Returns
+ -------
+ return the first matching dtype, otherwise return None
+ """
+ if not isinstance(dtype, compat.string_types):
+ dtype_type = dtype
+ if not isinstance(dtype, type):
+ dtype_type = type(dtype)
+ if issubclass(dtype_type, ExtensionDtype):
+ return dtype
+
+ return None
+
+ for dtype_type in self.dtypes:
+ try:
+ return dtype_type.construct_from_string(dtype)
+ except TypeError:
+ pass
+
+ return None
+
+
+registry = Registry()
+
+
+class PandasExtensionDtype(_DtypeOpsMixin):
+ """
+ A np.dtype duck-typed class, suitable for holding a custom dtype.
+
+ THIS IS NOT A REAL NUMPY DTYPE
+ """
+ type = None
+ subdtype = None
+ kind = None
+ str = None
+ num = 100
+ shape = tuple()
+ itemsize = 8
+ base = None
+ isbuiltin = 0
+ isnative = 0
+ _cache = {}
+
+ def __unicode__(self):
+ return self.name
+
+ def __str__(self):
+ """
+ Return a string representation for a particular Object
+
+ Invoked by str(df) in both py2/py3.
+ Yields Bytestring in Py2, Unicode String in py3.
+ """
+
+ if compat.PY3:
+ return self.__unicode__()
+ return self.__bytes__()
+
+ def __bytes__(self):
+ """
+ Return a string representation for a particular object.
+
+ Invoked by bytes(obj) in py3 only.
+ Yields a bytestring in both py2/py3.
+ """
+ from pandas.core.config import get_option
+
+ encoding = get_option("display.encoding")
+ return self.__unicode__().encode(encoding, 'replace')
+
+ def __repr__(self):
+ """
+ Return a string representation for a particular object.
+
+ Yields Bytestring in Py2, Unicode String in py3.
+ """
+ return str(self)
+
+ def __hash__(self):
+ raise NotImplementedError("sub-classes should implement an __hash__ "
+ "method")
+
+ def __getstate__(self):
+ # pickle support; we don't want to pickle the cache
+ return {k: getattr(self, k, None) for k in self._metadata}
+
+ @classmethod
+ def reset_cache(cls):
+ """ clear the cache """
+ cls._cache = {}
+
+
+class CategoricalDtypeType(type):
+ """
+ the type of CategoricalDtype, this metaclass determines subclass ability
+ """
+ pass
+
+
+@register_extension_dtype
+class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
+ """
+ Type for categorical data with the categories and orderedness
+
+ .. versionchanged:: 0.21.0
+
+ Parameters
+ ----------
+ categories : sequence, optional
+ Must be unique, and must not contain any nulls.
+ ordered : bool, default False
+
+ Attributes
+ ----------
+ categories
+ ordered
+
+ Methods
+ -------
+ None
+
+ See Also
+ --------
+ pandas.Categorical
+
+ Notes
+ -----
+ This class is useful for specifying the type of a ``Categorical``
+ independent of the values. See :ref:`categorical.categoricaldtype`
+ for more.
+
+ Examples
+ --------
+ >>> t = pd.CategoricalDtype(categories=['b', 'a'], ordered=True)
+ >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t)
+ 0 a
+ 1 b
+ 2 a
+ 3 NaN
+ dtype: category
+ Categories (2, object): [b < a]
+ """
+ # TODO: Document public vs. private API
+ name = 'category'
+ type = CategoricalDtypeType
+ kind = 'O'
+ str = '|O08'
+ base = np.dtype('O')
+ _metadata = ('categories', 'ordered')
+ _cache = {}
+
+ def __init__(self, categories=None, ordered=None):
+ self._finalize(categories, ordered, fastpath=False)
+
+ @classmethod
+ def _from_fastpath(cls, categories=None, ordered=None):
+ self = cls.__new__(cls)
+ self._finalize(categories, ordered, fastpath=True)
+ return self
+
+ @classmethod
+ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None):
+ if categories is ordered is None:
+ return dtype
+ if categories is None:
+ categories = dtype.categories
+ if ordered is None:
+ ordered = dtype.ordered
+ return cls(categories, ordered)
+
+ @classmethod
+ def _from_values_or_dtype(cls, values=None, categories=None, ordered=None,
+ dtype=None):
+ """
+ Construct dtype from the input parameters used in :class:`Categorical`.
+
+ This constructor method specifically does not do the factorization
+ step, if that is needed to find the categories. This constructor may
+ therefore return ``CategoricalDtype(categories=None, ordered=None)``,
+ which may not be useful. Additional steps may therefore have to be
+ taken to create the final dtype.
+
+ The return dtype is specified from the inputs in this prioritized
+ order:
+ 1. if dtype is a CategoricalDtype, return dtype
+ 2. if dtype is the string 'category', create a CategoricalDtype from
+ the supplied categories and ordered parameters, and return that.
+ 3. if values is a categorical, use value.dtype, but override it with
+ categories and ordered if either/both of those are not None.
+ 4. if dtype is None and values is not a categorical, construct the
+ dtype from categories and ordered, even if either of those is None.
+
+ Parameters
+ ----------
+ values : list-like, optional
+ The list-like must be 1-dimensional.
+ categories : list-like, optional
+ Categories for the CategoricalDtype.
+ ordered : bool, optional
+ Designating if the categories are ordered.
+ dtype : CategoricalDtype or the string "category", optional
+ If ``CategoricalDtype``, cannot be used together with
+ `categories` or `ordered`.
+
+ Returns
+ -------
+ CategoricalDtype
+
+ Examples
+ --------
+ >>> CategoricalDtype._from_values_or_dtype()
+ CategoricalDtype(categories=None, ordered=None)
+ >>> CategoricalDtype._from_values_or_dtype(categories=['a', 'b'],
+ ... ordered=True)
+ CategoricalDtype(categories=['a', 'b'], ordered=True)
+ >>> dtype1 = CategoricalDtype(['a', 'b'], ordered=True)
+ >>> dtype2 = CategoricalDtype(['x', 'y'], ordered=False)
+ >>> c = Categorical([0, 1], dtype=dtype1, fastpath=True)
+ >>> CategoricalDtype._from_values_or_dtype(c, ['x', 'y'], ordered=True,
+ ... dtype=dtype2)
+ ValueError: Cannot specify `categories` or `ordered` together with
+ `dtype`.
+
+ The supplied dtype takes precedence over values' dtype:
+
+ >>> CategoricalDtype._from_values_or_dtype(c, dtype=dtype2)
+ CategoricalDtype(['x', 'y'], ordered=False)
+ """
+ from pandas.core.dtypes.common import is_categorical
+
+ if dtype is not None:
+ # The dtype argument takes precedence over values.dtype (if any)
+ if isinstance(dtype, compat.string_types):
+ if dtype == 'category':
+ dtype = CategoricalDtype(categories, ordered)
+ else:
+ msg = "Unknown dtype {dtype!r}"
+ raise ValueError(msg.format(dtype=dtype))
+ elif categories is not None or ordered is not None:
+ raise ValueError("Cannot specify `categories` or `ordered` "
+ "together with `dtype`.")
+ elif is_categorical(values):
+ # If no "dtype" was passed, use the one from "values", but honor
+ # the "ordered" and "categories" arguments
+ dtype = values.dtype._from_categorical_dtype(values.dtype,
+ categories, ordered)
+ else:
+ # If dtype=None and values is not categorical, create a new dtype.
+ # Note: This could potentially have categories=None and
+ # ordered=None.
+ dtype = CategoricalDtype(categories, ordered)
+
+ return dtype
+
+ def _finalize(self, categories, ordered, fastpath=False):
+
+ if ordered is not None:
+ self.validate_ordered(ordered)
+
+ if categories is not None:
+ categories = self.validate_categories(categories,
+ fastpath=fastpath)
+
+ self._categories = categories
+ self._ordered = ordered
+
+ def __setstate__(self, state):
+ self._categories = state.pop('categories', None)
+ self._ordered = state.pop('ordered', False)
+
+ def __hash__(self):
+ # _hash_categories returns a uint64, so use the negative
+ # space for when we have unknown categories to avoid a conflict
+ if self.categories is None:
+ if self.ordered:
+ return -1
+ else:
+ return -2
+ # We *do* want to include the real self.ordered here
+ return int(self._hash_categories(self.categories, self.ordered))
+
+ def __eq__(self, other):
+ """
+ Rules for CDT equality:
+ 1) Any CDT is equal to the string 'category'
+ 2) Any CDT is equal to itself
+ 3) Any CDT is equal to a CDT with categories=None regardless of ordered
+ 4) A CDT with ordered=True is only equal to another CDT with
+ ordered=True and identical categories in the same order
+ 5) A CDT with ordered={False, None} is only equal to another CDT with
+ ordered={False, None} and identical categories, but same order is
+ not required. There is no distinction between False/None.
+ 6) Any other comparison returns False
+ """
+ if isinstance(other, compat.string_types):
+ return other == self.name
+ elif other is self:
+ return True
+ elif not (hasattr(other, 'ordered') and hasattr(other, 'categories')):
+ return False
+ elif self.categories is None or other.categories is None:
+ # We're forced into a suboptimal corner thanks to math and
+ # backwards compatibility. We require that `CDT(...) == 'category'`
+ # for all CDTs **including** `CDT(None, ...)`. Therefore, *all*
+ # CDT(., .) = CDT(None, False) and *all*
+ # CDT(., .) = CDT(None, True).
+ return True
+ elif self.ordered or other.ordered:
+ # At least one has ordered=True; equal if both have ordered=True
+ # and the same values for categories in the same order.
+ return ((self.ordered == other.ordered) and
+ self.categories.equals(other.categories))
+ else:
+ # Neither has ordered=True; equal if both have the same categories,
+ # but same order is not necessary. There is no distinction between
+ # ordered=False and ordered=None: CDT(., False) and CDT(., None)
+ # will be equal if they have the same categories.
+ return hash(self) == hash(other)
+
+ def __repr__(self):
+ tpl = u'CategoricalDtype(categories={}ordered={})'
+ if self.categories is None:
+ data = u"None, "
+ else:
+ data = self.categories._format_data(name=self.__class__.__name__)
+ return tpl.format(data, self.ordered)
+
+ @staticmethod
+ def _hash_categories(categories, ordered=True):
+ from pandas.core.util.hashing import (
+ hash_array, _combine_hash_arrays, hash_tuples
+ )
+ from pandas.core.dtypes.common import is_datetime64tz_dtype, _NS_DTYPE
+
+ if len(categories) and isinstance(categories[0], tuple):
+ # assumes if any individual category is a tuple, then all our. ATM
+ # I don't really want to support just some of the categories being
+ # tuples.
+ categories = list(categories) # breaks if a np.array of categories
+ cat_array = hash_tuples(categories)
+ else:
+ if categories.dtype == 'O':
+ types = [type(x) for x in categories]
+ if not len(set(types)) == 1:
+ # TODO: hash_array doesn't handle mixed types. It casts
+ # everything to a str first, which means we treat
+ # {'1', '2'} the same as {'1', 2}
+ # find a better solution
+ hashed = hash((tuple(categories), ordered))
+ return hashed
+
+ if is_datetime64tz_dtype(categories.dtype):
+ # Avoid future warning.
+ categories = categories.astype(_NS_DTYPE)
+
+ cat_array = hash_array(np.asarray(categories), categorize=False)
+ if ordered:
+ cat_array = np.vstack([
+ cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)
+ ])
+ else:
+ cat_array = [cat_array]
+ hashed = _combine_hash_arrays(iter(cat_array),
+ num_items=len(cat_array))
+ return np.bitwise_xor.reduce(hashed)
+
+ @classmethod
+ def construct_array_type(cls):
+ """
+ Return the array type associated with this dtype
+
+ Returns
+ -------
+ type
+ """
+ from pandas import Categorical
+ return Categorical
+
+ @classmethod
+ def construct_from_string(cls, string):
+ """
+ attempt to construct this type from a string, raise a TypeError if
+ it's not possible """
+ try:
+ if string == 'category':
+ return cls()
+ else:
+ raise TypeError("cannot construct a CategoricalDtype")
+ except AttributeError:
+ pass
+
+ @staticmethod
+ def validate_ordered(ordered):
+ """
+ Validates that we have a valid ordered parameter. If
+ it is not a boolean, a TypeError will be raised.
+
+ Parameters
+ ----------
+ ordered : object
+ The parameter to be verified.
+
+ Raises
+ ------
+ TypeError
+ If 'ordered' is not a boolean.
+ """
+ from pandas.core.dtypes.common import is_bool
+ if not is_bool(ordered):
+ raise TypeError("'ordered' must either be 'True' or 'False'")
+
+ @staticmethod
+ def validate_categories(categories, fastpath=False):
+ """
+ Validates that we have good categories
+
+ Parameters
+ ----------
+ categories : array-like
+ fastpath : bool
+ Whether to skip nan and uniqueness checks
+
+ Returns
+ -------
+ categories : Index
+ """
+ from pandas import Index
+
+ if not fastpath and not is_list_like(categories):
+ msg = "Parameter 'categories' must be list-like, was {!r}"
+ raise TypeError(msg.format(categories))
+ elif not isinstance(categories, ABCIndexClass):
+ categories = Index(categories, tupleize_cols=False)
+
+ if not fastpath:
+
+ if categories.hasnans:
+ raise ValueError('Categorial categories cannot be null')
+
+ if not categories.is_unique:
+ raise ValueError('Categorical categories must be unique')
+
+ if isinstance(categories, ABCCategoricalIndex):
+ categories = categories.categories
+
+ return categories
+
+ def update_dtype(self, dtype):
+ """
+ Returns a CategoricalDtype with categories and ordered taken from dtype
+ if specified, otherwise falling back to self if unspecified
+
+ Parameters
+ ----------
+ dtype : CategoricalDtype
+
+ Returns
+ -------
+ new_dtype : CategoricalDtype
+ """
+ if isinstance(dtype, compat.string_types) and dtype == 'category':
+ # dtype='category' should not change anything
+ return self
+ elif not self.is_dtype(dtype):
+ msg = ('a CategoricalDtype must be passed to perform an update, '
+ 'got {dtype!r}').format(dtype=dtype)
+ raise ValueError(msg)
+ elif dtype.categories is not None and dtype.ordered is self.ordered:
+ return dtype
+
+ # dtype is CDT: keep current categories/ordered if None
+ new_categories = dtype.categories
+ if new_categories is None:
+ new_categories = self.categories
+
+ new_ordered = dtype.ordered
+ if new_ordered is None:
+ new_ordered = self.ordered
+
+ return CategoricalDtype(new_categories, new_ordered)
+
+ @property
+ def categories(self):
+ """
+ An ``Index`` containing the unique categories allowed.
+ """
+ return self._categories
+
+ @property
+ def ordered(self):
+ """
+ Whether the categories have an ordered relationship.
+ """
+ return self._ordered
+
+ @property
+ def _is_boolean(self):
+ from pandas.core.dtypes.common import is_bool_dtype
+
+ return is_bool_dtype(self.categories)
+
+
+@register_extension_dtype
+class DatetimeTZDtype(PandasExtensionDtype, ExtensionDtype):
+
+ """
+ A np.dtype duck-typed class, suitable for holding a custom datetime with tz
+ dtype.
+
+ THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of
+ np.datetime64[ns]
+ """
+ type = Timestamp
+ kind = 'M'
+ str = '|M8[ns]'
+ num = 101
+ base = np.dtype('M8[ns]')
+ na_value = NaT
+ _metadata = ('unit', 'tz')
+ _match = re.compile(r"(datetime64|M8)\[(?P<unit>.+), (?P<tz>.+)\]")
+ _cache = {}
+
+ def __init__(self, unit="ns", tz=None):
+ """
+ An ExtensionDtype for timezone-aware datetime data.
+
+ Parameters
+ ----------
+ unit : str, default "ns"
+ The precision of the datetime data. Currently limited
+ to ``"ns"``.
+ tz : str, int, or datetime.tzinfo
+ The timezone.
+
+ Raises
+ ------
+ pytz.UnknownTimeZoneError
+ When the requested timezone cannot be found.
+
+ Examples
+ --------
+ >>> pd.core.dtypes.dtypes.DatetimeTZDtype(tz='UTC')
+ datetime64[ns, UTC]
+
+ >>> pd.core.dtypes.dtypes.DatetimeTZDtype(tz='dateutil/US/Central')
+ datetime64[ns, tzfile('/usr/share/zoneinfo/US/Central')]
+ """
+ if isinstance(unit, DatetimeTZDtype):
+ unit, tz = unit.unit, unit.tz
+
+ if unit != 'ns':
+ if isinstance(unit, compat.string_types) and tz is None:
+ # maybe a string like datetime64[ns, tz], which we support for
+ # now.
+ result = type(self).construct_from_string(unit)
+ unit = result.unit
+ tz = result.tz
+ msg = (
+ "Passing a dtype alias like 'datetime64[ns, {tz}]' "
+ "to DatetimeTZDtype is deprecated. Use "
+ "'DatetimeTZDtype.construct_from_string()' instead."
+ )
+ warnings.warn(msg.format(tz=tz), FutureWarning, stacklevel=2)
+ else:
+ raise ValueError("DatetimeTZDtype only supports ns units")
+
+ if tz:
+ tz = timezones.maybe_get_tz(tz)
+ elif tz is not None:
+ raise pytz.UnknownTimeZoneError(tz)
+ elif tz is None:
+ raise TypeError("A 'tz' is required.")
+
+ self._unit = unit
+ self._tz = tz
+
+ @property
+ def unit(self):
+ """The precision of the datetime data."""
+ return self._unit
+
+ @property
+ def tz(self):
+ """The timezone."""
+ return self._tz
+
+ @classmethod
+ def construct_array_type(cls):
+ """
+ Return the array type associated with this dtype
+
+ Returns
+ -------
+ type
+ """
+ from pandas.core.arrays import DatetimeArray
+ return DatetimeArray
+
+ @classmethod
+ def construct_from_string(cls, string):
+ """
+ Construct a DatetimeTZDtype from a string.
+
+ Parameters
+ ----------
+ string : str
+ The string alias for this DatetimeTZDtype.
+ Should be formatted like ``datetime64[ns, <tz>]``,
+ where ``<tz>`` is the timezone name.
+
+ Examples
+ --------
+ >>> DatetimeTZDtype.construct_from_string('datetime64[ns, UTC]')
+ datetime64[ns, UTC]
+ """
+ if isinstance(string, compat.string_types):
+ msg = "Could not construct DatetimeTZDtype from '{}'"
+ try:
+ match = cls._match.match(string)
+ if match:
+ d = match.groupdict()
+ return cls(unit=d['unit'], tz=d['tz'])
+ except Exception:
+ # TODO(py3): Change this pass to `raise TypeError(msg) from e`
+ pass
+ raise TypeError(msg.format(string))
+
+ raise TypeError("Could not construct DatetimeTZDtype")
+
+ def __unicode__(self):
+ return "datetime64[{unit}, {tz}]".format(unit=self.unit, tz=self.tz)
+
+ @property
+ def name(self):
+ """A string representation of the dtype."""
+ return str(self)
+
+ def __hash__(self):
+ # make myself hashable
+ # TODO: update this.
+ return hash(str(self))
+
+ def __eq__(self, other):
+ if isinstance(other, compat.string_types):
+ return other == self.name
+
+ return (isinstance(other, DatetimeTZDtype) and
+ self.unit == other.unit and
+ str(self.tz) == str(other.tz))
+
+ def __setstate__(self, state):
+ # for pickle compat.
+ self._tz = state['tz']
+ self._unit = state['unit']
+
+
+@register_extension_dtype
+class PeriodDtype(ExtensionDtype, PandasExtensionDtype):
+ """
+ A Period duck-typed class, suitable for holding a period with freq dtype.
+
+ THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.int64.
+ """
+ type = Period
+ kind = 'O'
+ str = '|O08'
+ base = np.dtype('O')
+ num = 102
+ _metadata = ('freq',)
+ _match = re.compile(r"(P|p)eriod\[(?P<freq>.+)\]")
+ _cache = {}
+
+ def __new__(cls, freq=None):
+ """
+ Parameters
+ ----------
+ freq : frequency
+ """
+
+ if isinstance(freq, PeriodDtype):
+ return freq
+
+ elif freq is None:
+ # empty constructor for pickle compat
+ return object.__new__(cls)
+
+ from pandas.tseries.offsets import DateOffset
+ if not isinstance(freq, DateOffset):
+ freq = cls._parse_dtype_strict(freq)
+
+ try:
+ return cls._cache[freq.freqstr]
+ except KeyError:
+ u = object.__new__(cls)
+ u.freq = freq
+ cls._cache[freq.freqstr] = u
+ return u
+
+ @classmethod
+ def _parse_dtype_strict(cls, freq):
+ if isinstance(freq, compat.string_types):
+ if freq.startswith('period[') or freq.startswith('Period['):
+ m = cls._match.search(freq)
+ if m is not None:
+ freq = m.group('freq')
+ from pandas.tseries.frequencies import to_offset
+ freq = to_offset(freq)
+ if freq is not None:
+ return freq
+
+ raise ValueError("could not construct PeriodDtype")
+
+ @classmethod
+ def construct_from_string(cls, string):
+ """
+ Strict construction from a string, raise a TypeError if not
+ possible
+ """
+ from pandas.tseries.offsets import DateOffset
+
+ if (isinstance(string, compat.string_types) and
+ (string.startswith('period[') or
+ string.startswith('Period[')) or
+ isinstance(string, DateOffset)):
+ # do not parse string like U as period[U]
+ # avoid tuple to be regarded as freq
+ try:
+ return cls(freq=string)
+ except ValueError:
+ pass
+ raise TypeError("could not construct PeriodDtype")
+
+ def __unicode__(self):
+ return compat.text_type(self.name)
+
+ @property
+ def name(self):
+ return str("period[{freq}]".format(freq=self.freq.freqstr))
+
+ @property
+ def na_value(self):
+ return NaT
+
+ def __hash__(self):
+ # make myself hashable
+ return hash(str(self))
+
+ def __eq__(self, other):
+ if isinstance(other, compat.string_types):
+ return other == self.name or other == self.name.title()
+
+ return isinstance(other, PeriodDtype) and self.freq == other.freq
+
+ @classmethod
+ def is_dtype(cls, dtype):
+ """
+ Return a boolean if we if the passed type is an actual dtype that we
+ can match (via string or type)
+ """
+
+ if isinstance(dtype, compat.string_types):
+ # PeriodDtype can be instantiated from freq string like "U",
+ # but doesn't regard freq str like "U" as dtype.
+ if dtype.startswith('period[') or dtype.startswith('Period['):
+ try:
+ if cls._parse_dtype_strict(dtype) is not None:
+ return True
+ else:
+ return False
+ except ValueError:
+ return False
+ else:
+ return False
+ return super(PeriodDtype, cls).is_dtype(dtype)
+
+ @classmethod
+ def construct_array_type(cls):
+ from pandas.core.arrays import PeriodArray
+
+ return PeriodArray
+
+
+@register_extension_dtype
+class IntervalDtype(PandasExtensionDtype, ExtensionDtype):
+ """
+ A Interval duck-typed class, suitable for holding an interval
+
+ THIS IS NOT A REAL NUMPY DTYPE
+ """
+ name = 'interval'
+ kind = None
+ str = '|O08'
+ base = np.dtype('O')
+ num = 103
+ _metadata = ('subtype',)
+ _match = re.compile(r"(I|i)nterval\[(?P<subtype>.+)\]")
+ _cache = {}
+
+ def __new__(cls, subtype=None):
+ """
+ Parameters
+ ----------
+ subtype : the dtype of the Interval
+ """
+ from pandas.core.dtypes.common import (
+ is_categorical_dtype, is_string_dtype, pandas_dtype)
+
+ if isinstance(subtype, IntervalDtype):
+ return subtype
+ elif subtype is None:
+ # we are called as an empty constructor
+ # generally for pickle compat
+ u = object.__new__(cls)
+ u.subtype = None
+ return u
+ elif (isinstance(subtype, compat.string_types) and
+ subtype.lower() == 'interval'):
+ subtype = None
+ else:
+ if isinstance(subtype, compat.string_types):
+ m = cls._match.search(subtype)
+ if m is not None:
+ subtype = m.group('subtype')
+
+ try:
+ subtype = pandas_dtype(subtype)
+ except TypeError:
+ raise TypeError("could not construct IntervalDtype")
+
+ if is_categorical_dtype(subtype) or is_string_dtype(subtype):
+ # GH 19016
+ msg = ('category, object, and string subtypes are not supported '
+ 'for IntervalDtype')
+ raise TypeError(msg)
+
+ try:
+ return cls._cache[str(subtype)]
+ except KeyError:
+ u = object.__new__(cls)
+ u.subtype = subtype
+ cls._cache[str(subtype)] = u
+ return u
+
+ @classmethod
+ def construct_array_type(cls):
+ """
+ Return the array type associated with this dtype
+
+ Returns
+ -------
+ type
+ """
+ from pandas.core.arrays import IntervalArray
+ return IntervalArray
+
+ @classmethod
+ def construct_from_string(cls, string):
+ """
+ attempt to construct this type from a string, raise a TypeError
+ if its not possible
+ """
+ if not isinstance(string, compat.string_types):
+ msg = "a string needs to be passed, got type {typ}"
+ raise TypeError(msg.format(typ=type(string)))
+
+ if (string.lower() == 'interval' or
+ cls._match.search(string) is not None):
+ return cls(string)
+
+ msg = ('Incorrectly formatted string passed to constructor. '
+ 'Valid formats include Interval or Interval[dtype] '
+ 'where dtype is numeric, datetime, or timedelta')
+ raise TypeError(msg)
+
+ @property
+ def type(self):
+ return Interval
+
+ def __unicode__(self):
+ if self.subtype is None:
+ return "interval"
+ return "interval[{subtype}]".format(subtype=self.subtype)
+
+ def __hash__(self):
+ # make myself hashable
+ return hash(str(self))
+
+ def __eq__(self, other):
+ if isinstance(other, compat.string_types):
+ return other.lower() in (self.name.lower(), str(self).lower())
+ elif not isinstance(other, IntervalDtype):
+ return False
+ elif self.subtype is None or other.subtype is None:
+ # None should match any subtype
+ return True
+ else:
+ from pandas.core.dtypes.common import is_dtype_equal
+ return is_dtype_equal(self.subtype, other.subtype)
+
+ @classmethod
+ def is_dtype(cls, dtype):
+ """
+ Return a boolean if we if the passed type is an actual dtype that we
+ can match (via string or type)
+ """
+
+ if isinstance(dtype, compat.string_types):
+ if dtype.lower().startswith('interval'):
+ try:
+ if cls.construct_from_string(dtype) is not None:
+ return True
+ else:
+ return False
+ except (ValueError, TypeError):
+ return False
+ else:
+ return False
+ return super(IntervalDtype, cls).is_dtype(dtype)
diff --git a/contrib/python/pandas/py2/pandas/core/dtypes/generic.py b/contrib/python/pandas/py2/pandas/core/dtypes/generic.py
new file mode 100644
index 00000000000..134ec957298
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/dtypes/generic.py
@@ -0,0 +1,84 @@
+""" define generic base classes for pandas objects """
+
+
+# define abstract base classes to enable isinstance type checking on our
+# objects
+def create_pandas_abc_type(name, attr, comp):
+ @classmethod
+ def _check(cls, inst):
+ return getattr(inst, attr, '_typ') in comp
+
+ dct = dict(__instancecheck__=_check, __subclasscheck__=_check)
+ meta = type("ABCBase", (type, ), dct)
+ return meta(name, tuple(), dct)
+
+
+ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index", ))
+ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ",
+ ("int64index", ))
+ABCUInt64Index = create_pandas_abc_type("ABCUInt64Index", "_typ",
+ ("uint64index", ))
+ABCRangeIndex = create_pandas_abc_type("ABCRangeIndex", "_typ",
+ ("rangeindex", ))
+ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ",
+ ("float64index", ))
+ABCMultiIndex = create_pandas_abc_type("ABCMultiIndex", "_typ",
+ ("multiindex", ))
+ABCDatetimeIndex = create_pandas_abc_type("ABCDatetimeIndex", "_typ",
+ ("datetimeindex", ))
+ABCTimedeltaIndex = create_pandas_abc_type("ABCTimedeltaIndex", "_typ",
+ ("timedeltaindex", ))
+ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ",
+ ("periodindex", ))
+ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ",
+ ("categoricalindex", ))
+ABCIntervalIndex = create_pandas_abc_type("ABCIntervalIndex", "_typ",
+ ("intervalindex", ))
+ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ",
+ ("index", "int64index", "rangeindex",
+ "float64index", "uint64index",
+ "multiindex", "datetimeindex",
+ "timedeltaindex", "periodindex",
+ "categoricalindex", "intervalindex"))
+
+ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series", ))
+ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe", ))
+ABCSparseDataFrame = create_pandas_abc_type("ABCSparseDataFrame", "_subtyp",
+ ("sparse_frame", ))
+ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",))
+ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp",
+ ('sparse_series',
+ 'sparse_time_series'))
+ABCSparseArray = create_pandas_abc_type("ABCSparseArray", "_subtyp",
+ ('sparse_array', 'sparse_series'))
+ABCCategorical = create_pandas_abc_type("ABCCategorical", "_typ",
+ ("categorical"))
+ABCDatetimeArray = create_pandas_abc_type("ABCDatetimeArray", "_typ",
+ ("datetimearray"))
+ABCTimedeltaArray = create_pandas_abc_type("ABCTimedeltaArray", "_typ",
+ ("timedeltaarray"))
+ABCPeriodArray = create_pandas_abc_type("ABCPeriodArray", "_typ",
+ ("periodarray", ))
+ABCPeriod = create_pandas_abc_type("ABCPeriod", "_typ", ("period", ))
+ABCDateOffset = create_pandas_abc_type("ABCDateOffset", "_typ",
+ ("dateoffset",))
+ABCInterval = create_pandas_abc_type("ABCInterval", "_typ", ("interval", ))
+ABCExtensionArray = create_pandas_abc_type("ABCExtensionArray", "_typ",
+ ("extension",
+ "categorical",
+ "periodarray",
+ "datetimearray",
+ "timedeltaarray",
+ ))
+ABCPandasArray = create_pandas_abc_type("ABCPandasArray",
+ "_typ",
+ ("npy_extension",))
+
+
+class _ABCGeneric(type):
+
+ def __instancecheck__(cls, inst):
+ return hasattr(inst, "_data")
+
+
+ABCGeneric = _ABCGeneric("ABCGeneric", tuple(), {})
diff --git a/contrib/python/pandas/py2/pandas/core/dtypes/inference.py b/contrib/python/pandas/py2/pandas/core/dtypes/inference.py
new file mode 100644
index 00000000000..dd05e2022f0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/dtypes/inference.py
@@ -0,0 +1,499 @@
+""" basic inference routines """
+
+from numbers import Number
+import re
+
+import numpy as np
+
+from pandas._libs import lib
+from pandas.compat import (
+ PY2, Set, re_type, string_and_binary_types, string_types, text_type)
+
+from pandas import compat
+
+is_bool = lib.is_bool
+
+is_integer = lib.is_integer
+
+is_float = lib.is_float
+
+is_complex = lib.is_complex
+
+is_scalar = lib.is_scalar
+
+is_decimal = lib.is_decimal
+
+is_interval = lib.is_interval
+
+
+def is_number(obj):
+ """
+ Check if the object is a number.
+
+ Returns True when the object is a number, and False if is not.
+
+ Parameters
+ ----------
+ obj : any type
+ The object to check if is a number.
+
+ Returns
+ -------
+ is_number : bool
+ Whether `obj` is a number or not.
+
+ See Also
+ --------
+ pandas.api.types.is_integer: Checks a subgroup of numbers.
+
+ Examples
+ --------
+ >>> pd.api.types.is_number(1)
+ True
+ >>> pd.api.types.is_number(7.15)
+ True
+
+ Booleans are valid because they are int subclass.
+
+ >>> pd.api.types.is_number(False)
+ True
+
+ >>> pd.api.types.is_number("foo")
+ False
+ >>> pd.api.types.is_number("5")
+ False
+ """
+
+ return isinstance(obj, (Number, np.number))
+
+
+def is_string_like(obj):
+ """
+ Check if the object is a string.
+
+ Parameters
+ ----------
+ obj : The object to check
+
+ Examples
+ --------
+ >>> is_string_like("foo")
+ True
+ >>> is_string_like(1)
+ False
+
+ Returns
+ -------
+ is_str_like : bool
+ Whether `obj` is a string or not.
+ """
+
+ return isinstance(obj, (text_type, string_types))
+
+
+def _iterable_not_string(obj):
+ """
+ Check if the object is an iterable but not a string.
+
+ Parameters
+ ----------
+ obj : The object to check.
+
+ Returns
+ -------
+ is_iter_not_string : bool
+ Whether `obj` is a non-string iterable.
+
+ Examples
+ --------
+ >>> _iterable_not_string([1, 2, 3])
+ True
+ >>> _iterable_not_string("foo")
+ False
+ >>> _iterable_not_string(1)
+ False
+ """
+
+ return (isinstance(obj, compat.Iterable) and
+ not isinstance(obj, string_types))
+
+
+def is_iterator(obj):
+ """
+ Check if the object is an iterator.
+
+ For example, lists are considered iterators
+ but not strings or datetime objects.
+
+ Parameters
+ ----------
+ obj : The object to check
+
+ Returns
+ -------
+ is_iter : bool
+ Whether `obj` is an iterator.
+
+ Examples
+ --------
+ >>> is_iterator([1, 2, 3])
+ True
+ >>> is_iterator(datetime(2017, 1, 1))
+ False
+ >>> is_iterator("foo")
+ False
+ >>> is_iterator(1)
+ False
+ """
+
+ if not hasattr(obj, '__iter__'):
+ return False
+
+ if PY2:
+ return hasattr(obj, 'next')
+ else:
+ # Python 3 generators have
+ # __next__ instead of next
+ return hasattr(obj, '__next__')
+
+
+def is_file_like(obj):
+ """
+ Check if the object is a file-like object.
+
+ For objects to be considered file-like, they must
+ be an iterator AND have either a `read` and/or `write`
+ method as an attribute.
+
+ Note: file-like objects must be iterable, but
+ iterable objects need not be file-like.
+
+ .. versionadded:: 0.20.0
+
+ Parameters
+ ----------
+ obj : The object to check
+
+ Returns
+ -------
+ is_file_like : bool
+ Whether `obj` has file-like properties.
+
+ Examples
+ --------
+ >>> buffer(StringIO("data"))
+ >>> is_file_like(buffer)
+ True
+ >>> is_file_like([1, 2, 3])
+ False
+ """
+
+ if not (hasattr(obj, 'read') or hasattr(obj, 'write')):
+ return False
+
+ if not hasattr(obj, "__iter__"):
+ return False
+
+ return True
+
+
+def is_re(obj):
+ """
+ Check if the object is a regex pattern instance.
+
+ Parameters
+ ----------
+ obj : The object to check
+
+ Returns
+ -------
+ is_regex : bool
+ Whether `obj` is a regex pattern.
+
+ Examples
+ --------
+ >>> is_re(re.compile(".*"))
+ True
+ >>> is_re("foo")
+ False
+ """
+
+ return isinstance(obj, re_type)
+
+
+def is_re_compilable(obj):
+ """
+ Check if the object can be compiled into a regex pattern instance.
+
+ Parameters
+ ----------
+ obj : The object to check
+
+ Returns
+ -------
+ is_regex_compilable : bool
+ Whether `obj` can be compiled as a regex pattern.
+
+ Examples
+ --------
+ >>> is_re_compilable(".*")
+ True
+ >>> is_re_compilable(1)
+ False
+ """
+
+ try:
+ re.compile(obj)
+ except TypeError:
+ return False
+ else:
+ return True
+
+
+def is_list_like(obj, allow_sets=True):
+ """
+ Check if the object is list-like.
+
+ Objects that are considered list-like are for example Python
+ lists, tuples, sets, NumPy arrays, and Pandas Series.
+
+ Strings and datetime objects, however, are not considered list-like.
+
+ Parameters
+ ----------
+ obj : The object to check
+ allow_sets : boolean, default True
+ If this parameter is False, sets will not be considered list-like
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ is_list_like : bool
+ Whether `obj` has list-like properties.
+
+ Examples
+ --------
+ >>> is_list_like([1, 2, 3])
+ True
+ >>> is_list_like({1, 2, 3})
+ True
+ >>> is_list_like(datetime(2017, 1, 1))
+ False
+ >>> is_list_like("foo")
+ False
+ >>> is_list_like(1)
+ False
+ >>> is_list_like(np.array([2]))
+ True
+ >>> is_list_like(np.array(2)))
+ False
+ """
+
+ return (isinstance(obj, compat.Iterable)
+ # we do not count strings/unicode/bytes as list-like
+ and not isinstance(obj, string_and_binary_types)
+
+ # exclude zero-dimensional numpy arrays, effectively scalars
+ and not (isinstance(obj, np.ndarray) and obj.ndim == 0)
+
+ # exclude sets if allow_sets is False
+ and not (allow_sets is False and isinstance(obj, Set)))
+
+
+def is_array_like(obj):
+ """
+ Check if the object is array-like.
+
+ For an object to be considered array-like, it must be list-like and
+ have a `dtype` attribute.
+
+ Parameters
+ ----------
+ obj : The object to check
+
+ Returns
+ -------
+ is_array_like : bool
+ Whether `obj` has array-like properties.
+
+ Examples
+ --------
+ >>> is_array_like(np.array([1, 2, 3]))
+ True
+ >>> is_array_like(pd.Series(["a", "b"]))
+ True
+ >>> is_array_like(pd.Index(["2016-01-01"]))
+ True
+ >>> is_array_like([1, 2, 3])
+ False
+ >>> is_array_like(("a", "b"))
+ False
+ """
+
+ return is_list_like(obj) and hasattr(obj, "dtype")
+
+
+def is_nested_list_like(obj):
+ """
+ Check if the object is list-like, and that all of its elements
+ are also list-like.
+
+ .. versionadded:: 0.20.0
+
+ Parameters
+ ----------
+ obj : The object to check
+
+ Returns
+ -------
+ is_list_like : bool
+ Whether `obj` has list-like properties.
+
+ Examples
+ --------
+ >>> is_nested_list_like([[1, 2, 3]])
+ True
+ >>> is_nested_list_like([{1, 2, 3}, {1, 2, 3}])
+ True
+ >>> is_nested_list_like(["foo"])
+ False
+ >>> is_nested_list_like([])
+ False
+ >>> is_nested_list_like([[1, 2, 3], 1])
+ False
+
+ Notes
+ -----
+ This won't reliably detect whether a consumable iterator (e. g.
+ a generator) is a nested-list-like without consuming the iterator.
+ To avoid consuming it, we always return False if the outer container
+ doesn't define `__len__`.
+
+ See Also
+ --------
+ is_list_like
+ """
+ return (is_list_like(obj) and hasattr(obj, '__len__') and
+ len(obj) > 0 and all(is_list_like(item) for item in obj))
+
+
+def is_dict_like(obj):
+ """
+ Check if the object is dict-like.
+
+ Parameters
+ ----------
+ obj : The object to check
+
+ Returns
+ -------
+ is_dict_like : bool
+ Whether `obj` has dict-like properties.
+
+ Examples
+ --------
+ >>> is_dict_like({1: 2})
+ True
+ >>> is_dict_like([1, 2, 3])
+ False
+ >>> is_dict_like(dict)
+ False
+ >>> is_dict_like(dict())
+ True
+ """
+ dict_like_attrs = ("__getitem__", "keys", "__contains__")
+ return (all(hasattr(obj, attr) for attr in dict_like_attrs)
+ # [GH 25196] exclude classes
+ and not isinstance(obj, type))
+
+
+def is_named_tuple(obj):
+ """
+ Check if the object is a named tuple.
+
+ Parameters
+ ----------
+ obj : The object to check
+
+ Returns
+ -------
+ is_named_tuple : bool
+ Whether `obj` is a named tuple.
+
+ Examples
+ --------
+ >>> Point = namedtuple("Point", ["x", "y"])
+ >>> p = Point(1, 2)
+ >>>
+ >>> is_named_tuple(p)
+ True
+ >>> is_named_tuple((1, 2))
+ False
+ """
+
+ return isinstance(obj, tuple) and hasattr(obj, '_fields')
+
+
+def is_hashable(obj):
+ """Return True if hash(obj) will succeed, False otherwise.
+
+ Some types will pass a test against collections.Hashable but fail when they
+ are actually hashed with hash().
+
+ Distinguish between these and other types by trying the call to hash() and
+ seeing if they raise TypeError.
+
+ Examples
+ --------
+ >>> a = ([],)
+ >>> isinstance(a, collections.Hashable)
+ True
+ >>> is_hashable(a)
+ False
+ """
+ # Unfortunately, we can't use isinstance(obj, collections.Hashable), which
+ # can be faster than calling hash. That is because numpy scalars on Python
+ # 3 fail this test.
+
+ # Reconsider this decision once this numpy bug is fixed:
+ # https://github.com/numpy/numpy/issues/5562
+
+ try:
+ hash(obj)
+ except TypeError:
+ return False
+ else:
+ return True
+
+
+def is_sequence(obj):
+ """
+ Check if the object is a sequence of objects.
+ String types are not included as sequences here.
+
+ Parameters
+ ----------
+ obj : The object to check
+
+ Returns
+ -------
+ is_sequence : bool
+ Whether `obj` is a sequence of objects.
+
+ Examples
+ --------
+ >>> l = [1, 2, 3]
+ >>>
+ >>> is_sequence(l)
+ True
+ >>> is_sequence(iter(l))
+ False
+ """
+
+ try:
+ iter(obj) # Can iterate over it.
+ len(obj) # Has a length associated with it.
+ return not isinstance(obj, string_and_binary_types)
+ except (TypeError, AttributeError):
+ return False
diff --git a/contrib/python/pandas/py2/pandas/core/dtypes/missing.py b/contrib/python/pandas/py2/pandas/core/dtypes/missing.py
new file mode 100644
index 00000000000..3c6d3f21234
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/dtypes/missing.py
@@ -0,0 +1,529 @@
+"""
+missing types & inference
+"""
+import numpy as np
+
+from pandas._libs import lib, missing as libmissing
+from pandas._libs.tslibs import NaT, iNaT
+
+from .common import (
+ _NS_DTYPE, _TD_DTYPE, ensure_object, is_bool_dtype, is_complex_dtype,
+ is_datetime64_dtype, is_datetime64tz_dtype, is_datetimelike,
+ is_datetimelike_v_numeric, is_dtype_equal, is_extension_array_dtype,
+ is_float_dtype, is_integer_dtype, is_object_dtype, is_period_dtype,
+ is_scalar, is_string_dtype, is_string_like_dtype, is_timedelta64_dtype,
+ needs_i8_conversion, pandas_dtype)
+from .generic import (
+ ABCDatetimeArray, ABCExtensionArray, ABCGeneric, ABCIndexClass,
+ ABCMultiIndex, ABCSeries, ABCTimedeltaArray)
+from .inference import is_list_like
+
+isposinf_scalar = libmissing.isposinf_scalar
+isneginf_scalar = libmissing.isneginf_scalar
+
+
+def isna(obj):
+ """
+ Detect missing values for an array-like object.
+
+ This function takes a scalar or array-like object and indicates
+ whether values are missing (``NaN`` in numeric arrays, ``None`` or ``NaN``
+ in object arrays, ``NaT`` in datetimelike).
+
+ Parameters
+ ----------
+ obj : scalar or array-like
+ Object to check for null or missing values.
+
+ Returns
+ -------
+ bool or array-like of bool
+ For scalar input, returns a scalar boolean.
+ For array input, returns an array of boolean indicating whether each
+ corresponding element is missing.
+
+ See Also
+ --------
+ notna : Boolean inverse of pandas.isna.
+ Series.isna : Detect missing values in a Series.
+ DataFrame.isna : Detect missing values in a DataFrame.
+ Index.isna : Detect missing values in an Index.
+
+ Examples
+ --------
+ Scalar arguments (including strings) result in a scalar boolean.
+
+ >>> pd.isna('dog')
+ False
+
+ >>> pd.isna(np.nan)
+ True
+
+ ndarrays result in an ndarray of booleans.
+
+ >>> array = np.array([[1, np.nan, 3], [4, 5, np.nan]])
+ >>> array
+ array([[ 1., nan, 3.],
+ [ 4., 5., nan]])
+ >>> pd.isna(array)
+ array([[False, True, False],
+ [False, False, True]])
+
+ For indexes, an ndarray of booleans is returned.
+
+ >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None,
+ ... "2017-07-08"])
+ >>> index
+ DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'],
+ dtype='datetime64[ns]', freq=None)
+ >>> pd.isna(index)
+ array([False, False, True, False])
+
+ For Series and DataFrame, the same type is returned, containing booleans.
+
+ >>> df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']])
+ >>> df
+ 0 1 2
+ 0 ant bee cat
+ 1 dog None fly
+ >>> pd.isna(df)
+ 0 1 2
+ 0 False False False
+ 1 False True False
+
+ >>> pd.isna(df[1])
+ 0 False
+ 1 True
+ Name: 1, dtype: bool
+ """
+ return _isna(obj)
+
+
+isnull = isna
+
+
+def _isna_new(obj):
+ if is_scalar(obj):
+ return libmissing.checknull(obj)
+ # hack (for now) because MI registers as ndarray
+ elif isinstance(obj, ABCMultiIndex):
+ raise NotImplementedError("isna is not defined for MultiIndex")
+ elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass,
+ ABCExtensionArray,
+ ABCDatetimeArray, ABCTimedeltaArray)):
+ return _isna_ndarraylike(obj)
+ elif isinstance(obj, ABCGeneric):
+ return obj._constructor(obj._data.isna(func=isna))
+ elif isinstance(obj, list):
+ return _isna_ndarraylike(np.asarray(obj, dtype=object))
+ elif hasattr(obj, '__array__'):
+ return _isna_ndarraylike(np.asarray(obj))
+ else:
+ return obj is None
+
+
+def _isna_old(obj):
+ """Detect missing values. Treat None, NaN, INF, -INF as null.
+
+ Parameters
+ ----------
+ arr: ndarray or object value
+
+ Returns
+ -------
+ boolean ndarray or boolean
+ """
+ if is_scalar(obj):
+ return libmissing.checknull_old(obj)
+ # hack (for now) because MI registers as ndarray
+ elif isinstance(obj, ABCMultiIndex):
+ raise NotImplementedError("isna is not defined for MultiIndex")
+ elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)):
+ return _isna_ndarraylike_old(obj)
+ elif isinstance(obj, ABCGeneric):
+ return obj._constructor(obj._data.isna(func=_isna_old))
+ elif isinstance(obj, list):
+ return _isna_ndarraylike_old(np.asarray(obj, dtype=object))
+ elif hasattr(obj, '__array__'):
+ return _isna_ndarraylike_old(np.asarray(obj))
+ else:
+ return obj is None
+
+
+_isna = _isna_new
+
+
+def _use_inf_as_na(key):
+ """Option change callback for na/inf behaviour
+ Choose which replacement for numpy.isnan / -numpy.isfinite is used.
+
+ Parameters
+ ----------
+ flag: bool
+ True means treat None, NaN, INF, -INF as null (old way),
+ False means None and NaN are null, but INF, -INF are not null
+ (new way).
+
+ Notes
+ -----
+ This approach to setting global module values is discussed and
+ approved here:
+
+ * http://stackoverflow.com/questions/4859217/
+ programmatically-creating-variables-in-python/4859312#4859312
+ """
+ from pandas.core.config import get_option
+ flag = get_option(key)
+ if flag:
+ globals()['_isna'] = _isna_old
+ else:
+ globals()['_isna'] = _isna_new
+
+
+def _isna_ndarraylike(obj):
+ is_extension = is_extension_array_dtype(obj)
+
+ if not is_extension:
+ # Avoid accessing `.values` on things like
+ # PeriodIndex, which may be expensive.
+ values = getattr(obj, 'values', obj)
+ else:
+ values = obj
+
+ dtype = values.dtype
+
+ if is_extension:
+ if isinstance(obj, (ABCIndexClass, ABCSeries)):
+ values = obj._values
+ else:
+ values = obj
+ result = values.isna()
+ elif isinstance(obj, ABCDatetimeArray):
+ return obj.isna()
+ elif is_string_dtype(dtype):
+ # Working around NumPy ticket 1542
+ shape = values.shape
+
+ if is_string_like_dtype(dtype):
+ # object array of strings
+ result = np.zeros(values.shape, dtype=bool)
+ else:
+ # object array of non-strings
+ result = np.empty(shape, dtype=bool)
+ vec = libmissing.isnaobj(values.ravel())
+ result[...] = vec.reshape(shape)
+
+ elif needs_i8_conversion(dtype):
+ # this is the NaT pattern
+ result = values.view('i8') == iNaT
+ else:
+ result = np.isnan(values)
+
+ # box
+ if isinstance(obj, ABCSeries):
+ from pandas import Series
+ result = Series(result, index=obj.index, name=obj.name, copy=False)
+
+ return result
+
+
+def _isna_ndarraylike_old(obj):
+ values = getattr(obj, 'values', obj)
+ dtype = values.dtype
+
+ if is_string_dtype(dtype):
+ # Working around NumPy ticket 1542
+ shape = values.shape
+
+ if is_string_like_dtype(dtype):
+ result = np.zeros(values.shape, dtype=bool)
+ else:
+ result = np.empty(shape, dtype=bool)
+ vec = libmissing.isnaobj_old(values.ravel())
+ result[:] = vec.reshape(shape)
+
+ elif is_datetime64_dtype(dtype):
+ # this is the NaT pattern
+ result = values.view('i8') == iNaT
+ else:
+ result = ~np.isfinite(values)
+
+ # box
+ if isinstance(obj, ABCSeries):
+ from pandas import Series
+ result = Series(result, index=obj.index, name=obj.name, copy=False)
+
+ return result
+
+
+def notna(obj):
+ """
+ Detect non-missing values for an array-like object.
+
+ This function takes a scalar or array-like object and indicates
+ whether values are valid (not missing, which is ``NaN`` in numeric
+ arrays, ``None`` or ``NaN`` in object arrays, ``NaT`` in datetimelike).
+
+ Parameters
+ ----------
+ obj : array-like or object value
+ Object to check for *not* null or *non*-missing values.
+
+ Returns
+ -------
+ bool or array-like of bool
+ For scalar input, returns a scalar boolean.
+ For array input, returns an array of boolean indicating whether each
+ corresponding element is valid.
+
+ See Also
+ --------
+ isna : Boolean inverse of pandas.notna.
+ Series.notna : Detect valid values in a Series.
+ DataFrame.notna : Detect valid values in a DataFrame.
+ Index.notna : Detect valid values in an Index.
+
+ Examples
+ --------
+ Scalar arguments (including strings) result in a scalar boolean.
+
+ >>> pd.notna('dog')
+ True
+
+ >>> pd.notna(np.nan)
+ False
+
+ ndarrays result in an ndarray of booleans.
+
+ >>> array = np.array([[1, np.nan, 3], [4, 5, np.nan]])
+ >>> array
+ array([[ 1., nan, 3.],
+ [ 4., 5., nan]])
+ >>> pd.notna(array)
+ array([[ True, False, True],
+ [ True, True, False]])
+
+ For indexes, an ndarray of booleans is returned.
+
+ >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None,
+ ... "2017-07-08"])
+ >>> index
+ DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'],
+ dtype='datetime64[ns]', freq=None)
+ >>> pd.notna(index)
+ array([ True, True, False, True])
+
+ For Series and DataFrame, the same type is returned, containing booleans.
+
+ >>> df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']])
+ >>> df
+ 0 1 2
+ 0 ant bee cat
+ 1 dog None fly
+ >>> pd.notna(df)
+ 0 1 2
+ 0 True True True
+ 1 True False True
+
+ >>> pd.notna(df[1])
+ 0 True
+ 1 False
+ Name: 1, dtype: bool
+ """
+ res = isna(obj)
+ if is_scalar(res):
+ return not res
+ return ~res
+
+
+notnull = notna
+
+
+def _isna_compat(arr, fill_value=np.nan):
+ """
+ Parameters
+ ----------
+ arr: a numpy array
+ fill_value: fill value, default to np.nan
+
+ Returns
+ -------
+ True if we can fill using this fill_value
+ """
+ dtype = arr.dtype
+ if isna(fill_value):
+ return not (is_bool_dtype(dtype) or
+ is_integer_dtype(dtype))
+ return True
+
+
+def array_equivalent(left, right, strict_nan=False):
+ """
+ True if two arrays, left and right, have equal non-NaN elements, and NaNs
+ in corresponding locations. False otherwise. It is assumed that left and
+ right are NumPy arrays of the same dtype. The behavior of this function
+ (particularly with respect to NaNs) is not defined if the dtypes are
+ different.
+
+ Parameters
+ ----------
+ left, right : ndarrays
+ strict_nan : bool, default False
+ If True, consider NaN and None to be different.
+
+ Returns
+ -------
+ b : bool
+ Returns True if the arrays are equivalent.
+
+ Examples
+ --------
+ >>> array_equivalent(
+ ... np.array([1, 2, np.nan]),
+ ... np.array([1, 2, np.nan]))
+ True
+ >>> array_equivalent(
+ ... np.array([1, np.nan, 2]),
+ ... np.array([1, 2, np.nan]))
+ False
+ """
+
+ left, right = np.asarray(left), np.asarray(right)
+
+ # shape compat
+ if left.shape != right.shape:
+ return False
+
+ # Object arrays can contain None, NaN and NaT.
+ # string dtypes must be come to this path for NumPy 1.7.1 compat
+ if is_string_dtype(left) or is_string_dtype(right):
+
+ if not strict_nan:
+ # isna considers NaN and None to be equivalent.
+ return lib.array_equivalent_object(
+ ensure_object(left.ravel()), ensure_object(right.ravel()))
+
+ for left_value, right_value in zip(left, right):
+ if left_value is NaT and right_value is not NaT:
+ return False
+
+ elif isinstance(left_value, float) and np.isnan(left_value):
+ if (not isinstance(right_value, float) or
+ not np.isnan(right_value)):
+ return False
+ else:
+ if left_value != right_value:
+ return False
+ return True
+
+ # NaNs can occur in float and complex arrays.
+ if is_float_dtype(left) or is_complex_dtype(left):
+
+ # empty
+ if not (np.prod(left.shape) and np.prod(right.shape)):
+ return True
+ return ((left == right) | (isna(left) & isna(right))).all()
+
+ # numpy will will not allow this type of datetimelike vs integer comparison
+ elif is_datetimelike_v_numeric(left, right):
+ return False
+
+ # M8/m8
+ elif needs_i8_conversion(left) and needs_i8_conversion(right):
+ if not is_dtype_equal(left.dtype, right.dtype):
+ return False
+
+ left = left.view('i8')
+ right = right.view('i8')
+
+ # if we have structured dtypes, compare first
+ if (left.dtype.type is np.void or
+ right.dtype.type is np.void):
+ if left.dtype != right.dtype:
+ return False
+
+ return np.array_equal(left, right)
+
+
+def _infer_fill_value(val):
+ """
+ infer the fill value for the nan/NaT from the provided
+ scalar/ndarray/list-like if we are a NaT, return the correct dtyped
+ element to provide proper block construction
+ """
+
+ if not is_list_like(val):
+ val = [val]
+ val = np.array(val, copy=False)
+ if is_datetimelike(val):
+ return np.array('NaT', dtype=val.dtype)
+ elif is_object_dtype(val.dtype):
+ dtype = lib.infer_dtype(ensure_object(val), skipna=False)
+ if dtype in ['datetime', 'datetime64']:
+ return np.array('NaT', dtype=_NS_DTYPE)
+ elif dtype in ['timedelta', 'timedelta64']:
+ return np.array('NaT', dtype=_TD_DTYPE)
+ return np.nan
+
+
+def _maybe_fill(arr, fill_value=np.nan):
+ """
+ if we have a compatible fill_value and arr dtype, then fill
+ """
+ if _isna_compat(arr, fill_value):
+ arr.fill(fill_value)
+ return arr
+
+
+def na_value_for_dtype(dtype, compat=True):
+ """
+ Return a dtype compat na value
+
+ Parameters
+ ----------
+ dtype : string / dtype
+ compat : boolean, default True
+
+ Returns
+ -------
+ np.dtype or a pandas dtype
+
+ Examples
+ --------
+ >>> na_value_for_dtype(np.dtype('int64'))
+ 0
+ >>> na_value_for_dtype(np.dtype('int64'), compat=False)
+ nan
+ >>> na_value_for_dtype(np.dtype('float64'))
+ nan
+ >>> na_value_for_dtype(np.dtype('bool'))
+ False
+ >>> na_value_for_dtype(np.dtype('datetime64[ns]'))
+ NaT
+ """
+ dtype = pandas_dtype(dtype)
+
+ if is_extension_array_dtype(dtype):
+ return dtype.na_value
+ if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or
+ is_timedelta64_dtype(dtype) or is_period_dtype(dtype)):
+ return NaT
+ elif is_float_dtype(dtype):
+ return np.nan
+ elif is_integer_dtype(dtype):
+ if compat:
+ return 0
+ return np.nan
+ elif is_bool_dtype(dtype):
+ return False
+ return np.nan
+
+
+def remove_na_arraylike(arr):
+ """
+ Return array-like containing only true/non-NaN values, possibly empty.
+ """
+ if is_extension_array_dtype(arr):
+ return arr[notna(arr)]
+ else:
+ return arr[notna(lib.values_from_object(arr))]
diff --git a/contrib/python/pandas/py2/pandas/core/frame.py b/contrib/python/pandas/py2/pandas/core/frame.py
new file mode 100644
index 00000000000..f5535096c96
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/frame.py
@@ -0,0 +1,7976 @@
+# pylint: disable=E1101
+# pylint: disable=W0212,W0703,W0622
+"""
+DataFrame
+---------
+An efficient 2D container for potentially mixed-type time series or other
+labeled data series.
+
+Similar to its R counterpart, data.frame, except providing automatic data
+alignment and a host of useful data manipulation methods having to do with the
+labeling information
+"""
+from __future__ import division
+
+import collections
+import functools
+import itertools
+import sys
+import warnings
+from distutils.version import LooseVersion
+from textwrap import dedent
+
+import numpy as np
+import numpy.ma as ma
+
+from pandas._libs import lib, algos as libalgos
+
+from pandas.util._decorators import (Appender, Substitution,
+ rewrite_axis_style_signature,
+ deprecate_kwarg)
+from pandas.util._validators import (validate_bool_kwarg,
+ validate_axis_style_args)
+
+from pandas import compat
+from pandas.compat import (range, map, zip, lmap, lzip, StringIO, u,
+ OrderedDict, PY36, raise_with_traceback,
+ string_and_binary_types)
+from pandas.compat.numpy import function as nv
+from pandas.core.dtypes.cast import (
+ maybe_upcast,
+ cast_scalar_to_array,
+ infer_dtype_from_scalar,
+ maybe_cast_to_datetime,
+ maybe_infer_to_datetimelike,
+ maybe_convert_platform,
+ maybe_downcast_to_dtype,
+ invalidate_string_dtypes,
+ coerce_to_dtypes,
+ maybe_upcast_putmask,
+ find_common_type)
+from pandas.core.dtypes.common import (
+ is_dict_like,
+ is_datetime64tz_dtype,
+ is_object_dtype,
+ is_extension_type,
+ is_extension_array_dtype,
+ is_datetime64_any_dtype,
+ is_bool_dtype,
+ is_integer_dtype,
+ is_float_dtype,
+ is_integer,
+ is_scalar,
+ is_dtype_equal,
+ needs_i8_conversion,
+ infer_dtype_from_object,
+ ensure_float64,
+ ensure_int64,
+ ensure_platform_int,
+ is_list_like,
+ is_nested_list_like,
+ is_iterator,
+ is_sequence,
+ is_named_tuple)
+from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass
+from pandas.core.dtypes.missing import isna, notna
+
+from pandas.core import algorithms
+from pandas.core import common as com
+from pandas.core import nanops
+from pandas.core import ops
+from pandas.core.accessor import CachedAccessor
+from pandas.core.arrays import Categorical, ExtensionArray
+from pandas.core.arrays.datetimelike import (
+ DatetimeLikeArrayMixin as DatetimeLikeArray
+)
+from pandas.core.config import get_option
+from pandas.core.generic import NDFrame, _shared_docs
+from pandas.core.index import (Index, MultiIndex, ensure_index,
+ ensure_index_from_sequences)
+from pandas.core.indexes import base as ibase
+from pandas.core.indexes.datetimes import DatetimeIndex
+from pandas.core.indexes.period import PeriodIndex
+from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable,
+ check_bool_indexer)
+from pandas.core.internals import BlockManager
+from pandas.core.internals.construction import (
+ masked_rec_array_to_mgr, get_names_from_index, to_arrays,
+ reorder_arrays, init_ndarray, init_dict,
+ arrays_to_mgr, sanitize_index)
+from pandas.core.series import Series
+
+from pandas.io.formats import console
+from pandas.io.formats import format as fmt
+from pandas.io.formats.printing import pprint_thing
+
+import pandas.plotting._core as gfx
+
+# ---------------------------------------------------------------------
+# Docstring templates
+
+_shared_doc_kwargs = dict(
+ axes='index, columns', klass='DataFrame',
+ axes_single_arg="{0 or 'index', 1 or 'columns'}",
+ axis="""axis : {0 or 'index', 1 or 'columns'}, default 0
+ If 0 or 'index': apply function to each column.
+ If 1 or 'columns': apply function to each row.""",
+ optional_by="""
+ by : str or list of str
+ Name or list of names to sort by.
+
+ - if `axis` is 0 or `'index'` then `by` may contain index
+ levels and/or column labels
+ - if `axis` is 1 or `'columns'` then `by` may contain column
+ levels and/or index labels
+
+ .. versionchanged:: 0.23.0
+ Allow specifying index or column level names.""",
+ versionadded_to_excel='',
+ optional_labels="""labels : array-like, optional
+ New labels / index to conform the axis specified by 'axis' to.""",
+ optional_axis="""axis : int or str, optional
+ Axis to target. Can be either the axis name ('index', 'columns')
+ or number (0, 1).""",
+)
+
+_numeric_only_doc = """numeric_only : boolean, default None
+ Include only float, int, boolean data. If None, will attempt to use
+ everything, then use only numeric data
+"""
+
+_merge_doc = """
+Merge DataFrame or named Series objects with a database-style join.
+
+The join is done on columns or indexes. If joining columns on
+columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
+on indexes or indexes on a column or columns, the index will be passed on.
+
+Parameters
+----------%s
+right : DataFrame or named Series
+ Object to merge with.
+how : {'left', 'right', 'outer', 'inner'}, default 'inner'
+ Type of merge to be performed.
+
+ * left: use only keys from left frame, similar to a SQL left outer join;
+ preserve key order.
+ * right: use only keys from right frame, similar to a SQL right outer join;
+ preserve key order.
+ * outer: use union of keys from both frames, similar to a SQL full outer
+ join; sort keys lexicographically.
+ * inner: use intersection of keys from both frames, similar to a SQL inner
+ join; preserve the order of the left keys.
+on : label or list
+ Column or index level names to join on. These must be found in both
+ DataFrames. If `on` is None and not merging on indexes then this defaults
+ to the intersection of the columns in both DataFrames.
+left_on : label or list, or array-like
+ Column or index level names to join on in the left DataFrame. Can also
+ be an array or list of arrays of the length of the left DataFrame.
+ These arrays are treated as if they are columns.
+right_on : label or list, or array-like
+ Column or index level names to join on in the right DataFrame. Can also
+ be an array or list of arrays of the length of the right DataFrame.
+ These arrays are treated as if they are columns.
+left_index : bool, default False
+ Use the index from the left DataFrame as the join key(s). If it is a
+ MultiIndex, the number of keys in the other DataFrame (either the index
+ or a number of columns) must match the number of levels.
+right_index : bool, default False
+ Use the index from the right DataFrame as the join key. Same caveats as
+ left_index.
+sort : bool, default False
+ Sort the join keys lexicographically in the result DataFrame. If False,
+ the order of the join keys depends on the join type (how keyword).
+suffixes : tuple of (str, str), default ('_x', '_y')
+ Suffix to apply to overlapping column names in the left and right
+ side, respectively. To raise an exception on overlapping columns use
+ (False, False).
+copy : bool, default True
+ If False, avoid copy if possible.
+indicator : bool or str, default False
+ If True, adds a column to output DataFrame called "_merge" with
+ information on the source of each row.
+ If string, column with information on source of each row will be added to
+ output DataFrame, and column will be named value of string.
+ Information column is Categorical-type and takes on a value of "left_only"
+ for observations whose merge key only appears in 'left' DataFrame,
+ "right_only" for observations whose merge key only appears in 'right'
+ DataFrame, and "both" if the observation's merge key is found in both.
+
+validate : str, optional
+ If specified, checks if merge is of specified type.
+
+ * "one_to_one" or "1:1": check if merge keys are unique in both
+ left and right datasets.
+ * "one_to_many" or "1:m": check if merge keys are unique in left
+ dataset.
+ * "many_to_one" or "m:1": check if merge keys are unique in right
+ dataset.
+ * "many_to_many" or "m:m": allowed, but does not result in checks.
+
+ .. versionadded:: 0.21.0
+
+Returns
+-------
+DataFrame
+ A DataFrame of the two merged objects.
+
+See Also
+--------
+merge_ordered : Merge with optional filling/interpolation.
+merge_asof : Merge on nearest keys.
+DataFrame.join : Similar method using indices.
+
+Notes
+-----
+Support for specifying index levels as the `on`, `left_on`, and
+`right_on` parameters was added in version 0.23.0
+Support for merging named Series objects was added in version 0.24.0
+
+Examples
+--------
+
+>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
+... 'value': [1, 2, 3, 5]})
+>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
+... 'value': [5, 6, 7, 8]})
+>>> df1
+ lkey value
+0 foo 1
+1 bar 2
+2 baz 3
+3 foo 5
+>>> df2
+ rkey value
+0 foo 5
+1 bar 6
+2 baz 7
+3 foo 8
+
+Merge df1 and df2 on the lkey and rkey columns. The value columns have
+the default suffixes, _x and _y, appended.
+
+>>> df1.merge(df2, left_on='lkey', right_on='rkey')
+ lkey value_x rkey value_y
+0 foo 1 foo 5
+1 foo 1 foo 8
+2 foo 5 foo 5
+3 foo 5 foo 8
+4 bar 2 bar 6
+5 baz 3 baz 7
+
+Merge DataFrames df1 and df2 with specified left and right suffixes
+appended to any overlapping columns.
+
+>>> df1.merge(df2, left_on='lkey', right_on='rkey',
+... suffixes=('_left', '_right'))
+ lkey value_left rkey value_right
+0 foo 1 foo 5
+1 foo 1 foo 8
+2 foo 5 foo 5
+3 foo 5 foo 8
+4 bar 2 bar 6
+5 baz 3 baz 7
+
+Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
+any overlapping columns.
+
+>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
+Traceback (most recent call last):
+...
+ValueError: columns overlap but no suffix specified:
+ Index(['value'], dtype='object')
+"""
+
+# -----------------------------------------------------------------------
+# DataFrame class
+
+
+class DataFrame(NDFrame):
+ """
+ Two-dimensional size-mutable, potentially heterogeneous tabular data
+ structure with labeled axes (rows and columns). Arithmetic operations
+ align on both row and column labels. Can be thought of as a dict-like
+ container for Series objects. The primary pandas data structure.
+
+ Parameters
+ ----------
+ data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
+ Dict can contain Series, arrays, constants, or list-like objects
+
+ .. versionchanged :: 0.23.0
+ If data is a dict, argument order is maintained for Python 3.6
+ and later.
+
+ index : Index or array-like
+ Index to use for resulting frame. Will default to RangeIndex if
+ no indexing information part of input data and no index provided
+ columns : Index or array-like
+ Column labels to use for resulting frame. Will default to
+ RangeIndex (0, 1, 2, ..., n) if no column labels are provided
+ dtype : dtype, default None
+ Data type to force. Only a single dtype is allowed. If None, infer
+ copy : boolean, default False
+ Copy data from inputs. Only affects DataFrame / 2d ndarray input
+
+ See Also
+ --------
+ DataFrame.from_records : Constructor from tuples, also record arrays.
+ DataFrame.from_dict : From dicts of Series, arrays, or dicts.
+ DataFrame.from_items : From sequence of (key, value) pairs
+ pandas.read_csv, pandas.read_table, pandas.read_clipboard.
+
+ Examples
+ --------
+ Constructing DataFrame from a dictionary.
+
+ >>> d = {'col1': [1, 2], 'col2': [3, 4]}
+ >>> df = pd.DataFrame(data=d)
+ >>> df
+ col1 col2
+ 0 1 3
+ 1 2 4
+
+ Notice that the inferred dtype is int64.
+
+ >>> df.dtypes
+ col1 int64
+ col2 int64
+ dtype: object
+
+ To enforce a single dtype:
+
+ >>> df = pd.DataFrame(data=d, dtype=np.int8)
+ >>> df.dtypes
+ col1 int8
+ col2 int8
+ dtype: object
+
+ Constructing DataFrame from numpy ndarray:
+
+ >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
+ ... columns=['a', 'b', 'c'])
+ >>> df2
+ a b c
+ 0 1 2 3
+ 1 4 5 6
+ 2 7 8 9
+ """
+
+ @property
+ def _constructor(self):
+ return DataFrame
+
+ _constructor_sliced = Series
+ _deprecations = NDFrame._deprecations | frozenset(
+ ['get_value', 'set_value', 'from_csv', 'from_items'])
+ _accessors = set()
+
+ @property
+ def _constructor_expanddim(self):
+ from pandas.core.panel import Panel
+ return Panel
+
+ # ----------------------------------------------------------------------
+ # Constructors
+
+ def __init__(self, data=None, index=None, columns=None, dtype=None,
+ copy=False):
+ if data is None:
+ data = {}
+ if dtype is not None:
+ dtype = self._validate_dtype(dtype)
+
+ if isinstance(data, DataFrame):
+ data = data._data
+
+ if isinstance(data, BlockManager):
+ mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
+ dtype=dtype, copy=copy)
+ elif isinstance(data, dict):
+ mgr = init_dict(data, index, columns, dtype=dtype)
+ elif isinstance(data, ma.MaskedArray):
+ import numpy.ma.mrecords as mrecords
+ # masked recarray
+ if isinstance(data, mrecords.MaskedRecords):
+ mgr = masked_rec_array_to_mgr(data, index, columns, dtype,
+ copy)
+
+ # a masked array
+ else:
+ mask = ma.getmaskarray(data)
+ if mask.any():
+ data, fill_value = maybe_upcast(data, copy=True)
+ data.soften_mask() # set hardmask False if it was True
+ data[mask] = fill_value
+ else:
+ data = data.copy()
+ mgr = init_ndarray(data, index, columns, dtype=dtype,
+ copy=copy)
+
+ elif isinstance(data, (np.ndarray, Series, Index)):
+ if data.dtype.names:
+ data_columns = list(data.dtype.names)
+ data = {k: data[k] for k in data_columns}
+ if columns is None:
+ columns = data_columns
+ mgr = init_dict(data, index, columns, dtype=dtype)
+ elif getattr(data, 'name', None) is not None:
+ mgr = init_dict({data.name: data}, index, columns,
+ dtype=dtype)
+ else:
+ mgr = init_ndarray(data, index, columns, dtype=dtype,
+ copy=copy)
+
+ # For data is list-like, or Iterable (will consume into list)
+ elif (isinstance(data, compat.Iterable)
+ and not isinstance(data, string_and_binary_types)):
+ if not isinstance(data, compat.Sequence):
+ data = list(data)
+ if len(data) > 0:
+ if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
+ if is_named_tuple(data[0]) and columns is None:
+ columns = data[0]._fields
+ arrays, columns = to_arrays(data, columns, dtype=dtype)
+ columns = ensure_index(columns)
+
+ # set the index
+ if index is None:
+ if isinstance(data[0], Series):
+ index = get_names_from_index(data)
+ elif isinstance(data[0], Categorical):
+ index = ibase.default_index(len(data[0]))
+ else:
+ index = ibase.default_index(len(data))
+
+ mgr = arrays_to_mgr(arrays, columns, index, columns,
+ dtype=dtype)
+ else:
+ mgr = init_ndarray(data, index, columns, dtype=dtype,
+ copy=copy)
+ else:
+ mgr = init_dict({}, index, columns, dtype=dtype)
+ else:
+ try:
+ arr = np.array(data, dtype=dtype, copy=copy)
+ except (ValueError, TypeError) as e:
+ exc = TypeError('DataFrame constructor called with '
+ 'incompatible data and dtype: {e}'.format(e=e))
+ raise_with_traceback(exc)
+
+ if arr.ndim == 0 and index is not None and columns is not None:
+ values = cast_scalar_to_array((len(index), len(columns)),
+ data, dtype=dtype)
+ mgr = init_ndarray(values, index, columns,
+ dtype=values.dtype, copy=False)
+ else:
+ raise ValueError('DataFrame constructor not properly called!')
+
+ NDFrame.__init__(self, mgr, fastpath=True)
+
+ # ----------------------------------------------------------------------
+
+ @property
+ def axes(self):
+ """
+ Return a list representing the axes of the DataFrame.
+
+ It has the row axis labels and column axis labels as the only members.
+ They are returned in that order.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
+ >>> df.axes
+ [RangeIndex(start=0, stop=2, step=1), Index(['coll', 'col2'],
+ dtype='object')]
+ """
+ return [self.index, self.columns]
+
+ @property
+ def shape(self):
+ """
+ Return a tuple representing the dimensionality of the DataFrame.
+
+ See Also
+ --------
+ ndarray.shape
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
+ >>> df.shape
+ (2, 2)
+
+ >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
+ ... 'col3': [5, 6]})
+ >>> df.shape
+ (2, 3)
+ """
+ return len(self.index), len(self.columns)
+
+ @property
+ def _is_homogeneous_type(self):
+ """
+ Whether all the columns in a DataFrame have the same type.
+
+ Returns
+ -------
+ bool
+
+ Examples
+ --------
+ >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
+ True
+ >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
+ False
+
+ Items with the same type but different sizes are considered
+ different types.
+
+ >>> DataFrame({
+ ... "A": np.array([1, 2], dtype=np.int32),
+ ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
+ False
+ """
+ if self._data.any_extension_types:
+ return len({block.dtype for block in self._data.blocks}) == 1
+ else:
+ return not self._data.is_mixed_type
+
+ # ----------------------------------------------------------------------
+ # Rendering Methods
+
+ def _repr_fits_vertical_(self):
+ """
+ Check length against max_rows.
+ """
+ max_rows = get_option("display.max_rows")
+ return len(self) <= max_rows
+
+ def _repr_fits_horizontal_(self, ignore_width=False):
+ """
+ Check if full repr fits in horizontal boundaries imposed by the display
+ options width and max_columns.
+
+ In case off non-interactive session, no boundaries apply.
+
+ `ignore_width` is here so ipnb+HTML output can behave the way
+ users expect. display.max_columns remains in effect.
+ GH3541, GH3573
+ """
+
+ width, height = console.get_console_size()
+ max_columns = get_option("display.max_columns")
+ nb_columns = len(self.columns)
+
+ # exceed max columns
+ if ((max_columns and nb_columns > max_columns) or
+ ((not ignore_width) and width and nb_columns > (width // 2))):
+ return False
+
+ # used by repr_html under IPython notebook or scripts ignore terminal
+ # dims
+ if ignore_width or not console.in_interactive_session():
+ return True
+
+ if (get_option('display.width') is not None or
+ console.in_ipython_frontend()):
+ # check at least the column row for excessive width
+ max_rows = 1
+ else:
+ max_rows = get_option("display.max_rows")
+
+ # when auto-detecting, so width=None and not in ipython front end
+ # check whether repr fits horizontal by actually checking
+ # the width of the rendered repr
+ buf = StringIO()
+
+ # only care about the stuff we'll actually print out
+ # and to_string on entire frame may be expensive
+ d = self
+
+ if not (max_rows is None): # unlimited rows
+ # min of two, where one may be None
+ d = d.iloc[:min(max_rows, len(d))]
+ else:
+ return True
+
+ d.to_string(buf=buf)
+ value = buf.getvalue()
+ repr_width = max(len(l) for l in value.split('\n'))
+
+ return repr_width < width
+
+ def _info_repr(self):
+ """
+ True if the repr should show the info view.
+ """
+ info_repr_option = (get_option("display.large_repr") == "info")
+ return info_repr_option and not (self._repr_fits_horizontal_() and
+ self._repr_fits_vertical_())
+
+ def __unicode__(self):
+ """
+ Return a string representation for a particular DataFrame.
+
+ Invoked by unicode(df) in py2 only. Yields a Unicode String in both
+ py2/py3.
+ """
+ buf = StringIO(u(""))
+ if self._info_repr():
+ self.info(buf=buf)
+ return buf.getvalue()
+
+ max_rows = get_option("display.max_rows")
+ max_cols = get_option("display.max_columns")
+ show_dimensions = get_option("display.show_dimensions")
+ if get_option("display.expand_frame_repr"):
+ width, _ = console.get_console_size()
+ else:
+ width = None
+ self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
+ line_width=width, show_dimensions=show_dimensions)
+
+ return buf.getvalue()
+
+ def _repr_html_(self):
+ """
+ Return a html representation for a particular DataFrame.
+
+ Mainly for IPython notebook.
+ """
+ # qtconsole doesn't report its line width, and also
+ # behaves badly when outputting an HTML table
+ # that doesn't fit the window, so disable it.
+ # XXX: In IPython 3.x and above, the Qt console will not attempt to
+ # display HTML, so this check can be removed when support for
+ # IPython 2.x is no longer needed.
+ try:
+ import IPython
+ except ImportError:
+ pass
+ else:
+ if LooseVersion(IPython.__version__) < LooseVersion('3.0'):
+ if console.in_qtconsole():
+ # 'HTML output is disabled in QtConsole'
+ return None
+
+ if self._info_repr():
+ buf = StringIO(u(""))
+ self.info(buf=buf)
+ # need to escape the <class>, should be the first line.
+ val = buf.getvalue().replace('<', r'&lt;', 1)
+ val = val.replace('>', r'&gt;', 1)
+ return '<pre>' + val + '</pre>'
+
+ if get_option("display.notebook_repr_html"):
+ max_rows = get_option("display.max_rows")
+ max_cols = get_option("display.max_columns")
+ show_dimensions = get_option("display.show_dimensions")
+
+ return self.to_html(max_rows=max_rows, max_cols=max_cols,
+ show_dimensions=show_dimensions, notebook=True)
+ else:
+ return None
+
+ @Substitution(header='Write out the column names. If a list of strings '
+ 'is given, it is assumed to be aliases for the '
+ 'column names')
+ @Substitution(shared_params=fmt.common_docstring,
+ returns=fmt.return_docstring)
+ def to_string(self, buf=None, columns=None, col_space=None, header=True,
+ index=True, na_rep='NaN', formatters=None, float_format=None,
+ sparsify=None, index_names=True, justify=None,
+ max_rows=None, max_cols=None, show_dimensions=False,
+ decimal='.', line_width=None):
+ """
+ Render a DataFrame to a console-friendly tabular output.
+ %(shared_params)s
+ line_width : int, optional
+ Width to wrap a line in characters.
+ %(returns)s
+ See Also
+ --------
+ to_html : Convert DataFrame to HTML.
+
+ Examples
+ --------
+ >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
+ >>> df = pd.DataFrame(d)
+ >>> print(df.to_string())
+ col1 col2
+ 0 1 4
+ 1 2 5
+ 2 3 6
+ """
+
+ formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
+ col_space=col_space, na_rep=na_rep,
+ formatters=formatters,
+ float_format=float_format,
+ sparsify=sparsify, justify=justify,
+ index_names=index_names,
+ header=header, index=index,
+ max_rows=max_rows,
+ max_cols=max_cols,
+ show_dimensions=show_dimensions,
+ decimal=decimal,
+ line_width=line_width)
+ formatter.to_string()
+
+ if buf is None:
+ result = formatter.buf.getvalue()
+ return result
+
+ # ----------------------------------------------------------------------
+
+ @property
+ def style(self):
+ """
+ Property returning a Styler object containing methods for
+ building a styled HTML representation fo the DataFrame.
+
+ See Also
+ --------
+ pandas.io.formats.style.Styler
+ """
+ from pandas.io.formats.style import Styler
+ return Styler(self)
+
+ def iteritems(self):
+ r"""
+ Iterator over (column name, Series) pairs.
+
+ Iterates over the DataFrame columns, returning a tuple with
+ the column name and the content as a Series.
+
+ Yields
+ ------
+ label : object
+ The column names for the DataFrame being iterated over.
+ content : Series
+ The column entries belonging to each label, as a Series.
+
+ See Also
+ --------
+ DataFrame.iterrows : Iterate over DataFrame rows as
+ (index, Series) pairs.
+ DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
+ of the values.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
+ ... 'population': [1864, 22000, 80000]},
+ ... index=['panda', 'polar', 'koala'])
+ >>> df
+ species population
+ panda bear 1864
+ polar bear 22000
+ koala marsupial 80000
+ >>> for label, content in df.iteritems():
+ ... print('label:', label)
+ ... print('content:', content, sep='\n')
+ ...
+ label: species
+ content:
+ panda bear
+ polar bear
+ koala marsupial
+ Name: species, dtype: object
+ label: population
+ content:
+ panda 1864
+ polar 22000
+ koala 80000
+ Name: population, dtype: int64
+ """
+ if self.columns.is_unique and hasattr(self, '_item_cache'):
+ for k in self.columns:
+ yield k, self._get_item_cache(k)
+ else:
+ for i, k in enumerate(self.columns):
+ yield k, self._ixs(i, axis=1)
+
+ def iterrows(self):
+ """
+ Iterate over DataFrame rows as (index, Series) pairs.
+
+ Yields
+ ------
+ index : label or tuple of label
+ The index of the row. A tuple for a `MultiIndex`.
+ data : Series
+ The data of the row as a Series.
+
+ it : generator
+ A generator that iterates over the rows of the frame.
+
+ See Also
+ --------
+ itertuples : Iterate over DataFrame rows as namedtuples of the values.
+ iteritems : Iterate over (column name, Series) pairs.
+
+ Notes
+ -----
+
+ 1. Because ``iterrows`` returns a Series for each row,
+ it does **not** preserve dtypes across the rows (dtypes are
+ preserved across columns for DataFrames). For example,
+
+ >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
+ >>> row = next(df.iterrows())[1]
+ >>> row
+ int 1.0
+ float 1.5
+ Name: 0, dtype: float64
+ >>> print(row['int'].dtype)
+ float64
+ >>> print(df['int'].dtype)
+ int64
+
+ To preserve dtypes while iterating over the rows, it is better
+ to use :meth:`itertuples` which returns namedtuples of the values
+ and which is generally faster than ``iterrows``.
+
+ 2. You should **never modify** something you are iterating over.
+ This is not guaranteed to work in all cases. Depending on the
+ data types, the iterator returns a copy and not a view, and writing
+ to it will have no effect.
+ """
+ columns = self.columns
+ klass = self._constructor_sliced
+ for k, v in zip(self.index, self.values):
+ s = klass(v, index=columns, name=k)
+ yield k, s
+
+ def itertuples(self, index=True, name="Pandas"):
+ """
+ Iterate over DataFrame rows as namedtuples.
+
+ Parameters
+ ----------
+ index : bool, default True
+ If True, return the index as the first element of the tuple.
+ name : str or None, default "Pandas"
+ The name of the returned namedtuples or None to return regular
+ tuples.
+
+ Yields
+ -------
+ collections.namedtuple
+ Yields a namedtuple for each row in the DataFrame with the first
+ field possibly being the index and following fields being the
+ column values.
+
+ See Also
+ --------
+ DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
+ pairs.
+ DataFrame.iteritems : Iterate over (column name, Series) pairs.
+
+ Notes
+ -----
+ The column names will be renamed to positional names if they are
+ invalid Python identifiers, repeated, or start with an underscore.
+ With a large number of columns (>255), regular tuples are returned.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
+ ... index=['dog', 'hawk'])
+ >>> df
+ num_legs num_wings
+ dog 4 0
+ hawk 2 2
+ >>> for row in df.itertuples():
+ ... print(row)
+ ...
+ Pandas(Index='dog', num_legs=4, num_wings=0)
+ Pandas(Index='hawk', num_legs=2, num_wings=2)
+
+ By setting the `index` parameter to False we can remove the index
+ as the first element of the tuple:
+
+ >>> for row in df.itertuples(index=False):
+ ... print(row)
+ ...
+ Pandas(num_legs=4, num_wings=0)
+ Pandas(num_legs=2, num_wings=2)
+
+ With the `name` parameter set we set a custom name for the yielded
+ namedtuples:
+
+ >>> for row in df.itertuples(name='Animal'):
+ ... print(row)
+ ...
+ Animal(Index='dog', num_legs=4, num_wings=0)
+ Animal(Index='hawk', num_legs=2, num_wings=2)
+ """
+ arrays = []
+ fields = list(self.columns)
+ if index:
+ arrays.append(self.index)
+ fields.insert(0, "Index")
+
+ # use integer indexing because of possible duplicate column names
+ arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
+
+ # Python 3 supports at most 255 arguments to constructor, and
+ # things get slow with this many fields in Python 2
+ if name is not None and len(self.columns) + index < 256:
+ # `rename` is unsupported in Python 2.6
+ try:
+ itertuple = collections.namedtuple(name, fields, rename=True)
+ return map(itertuple._make, zip(*arrays))
+
+ except Exception:
+ pass
+
+ # fallback to regular tuples
+ return zip(*arrays)
+
+ items = iteritems
+
+ def __len__(self):
+ """
+ Returns length of info axis, but here we use the index.
+ """
+ return len(self.index)
+
+ def dot(self, other):
+ """
+ Compute the matrix mutiplication between the DataFrame and other.
+
+ This method computes the matrix product between the DataFrame and the
+ values of an other Series, DataFrame or a numpy array.
+
+ It can also be called using ``self @ other`` in Python >= 3.5.
+
+ Parameters
+ ----------
+ other : Series, DataFrame or array-like
+ The other object to compute the matrix product with.
+
+ Returns
+ -------
+ Series or DataFrame
+ If other is a Series, return the matrix product between self and
+ other as a Serie. If other is a DataFrame or a numpy.array, return
+ the matrix product of self and other in a DataFrame of a np.array.
+
+ See Also
+ --------
+ Series.dot: Similar method for Series.
+
+ Notes
+ -----
+ The dimensions of DataFrame and other must be compatible in order to
+ compute the matrix multiplication.
+
+ The dot method for Series computes the inner product, instead of the
+ matrix product here.
+
+ Examples
+ --------
+ Here we multiply a DataFrame with a Series.
+
+ >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
+ >>> s = pd.Series([1, 1, 2, 1])
+ >>> df.dot(s)
+ 0 -4
+ 1 5
+ dtype: int64
+
+ Here we multiply a DataFrame with another DataFrame.
+
+ >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
+ >>> df.dot(other)
+ 0 1
+ 0 1 4
+ 1 2 2
+
+ Note that the dot method give the same result as @
+
+ >>> df @ other
+ 0 1
+ 0 1 4
+ 1 2 2
+
+ The dot method works also if other is an np.array.
+
+ >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
+ >>> df.dot(arr)
+ 0 1
+ 0 1 4
+ 1 2 2
+ """
+ if isinstance(other, (Series, DataFrame)):
+ common = self.columns.union(other.index)
+ if (len(common) > len(self.columns) or
+ len(common) > len(other.index)):
+ raise ValueError('matrices are not aligned')
+
+ left = self.reindex(columns=common, copy=False)
+ right = other.reindex(index=common, copy=False)
+ lvals = left.values
+ rvals = right.values
+ else:
+ left = self
+ lvals = self.values
+ rvals = np.asarray(other)
+ if lvals.shape[1] != rvals.shape[0]:
+ raise ValueError('Dot product shape mismatch, '
+ '{s} vs {r}'.format(s=lvals.shape,
+ r=rvals.shape))
+
+ if isinstance(other, DataFrame):
+ return self._constructor(np.dot(lvals, rvals), index=left.index,
+ columns=other.columns)
+ elif isinstance(other, Series):
+ return Series(np.dot(lvals, rvals), index=left.index)
+ elif isinstance(rvals, (np.ndarray, Index)):
+ result = np.dot(lvals, rvals)
+ if result.ndim == 2:
+ return self._constructor(result, index=left.index)
+ else:
+ return Series(result, index=left.index)
+ else: # pragma: no cover
+ raise TypeError('unsupported type: {oth}'.format(oth=type(other)))
+
+ def __matmul__(self, other):
+ """
+ Matrix multiplication using binary `@` operator in Python>=3.5.
+ """
+ return self.dot(other)
+
+ def __rmatmul__(self, other):
+ """
+ Matrix multiplication using binary `@` operator in Python>=3.5.
+ """
+ return self.T.dot(np.transpose(other)).T
+
+ # ----------------------------------------------------------------------
+ # IO methods (to / from other formats)
+
+ @classmethod
+ def from_dict(cls, data, orient='columns', dtype=None, columns=None):
+ """
+ Construct DataFrame from dict of array-like or dicts.
+
+ Creates DataFrame object from dictionary by columns or by index
+ allowing dtype specification.
+
+ Parameters
+ ----------
+ data : dict
+ Of the form {field : array-like} or {field : dict}.
+ orient : {'columns', 'index'}, default 'columns'
+ The "orientation" of the data. If the keys of the passed dict
+ should be the columns of the resulting DataFrame, pass 'columns'
+ (default). Otherwise if the keys should be rows, pass 'index'.
+ dtype : dtype, default None
+ Data type to force, otherwise infer.
+ columns : list, default None
+ Column labels to use when ``orient='index'``. Raises a ValueError
+ if used with ``orient='columns'``.
+
+ .. versionadded:: 0.23.0
+
+ Returns
+ -------
+ pandas.DataFrame
+
+ See Also
+ --------
+ DataFrame.from_records : DataFrame from ndarray (structured
+ dtype), list of tuples, dict, or DataFrame.
+ DataFrame : DataFrame object creation using constructor.
+
+ Examples
+ --------
+ By default the keys of the dict become the DataFrame columns:
+
+ >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
+ >>> pd.DataFrame.from_dict(data)
+ col_1 col_2
+ 0 3 a
+ 1 2 b
+ 2 1 c
+ 3 0 d
+
+ Specify ``orient='index'`` to create the DataFrame using dictionary
+ keys as rows:
+
+ >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
+ >>> pd.DataFrame.from_dict(data, orient='index')
+ 0 1 2 3
+ row_1 3 2 1 0
+ row_2 a b c d
+
+ When using the 'index' orientation, the column names can be
+ specified manually:
+
+ >>> pd.DataFrame.from_dict(data, orient='index',
+ ... columns=['A', 'B', 'C', 'D'])
+ A B C D
+ row_1 3 2 1 0
+ row_2 a b c d
+ """
+ index = None
+ orient = orient.lower()
+ if orient == 'index':
+ if len(data) > 0:
+ # TODO speed up Series case
+ if isinstance(list(data.values())[0], (Series, dict)):
+ data = _from_nested_dict(data)
+ else:
+ data, index = list(data.values()), list(data.keys())
+ elif orient == 'columns':
+ if columns is not None:
+ raise ValueError("cannot use columns parameter with "
+ "orient='columns'")
+ else: # pragma: no cover
+ raise ValueError('only recognize index or columns for orient')
+
+ return cls(data, index=index, columns=columns, dtype=dtype)
+
+ def to_numpy(self, dtype=None, copy=False):
+ """
+ Convert the DataFrame to a NumPy array.
+
+ .. versionadded:: 0.24.0
+
+ By default, the dtype of the returned array will be the common NumPy
+ dtype of all types in the DataFrame. For example, if the dtypes are
+ ``float16`` and ``float32``, the results dtype will be ``float32``.
+ This may require copying data and coercing values, which may be
+ expensive.
+
+ Parameters
+ ----------
+ dtype : str or numpy.dtype, optional
+ The dtype to pass to :meth:`numpy.asarray`
+ copy : bool, default False
+ Whether to ensure that the returned value is a not a view on
+ another array. Note that ``copy=False`` does not *ensure* that
+ ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
+ a copy is made, even if not strictly necessary.
+
+ Returns
+ -------
+ array : numpy.ndarray
+
+ See Also
+ --------
+ Series.to_numpy : Similar method for Series.
+
+ Examples
+ --------
+ >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
+ array([[1, 3],
+ [2, 4]])
+
+ With heterogenous data, the lowest common type will have to
+ be used.
+
+ >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
+ >>> df.to_numpy()
+ array([[1. , 3. ],
+ [2. , 4.5]])
+
+ For a mix of numeric and non-numeric types, the output array will
+ have object dtype.
+
+ >>> df['C'] = pd.date_range('2000', periods=2)
+ >>> df.to_numpy()
+ array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
+ [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
+ """
+ result = np.array(self.values, dtype=dtype, copy=copy)
+ return result
+
+ def to_dict(self, orient='dict', into=dict):
+ """
+ Convert the DataFrame to a dictionary.
+
+ The type of the key-value pairs can be customized with the parameters
+ (see below).
+
+ Parameters
+ ----------
+ orient : str {'dict', 'list', 'series', 'split', 'records', 'index'}
+ Determines the type of the values of the dictionary.
+
+ - 'dict' (default) : dict like {column -> {index -> value}}
+ - 'list' : dict like {column -> [values]}
+ - 'series' : dict like {column -> Series(values)}
+ - 'split' : dict like
+ {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
+ - 'records' : list like
+ [{column -> value}, ... , {column -> value}]
+ - 'index' : dict like {index -> {column -> value}}
+
+ Abbreviations are allowed. `s` indicates `series` and `sp`
+ indicates `split`.
+
+ into : class, default dict
+ The collections.Mapping subclass used for all Mappings
+ in the return value. Can be the actual class or an empty
+ instance of the mapping type you want. If you want a
+ collections.defaultdict, you must pass it initialized.
+
+ .. versionadded:: 0.21.0
+
+ Returns
+ -------
+ dict, list or collections.Mapping
+ Return a collections.Mapping object representing the DataFrame.
+ The resulting transformation depends on the `orient` parameter.
+
+ See Also
+ --------
+ DataFrame.from_dict: Create a DataFrame from a dictionary.
+ DataFrame.to_json: Convert a DataFrame to JSON format.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'col1': [1, 2],
+ ... 'col2': [0.5, 0.75]},
+ ... index=['row1', 'row2'])
+ >>> df
+ col1 col2
+ row1 1 0.50
+ row2 2 0.75
+ >>> df.to_dict()
+ {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
+
+ You can specify the return orientation.
+
+ >>> df.to_dict('series')
+ {'col1': row1 1
+ row2 2
+ Name: col1, dtype: int64,
+ 'col2': row1 0.50
+ row2 0.75
+ Name: col2, dtype: float64}
+
+ >>> df.to_dict('split')
+ {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
+ 'data': [[1, 0.5], [2, 0.75]]}
+
+ >>> df.to_dict('records')
+ [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
+
+ >>> df.to_dict('index')
+ {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
+
+ You can also specify the mapping type.
+
+ >>> from collections import OrderedDict, defaultdict
+ >>> df.to_dict(into=OrderedDict)
+ OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
+ ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
+
+ If you want a `defaultdict`, you need to initialize it:
+
+ >>> dd = defaultdict(list)
+ >>> df.to_dict('records', into=dd)
+ [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
+ defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
+ """
+ if not self.columns.is_unique:
+ warnings.warn("DataFrame columns are not unique, some "
+ "columns will be omitted.", UserWarning,
+ stacklevel=2)
+ # GH16122
+ into_c = com.standardize_mapping(into)
+ if orient.lower().startswith('d'):
+ return into_c(
+ (k, v.to_dict(into)) for k, v in compat.iteritems(self))
+ elif orient.lower().startswith('l'):
+ return into_c((k, v.tolist()) for k, v in compat.iteritems(self))
+ elif orient.lower().startswith('sp'):
+ return into_c((('index', self.index.tolist()),
+ ('columns', self.columns.tolist()),
+ ('data', [
+ list(map(com.maybe_box_datetimelike, t))
+ for t in self.itertuples(index=False, name=None)
+ ])))
+ elif orient.lower().startswith('s'):
+ return into_c((k, com.maybe_box_datetimelike(v))
+ for k, v in compat.iteritems(self))
+ elif orient.lower().startswith('r'):
+ columns = self.columns.tolist()
+ rows = (dict(zip(columns, row))
+ for row in self.itertuples(index=False, name=None))
+ return [
+ into_c((k, com.maybe_box_datetimelike(v))
+ for k, v in compat.iteritems(row))
+ for row in rows]
+ elif orient.lower().startswith('i'):
+ if not self.index.is_unique:
+ raise ValueError(
+ "DataFrame index must be unique for orient='index'."
+ )
+ return into_c((t[0], dict(zip(self.columns, t[1:])))
+ for t in self.itertuples(name=None))
+ else:
+ raise ValueError("orient '{o}' not understood".format(o=orient))
+
+ def to_gbq(self, destination_table, project_id=None, chunksize=None,
+ reauth=False, if_exists='fail', auth_local_webserver=False,
+ table_schema=None, location=None, progress_bar=True,
+ credentials=None, verbose=None, private_key=None):
+ """
+ Write a DataFrame to a Google BigQuery table.
+
+ This function requires the `pandas-gbq package
+ <https://pandas-gbq.readthedocs.io>`__.
+
+ See the `How to authenticate with Google BigQuery
+ <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
+ guide for authentication instructions.
+
+ Parameters
+ ----------
+ destination_table : str
+ Name of table to be written, in the form ``dataset.tablename``.
+ project_id : str, optional
+ Google BigQuery Account project ID. Optional when available from
+ the environment.
+ chunksize : int, optional
+ Number of rows to be inserted in each chunk from the dataframe.
+ Set to ``None`` to load the whole dataframe at once.
+ reauth : bool, default False
+ Force Google BigQuery to re-authenticate the user. This is useful
+ if multiple accounts are used.
+ if_exists : str, default 'fail'
+ Behavior when the destination table exists. Value can be one of:
+
+ ``'fail'``
+ If table exists, do nothing.
+ ``'replace'``
+ If table exists, drop it, recreate it, and insert data.
+ ``'append'``
+ If table exists, insert data. Create if does not exist.
+ auth_local_webserver : bool, default False
+ Use the `local webserver flow`_ instead of the `console flow`_
+ when getting user credentials.
+
+ .. _local webserver flow:
+ http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
+ .. _console flow:
+ http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
+
+ *New in version 0.2.0 of pandas-gbq*.
+ table_schema : list of dicts, optional
+ List of BigQuery table fields to which according DataFrame
+ columns conform to, e.g. ``[{'name': 'col1', 'type':
+ 'STRING'},...]``. If schema is not provided, it will be
+ generated according to dtypes of DataFrame columns. See
+ BigQuery API documentation on available names of a field.
+
+ *New in version 0.3.1 of pandas-gbq*.
+ location : str, optional
+ Location where the load job should run. See the `BigQuery locations
+ documentation
+ <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
+ list of available locations. The location must match that of the
+ target dataset.
+
+ *New in version 0.5.0 of pandas-gbq*.
+ progress_bar : bool, default True
+ Use the library `tqdm` to show the progress bar for the upload,
+ chunk by chunk.
+
+ *New in version 0.5.0 of pandas-gbq*.
+ credentials : google.auth.credentials.Credentials, optional
+ Credentials for accessing Google APIs. Use this parameter to
+ override default credentials, such as to use Compute Engine
+ :class:`google.auth.compute_engine.Credentials` or Service
+ Account :class:`google.oauth2.service_account.Credentials`
+ directly.
+
+ *New in version 0.8.0 of pandas-gbq*.
+
+ .. versionadded:: 0.24.0
+ verbose : bool, deprecated
+ Deprecated in pandas-gbq version 0.4.0. Use the `logging module
+ to adjust verbosity instead
+ <https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.
+ private_key : str, deprecated
+ Deprecated in pandas-gbq version 0.8.0. Use the ``credentials``
+ parameter and
+ :func:`google.oauth2.service_account.Credentials.from_service_account_info`
+ or
+ :func:`google.oauth2.service_account.Credentials.from_service_account_file`
+ instead.
+
+ Service account private key in JSON format. Can be file path
+ or string contents. This is useful for remote server
+ authentication (eg. Jupyter/IPython notebook on remote host).
+
+ See Also
+ --------
+ pandas_gbq.to_gbq : This function in the pandas-gbq library.
+ pandas.read_gbq : Read a DataFrame from Google BigQuery.
+ """
+ from pandas.io import gbq
+ return gbq.to_gbq(
+ self, destination_table, project_id=project_id,
+ chunksize=chunksize, reauth=reauth, if_exists=if_exists,
+ auth_local_webserver=auth_local_webserver,
+ table_schema=table_schema, location=location,
+ progress_bar=progress_bar, credentials=credentials,
+ verbose=verbose, private_key=private_key)
+
+ @classmethod
+ def from_records(cls, data, index=None, exclude=None, columns=None,
+ coerce_float=False, nrows=None):
+ """
+ Convert structured or record ndarray to DataFrame.
+
+ Parameters
+ ----------
+ data : ndarray (structured dtype), list of tuples, dict, or DataFrame
+ index : string, list of fields, array-like
+ Field of array to use as the index, alternately a specific set of
+ input labels to use
+ exclude : sequence, default None
+ Columns or fields to exclude
+ columns : sequence, default None
+ Column names to use. If the passed data do not have names
+ associated with them, this argument provides names for the
+ columns. Otherwise this argument indicates the order of the columns
+ in the result (any names not found in the data will become all-NA
+ columns)
+ coerce_float : boolean, default False
+ Attempt to convert values of non-string, non-numeric objects (like
+ decimal.Decimal) to floating point, useful for SQL result sets
+ nrows : int, default None
+ Number of rows to read if data is an iterator
+
+ Returns
+ -------
+ df : DataFrame
+ """
+
+ # Make a copy of the input columns so we can modify it
+ if columns is not None:
+ columns = ensure_index(columns)
+
+ if is_iterator(data):
+ if nrows == 0:
+ return cls()
+
+ try:
+ first_row = next(data)
+ except StopIteration:
+ return cls(index=index, columns=columns)
+
+ dtype = None
+ if hasattr(first_row, 'dtype') and first_row.dtype.names:
+ dtype = first_row.dtype
+
+ values = [first_row]
+
+ if nrows is None:
+ values += data
+ else:
+ values.extend(itertools.islice(data, nrows - 1))
+
+ if dtype is not None:
+ data = np.array(values, dtype=dtype)
+ else:
+ data = values
+
+ if isinstance(data, dict):
+ if columns is None:
+ columns = arr_columns = ensure_index(sorted(data))
+ arrays = [data[k] for k in columns]
+ else:
+ arrays = []
+ arr_columns = []
+ for k, v in compat.iteritems(data):
+ if k in columns:
+ arr_columns.append(k)
+ arrays.append(v)
+
+ arrays, arr_columns = reorder_arrays(arrays, arr_columns,
+ columns)
+
+ elif isinstance(data, (np.ndarray, DataFrame)):
+ arrays, columns = to_arrays(data, columns)
+ if columns is not None:
+ columns = ensure_index(columns)
+ arr_columns = columns
+ else:
+ arrays, arr_columns = to_arrays(data, columns,
+ coerce_float=coerce_float)
+
+ arr_columns = ensure_index(arr_columns)
+ if columns is not None:
+ columns = ensure_index(columns)
+ else:
+ columns = arr_columns
+
+ if exclude is None:
+ exclude = set()
+ else:
+ exclude = set(exclude)
+
+ result_index = None
+ if index is not None:
+ if (isinstance(index, compat.string_types) or
+ not hasattr(index, "__iter__")):
+ i = columns.get_loc(index)
+ exclude.add(index)
+ if len(arrays) > 0:
+ result_index = Index(arrays[i], name=index)
+ else:
+ result_index = Index([], name=index)
+ else:
+ try:
+ to_remove = [arr_columns.get_loc(field) for field in index]
+ index_data = [arrays[i] for i in to_remove]
+ result_index = ensure_index_from_sequences(index_data,
+ names=index)
+
+ exclude.update(index)
+ except Exception:
+ result_index = index
+
+ if any(exclude):
+ arr_exclude = [x for x in exclude if x in arr_columns]
+ to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
+ arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
+
+ arr_columns = arr_columns.drop(arr_exclude)
+ columns = columns.drop(exclude)
+
+ mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns)
+
+ return cls(mgr)
+
+ def to_records(self, index=True, convert_datetime64=None,
+ column_dtypes=None, index_dtypes=None):
+ """
+ Convert DataFrame to a NumPy record array.
+
+ Index will be included as the first field of the record array if
+ requested.
+
+ Parameters
+ ----------
+ index : bool, default True
+ Include index in resulting record array, stored in 'index'
+ field or using the index label, if set.
+ convert_datetime64 : bool, default None
+ .. deprecated:: 0.23.0
+
+ Whether to convert the index to datetime.datetime if it is a
+ DatetimeIndex.
+ column_dtypes : str, type, dict, default None
+ .. versionadded:: 0.24.0
+
+ If a string or type, the data type to store all columns. If
+ a dictionary, a mapping of column names and indices (zero-indexed)
+ to specific data types.
+ index_dtypes : str, type, dict, default None
+ .. versionadded:: 0.24.0
+
+ If a string or type, the data type to store all index levels. If
+ a dictionary, a mapping of index level names and indices
+ (zero-indexed) to specific data types.
+
+ This mapping is applied only if `index=True`.
+
+ Returns
+ -------
+ numpy.recarray
+ NumPy ndarray with the DataFrame labels as fields and each row
+ of the DataFrame as entries.
+
+ See Also
+ --------
+ DataFrame.from_records: Convert structured or record ndarray
+ to DataFrame.
+ numpy.recarray: An ndarray that allows field access using
+ attributes, analogous to typed columns in a
+ spreadsheet.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
+ ... index=['a', 'b'])
+ >>> df
+ A B
+ a 1 0.50
+ b 2 0.75
+ >>> df.to_records()
+ rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
+ dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
+
+ If the DataFrame index has no label then the recarray field name
+ is set to 'index'. If the index has a label then this is used as the
+ field name:
+
+ >>> df.index = df.index.rename("I")
+ >>> df.to_records()
+ rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
+ dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])
+
+ The index can be excluded from the record array:
+
+ >>> df.to_records(index=False)
+ rec.array([(1, 0.5 ), (2, 0.75)],
+ dtype=[('A', '<i8'), ('B', '<f8')])
+
+ Data types can be specified for the columns:
+
+ >>> df.to_records(column_dtypes={"A": "int32"})
+ rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
+ dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
+
+ As well as for the index:
+
+ >>> df.to_records(index_dtypes="<S2")
+ rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
+ dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
+
+ >>> index_dtypes = "<S{}".format(df.index.str.len().max())
+ >>> df.to_records(index_dtypes=index_dtypes)
+ rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
+ dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
+ """
+
+ if convert_datetime64 is not None:
+ warnings.warn("The 'convert_datetime64' parameter is "
+ "deprecated and will be removed in a future "
+ "version",
+ FutureWarning, stacklevel=2)
+
+ if index:
+ if is_datetime64_any_dtype(self.index) and convert_datetime64:
+ ix_vals = [self.index.to_pydatetime()]
+ else:
+ if isinstance(self.index, MultiIndex):
+ # array of tuples to numpy cols. copy copy copy
+ ix_vals = lmap(np.array, zip(*self.index.values))
+ else:
+ ix_vals = [self.index.values]
+
+ arrays = ix_vals + [self[c].get_values() for c in self.columns]
+
+ count = 0
+ index_names = list(self.index.names)
+
+ if isinstance(self.index, MultiIndex):
+ for i, n in enumerate(index_names):
+ if n is None:
+ index_names[i] = 'level_%d' % count
+ count += 1
+ elif index_names[0] is None:
+ index_names = ['index']
+
+ names = (lmap(compat.text_type, index_names) +
+ lmap(compat.text_type, self.columns))
+ else:
+ arrays = [self[c].get_values() for c in self.columns]
+ names = lmap(compat.text_type, self.columns)
+ index_names = []
+
+ index_len = len(index_names)
+ formats = []
+
+ for i, v in enumerate(arrays):
+ index = i
+
+ # When the names and arrays are collected, we
+ # first collect those in the DataFrame's index,
+ # followed by those in its columns.
+ #
+ # Thus, the total length of the array is:
+ # len(index_names) + len(DataFrame.columns).
+ #
+ # This check allows us to see whether we are
+ # handling a name / array in the index or column.
+ if index < index_len:
+ dtype_mapping = index_dtypes
+ name = index_names[index]
+ else:
+ index -= index_len
+ dtype_mapping = column_dtypes
+ name = self.columns[index]
+
+ # We have a dictionary, so we get the data type
+ # associated with the index or column (which can
+ # be denoted by its name in the DataFrame or its
+ # position in DataFrame's array of indices or
+ # columns, whichever is applicable.
+ if is_dict_like(dtype_mapping):
+ if name in dtype_mapping:
+ dtype_mapping = dtype_mapping[name]
+ elif index in dtype_mapping:
+ dtype_mapping = dtype_mapping[index]
+ else:
+ dtype_mapping = None
+
+ # If no mapping can be found, use the array's
+ # dtype attribute for formatting.
+ #
+ # A valid dtype must either be a type or
+ # string naming a type.
+ if dtype_mapping is None:
+ formats.append(v.dtype)
+ elif isinstance(dtype_mapping, (type, compat.string_types)):
+ formats.append(dtype_mapping)
+ else:
+ element = "row" if i < index_len else "column"
+ msg = ("Invalid dtype {dtype} specified for "
+ "{element} {name}").format(dtype=dtype_mapping,
+ element=element, name=name)
+ raise ValueError(msg)
+
+ return np.rec.fromarrays(
+ arrays,
+ dtype={'names': names, 'formats': formats}
+ )
+
+ @classmethod
+ def from_items(cls, items, columns=None, orient='columns'):
+ """
+ Construct a DataFrame from a list of tuples.
+
+ .. deprecated:: 0.23.0
+ `from_items` is deprecated and will be removed in a future version.
+ Use :meth:`DataFrame.from_dict(dict(items)) <DataFrame.from_dict>`
+ instead.
+ :meth:`DataFrame.from_dict(OrderedDict(items)) <DataFrame.from_dict>`
+ may be used to preserve the key order.
+
+ Convert (key, value) pairs to DataFrame. The keys will be the axis
+ index (usually the columns, but depends on the specified
+ orientation). The values should be arrays or Series.
+
+ Parameters
+ ----------
+ items : sequence of (key, value) pairs
+ Values should be arrays or Series.
+ columns : sequence of column labels, optional
+ Must be passed if orient='index'.
+ orient : {'columns', 'index'}, default 'columns'
+ The "orientation" of the data. If the keys of the
+ input correspond to column labels, pass 'columns'
+ (default). Otherwise if the keys correspond to the index,
+ pass 'index'.
+
+ Returns
+ -------
+ frame : DataFrame
+ """
+
+ warnings.warn("from_items is deprecated. Please use "
+ "DataFrame.from_dict(dict(items), ...) instead. "
+ "DataFrame.from_dict(OrderedDict(items)) may be used to "
+ "preserve the key order.",
+ FutureWarning, stacklevel=2)
+
+ keys, values = lzip(*items)
+
+ if orient == 'columns':
+ if columns is not None:
+ columns = ensure_index(columns)
+
+ idict = dict(items)
+ if len(idict) < len(items):
+ if not columns.equals(ensure_index(keys)):
+ raise ValueError('With non-unique item names, passed '
+ 'columns must be identical')
+ arrays = values
+ else:
+ arrays = [idict[k] for k in columns if k in idict]
+ else:
+ columns = ensure_index(keys)
+ arrays = values
+
+ # GH 17312
+ # Provide more informative error msg when scalar values passed
+ try:
+ return cls._from_arrays(arrays, columns, None)
+
+ except ValueError:
+ if not is_nested_list_like(values):
+ raise ValueError('The value in each (key, value) pair '
+ 'must be an array, Series, or dict')
+
+ elif orient == 'index':
+ if columns is None:
+ raise TypeError("Must pass columns with orient='index'")
+
+ keys = ensure_index(keys)
+
+ # GH 17312
+ # Provide more informative error msg when scalar values passed
+ try:
+ arr = np.array(values, dtype=object).T
+ data = [lib.maybe_convert_objects(v) for v in arr]
+ return cls._from_arrays(data, columns, keys)
+
+ except TypeError:
+ if not is_nested_list_like(values):
+ raise ValueError('The value in each (key, value) pair '
+ 'must be an array, Series, or dict')
+
+ else: # pragma: no cover
+ raise ValueError("'orient' must be either 'columns' or 'index'")
+
+ @classmethod
+ def _from_arrays(cls, arrays, columns, index, dtype=None):
+ mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
+ return cls(mgr)
+
+ @classmethod
+ def from_csv(cls, path, header=0, sep=',', index_col=0, parse_dates=True,
+ encoding=None, tupleize_cols=None,
+ infer_datetime_format=False):
+ """
+ Read CSV file.
+
+ .. deprecated:: 0.21.0
+ Use :func:`pandas.read_csv` instead.
+
+ It is preferable to use the more powerful :func:`pandas.read_csv`
+ for most general purposes, but ``from_csv`` makes for an easy
+ roundtrip to and from a file (the exact counterpart of
+ ``to_csv``), especially with a DataFrame of time series data.
+
+ This method only differs from the preferred :func:`pandas.read_csv`
+ in some defaults:
+
+ - `index_col` is ``0`` instead of ``None`` (take first column as index
+ by default)
+ - `parse_dates` is ``True`` instead of ``False`` (try parsing the index
+ as datetime by default)
+
+ So a ``pd.DataFrame.from_csv(path)`` can be replaced by
+ ``pd.read_csv(path, index_col=0, parse_dates=True)``.
+
+ Parameters
+ ----------
+ path : string file path or file handle / StringIO
+ header : int, default 0
+ Row to use as header (skip prior rows)
+ sep : string, default ','
+ Field delimiter
+ index_col : int or sequence, default 0
+ Column to use for index. If a sequence is given, a MultiIndex
+ is used. Different default from read_table
+ parse_dates : boolean, default True
+ Parse dates. Different default from read_table
+ tupleize_cols : boolean, default False
+ write multi_index columns as a list of tuples (if True)
+ or new (expanded format) if False)
+ infer_datetime_format : boolean, default False
+ If True and `parse_dates` is True for a column, try to infer the
+ datetime format based on the first datetime string. If the format
+ can be inferred, there often will be a large parsing speed-up.
+
+ Returns
+ -------
+ y : DataFrame
+
+ See Also
+ --------
+ pandas.read_csv
+ """
+
+ warnings.warn("from_csv is deprecated. Please use read_csv(...) "
+ "instead. Note that some of the default arguments are "
+ "different, so please refer to the documentation "
+ "for from_csv when changing your function calls",
+ FutureWarning, stacklevel=2)
+
+ from pandas.io.parsers import read_csv
+ return read_csv(path, header=header, sep=sep,
+ parse_dates=parse_dates, index_col=index_col,
+ encoding=encoding, tupleize_cols=tupleize_cols,
+ infer_datetime_format=infer_datetime_format)
+
+ def to_sparse(self, fill_value=None, kind='block'):
+ """
+ Convert to SparseDataFrame.
+
+ Implement the sparse version of the DataFrame meaning that any data
+ matching a specific value it's omitted in the representation.
+ The sparse DataFrame allows for a more efficient storage.
+
+ Parameters
+ ----------
+ fill_value : float, default None
+ The specific value that should be omitted in the representation.
+ kind : {'block', 'integer'}, default 'block'
+ The kind of the SparseIndex tracking where data is not equal to
+ the fill value:
+
+ - 'block' tracks only the locations and sizes of blocks of data.
+ - 'integer' keeps an array with all the locations of the data.
+
+ In most cases 'block' is recommended, since it's more memory
+ efficient.
+
+ Returns
+ -------
+ SparseDataFrame
+ The sparse representation of the DataFrame.
+
+ See Also
+ --------
+ DataFrame.to_dense :
+ Converts the DataFrame back to the its dense form.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([(np.nan, np.nan),
+ ... (1., np.nan),
+ ... (np.nan, 1.)])
+ >>> df
+ 0 1
+ 0 NaN NaN
+ 1 1.0 NaN
+ 2 NaN 1.0
+ >>> type(df)
+ <class 'pandas.core.frame.DataFrame'>
+
+ >>> sdf = df.to_sparse()
+ >>> sdf
+ 0 1
+ 0 NaN NaN
+ 1 1.0 NaN
+ 2 NaN 1.0
+ >>> type(sdf)
+ <class 'pandas.core.sparse.frame.SparseDataFrame'>
+ """
+ from pandas.core.sparse.api import SparseDataFrame
+ return SparseDataFrame(self._series, index=self.index,
+ columns=self.columns, default_kind=kind,
+ default_fill_value=fill_value)
+
+ def to_panel(self):
+ """
+ Transform long (stacked) format (DataFrame) into wide (3D, Panel)
+ format.
+
+ .. deprecated:: 0.20.0
+
+ Currently the index of the DataFrame must be a 2-level MultiIndex. This
+ may be generalized later
+
+ Returns
+ -------
+ panel : Panel
+ """
+ # only support this kind for now
+ if (not isinstance(self.index, MultiIndex) or # pragma: no cover
+ len(self.index.levels) != 2):
+ raise NotImplementedError('Only 2-level MultiIndex are supported.')
+
+ if not self.index.is_unique:
+ raise ValueError("Can't convert non-uniquely indexed "
+ "DataFrame to Panel")
+
+ self._consolidate_inplace()
+
+ # minor axis must be sorted
+ if self.index.lexsort_depth < 2:
+ selfsorted = self.sort_index(level=0)
+ else:
+ selfsorted = self
+
+ major_axis, minor_axis = selfsorted.index.levels
+ major_codes, minor_codes = selfsorted.index.codes
+ shape = len(major_axis), len(minor_axis)
+
+ # preserve names, if any
+ major_axis = major_axis.copy()
+ major_axis.name = self.index.names[0]
+
+ minor_axis = minor_axis.copy()
+ minor_axis.name = self.index.names[1]
+
+ # create new axes
+ new_axes = [selfsorted.columns, major_axis, minor_axis]
+
+ # create new manager
+ new_mgr = selfsorted._data.reshape_nd(axes=new_axes,
+ labels=[major_codes,
+ minor_codes],
+ shape=shape,
+ ref_items=selfsorted.columns)
+
+ return self._constructor_expanddim(new_mgr)
+
+ @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
+ def to_stata(self, fname, convert_dates=None, write_index=True,
+ encoding="latin-1", byteorder=None, time_stamp=None,
+ data_label=None, variable_labels=None, version=114,
+ convert_strl=None):
+ """
+ Export DataFrame object to Stata dta format.
+
+ Writes the DataFrame to a Stata dataset file.
+ "dta" files contain a Stata dataset.
+
+ Parameters
+ ----------
+ fname : str, buffer or path object
+ String, path object (pathlib.Path or py._path.local.LocalPath) or
+ object implementing a binary write() function. If using a buffer
+ then the buffer will not be automatically closed after the file
+ data has been written.
+ convert_dates : dict
+ Dictionary mapping columns containing datetime types to stata
+ internal format to use when writing the dates. Options are 'tc',
+ 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
+ or a name. Datetime columns that do not have a conversion type
+ specified will be converted to 'tc'. Raises NotImplementedError if
+ a datetime column has timezone information.
+ write_index : bool
+ Write the index to Stata dataset.
+ encoding : str
+ Default is latin-1. Unicode is not supported.
+ byteorder : str
+ Can be ">", "<", "little", or "big". default is `sys.byteorder`.
+ time_stamp : datetime
+ A datetime to use as file creation date. Default is the current
+ time.
+ data_label : str, optional
+ A label for the data set. Must be 80 characters or smaller.
+ variable_labels : dict
+ Dictionary containing columns as keys and variable labels as
+ values. Each label must be 80 characters or smaller.
+
+ .. versionadded:: 0.19.0
+
+ version : {114, 117}, default 114
+ Version to use in the output dta file. Version 114 can be used
+ read by Stata 10 and later. Version 117 can be read by Stata 13
+ or later. Version 114 limits string variables to 244 characters or
+ fewer while 117 allows strings with lengths up to 2,000,000
+ characters.
+
+ .. versionadded:: 0.23.0
+
+ convert_strl : list, optional
+ List of column names to convert to string columns to Stata StrL
+ format. Only available if version is 117. Storing strings in the
+ StrL format can produce smaller dta files if strings have more than
+ 8 characters and values are repeated.
+
+ .. versionadded:: 0.23.0
+
+ Raises
+ ------
+ NotImplementedError
+ * If datetimes contain timezone information
+ * Column dtype is not representable in Stata
+ ValueError
+ * Columns listed in convert_dates are neither datetime64[ns]
+ or datetime.datetime
+ * Column listed in convert_dates is not in DataFrame
+ * Categorical label contains more than 32,000 characters
+
+ .. versionadded:: 0.19.0
+
+ See Also
+ --------
+ read_stata : Import Stata data files.
+ io.stata.StataWriter : Low-level writer for Stata data files.
+ io.stata.StataWriter117 : Low-level writer for version 117 files.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon',
+ ... 'parrot'],
+ ... 'speed': [350, 18, 361, 15]})
+ >>> df.to_stata('animals.dta') # doctest: +SKIP
+ """
+ kwargs = {}
+ if version not in (114, 117):
+ raise ValueError('Only formats 114 and 117 supported.')
+ if version == 114:
+ if convert_strl is not None:
+ raise ValueError('strl support is only available when using '
+ 'format 117')
+ from pandas.io.stata import StataWriter as statawriter
+ else:
+ from pandas.io.stata import StataWriter117 as statawriter
+ kwargs['convert_strl'] = convert_strl
+
+ writer = statawriter(fname, self, convert_dates=convert_dates,
+ byteorder=byteorder, time_stamp=time_stamp,
+ data_label=data_label, write_index=write_index,
+ variable_labels=variable_labels, **kwargs)
+ writer.write_file()
+
+ def to_feather(self, fname):
+ """
+ Write out the binary feather-format for DataFrames.
+
+ .. versionadded:: 0.20.0
+
+ Parameters
+ ----------
+ fname : str
+ string file path
+ """
+ from pandas.io.feather_format import to_feather
+ to_feather(self, fname)
+
+ def to_parquet(self, fname, engine='auto', compression='snappy',
+ index=None, partition_cols=None, **kwargs):
+ """
+ Write a DataFrame to the binary parquet format.
+
+ .. versionadded:: 0.21.0
+
+ This function writes the dataframe as a `parquet file
+ <https://parquet.apache.org/>`_. You can choose different parquet
+ backends, and have the option of compression. See
+ :ref:`the user guide <io.parquet>` for more details.
+
+ Parameters
+ ----------
+ fname : str
+ File path or Root Directory path. Will be used as Root Directory
+ path while writing a partitioned dataset.
+
+ .. versionchanged:: 0.24.0
+
+ engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
+ Parquet library to use. If 'auto', then the option
+ ``io.parquet.engine`` is used. The default ``io.parquet.engine``
+ behavior is to try 'pyarrow', falling back to 'fastparquet' if
+ 'pyarrow' is unavailable.
+ compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
+ Name of the compression to use. Use ``None`` for no compression.
+ index : bool, default None
+ If ``True``, include the dataframe's index(es) in the file output.
+ If ``False``, they will not be written to the file. If ``None``,
+ the behavior depends on the chosen engine.
+
+ .. versionadded:: 0.24.0
+
+ partition_cols : list, optional, default None
+ Column names by which to partition the dataset
+ Columns are partitioned in the order they are given
+
+ .. versionadded:: 0.24.0
+
+ **kwargs
+ Additional arguments passed to the parquet library. See
+ :ref:`pandas io <io.parquet>` for more details.
+
+ See Also
+ --------
+ read_parquet : Read a parquet file.
+ DataFrame.to_csv : Write a csv file.
+ DataFrame.to_sql : Write to a sql table.
+ DataFrame.to_hdf : Write to hdf.
+
+ Notes
+ -----
+ This function requires either the `fastparquet
+ <https://pypi.org/project/fastparquet>`_ or `pyarrow
+ <https://arrow.apache.org/docs/python/>`_ library.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
+ >>> df.to_parquet('df.parquet.gzip',
+ ... compression='gzip') # doctest: +SKIP
+ >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP
+ col1 col2
+ 0 1 3
+ 1 2 4
+ """
+ from pandas.io.parquet import to_parquet
+ to_parquet(self, fname, engine,
+ compression=compression, index=index,
+ partition_cols=partition_cols, **kwargs)
+
+ @Substitution(header='Whether to print column labels, default True')
+ @Substitution(shared_params=fmt.common_docstring,
+ returns=fmt.return_docstring)
+ def to_html(self, buf=None, columns=None, col_space=None, header=True,
+ index=True, na_rep='NaN', formatters=None, float_format=None,
+ sparsify=None, index_names=True, justify=None, max_rows=None,
+ max_cols=None, show_dimensions=False, decimal='.',
+ bold_rows=True, classes=None, escape=True, notebook=False,
+ border=None, table_id=None, render_links=False):
+ """
+ Render a DataFrame as an HTML table.
+ %(shared_params)s
+ bold_rows : bool, default True
+ Make the row labels bold in the output.
+ classes : str or list or tuple, default None
+ CSS class(es) to apply to the resulting html table.
+ escape : bool, default True
+ Convert the characters <, >, and & to HTML-safe sequences.
+ notebook : {True, False}, default False
+ Whether the generated HTML is for IPython Notebook.
+ border : int
+ A ``border=border`` attribute is included in the opening
+ `<table>` tag. Default ``pd.options.html.border``.
+
+ .. versionadded:: 0.19.0
+
+ table_id : str, optional
+ A css id is included in the opening `<table>` tag if specified.
+
+ .. versionadded:: 0.23.0
+
+ render_links : bool, default False
+ Convert URLs to HTML links.
+
+ .. versionadded:: 0.24.0
+
+ %(returns)s
+ See Also
+ --------
+ to_string : Convert DataFrame to a string.
+ """
+
+ if (justify is not None and
+ justify not in fmt._VALID_JUSTIFY_PARAMETERS):
+ raise ValueError("Invalid value for justify parameter")
+
+ formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
+ col_space=col_space, na_rep=na_rep,
+ formatters=formatters,
+ float_format=float_format,
+ sparsify=sparsify, justify=justify,
+ index_names=index_names,
+ header=header, index=index,
+ bold_rows=bold_rows, escape=escape,
+ max_rows=max_rows,
+ max_cols=max_cols,
+ show_dimensions=show_dimensions,
+ decimal=decimal, table_id=table_id,
+ render_links=render_links)
+ # TODO: a generic formatter wld b in DataFrameFormatter
+ formatter.to_html(classes=classes, notebook=notebook, border=border)
+
+ if buf is None:
+ return formatter.buf.getvalue()
+
+ # ----------------------------------------------------------------------
+
+ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
+ null_counts=None):
+ """
+ Print a concise summary of a DataFrame.
+
+ This method prints information about a DataFrame including
+ the index dtype and column dtypes, non-null values and memory usage.
+
+ Parameters
+ ----------
+ verbose : bool, optional
+ Whether to print the full summary. By default, the setting in
+ ``pandas.options.display.max_info_columns`` is followed.
+ buf : writable buffer, defaults to sys.stdout
+ Where to send the output. By default, the output is printed to
+ sys.stdout. Pass a writable buffer if you need to further process
+ the output.
+ max_cols : int, optional
+ When to switch from the verbose to the truncated output. If the
+ DataFrame has more than `max_cols` columns, the truncated output
+ is used. By default, the setting in
+ ``pandas.options.display.max_info_columns`` is used.
+ memory_usage : bool, str, optional
+ Specifies whether total memory usage of the DataFrame
+ elements (including the index) should be displayed. By default,
+ this follows the ``pandas.options.display.memory_usage`` setting.
+
+ True always show memory usage. False never shows memory usage.
+ A value of 'deep' is equivalent to "True with deep introspection".
+ Memory usage is shown in human-readable units (base-2
+ representation). Without deep introspection a memory estimation is
+ made based in column dtype and number of rows assuming values
+ consume the same memory amount for corresponding dtypes. With deep
+ memory introspection, a real memory usage calculation is performed
+ at the cost of computational resources.
+ null_counts : bool, optional
+ Whether to show the non-null counts. By default, this is shown
+ only if the frame is smaller than
+ ``pandas.options.display.max_info_rows`` and
+ ``pandas.options.display.max_info_columns``. A value of True always
+ shows the counts, and False never shows the counts.
+
+ Returns
+ -------
+ None
+ This method prints a summary of a DataFrame and returns None.
+
+ See Also
+ --------
+ DataFrame.describe: Generate descriptive statistics of DataFrame
+ columns.
+ DataFrame.memory_usage: Memory usage of DataFrame columns.
+
+ Examples
+ --------
+ >>> int_values = [1, 2, 3, 4, 5]
+ >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
+ >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
+ >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
+ ... "float_col": float_values})
+ >>> df
+ int_col text_col float_col
+ 0 1 alpha 0.00
+ 1 2 beta 0.25
+ 2 3 gamma 0.50
+ 3 4 delta 0.75
+ 4 5 epsilon 1.00
+
+ Prints information of all columns:
+
+ >>> df.info(verbose=True)
+ <class 'pandas.core.frame.DataFrame'>
+ RangeIndex: 5 entries, 0 to 4
+ Data columns (total 3 columns):
+ int_col 5 non-null int64
+ text_col 5 non-null object
+ float_col 5 non-null float64
+ dtypes: float64(1), int64(1), object(1)
+ memory usage: 200.0+ bytes
+
+ Prints a summary of columns count and its dtypes but not per column
+ information:
+
+ >>> df.info(verbose=False)
+ <class 'pandas.core.frame.DataFrame'>
+ RangeIndex: 5 entries, 0 to 4
+ Columns: 3 entries, int_col to float_col
+ dtypes: float64(1), int64(1), object(1)
+ memory usage: 200.0+ bytes
+
+ Pipe output of DataFrame.info to buffer instead of sys.stdout, get
+ buffer content and writes to a text file:
+
+ >>> import io
+ >>> buffer = io.StringIO()
+ >>> df.info(buf=buffer)
+ >>> s = buffer.getvalue()
+ >>> with open("df_info.txt", "w",
+ ... encoding="utf-8") as f: # doctest: +SKIP
+ ... f.write(s)
+ 260
+
+ The `memory_usage` parameter allows deep introspection mode, specially
+ useful for big DataFrames and fine-tune memory optimization:
+
+ >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
+ >>> df = pd.DataFrame({
+ ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
+ ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
+ ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
+ ... })
+ >>> df.info()
+ <class 'pandas.core.frame.DataFrame'>
+ RangeIndex: 1000000 entries, 0 to 999999
+ Data columns (total 3 columns):
+ column_1 1000000 non-null object
+ column_2 1000000 non-null object
+ column_3 1000000 non-null object
+ dtypes: object(3)
+ memory usage: 22.9+ MB
+
+ >>> df.info(memory_usage='deep')
+ <class 'pandas.core.frame.DataFrame'>
+ RangeIndex: 1000000 entries, 0 to 999999
+ Data columns (total 3 columns):
+ column_1 1000000 non-null object
+ column_2 1000000 non-null object
+ column_3 1000000 non-null object
+ dtypes: object(3)
+ memory usage: 188.8 MB
+ """
+
+ if buf is None: # pragma: no cover
+ buf = sys.stdout
+
+ lines = []
+
+ lines.append(str(type(self)))
+ lines.append(self.index._summary())
+
+ if len(self.columns) == 0:
+ lines.append('Empty {name}'.format(name=type(self).__name__))
+ fmt.buffer_put_lines(buf, lines)
+ return
+
+ cols = self.columns
+
+ # hack
+ if max_cols is None:
+ max_cols = get_option('display.max_info_columns',
+ len(self.columns) + 1)
+
+ max_rows = get_option('display.max_info_rows', len(self) + 1)
+
+ if null_counts is None:
+ show_counts = ((len(self.columns) <= max_cols) and
+ (len(self) < max_rows))
+ else:
+ show_counts = null_counts
+ exceeds_info_cols = len(self.columns) > max_cols
+
+ def _verbose_repr():
+ lines.append('Data columns (total %d columns):' %
+ len(self.columns))
+ space = max(len(pprint_thing(k)) for k in self.columns) + 4
+ counts = None
+
+ tmpl = "{count}{dtype}"
+ if show_counts:
+ counts = self.count()
+ if len(cols) != len(counts): # pragma: no cover
+ raise AssertionError(
+ 'Columns must equal counts '
+ '({cols:d} != {counts:d})'.format(
+ cols=len(cols), counts=len(counts)))
+ tmpl = "{count} non-null {dtype}"
+
+ dtypes = self.dtypes
+ for i, col in enumerate(self.columns):
+ dtype = dtypes.iloc[i]
+ col = pprint_thing(col)
+
+ count = ""
+ if show_counts:
+ count = counts.iloc[i]
+
+ lines.append(_put_str(col, space) + tmpl.format(count=count,
+ dtype=dtype))
+
+ def _non_verbose_repr():
+ lines.append(self.columns._summary(name='Columns'))
+
+ def _sizeof_fmt(num, size_qualifier):
+ # returns size in human readable format
+ for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
+ if num < 1024.0:
+ return ("{num:3.1f}{size_q} "
+ "{x}".format(num=num, size_q=size_qualifier, x=x))
+ num /= 1024.0
+ return "{num:3.1f}{size_q} {pb}".format(num=num,
+ size_q=size_qualifier,
+ pb='PB')
+
+ if verbose:
+ _verbose_repr()
+ elif verbose is False: # specifically set to False, not nesc None
+ _non_verbose_repr()
+ else:
+ if exceeds_info_cols:
+ _non_verbose_repr()
+ else:
+ _verbose_repr()
+
+ counts = self.get_dtype_counts()
+ dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k
+ in sorted(compat.iteritems(counts))]
+ lines.append('dtypes: {types}'.format(types=', '.join(dtypes)))
+
+ if memory_usage is None:
+ memory_usage = get_option('display.memory_usage')
+ if memory_usage:
+ # append memory usage of df to display
+ size_qualifier = ''
+ if memory_usage == 'deep':
+ deep = True
+ else:
+ # size_qualifier is just a best effort; not guaranteed to catch
+ # all cases (e.g., it misses categorical data even with object
+ # categories)
+ deep = False
+ if ('object' in counts or
+ self.index._is_memory_usage_qualified()):
+ size_qualifier = '+'
+ mem_usage = self.memory_usage(index=True, deep=deep).sum()
+ lines.append("memory usage: {mem}\n".format(
+ mem=_sizeof_fmt(mem_usage, size_qualifier)))
+
+ fmt.buffer_put_lines(buf, lines)
+
+ def memory_usage(self, index=True, deep=False):
+ """
+ Return the memory usage of each column in bytes.
+
+ The memory usage can optionally include the contribution of
+ the index and elements of `object` dtype.
+
+ This value is displayed in `DataFrame.info` by default. This can be
+ suppressed by setting ``pandas.options.display.memory_usage`` to False.
+
+ Parameters
+ ----------
+ index : bool, default True
+ Specifies whether to include the memory usage of the DataFrame's
+ index in returned Series. If ``index=True`` the memory usage of the
+ index the first item in the output.
+ deep : bool, default False
+ If True, introspect the data deeply by interrogating
+ `object` dtypes for system-level memory consumption, and include
+ it in the returned values.
+
+ Returns
+ -------
+ sizes : Series
+ A Series whose index is the original column names and whose values
+ is the memory usage of each column in bytes.
+
+ See Also
+ --------
+ numpy.ndarray.nbytes : Total bytes consumed by the elements of an
+ ndarray.
+ Series.memory_usage : Bytes consumed by a Series.
+ pandas.Categorical : Memory-efficient array for string values with
+ many repeated values.
+ DataFrame.info : Concise summary of a DataFrame.
+
+ Examples
+ --------
+ >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
+ >>> data = dict([(t, np.ones(shape=5000).astype(t))
+ ... for t in dtypes])
+ >>> df = pd.DataFrame(data)
+ >>> df.head()
+ int64 float64 complex128 object bool
+ 0 1 1.0 (1+0j) 1 True
+ 1 1 1.0 (1+0j) 1 True
+ 2 1 1.0 (1+0j) 1 True
+ 3 1 1.0 (1+0j) 1 True
+ 4 1 1.0 (1+0j) 1 True
+
+ >>> df.memory_usage()
+ Index 80
+ int64 40000
+ float64 40000
+ complex128 80000
+ object 40000
+ bool 5000
+ dtype: int64
+
+ >>> df.memory_usage(index=False)
+ int64 40000
+ float64 40000
+ complex128 80000
+ object 40000
+ bool 5000
+ dtype: int64
+
+ The memory footprint of `object` dtype columns is ignored by default:
+
+ >>> df.memory_usage(deep=True)
+ Index 80
+ int64 40000
+ float64 40000
+ complex128 80000
+ object 160000
+ bool 5000
+ dtype: int64
+
+ Use a Categorical for efficient storage of an object-dtype column with
+ many repeated values.
+
+ >>> df['object'].astype('category').memory_usage(deep=True)
+ 5168
+ """
+ result = Series([c.memory_usage(index=False, deep=deep)
+ for col, c in self.iteritems()], index=self.columns)
+ if index:
+ result = Series(self.index.memory_usage(deep=deep),
+ index=['Index']).append(result)
+ return result
+
+ def transpose(self, *args, **kwargs):
+ """
+ Transpose index and columns.
+
+ Reflect the DataFrame over its main diagonal by writing rows as columns
+ and vice-versa. The property :attr:`.T` is an accessor to the method
+ :meth:`transpose`.
+
+ Parameters
+ ----------
+ copy : bool, default False
+ If True, the underlying data is copied. Otherwise (default), no
+ copy is made if possible.
+ *args, **kwargs
+ Additional keywords have no effect but might be accepted for
+ compatibility with numpy.
+
+ Returns
+ -------
+ DataFrame
+ The transposed DataFrame.
+
+ See Also
+ --------
+ numpy.transpose : Permute the dimensions of a given array.
+
+ Notes
+ -----
+ Transposing a DataFrame with mixed dtypes will result in a homogeneous
+ DataFrame with the `object` dtype. In such a case, a copy of the data
+ is always made.
+
+ Examples
+ --------
+ **Square DataFrame with homogeneous dtype**
+
+ >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
+ >>> df1 = pd.DataFrame(data=d1)
+ >>> df1
+ col1 col2
+ 0 1 3
+ 1 2 4
+
+ >>> df1_transposed = df1.T # or df1.transpose()
+ >>> df1_transposed
+ 0 1
+ col1 1 2
+ col2 3 4
+
+ When the dtype is homogeneous in the original DataFrame, we get a
+ transposed DataFrame with the same dtype:
+
+ >>> df1.dtypes
+ col1 int64
+ col2 int64
+ dtype: object
+ >>> df1_transposed.dtypes
+ 0 int64
+ 1 int64
+ dtype: object
+
+ **Non-square DataFrame with mixed dtypes**
+
+ >>> d2 = {'name': ['Alice', 'Bob'],
+ ... 'score': [9.5, 8],
+ ... 'employed': [False, True],
+ ... 'kids': [0, 0]}
+ >>> df2 = pd.DataFrame(data=d2)
+ >>> df2
+ name score employed kids
+ 0 Alice 9.5 False 0
+ 1 Bob 8.0 True 0
+
+ >>> df2_transposed = df2.T # or df2.transpose()
+ >>> df2_transposed
+ 0 1
+ name Alice Bob
+ score 9.5 8
+ employed False True
+ kids 0 0
+
+ When the DataFrame has mixed dtypes, we get a transposed DataFrame with
+ the `object` dtype:
+
+ >>> df2.dtypes
+ name object
+ score float64
+ employed bool
+ kids int64
+ dtype: object
+ >>> df2_transposed.dtypes
+ 0 object
+ 1 object
+ dtype: object
+ """
+ nv.validate_transpose(args, dict())
+ return super(DataFrame, self).transpose(1, 0, **kwargs)
+
+ T = property(transpose)
+
+ # ----------------------------------------------------------------------
+ # Picklability
+
+ # legacy pickle formats
+ def _unpickle_frame_compat(self, state): # pragma: no cover
+ if len(state) == 2: # pragma: no cover
+ series, idx = state
+ columns = sorted(series)
+ else:
+ series, cols, idx = state
+ columns = com._unpickle_array(cols)
+
+ index = com._unpickle_array(idx)
+ self._data = self._init_dict(series, index, columns, None)
+
+ def _unpickle_matrix_compat(self, state): # pragma: no cover
+ # old unpickling
+ (vals, idx, cols), object_state = state
+
+ index = com._unpickle_array(idx)
+ dm = DataFrame(vals, index=index, columns=com._unpickle_array(cols),
+ copy=False)
+
+ if object_state is not None:
+ ovals, _, ocols = object_state
+ objects = DataFrame(ovals, index=index,
+ columns=com._unpickle_array(ocols), copy=False)
+
+ dm = dm.join(objects)
+
+ self._data = dm._data
+
+ # ----------------------------------------------------------------------
+ # Getting and setting elements
+
+ def get_value(self, index, col, takeable=False):
+ """
+ Quickly retrieve single value at passed column and index.
+
+ .. deprecated:: 0.21.0
+ Use .at[] or .iat[] accessors instead.
+
+ Parameters
+ ----------
+ index : row label
+ col : column label
+ takeable : interpret the index/col as indexers, default False
+
+ Returns
+ -------
+ value : scalar value
+ """
+
+ warnings.warn("get_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead", FutureWarning,
+ stacklevel=2)
+ return self._get_value(index, col, takeable=takeable)
+
+ def _get_value(self, index, col, takeable=False):
+
+ if takeable:
+ series = self._iget_item_cache(col)
+ return com.maybe_box_datetimelike(series._values[index])
+
+ series = self._get_item_cache(col)
+ engine = self.index._engine
+
+ try:
+ return engine.get_value(series._values, index)
+ except (TypeError, ValueError):
+
+ # we cannot handle direct indexing
+ # use positional
+ col = self.columns.get_loc(col)
+ index = self.index.get_loc(index)
+ return self._get_value(index, col, takeable=True)
+ _get_value.__doc__ = get_value.__doc__
+
+ def set_value(self, index, col, value, takeable=False):
+ """
+ Put single value at passed column and index.
+
+ .. deprecated:: 0.21.0
+ Use .at[] or .iat[] accessors instead.
+
+ Parameters
+ ----------
+ index : row label
+ col : column label
+ value : scalar value
+ takeable : interpret the index/col as indexers, default False
+
+ Returns
+ -------
+ frame : DataFrame
+ If label pair is contained, will be reference to calling DataFrame,
+ otherwise a new object
+ """
+ warnings.warn("set_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead", FutureWarning,
+ stacklevel=2)
+ return self._set_value(index, col, value, takeable=takeable)
+
+ def _set_value(self, index, col, value, takeable=False):
+ try:
+ if takeable is True:
+ series = self._iget_item_cache(col)
+ return series._set_value(index, value, takeable=True)
+
+ series = self._get_item_cache(col)
+ engine = self.index._engine
+ engine.set_value(series._values, index, value)
+ return self
+ except (KeyError, TypeError):
+
+ # set using a non-recursive method & reset the cache
+ if takeable:
+ self.iloc[index, col] = value
+ else:
+ self.loc[index, col] = value
+ self._item_cache.pop(col, None)
+
+ return self
+ _set_value.__doc__ = set_value.__doc__
+
+ def _ixs(self, i, axis=0):
+ """
+ Parameters
+ ----------
+ i : int, slice, or sequence of integers
+ axis : int
+
+ Notes
+ -----
+ If slice passed, the resulting data will be a view.
+ """
+ # irow
+ if axis == 0:
+ if isinstance(i, slice):
+ return self[i]
+ else:
+ label = self.index[i]
+ if isinstance(label, Index):
+ # a location index by definition
+ result = self.take(i, axis=axis)
+ copy = True
+ else:
+ new_values = self._data.fast_xs(i)
+ if is_scalar(new_values):
+ return new_values
+
+ # if we are a copy, mark as such
+ copy = (isinstance(new_values, np.ndarray) and
+ new_values.base is None)
+ result = self._constructor_sliced(new_values,
+ index=self.columns,
+ name=self.index[i],
+ dtype=new_values.dtype)
+ result._set_is_copy(self, copy=copy)
+ return result
+
+ # icol
+ else:
+ label = self.columns[i]
+ if isinstance(i, slice):
+ # need to return view
+ lab_slice = slice(label[0], label[-1])
+ return self.loc[:, lab_slice]
+ else:
+ if isinstance(label, Index):
+ return self._take(i, axis=1)
+
+ index_len = len(self.index)
+
+ # if the values returned are not the same length
+ # as the index (iow a not found value), iget returns
+ # a 0-len ndarray. This is effectively catching
+ # a numpy error (as numpy should really raise)
+ values = self._data.iget(i)
+
+ if index_len and not len(values):
+ values = np.array([np.nan] * index_len, dtype=object)
+ result = self._box_col_values(values, label)
+
+ # this is a cached value, mark it so
+ result._set_as_cached(label, self)
+
+ return result
+
+ def __getitem__(self, key):
+ key = com.apply_if_callable(key, self)
+
+ # shortcut if the key is in columns
+ try:
+ if self.columns.is_unique and key in self.columns:
+ if self.columns.nlevels > 1:
+ return self._getitem_multilevel(key)
+ return self._get_item_cache(key)
+ except (TypeError, ValueError):
+ # The TypeError correctly catches non hashable "key" (e.g. list)
+ # The ValueError can be removed once GH #21729 is fixed
+ pass
+
+ # Do we have a slicer (on rows)?
+ indexer = convert_to_index_sliceable(self, key)
+ if indexer is not None:
+ return self._slice(indexer, axis=0)
+
+ # Do we have a (boolean) DataFrame?
+ if isinstance(key, DataFrame):
+ return self._getitem_frame(key)
+
+ # Do we have a (boolean) 1d indexer?
+ if com.is_bool_indexer(key):
+ return self._getitem_bool_array(key)
+
+ # We are left with two options: a single key, and a collection of keys,
+ # We interpret tuples as collections only for non-MultiIndex
+ is_single_key = isinstance(key, tuple) or not is_list_like(key)
+
+ if is_single_key:
+ if self.columns.nlevels > 1:
+ return self._getitem_multilevel(key)
+ indexer = self.columns.get_loc(key)
+ if is_integer(indexer):
+ indexer = [indexer]
+ else:
+ if is_iterator(key):
+ key = list(key)
+ indexer = self.loc._convert_to_indexer(key, axis=1,
+ raise_missing=True)
+
+ # take() does not accept boolean indexers
+ if getattr(indexer, "dtype", None) == bool:
+ indexer = np.where(indexer)[0]
+
+ data = self._take(indexer, axis=1)
+
+ if is_single_key:
+ # What does looking for a single key in a non-unique index return?
+ # The behavior is inconsistent. It returns a Series, except when
+ # - the key itself is repeated (test on data.shape, #9519), or
+ # - we have a MultiIndex on columns (test on self.columns, #21309)
+ if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):
+ data = data[key]
+
+ return data
+
+ def _getitem_bool_array(self, key):
+ # also raises Exception if object array with NA values
+ # warning here just in case -- previously __setitem__ was
+ # reindexing but __getitem__ was not; it seems more reasonable to
+ # go with the __setitem__ behavior since that is more consistent
+ # with all other indexing behavior
+ if isinstance(key, Series) and not key.index.equals(self.index):
+ warnings.warn("Boolean Series key will be reindexed to match "
+ "DataFrame index.", UserWarning, stacklevel=3)
+ elif len(key) != len(self.index):
+ raise ValueError('Item wrong length %d instead of %d.' %
+ (len(key), len(self.index)))
+
+ # check_bool_indexer will throw exception if Series key cannot
+ # be reindexed to match DataFrame rows
+ key = check_bool_indexer(self.index, key)
+ indexer = key.nonzero()[0]
+ return self._take(indexer, axis=0)
+
+ def _getitem_multilevel(self, key):
+ loc = self.columns.get_loc(key)
+ if isinstance(loc, (slice, Series, np.ndarray, Index)):
+ new_columns = self.columns[loc]
+ result_columns = maybe_droplevels(new_columns, key)
+ if self._is_mixed_type:
+ result = self.reindex(columns=new_columns)
+ result.columns = result_columns
+ else:
+ new_values = self.values[:, loc]
+ result = self._constructor(new_values, index=self.index,
+ columns=result_columns)
+ result = result.__finalize__(self)
+
+ # If there is only one column being returned, and its name is
+ # either an empty string, or a tuple with an empty string as its
+ # first element, then treat the empty string as a placeholder
+ # and return the column as if the user had provided that empty
+ # string in the key. If the result is a Series, exclude the
+ # implied empty string from its name.
+ if len(result.columns) == 1:
+ top = result.columns[0]
+ if isinstance(top, tuple):
+ top = top[0]
+ if top == '':
+ result = result['']
+ if isinstance(result, Series):
+ result = self._constructor_sliced(result,
+ index=self.index,
+ name=key)
+
+ result._set_is_copy(self)
+ return result
+ else:
+ return self._get_item_cache(key)
+
+ def _getitem_frame(self, key):
+ if key.values.size and not is_bool_dtype(key.values):
+ raise ValueError('Must pass DataFrame with boolean values only')
+ return self.where(key)
+
+ def query(self, expr, inplace=False, **kwargs):
+ """
+ Query the columns of a DataFrame with a boolean expression.
+
+ Parameters
+ ----------
+ expr : string
+ The query string to evaluate. You can refer to variables
+ in the environment by prefixing them with an '@' character like
+ ``@a + b``.
+ inplace : bool
+ Whether the query should modify the data in place or return
+ a modified copy
+
+ .. versionadded:: 0.18.0
+
+ kwargs : dict
+ See the documentation for :func:`pandas.eval` for complete details
+ on the keyword arguments accepted by :meth:`DataFrame.query`.
+
+ Returns
+ -------
+ q : DataFrame
+
+ See Also
+ --------
+ pandas.eval
+ DataFrame.eval
+
+ Notes
+ -----
+ The result of the evaluation of this expression is first passed to
+ :attr:`DataFrame.loc` and if that fails because of a
+ multidimensional key (e.g., a DataFrame) then the result will be passed
+ to :meth:`DataFrame.__getitem__`.
+
+ This method uses the top-level :func:`pandas.eval` function to
+ evaluate the passed query.
+
+ The :meth:`~pandas.DataFrame.query` method uses a slightly
+ modified Python syntax by default. For example, the ``&`` and ``|``
+ (bitwise) operators have the precedence of their boolean cousins,
+ :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
+ however the semantics are different.
+
+ You can change the semantics of the expression by passing the keyword
+ argument ``parser='python'``. This enforces the same semantics as
+ evaluation in Python space. Likewise, you can pass ``engine='python'``
+ to evaluate an expression using Python itself as a backend. This is not
+ recommended as it is inefficient compared to using ``numexpr`` as the
+ engine.
+
+ The :attr:`DataFrame.index` and
+ :attr:`DataFrame.columns` attributes of the
+ :class:`~pandas.DataFrame` instance are placed in the query namespace
+ by default, which allows you to treat both the index and columns of the
+ frame as a column in the frame.
+ The identifier ``index`` is used for the frame index; you can also
+ use the name of the index to identify it in a query. Please note that
+ Python keywords may not be used as identifiers.
+
+ For further details and examples see the ``query`` documentation in
+ :ref:`indexing <indexing.query>`.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(np.random.randn(10, 2), columns=list('ab'))
+ >>> df.query('a > b')
+ >>> df[df.a > df.b] # same result as the previous expression
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ if not isinstance(expr, compat.string_types):
+ msg = "expr must be a string to be evaluated, {0} given"
+ raise ValueError(msg.format(type(expr)))
+ kwargs['level'] = kwargs.pop('level', 0) + 1
+ kwargs['target'] = None
+ res = self.eval(expr, **kwargs)
+
+ try:
+ new_data = self.loc[res]
+ except ValueError:
+ # when res is multi-dimensional loc raises, but this is sometimes a
+ # valid query
+ new_data = self[res]
+
+ if inplace:
+ self._update_inplace(new_data)
+ else:
+ return new_data
+
+ def eval(self, expr, inplace=False, **kwargs):
+ """
+ Evaluate a string describing operations on DataFrame columns.
+
+ Operates on columns only, not specific rows or elements. This allows
+ `eval` to run arbitrary code, which can make you vulnerable to code
+ injection if you pass user input to this function.
+
+ Parameters
+ ----------
+ expr : str
+ The expression string to evaluate.
+ inplace : bool, default False
+ If the expression contains an assignment, whether to perform the
+ operation inplace and mutate the existing DataFrame. Otherwise,
+ a new DataFrame is returned.
+
+ .. versionadded:: 0.18.0.
+ kwargs : dict
+ See the documentation for :func:`~pandas.eval` for complete details
+ on the keyword arguments accepted by
+ :meth:`~pandas.DataFrame.query`.
+
+ Returns
+ -------
+ ndarray, scalar, or pandas object
+ The result of the evaluation.
+
+ See Also
+ --------
+ DataFrame.query : Evaluates a boolean expression to query the columns
+ of a frame.
+ DataFrame.assign : Can evaluate an expression or function to create new
+ values for a column.
+ pandas.eval : Evaluate a Python expression as a string using various
+ backends.
+
+ Notes
+ -----
+ For more details see the API documentation for :func:`~pandas.eval`.
+ For detailed examples see :ref:`enhancing performance with eval
+ <enhancingperf.eval>`.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
+ >>> df
+ A B
+ 0 1 10
+ 1 2 8
+ 2 3 6
+ 3 4 4
+ 4 5 2
+ >>> df.eval('A + B')
+ 0 11
+ 1 10
+ 2 9
+ 3 8
+ 4 7
+ dtype: int64
+
+ Assignment is allowed though by default the original DataFrame is not
+ modified.
+
+ >>> df.eval('C = A + B')
+ A B C
+ 0 1 10 11
+ 1 2 8 10
+ 2 3 6 9
+ 3 4 4 8
+ 4 5 2 7
+ >>> df
+ A B
+ 0 1 10
+ 1 2 8
+ 2 3 6
+ 3 4 4
+ 4 5 2
+
+ Use ``inplace=True`` to modify the original DataFrame.
+
+ >>> df.eval('C = A + B', inplace=True)
+ >>> df
+ A B C
+ 0 1 10 11
+ 1 2 8 10
+ 2 3 6 9
+ 3 4 4 8
+ 4 5 2 7
+ """
+ from pandas.core.computation.eval import eval as _eval
+
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ resolvers = kwargs.pop('resolvers', None)
+ kwargs['level'] = kwargs.pop('level', 0) + 1
+ if resolvers is None:
+ index_resolvers = self._get_index_resolvers()
+ resolvers = dict(self.iteritems()), index_resolvers
+ if 'target' not in kwargs:
+ kwargs['target'] = self
+ kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers)
+ return _eval(expr, inplace=inplace, **kwargs)
+
+ def select_dtypes(self, include=None, exclude=None):
+ """
+ Return a subset of the DataFrame's columns based on the column dtypes.
+
+ Parameters
+ ----------
+ include, exclude : scalar or list-like
+ A selection of dtypes or strings to be included/excluded. At least
+ one of these parameters must be supplied.
+
+ Returns
+ -------
+ subset : DataFrame
+ The subset of the frame including the dtypes in ``include`` and
+ excluding the dtypes in ``exclude``.
+
+ Raises
+ ------
+ ValueError
+ * If both of ``include`` and ``exclude`` are empty
+ * If ``include`` and ``exclude`` have overlapping elements
+ * If any kind of string dtype is passed in.
+
+ Notes
+ -----
+ * To select all *numeric* types, use ``np.number`` or ``'number'``
+ * To select strings you must use the ``object`` dtype, but note that
+ this will return *all* object dtype columns
+ * See the `numpy dtype hierarchy
+ <http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html>`__
+ * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
+ ``'datetime64'``
+ * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
+ ``'timedelta64'``
+ * To select Pandas categorical dtypes, use ``'category'``
+ * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
+ 0.20.0) or ``'datetime64[ns, tz]'``
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'a': [1, 2] * 3,
+ ... 'b': [True, False] * 3,
+ ... 'c': [1.0, 2.0] * 3})
+ >>> df
+ a b c
+ 0 1 True 1.0
+ 1 2 False 2.0
+ 2 1 True 1.0
+ 3 2 False 2.0
+ 4 1 True 1.0
+ 5 2 False 2.0
+
+ >>> df.select_dtypes(include='bool')
+ b
+ 0 True
+ 1 False
+ 2 True
+ 3 False
+ 4 True
+ 5 False
+
+ >>> df.select_dtypes(include=['float64'])
+ c
+ 0 1.0
+ 1 2.0
+ 2 1.0
+ 3 2.0
+ 4 1.0
+ 5 2.0
+
+ >>> df.select_dtypes(exclude=['int'])
+ b c
+ 0 True 1.0
+ 1 False 2.0
+ 2 True 1.0
+ 3 False 2.0
+ 4 True 1.0
+ 5 False 2.0
+ """
+ def _get_info_slice(obj, indexer):
+ """Slice the info axis of `obj` with `indexer`."""
+ if not hasattr(obj, '_info_axis_number'):
+ msg = 'object of type {typ!r} has no info axis'
+ raise TypeError(msg.format(typ=type(obj).__name__))
+ slices = [slice(None)] * obj.ndim
+ slices[obj._info_axis_number] = indexer
+ return tuple(slices)
+
+ if not is_list_like(include):
+ include = (include,) if include is not None else ()
+ if not is_list_like(exclude):
+ exclude = (exclude,) if exclude is not None else ()
+
+ selection = tuple(map(frozenset, (include, exclude)))
+
+ if not any(selection):
+ raise ValueError('at least one of include or exclude must be '
+ 'nonempty')
+
+ # convert the myriad valid dtypes object to a single representation
+ include, exclude = map(
+ lambda x: frozenset(map(infer_dtype_from_object, x)), selection)
+ for dtypes in (include, exclude):
+ invalidate_string_dtypes(dtypes)
+
+ # can't both include AND exclude!
+ if not include.isdisjoint(exclude):
+ raise ValueError('include and exclude overlap on {inc_ex}'.format(
+ inc_ex=(include & exclude)))
+
+ # empty include/exclude -> defaults to True
+ # three cases (we've already raised if both are empty)
+ # case 1: empty include, nonempty exclude
+ # we have True, True, ... True for include, same for exclude
+ # in the loop below we get the excluded
+ # and when we call '&' below we get only the excluded
+ # case 2: nonempty include, empty exclude
+ # same as case 1, but with include
+ # case 3: both nonempty
+ # the "union" of the logic of case 1 and case 2:
+ # we get the included and excluded, and return their logical and
+ include_these = Series(not bool(include), index=self.columns)
+ exclude_these = Series(not bool(exclude), index=self.columns)
+
+ def is_dtype_instance_mapper(idx, dtype):
+ return idx, functools.partial(issubclass, dtype.type)
+
+ for idx, f in itertools.starmap(is_dtype_instance_mapper,
+ enumerate(self.dtypes)):
+ if include: # checks for the case of empty include or exclude
+ include_these.iloc[idx] = any(map(f, include))
+ if exclude:
+ exclude_these.iloc[idx] = not any(map(f, exclude))
+
+ dtype_indexer = include_these & exclude_these
+ return self.loc[_get_info_slice(self, dtype_indexer)]
+
+ def _box_item_values(self, key, values):
+ items = self.columns[self.columns.get_loc(key)]
+ if values.ndim == 2:
+ return self._constructor(values.T, columns=items, index=self.index)
+ else:
+ return self._box_col_values(values, items)
+
+ def _box_col_values(self, values, items):
+ """
+ Provide boxed values for a column.
+ """
+ klass = self._constructor_sliced
+ return klass(values, index=self.index, name=items, fastpath=True)
+
+ def __setitem__(self, key, value):
+ key = com.apply_if_callable(key, self)
+
+ # see if we can slice the rows
+ indexer = convert_to_index_sliceable(self, key)
+ if indexer is not None:
+ return self._setitem_slice(indexer, value)
+
+ if isinstance(key, DataFrame) or getattr(key, 'ndim', None) == 2:
+ self._setitem_frame(key, value)
+ elif isinstance(key, (Series, np.ndarray, list, Index)):
+ self._setitem_array(key, value)
+ else:
+ # set column
+ self._set_item(key, value)
+
+ def _setitem_slice(self, key, value):
+ self._check_setitem_copy()
+ self.loc._setitem_with_indexer(key, value)
+
+ def _setitem_array(self, key, value):
+ # also raises Exception if object array with NA values
+ if com.is_bool_indexer(key):
+ if len(key) != len(self.index):
+ raise ValueError('Item wrong length %d instead of %d!' %
+ (len(key), len(self.index)))
+ key = check_bool_indexer(self.index, key)
+ indexer = key.nonzero()[0]
+ self._check_setitem_copy()
+ self.loc._setitem_with_indexer(indexer, value)
+ else:
+ if isinstance(value, DataFrame):
+ if len(value.columns) != len(key):
+ raise ValueError('Columns must be same length as key')
+ for k1, k2 in zip(key, value.columns):
+ self[k1] = value[k2]
+ else:
+ indexer = self.loc._convert_to_indexer(key, axis=1)
+ self._check_setitem_copy()
+ self.loc._setitem_with_indexer((slice(None), indexer), value)
+
+ def _setitem_frame(self, key, value):
+ # support boolean setting with DataFrame input, e.g.
+ # df[df > df2] = 0
+ if isinstance(key, np.ndarray):
+ if key.shape != self.shape:
+ raise ValueError(
+ 'Array conditional must be same shape as self'
+ )
+ key = self._constructor(key, **self._construct_axes_dict())
+
+ if key.values.size and not is_bool_dtype(key.values):
+ raise TypeError(
+ 'Must pass DataFrame or 2-d ndarray with boolean values only'
+ )
+
+ self._check_inplace_setting(value)
+ self._check_setitem_copy()
+ self._where(-key, value, inplace=True)
+
+ def _ensure_valid_index(self, value):
+ """
+ Ensure that if we don't have an index, that we can create one from the
+ passed value.
+ """
+ # GH5632, make sure that we are a Series convertible
+ if not len(self.index) and is_list_like(value):
+ try:
+ value = Series(value)
+ except (ValueError, NotImplementedError, TypeError):
+ raise ValueError('Cannot set a frame with no defined index '
+ 'and a value that cannot be converted to a '
+ 'Series')
+
+ self._data = self._data.reindex_axis(value.index.copy(), axis=1,
+ fill_value=np.nan)
+
+ def _set_item(self, key, value):
+ """
+ Add series to DataFrame in specified column.
+
+ If series is a numpy-array (not a Series/TimeSeries), it must be the
+ same length as the DataFrames index or an error will be thrown.
+
+ Series/TimeSeries will be conformed to the DataFrames index to
+ ensure homogeneity.
+ """
+
+ self._ensure_valid_index(value)
+ value = self._sanitize_column(key, value)
+ NDFrame._set_item(self, key, value)
+
+ # check if we are modifying a copy
+ # try to set first as we want an invalid
+ # value exception to occur first
+ if len(self):
+ self._check_setitem_copy()
+
+ def insert(self, loc, column, value, allow_duplicates=False):
+ """
+ Insert column into DataFrame at specified location.
+
+ Raises a ValueError if `column` is already contained in the DataFrame,
+ unless `allow_duplicates` is set to True.
+
+ Parameters
+ ----------
+ loc : int
+ Insertion index. Must verify 0 <= loc <= len(columns)
+ column : string, number, or hashable object
+ label of the inserted column
+ value : int, Series, or array-like
+ allow_duplicates : bool, optional
+ """
+ self._ensure_valid_index(value)
+ value = self._sanitize_column(column, value, broadcast=False)
+ self._data.insert(loc, column, value,
+ allow_duplicates=allow_duplicates)
+
+ def assign(self, **kwargs):
+ r"""
+ Assign new columns to a DataFrame.
+
+ Returns a new object with all original columns in addition to new ones.
+ Existing columns that are re-assigned will be overwritten.
+
+ Parameters
+ ----------
+ **kwargs : dict of {str: callable or Series}
+ The column names are keywords. If the values are
+ callable, they are computed on the DataFrame and
+ assigned to the new columns. The callable must not
+ change input DataFrame (though pandas doesn't check it).
+ If the values are not callable, (e.g. a Series, scalar, or array),
+ they are simply assigned.
+
+ Returns
+ -------
+ DataFrame
+ A new DataFrame with the new columns in addition to
+ all the existing columns.
+
+ Notes
+ -----
+ Assigning multiple columns within the same ``assign`` is possible.
+ For Python 3.6 and above, later items in '\*\*kwargs' may refer to
+ newly created or modified columns in 'df'; items are computed and
+ assigned into 'df' in order. For Python 3.5 and below, the order of
+ keyword arguments is not specified, you cannot refer to newly created
+ or modified columns. All items are computed first, and then assigned
+ in alphabetical order.
+
+ .. versionchanged :: 0.23.0
+
+ Keyword argument order is maintained for Python 3.6 and later.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},
+ ... index=['Portland', 'Berkeley'])
+ >>> df
+ temp_c
+ Portland 17.0
+ Berkeley 25.0
+
+ Where the value is a callable, evaluated on `df`:
+
+ >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
+ temp_c temp_f
+ Portland 17.0 62.6
+ Berkeley 25.0 77.0
+
+ Alternatively, the same behavior can be achieved by directly
+ referencing an existing Series or sequence:
+
+ >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)
+ temp_c temp_f
+ Portland 17.0 62.6
+ Berkeley 25.0 77.0
+
+ In Python 3.6+, you can create multiple columns within the same assign
+ where one of the columns depends on another one defined within the same
+ assign:
+
+ >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
+ ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)
+ temp_c temp_f temp_k
+ Portland 17.0 62.6 290.15
+ Berkeley 25.0 77.0 298.15
+ """
+ data = self.copy()
+
+ # >= 3.6 preserve order of kwargs
+ if PY36:
+ for k, v in kwargs.items():
+ data[k] = com.apply_if_callable(v, data)
+ else:
+ # <= 3.5: do all calculations first...
+ results = OrderedDict()
+ for k, v in kwargs.items():
+ results[k] = com.apply_if_callable(v, data)
+
+ # <= 3.5 and earlier
+ results = sorted(results.items())
+ # ... and then assign
+ for k, v in results:
+ data[k] = v
+ return data
+
+ def _sanitize_column(self, key, value, broadcast=True):
+ """
+ Ensures new columns (which go into the BlockManager as new blocks) are
+ always copied and converted into an array.
+
+ Parameters
+ ----------
+ key : object
+ value : scalar, Series, or array-like
+ broadcast : bool, default True
+ If ``key`` matches multiple duplicate column names in the
+ DataFrame, this parameter indicates whether ``value`` should be
+ tiled so that the returned array contains a (duplicated) column for
+ each occurrence of the key. If False, ``value`` will not be tiled.
+
+ Returns
+ -------
+ sanitized_column : numpy-array
+ """
+
+ def reindexer(value):
+ # reindex if necessary
+
+ if value.index.equals(self.index) or not len(self.index):
+ value = value._values.copy()
+ else:
+
+ # GH 4107
+ try:
+ value = value.reindex(self.index)._values
+ except Exception as e:
+
+ # duplicate axis
+ if not value.index.is_unique:
+ raise e
+
+ # other
+ raise TypeError('incompatible index of inserted column '
+ 'with frame index')
+ return value
+
+ if isinstance(value, Series):
+ value = reindexer(value)
+
+ elif isinstance(value, DataFrame):
+ # align right-hand-side columns if self.columns
+ # is multi-index and self[key] is a sub-frame
+ if isinstance(self.columns, MultiIndex) and key in self.columns:
+ loc = self.columns.get_loc(key)
+ if isinstance(loc, (slice, Series, np.ndarray, Index)):
+ cols = maybe_droplevels(self.columns[loc], key)
+ if len(cols) and not cols.equals(value.columns):
+ value = value.reindex(cols, axis=1)
+ # now align rows
+ value = reindexer(value).T
+
+ elif isinstance(value, ExtensionArray):
+ # Explicitly copy here, instead of in sanitize_index,
+ # as sanitize_index won't copy an EA, even with copy=True
+ value = value.copy()
+ value = sanitize_index(value, self.index, copy=False)
+
+ elif isinstance(value, Index) or is_sequence(value):
+
+ # turn me into an ndarray
+ value = sanitize_index(value, self.index, copy=False)
+ if not isinstance(value, (np.ndarray, Index)):
+ if isinstance(value, list) and len(value) > 0:
+ value = maybe_convert_platform(value)
+ else:
+ value = com.asarray_tuplesafe(value)
+ elif value.ndim == 2:
+ value = value.copy().T
+ elif isinstance(value, Index):
+ value = value.copy(deep=True)
+ else:
+ value = value.copy()
+
+ # possibly infer to datetimelike
+ if is_object_dtype(value.dtype):
+ value = maybe_infer_to_datetimelike(value)
+
+ else:
+ # cast ignores pandas dtypes. so save the dtype first
+ infer_dtype, _ = infer_dtype_from_scalar(
+ value, pandas_dtype=True)
+
+ # upcast
+ value = cast_scalar_to_array(len(self.index), value)
+ value = maybe_cast_to_datetime(value, infer_dtype)
+
+ # return internal types directly
+ if is_extension_type(value) or is_extension_array_dtype(value):
+ return value
+
+ # broadcast across multiple columns if necessary
+ if broadcast and key in self.columns and value.ndim == 1:
+ if (not self.columns.is_unique or
+ isinstance(self.columns, MultiIndex)):
+ existing_piece = self[key]
+ if isinstance(existing_piece, DataFrame):
+ value = np.tile(value, (len(existing_piece.columns), 1))
+
+ return np.atleast_2d(np.asarray(value))
+
+ @property
+ def _series(self):
+ return {item: Series(self._data.iget(idx), index=self.index, name=item)
+ for idx, item in enumerate(self.columns)}
+
+ def lookup(self, row_labels, col_labels):
+ """
+ Label-based "fancy indexing" function for DataFrame.
+
+ Given equal-length arrays of row and column labels, return an
+ array of the values corresponding to each (row, col) pair.
+
+ Parameters
+ ----------
+ row_labels : sequence
+ The row labels to use for lookup
+ col_labels : sequence
+ The column labels to use for lookup
+
+ Notes
+ -----
+ Akin to::
+
+ result = [df.get_value(row, col)
+ for row, col in zip(row_labels, col_labels)]
+
+ Examples
+ --------
+ values : ndarray
+ The found values
+ """
+ n = len(row_labels)
+ if n != len(col_labels):
+ raise ValueError('Row labels must have same size as column labels')
+
+ thresh = 1000
+ if not self._is_mixed_type or n > thresh:
+ values = self.values
+ ridx = self.index.get_indexer(row_labels)
+ cidx = self.columns.get_indexer(col_labels)
+ if (ridx == -1).any():
+ raise KeyError('One or more row labels was not found')
+ if (cidx == -1).any():
+ raise KeyError('One or more column labels was not found')
+ flat_index = ridx * len(self.columns) + cidx
+ result = values.flat[flat_index]
+ else:
+ result = np.empty(n, dtype='O')
+ for i, (r, c) in enumerate(zip(row_labels, col_labels)):
+ result[i] = self._get_value(r, c)
+
+ if is_object_dtype(result):
+ result = lib.maybe_convert_objects(result)
+
+ return result
+
+ # ----------------------------------------------------------------------
+ # Reindexing and alignment
+
+ def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
+ copy):
+ frame = self
+
+ columns = axes['columns']
+ if columns is not None:
+ frame = frame._reindex_columns(columns, method, copy, level,
+ fill_value, limit, tolerance)
+
+ index = axes['index']
+ if index is not None:
+ frame = frame._reindex_index(index, method, copy, level,
+ fill_value, limit, tolerance)
+
+ return frame
+
+ def _reindex_index(self, new_index, method, copy, level, fill_value=np.nan,
+ limit=None, tolerance=None):
+ new_index, indexer = self.index.reindex(new_index, method=method,
+ level=level, limit=limit,
+ tolerance=tolerance)
+ return self._reindex_with_indexers({0: [new_index, indexer]},
+ copy=copy, fill_value=fill_value,
+ allow_dups=False)
+
+ def _reindex_columns(self, new_columns, method, copy, level,
+ fill_value=None, limit=None, tolerance=None):
+ new_columns, indexer = self.columns.reindex(new_columns, method=method,
+ level=level, limit=limit,
+ tolerance=tolerance)
+ return self._reindex_with_indexers({1: [new_columns, indexer]},
+ copy=copy, fill_value=fill_value,
+ allow_dups=False)
+
+ def _reindex_multi(self, axes, copy, fill_value):
+ """
+ We are guaranteed non-Nones in the axes.
+ """
+
+ new_index, row_indexer = self.index.reindex(axes['index'])
+ new_columns, col_indexer = self.columns.reindex(axes['columns'])
+
+ if row_indexer is not None and col_indexer is not None:
+ indexer = row_indexer, col_indexer
+ new_values = algorithms.take_2d_multi(self.values, indexer,
+ fill_value=fill_value)
+ return self._constructor(new_values, index=new_index,
+ columns=new_columns)
+ else:
+ return self._reindex_with_indexers({0: [new_index, row_indexer],
+ 1: [new_columns, col_indexer]},
+ copy=copy,
+ fill_value=fill_value)
+
+ @Appender(_shared_docs['align'] % _shared_doc_kwargs)
+ def align(self, other, join='outer', axis=None, level=None, copy=True,
+ fill_value=None, method=None, limit=None, fill_axis=0,
+ broadcast_axis=None):
+ return super(DataFrame, self).align(other, join=join, axis=axis,
+ level=level, copy=copy,
+ fill_value=fill_value,
+ method=method, limit=limit,
+ fill_axis=fill_axis,
+ broadcast_axis=broadcast_axis)
+
+ @Substitution(**_shared_doc_kwargs)
+ @Appender(NDFrame.reindex.__doc__)
+ @rewrite_axis_style_signature('labels', [('method', None),
+ ('copy', True),
+ ('level', None),
+ ('fill_value', np.nan),
+ ('limit', None),
+ ('tolerance', None)])
+ def reindex(self, *args, **kwargs):
+ axes = validate_axis_style_args(self, args, kwargs, 'labels',
+ 'reindex')
+ kwargs.update(axes)
+ # Pop these, since the values are in `kwargs` under different names
+ kwargs.pop('axis', None)
+ kwargs.pop('labels', None)
+ return super(DataFrame, self).reindex(**kwargs)
+
+ @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)
+ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
+ limit=None, fill_value=np.nan):
+ return super(DataFrame,
+ self).reindex_axis(labels=labels, axis=axis,
+ method=method, level=level, copy=copy,
+ limit=limit, fill_value=fill_value)
+
+ def drop(self, labels=None, axis=0, index=None, columns=None,
+ level=None, inplace=False, errors='raise'):
+ """
+ Drop specified labels from rows or columns.
+
+ Remove rows or columns by specifying label names and corresponding
+ axis, or by specifying directly index or column names. When using a
+ multi-index, labels on different levels can be removed by specifying
+ the level.
+
+ Parameters
+ ----------
+ labels : single label or list-like
+ Index or column labels to drop.
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ Whether to drop labels from the index (0 or 'index') or
+ columns (1 or 'columns').
+ index, columns : single label or list-like
+ Alternative to specifying axis (``labels, axis=1``
+ is equivalent to ``columns=labels``).
+
+ .. versionadded:: 0.21.0
+ level : int or level name, optional
+ For MultiIndex, level from which the labels will be removed.
+ inplace : bool, default False
+ If True, do operation inplace and return None.
+ errors : {'ignore', 'raise'}, default 'raise'
+ If 'ignore', suppress error and only existing labels are
+ dropped.
+
+ Returns
+ -------
+ dropped : pandas.DataFrame
+
+ Raises
+ ------
+ KeyError
+ If none of the labels are found in the selected axis
+
+ See Also
+ --------
+ DataFrame.loc : Label-location based indexer for selection by label.
+ DataFrame.dropna : Return DataFrame with labels on given axis omitted
+ where (all or any) data are missing.
+ DataFrame.drop_duplicates : Return DataFrame with duplicate rows
+ removed, optionally only considering certain columns.
+ Series.drop : Return Series with specified index labels removed.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(np.arange(12).reshape(3,4),
+ ... columns=['A', 'B', 'C', 'D'])
+ >>> df
+ A B C D
+ 0 0 1 2 3
+ 1 4 5 6 7
+ 2 8 9 10 11
+
+ Drop columns
+
+ >>> df.drop(['B', 'C'], axis=1)
+ A D
+ 0 0 3
+ 1 4 7
+ 2 8 11
+
+ >>> df.drop(columns=['B', 'C'])
+ A D
+ 0 0 3
+ 1 4 7
+ 2 8 11
+
+ Drop a row by index
+
+ >>> df.drop([0, 1])
+ A B C D
+ 2 8 9 10 11
+
+ Drop columns and/or rows of MultiIndex DataFrame
+
+ >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
+ ... ['speed', 'weight', 'length']],
+ ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
+ ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
+ >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
+ ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
+ ... [250, 150], [1.5, 0.8], [320, 250],
+ ... [1, 0.8], [0.3,0.2]])
+ >>> df
+ big small
+ lama speed 45.0 30.0
+ weight 200.0 100.0
+ length 1.5 1.0
+ cow speed 30.0 20.0
+ weight 250.0 150.0
+ length 1.5 0.8
+ falcon speed 320.0 250.0
+ weight 1.0 0.8
+ length 0.3 0.2
+
+ >>> df.drop(index='cow', columns='small')
+ big
+ lama speed 45.0
+ weight 200.0
+ length 1.5
+ falcon speed 320.0
+ weight 1.0
+ length 0.3
+
+ >>> df.drop(index='length', level=1)
+ big small
+ lama speed 45.0 30.0
+ weight 200.0 100.0
+ cow speed 30.0 20.0
+ weight 250.0 150.0
+ falcon speed 320.0 250.0
+ weight 1.0 0.8
+ """
+ return super(DataFrame, self).drop(labels=labels, axis=axis,
+ index=index, columns=columns,
+ level=level, inplace=inplace,
+ errors=errors)
+
+ @rewrite_axis_style_signature('mapper', [('copy', True),
+ ('inplace', False),
+ ('level', None)])
+ def rename(self, *args, **kwargs):
+ """
+ Alter axes labels.
+
+ Function / dict values must be unique (1-to-1). Labels not contained in
+ a dict / Series will be left as-is. Extra labels listed don't throw an
+ error.
+
+ See the :ref:`user guide <basics.rename>` for more.
+
+ Parameters
+ ----------
+ mapper, index, columns : dict-like or function, optional
+ dict-like or functions transformations to apply to
+ that axis' values. Use either ``mapper`` and ``axis`` to
+ specify the axis to target with ``mapper``, or ``index`` and
+ ``columns``.
+ axis : int or str, optional
+ Axis to target with ``mapper``. Can be either the axis name
+ ('index', 'columns') or number (0, 1). The default is 'index'.
+ copy : boolean, default True
+ Also copy underlying data
+ inplace : boolean, default False
+ Whether to return a new DataFrame. If True then value of copy is
+ ignored.
+ level : int or level name, default None
+ In case of a MultiIndex, only rename labels in the specified
+ level.
+
+ Returns
+ -------
+ renamed : DataFrame
+
+ See Also
+ --------
+ pandas.DataFrame.rename_axis
+
+ Examples
+ --------
+
+ ``DataFrame.rename`` supports two calling conventions
+
+ * ``(index=index_mapper, columns=columns_mapper, ...)``
+ * ``(mapper, axis={'index', 'columns'}, ...)``
+
+ We *highly* recommend using keyword arguments to clarify your
+ intent.
+
+ >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+ >>> df.rename(index=str, columns={"A": "a", "B": "c"})
+ a c
+ 0 1 4
+ 1 2 5
+ 2 3 6
+
+ >>> df.rename(index=str, columns={"A": "a", "C": "c"})
+ a B
+ 0 1 4
+ 1 2 5
+ 2 3 6
+
+ Using axis-style parameters
+
+ >>> df.rename(str.lower, axis='columns')
+ a b
+ 0 1 4
+ 1 2 5
+ 2 3 6
+
+ >>> df.rename({1: 2, 2: 4}, axis='index')
+ A B
+ 0 1 4
+ 2 2 5
+ 4 3 6
+ """
+ axes = validate_axis_style_args(self, args, kwargs, 'mapper', 'rename')
+ kwargs.update(axes)
+ # Pop these, since the values are in `kwargs` under different names
+ kwargs.pop('axis', None)
+ kwargs.pop('mapper', None)
+ return super(DataFrame, self).rename(**kwargs)
+
+ @Substitution(**_shared_doc_kwargs)
+ @Appender(NDFrame.fillna.__doc__)
+ def fillna(self, value=None, method=None, axis=None, inplace=False,
+ limit=None, downcast=None, **kwargs):
+ return super(DataFrame,
+ self).fillna(value=value, method=method, axis=axis,
+ inplace=inplace, limit=limit,
+ downcast=downcast, **kwargs)
+
+ @Appender(_shared_docs['replace'] % _shared_doc_kwargs)
+ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
+ regex=False, method='pad'):
+ return super(DataFrame, self).replace(to_replace=to_replace,
+ value=value, inplace=inplace,
+ limit=limit, regex=regex,
+ method=method)
+
+ @Appender(_shared_docs['shift'] % _shared_doc_kwargs)
+ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
+ return super(DataFrame, self).shift(periods=periods, freq=freq,
+ axis=axis, fill_value=fill_value)
+
+ def set_index(self, keys, drop=True, append=False, inplace=False,
+ verify_integrity=False):
+ """
+ Set the DataFrame index using existing columns.
+
+ Set the DataFrame index (row labels) using one or more existing
+ columns or arrays (of the correct length). The index can replace the
+ existing index or expand on it.
+
+ Parameters
+ ----------
+ keys : label or array-like or list of labels/arrays
+ This parameter can be either a single column key, a single array of
+ the same length as the calling DataFrame, or a list containing an
+ arbitrary combination of column keys and arrays. Here, "array"
+ encompasses :class:`Series`, :class:`Index` and ``np.ndarray``.
+ drop : bool, default True
+ Delete columns to be used as the new index.
+ append : bool, default False
+ Whether to append columns to existing index.
+ inplace : bool, default False
+ Modify the DataFrame in place (do not create a new object).
+ verify_integrity : bool, default False
+ Check the new index for duplicates. Otherwise defer the check until
+ necessary. Setting to False will improve the performance of this
+ method.
+
+ Returns
+ -------
+ DataFrame
+ Changed row labels.
+
+ See Also
+ --------
+ DataFrame.reset_index : Opposite of set_index.
+ DataFrame.reindex : Change to new indices or expand indices.
+ DataFrame.reindex_like : Change to same indices as other DataFrame.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
+ ... 'year': [2012, 2014, 2013, 2014],
+ ... 'sale': [55, 40, 84, 31]})
+ >>> df
+ month year sale
+ 0 1 2012 55
+ 1 4 2014 40
+ 2 7 2013 84
+ 3 10 2014 31
+
+ Set the index to become the 'month' column:
+
+ >>> df.set_index('month')
+ year sale
+ month
+ 1 2012 55
+ 4 2014 40
+ 7 2013 84
+ 10 2014 31
+
+ Create a MultiIndex using columns 'year' and 'month':
+
+ >>> df.set_index(['year', 'month'])
+ sale
+ year month
+ 2012 1 55
+ 2014 4 40
+ 2013 7 84
+ 2014 10 31
+
+ Create a MultiIndex using an Index and a column:
+
+ >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
+ month sale
+ year
+ 1 2012 1 55
+ 2 2014 4 40
+ 3 2013 7 84
+ 4 2014 10 31
+
+ Create a MultiIndex using two Series:
+
+ >>> s = pd.Series([1, 2, 3, 4])
+ >>> df.set_index([s, s**2])
+ month year sale
+ 1 1 1 2012 55
+ 2 4 4 2014 40
+ 3 9 7 2013 84
+ 4 16 10 2014 31
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ if not isinstance(keys, list):
+ keys = [keys]
+
+ if inplace:
+ frame = self
+ else:
+ frame = self.copy()
+
+ arrays = []
+ names = []
+ if append:
+ names = [x for x in self.index.names]
+ if isinstance(self.index, MultiIndex):
+ for i in range(self.index.nlevels):
+ arrays.append(self.index._get_level_values(i))
+ else:
+ arrays.append(self.index)
+
+ to_remove = []
+ for col in keys:
+ if isinstance(col, MultiIndex):
+ # append all but the last column so we don't have to modify
+ # the end of this loop
+ for n in range(col.nlevels - 1):
+ arrays.append(col._get_level_values(n))
+
+ level = col._get_level_values(col.nlevels - 1)
+ names.extend(col.names)
+ elif isinstance(col, Series):
+ level = col._values
+ names.append(col.name)
+ elif isinstance(col, Index):
+ level = col
+ names.append(col.name)
+ elif isinstance(col, (list, np.ndarray, Index)):
+ level = col
+ names.append(None)
+ else:
+ level = frame[col]._values
+ names.append(col)
+ if drop:
+ to_remove.append(col)
+ arrays.append(level)
+
+ index = ensure_index_from_sequences(arrays, names)
+
+ if verify_integrity and not index.is_unique:
+ duplicates = index[index.duplicated()].unique()
+ raise ValueError('Index has duplicate keys: {dup}'.format(
+ dup=duplicates))
+
+ # use set to handle duplicate column names gracefully in case of drop
+ for c in set(to_remove):
+ del frame[c]
+
+ # clear up memory usage
+ index._cleanup()
+
+ frame.index = index
+
+ if not inplace:
+ return frame
+
+ def reset_index(self, level=None, drop=False, inplace=False, col_level=0,
+ col_fill=''):
+ """
+ Reset the index, or a level of it.
+
+ Reset the index of the DataFrame, and use the default one instead.
+ If the DataFrame has a MultiIndex, this method can remove one or more
+ levels.
+
+ Parameters
+ ----------
+ level : int, str, tuple, or list, default None
+ Only remove the given levels from the index. Removes all levels by
+ default.
+ drop : bool, default False
+ Do not try to insert index into dataframe columns. This resets
+ the index to the default integer index.
+ inplace : bool, default False
+ Modify the DataFrame in place (do not create a new object).
+ col_level : int or str, default 0
+ If the columns have multiple levels, determines which level the
+ labels are inserted into. By default it is inserted into the first
+ level.
+ col_fill : object, default ''
+ If the columns have multiple levels, determines how the other
+ levels are named. If None then the index name is repeated.
+
+ Returns
+ -------
+ DataFrame
+ DataFrame with the new index.
+
+ See Also
+ --------
+ DataFrame.set_index : Opposite of reset_index.
+ DataFrame.reindex : Change to new indices or expand indices.
+ DataFrame.reindex_like : Change to same indices as other DataFrame.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([('bird', 389.0),
+ ... ('bird', 24.0),
+ ... ('mammal', 80.5),
+ ... ('mammal', np.nan)],
+ ... index=['falcon', 'parrot', 'lion', 'monkey'],
+ ... columns=('class', 'max_speed'))
+ >>> df
+ class max_speed
+ falcon bird 389.0
+ parrot bird 24.0
+ lion mammal 80.5
+ monkey mammal NaN
+
+ When we reset the index, the old index is added as a column, and a
+ new sequential index is used:
+
+ >>> df.reset_index()
+ index class max_speed
+ 0 falcon bird 389.0
+ 1 parrot bird 24.0
+ 2 lion mammal 80.5
+ 3 monkey mammal NaN
+
+ We can use the `drop` parameter to avoid the old index being added as
+ a column:
+
+ >>> df.reset_index(drop=True)
+ class max_speed
+ 0 bird 389.0
+ 1 bird 24.0
+ 2 mammal 80.5
+ 3 mammal NaN
+
+ You can also use `reset_index` with `MultiIndex`.
+
+ >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
+ ... ('bird', 'parrot'),
+ ... ('mammal', 'lion'),
+ ... ('mammal', 'monkey')],
+ ... names=['class', 'name'])
+ >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
+ ... ('species', 'type')])
+ >>> df = pd.DataFrame([(389.0, 'fly'),
+ ... ( 24.0, 'fly'),
+ ... ( 80.5, 'run'),
+ ... (np.nan, 'jump')],
+ ... index=index,
+ ... columns=columns)
+ >>> df
+ speed species
+ max type
+ class name
+ bird falcon 389.0 fly
+ parrot 24.0 fly
+ mammal lion 80.5 run
+ monkey NaN jump
+
+ If the index has multiple levels, we can reset a subset of them:
+
+ >>> df.reset_index(level='class')
+ class speed species
+ max type
+ name
+ falcon bird 389.0 fly
+ parrot bird 24.0 fly
+ lion mammal 80.5 run
+ monkey mammal NaN jump
+
+ If we are not dropping the index, by default, it is placed in the top
+ level. We can place it in another level:
+
+ >>> df.reset_index(level='class', col_level=1)
+ speed species
+ class max type
+ name
+ falcon bird 389.0 fly
+ parrot bird 24.0 fly
+ lion mammal 80.5 run
+ monkey mammal NaN jump
+
+ When the index is inserted under another level, we can specify under
+ which one with the parameter `col_fill`:
+
+ >>> df.reset_index(level='class', col_level=1, col_fill='species')
+ species speed species
+ class max type
+ name
+ falcon bird 389.0 fly
+ parrot bird 24.0 fly
+ lion mammal 80.5 run
+ monkey mammal NaN jump
+
+ If we specify a nonexistent level for `col_fill`, it is created:
+
+ >>> df.reset_index(level='class', col_level=1, col_fill='genus')
+ genus speed species
+ class max type
+ name
+ falcon bird 389.0 fly
+ parrot bird 24.0 fly
+ lion mammal 80.5 run
+ monkey mammal NaN jump
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ if inplace:
+ new_obj = self
+ else:
+ new_obj = self.copy()
+
+ def _maybe_casted_values(index, labels=None):
+ values = index._values
+ if not isinstance(index, (PeriodIndex, DatetimeIndex)):
+ if values.dtype == np.object_:
+ values = lib.maybe_convert_objects(values)
+
+ # if we have the labels, extract the values with a mask
+ if labels is not None:
+ mask = labels == -1
+
+ # we can have situations where the whole mask is -1,
+ # meaning there is nothing found in labels, so make all nan's
+ if mask.all():
+ values = np.empty(len(mask))
+ values.fill(np.nan)
+ else:
+ values = values.take(labels)
+
+ # TODO(https://github.com/pandas-dev/pandas/issues/24206)
+ # Push this into maybe_upcast_putmask?
+ # We can't pass EAs there right now. Looks a bit
+ # complicated.
+ # So we unbox the ndarray_values, op, re-box.
+ values_type = type(values)
+ values_dtype = values.dtype
+
+ if issubclass(values_type, DatetimeLikeArray):
+ values = values._data
+
+ if mask.any():
+ values, changed = maybe_upcast_putmask(
+ values, mask, np.nan)
+
+ if issubclass(values_type, DatetimeLikeArray):
+ values = values_type(values, dtype=values_dtype)
+
+ return values
+
+ new_index = ibase.default_index(len(new_obj))
+ if level is not None:
+ if not isinstance(level, (tuple, list)):
+ level = [level]
+ level = [self.index._get_level_number(lev) for lev in level]
+ if len(level) < self.index.nlevels:
+ new_index = self.index.droplevel(level)
+
+ if not drop:
+ if isinstance(self.index, MultiIndex):
+ names = [n if n is not None else ('level_%d' % i)
+ for (i, n) in enumerate(self.index.names)]
+ to_insert = lzip(self.index.levels, self.index.codes)
+ else:
+ default = 'index' if 'index' not in self else 'level_0'
+ names = ([default] if self.index.name is None
+ else [self.index.name])
+ to_insert = ((self.index, None),)
+
+ multi_col = isinstance(self.columns, MultiIndex)
+ for i, (lev, lab) in reversed(list(enumerate(to_insert))):
+ if not (level is None or i in level):
+ continue
+ name = names[i]
+ if multi_col:
+ col_name = (list(name) if isinstance(name, tuple)
+ else [name])
+ if col_fill is None:
+ if len(col_name) not in (1, self.columns.nlevels):
+ raise ValueError("col_fill=None is incompatible "
+ "with incomplete column name "
+ "{}".format(name))
+ col_fill = col_name[0]
+
+ lev_num = self.columns._get_level_number(col_level)
+ name_lst = [col_fill] * lev_num + col_name
+ missing = self.columns.nlevels - len(name_lst)
+ name_lst += [col_fill] * missing
+ name = tuple(name_lst)
+ # to ndarray and maybe infer different dtype
+ level_values = _maybe_casted_values(lev, lab)
+ new_obj.insert(0, name, level_values)
+
+ new_obj.index = new_index
+ if not inplace:
+ return new_obj
+
+ # ----------------------------------------------------------------------
+ # Reindex-based selection methods
+
+ @Appender(_shared_docs['isna'] % _shared_doc_kwargs)
+ def isna(self):
+ return super(DataFrame, self).isna()
+
+ @Appender(_shared_docs['isna'] % _shared_doc_kwargs)
+ def isnull(self):
+ return super(DataFrame, self).isnull()
+
+ @Appender(_shared_docs['notna'] % _shared_doc_kwargs)
+ def notna(self):
+ return super(DataFrame, self).notna()
+
+ @Appender(_shared_docs['notna'] % _shared_doc_kwargs)
+ def notnull(self):
+ return super(DataFrame, self).notnull()
+
+ def dropna(self, axis=0, how='any', thresh=None, subset=None,
+ inplace=False):
+ """
+ Remove missing values.
+
+ See the :ref:`User Guide <missing_data>` for more on which values are
+ considered missing, and how to work with missing data.
+
+ Parameters
+ ----------
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ Determine if rows or columns which contain missing values are
+ removed.
+
+ * 0, or 'index' : Drop rows which contain missing values.
+ * 1, or 'columns' : Drop columns which contain missing value.
+
+ .. deprecated:: 0.23.0
+
+ Pass tuple or list to drop on multiple axes.
+ Only a single axis is allowed.
+
+ how : {'any', 'all'}, default 'any'
+ Determine if row or column is removed from DataFrame, when we have
+ at least one NA or all NA.
+
+ * 'any' : If any NA values are present, drop that row or column.
+ * 'all' : If all values are NA, drop that row or column.
+
+ thresh : int, optional
+ Require that many non-NA values.
+ subset : array-like, optional
+ Labels along other axis to consider, e.g. if you are dropping rows
+ these would be a list of columns to include.
+ inplace : bool, default False
+ If True, do operation inplace and return None.
+
+ Returns
+ -------
+ DataFrame
+ DataFrame with NA entries dropped from it.
+
+ See Also
+ --------
+ DataFrame.isna: Indicate missing values.
+ DataFrame.notna : Indicate existing (non-missing) values.
+ DataFrame.fillna : Replace missing values.
+ Series.dropna : Drop missing values.
+ Index.dropna : Drop missing indices.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
+ ... "toy": [np.nan, 'Batmobile', 'Bullwhip'],
+ ... "born": [pd.NaT, pd.Timestamp("1940-04-25"),
+ ... pd.NaT]})
+ >>> df
+ name toy born
+ 0 Alfred NaN NaT
+ 1 Batman Batmobile 1940-04-25
+ 2 Catwoman Bullwhip NaT
+
+ Drop the rows where at least one element is missing.
+
+ >>> df.dropna()
+ name toy born
+ 1 Batman Batmobile 1940-04-25
+
+ Drop the columns where at least one element is missing.
+
+ >>> df.dropna(axis='columns')
+ name
+ 0 Alfred
+ 1 Batman
+ 2 Catwoman
+
+ Drop the rows where all elements are missing.
+
+ >>> df.dropna(how='all')
+ name toy born
+ 0 Alfred NaN NaT
+ 1 Batman Batmobile 1940-04-25
+ 2 Catwoman Bullwhip NaT
+
+ Keep only the rows with at least 2 non-NA values.
+
+ >>> df.dropna(thresh=2)
+ name toy born
+ 1 Batman Batmobile 1940-04-25
+ 2 Catwoman Bullwhip NaT
+
+ Define in which columns to look for missing values.
+
+ >>> df.dropna(subset=['name', 'born'])
+ name toy born
+ 1 Batman Batmobile 1940-04-25
+
+ Keep the DataFrame with valid entries in the same variable.
+
+ >>> df.dropna(inplace=True)
+ >>> df
+ name toy born
+ 1 Batman Batmobile 1940-04-25
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ if isinstance(axis, (tuple, list)):
+ # GH20987
+ msg = ("supplying multiple axes to axis is deprecated and "
+ "will be removed in a future version.")
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+
+ result = self
+ for ax in axis:
+ result = result.dropna(how=how, thresh=thresh, subset=subset,
+ axis=ax)
+ else:
+ axis = self._get_axis_number(axis)
+ agg_axis = 1 - axis
+
+ agg_obj = self
+ if subset is not None:
+ ax = self._get_axis(agg_axis)
+ indices = ax.get_indexer_for(subset)
+ check = indices == -1
+ if check.any():
+ raise KeyError(list(np.compress(check, subset)))
+ agg_obj = self.take(indices, axis=agg_axis)
+
+ count = agg_obj.count(axis=agg_axis)
+
+ if thresh is not None:
+ mask = count >= thresh
+ elif how == 'any':
+ mask = count == len(agg_obj._get_axis(agg_axis))
+ elif how == 'all':
+ mask = count > 0
+ else:
+ if how is not None:
+ raise ValueError('invalid how option: {h}'.format(h=how))
+ else:
+ raise TypeError('must specify how or thresh')
+
+ result = self.loc(axis=axis)[mask]
+
+ if inplace:
+ self._update_inplace(result)
+ else:
+ return result
+
+ def drop_duplicates(self, subset=None, keep='first', inplace=False):
+ """
+ Return DataFrame with duplicate rows removed, optionally only
+ considering certain columns.
+
+ Parameters
+ ----------
+ subset : column label or sequence of labels, optional
+ Only consider certain columns for identifying duplicates, by
+ default use all of the columns
+ keep : {'first', 'last', False}, default 'first'
+ - ``first`` : Drop duplicates except for the first occurrence.
+ - ``last`` : Drop duplicates except for the last occurrence.
+ - False : Drop all duplicates.
+ inplace : boolean, default False
+ Whether to drop duplicates in place or to return a copy
+
+ Returns
+ -------
+ deduplicated : DataFrame
+ """
+ if self.empty:
+ return self.copy()
+
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ duplicated = self.duplicated(subset, keep=keep)
+
+ if inplace:
+ inds, = (-duplicated)._ndarray_values.nonzero()
+ new_data = self._data.take(inds)
+ self._update_inplace(new_data)
+ else:
+ return self[-duplicated]
+
+ def duplicated(self, subset=None, keep='first'):
+ """
+ Return boolean Series denoting duplicate rows, optionally only
+ considering certain columns.
+
+ Parameters
+ ----------
+ subset : column label or sequence of labels, optional
+ Only consider certain columns for identifying duplicates, by
+ default use all of the columns
+ keep : {'first', 'last', False}, default 'first'
+ - ``first`` : Mark duplicates as ``True`` except for the
+ first occurrence.
+ - ``last`` : Mark duplicates as ``True`` except for the
+ last occurrence.
+ - False : Mark all duplicates as ``True``.
+
+ Returns
+ -------
+ duplicated : Series
+ """
+ from pandas.core.sorting import get_group_index
+ from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
+
+ if self.empty:
+ return Series(dtype=bool)
+
+ def f(vals):
+ labels, shape = algorithms.factorize(
+ vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
+ return labels.astype('i8', copy=False), len(shape)
+
+ if subset is None:
+ subset = self.columns
+ elif (not np.iterable(subset) or
+ isinstance(subset, compat.string_types) or
+ isinstance(subset, tuple) and subset in self.columns):
+ subset = subset,
+
+ # Verify all columns in subset exist in the queried dataframe
+ # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
+ # key that doesn't exist.
+ diff = Index(subset).difference(self.columns)
+ if not diff.empty:
+ raise KeyError(diff)
+
+ vals = (col.values for name, col in self.iteritems()
+ if name in subset)
+ labels, shape = map(list, zip(*map(f, vals)))
+
+ ids = get_group_index(labels, shape, sort=False, xnull=False)
+ return Series(duplicated_int64(ids, keep), index=self.index)
+
+ # ----------------------------------------------------------------------
+ # Sorting
+
+ @Substitution(**_shared_doc_kwargs)
+ @Appender(NDFrame.sort_values.__doc__)
+ def sort_values(self, by, axis=0, ascending=True, inplace=False,
+ kind='quicksort', na_position='last'):
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ axis = self._get_axis_number(axis)
+
+ if not isinstance(by, list):
+ by = [by]
+ if is_sequence(ascending) and len(by) != len(ascending):
+ raise ValueError('Length of ascending (%d) != length of by (%d)' %
+ (len(ascending), len(by)))
+ if len(by) > 1:
+ from pandas.core.sorting import lexsort_indexer
+
+ keys = [self._get_label_or_level_values(x, axis=axis)
+ for x in by]
+ indexer = lexsort_indexer(keys, orders=ascending,
+ na_position=na_position)
+ indexer = ensure_platform_int(indexer)
+ else:
+ from pandas.core.sorting import nargsort
+
+ by = by[0]
+ k = self._get_label_or_level_values(by, axis=axis)
+
+ if isinstance(ascending, (tuple, list)):
+ ascending = ascending[0]
+
+ indexer = nargsort(k, kind=kind, ascending=ascending,
+ na_position=na_position)
+
+ new_data = self._data.take(indexer,
+ axis=self._get_block_manager_axis(axis),
+ verify=False)
+
+ if inplace:
+ return self._update_inplace(new_data)
+ else:
+ return self._constructor(new_data).__finalize__(self)
+
+ @Substitution(**_shared_doc_kwargs)
+ @Appender(NDFrame.sort_index.__doc__)
+ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
+ kind='quicksort', na_position='last', sort_remaining=True,
+ by=None):
+
+ # TODO: this can be combined with Series.sort_index impl as
+ # almost identical
+
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ # 10726
+ if by is not None:
+ warnings.warn("by argument to sort_index is deprecated, "
+ "please use .sort_values(by=...)",
+ FutureWarning, stacklevel=2)
+ if level is not None:
+ raise ValueError("unable to simultaneously sort by and level")
+ return self.sort_values(by, axis=axis, ascending=ascending,
+ inplace=inplace)
+
+ axis = self._get_axis_number(axis)
+ labels = self._get_axis(axis)
+
+ # make sure that the axis is lexsorted to start
+ # if not we need to reconstruct to get the correct indexer
+ labels = labels._sort_levels_monotonic()
+ if level is not None:
+
+ new_axis, indexer = labels.sortlevel(level, ascending=ascending,
+ sort_remaining=sort_remaining)
+
+ elif isinstance(labels, MultiIndex):
+ from pandas.core.sorting import lexsort_indexer
+
+ indexer = lexsort_indexer(labels._get_codes_for_sorting(),
+ orders=ascending,
+ na_position=na_position)
+ else:
+ from pandas.core.sorting import nargsort
+
+ # Check monotonic-ness before sort an index
+ # GH11080
+ if ((ascending and labels.is_monotonic_increasing) or
+ (not ascending and labels.is_monotonic_decreasing)):
+ if inplace:
+ return
+ else:
+ return self.copy()
+
+ indexer = nargsort(labels, kind=kind, ascending=ascending,
+ na_position=na_position)
+
+ baxis = self._get_block_manager_axis(axis)
+ new_data = self._data.take(indexer,
+ axis=baxis,
+ verify=False)
+
+ # reconstruct axis if needed
+ new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic()
+
+ if inplace:
+ return self._update_inplace(new_data)
+ else:
+ return self._constructor(new_data).__finalize__(self)
+
+ def nlargest(self, n, columns, keep='first'):
+ """
+ Return the first `n` rows ordered by `columns` in descending order.
+
+ Return the first `n` rows with the largest values in `columns`, in
+ descending order. The columns that are not specified are returned as
+ well, but not used for ordering.
+
+ This method is equivalent to
+ ``df.sort_values(columns, ascending=False).head(n)``, but more
+ performant.
+
+ Parameters
+ ----------
+ n : int
+ Number of rows to return.
+ columns : label or list of labels
+ Column label(s) to order by.
+ keep : {'first', 'last', 'all'}, default 'first'
+ Where there are duplicate values:
+
+ - `first` : prioritize the first occurrence(s)
+ - `last` : prioritize the last occurrence(s)
+ - ``all`` : do not drop any duplicates, even it means
+ selecting more than `n` items.
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ DataFrame
+ The first `n` rows ordered by the given columns in descending
+ order.
+
+ See Also
+ --------
+ DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
+ ascending order.
+ DataFrame.sort_values : Sort DataFrame by the values.
+ DataFrame.head : Return the first `n` rows without re-ordering.
+
+ Notes
+ -----
+ This function cannot be used with all column types. For example, when
+ specifying columns with `object` or `category` dtypes, ``TypeError`` is
+ raised.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
+ ... 434000, 434000, 337000, 11300,
+ ... 11300, 11300],
+ ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
+ ... 17036, 182, 38, 311],
+ ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
+ ... "IS", "NR", "TV", "AI"]},
+ ... index=["Italy", "France", "Malta",
+ ... "Maldives", "Brunei", "Iceland",
+ ... "Nauru", "Tuvalu", "Anguilla"])
+ >>> df
+ population GDP alpha-2
+ Italy 59000000 1937894 IT
+ France 65000000 2583560 FR
+ Malta 434000 12011 MT
+ Maldives 434000 4520 MV
+ Brunei 434000 12128 BN
+ Iceland 337000 17036 IS
+ Nauru 11300 182 NR
+ Tuvalu 11300 38 TV
+ Anguilla 11300 311 AI
+
+ In the following example, we will use ``nlargest`` to select the three
+ rows having the largest values in column "population".
+
+ >>> df.nlargest(3, 'population')
+ population GDP alpha-2
+ France 65000000 2583560 FR
+ Italy 59000000 1937894 IT
+ Malta 434000 12011 MT
+
+ When using ``keep='last'``, ties are resolved in reverse order:
+
+ >>> df.nlargest(3, 'population', keep='last')
+ population GDP alpha-2
+ France 65000000 2583560 FR
+ Italy 59000000 1937894 IT
+ Brunei 434000 12128 BN
+
+ When using ``keep='all'``, all duplicate items are maintained:
+
+ >>> df.nlargest(3, 'population', keep='all')
+ population GDP alpha-2
+ France 65000000 2583560 FR
+ Italy 59000000 1937894 IT
+ Malta 434000 12011 MT
+ Maldives 434000 4520 MV
+ Brunei 434000 12128 BN
+
+ To order by the largest values in column "population" and then "GDP",
+ we can specify multiple columns like in the next example.
+
+ >>> df.nlargest(3, ['population', 'GDP'])
+ population GDP alpha-2
+ France 65000000 2583560 FR
+ Italy 59000000 1937894 IT
+ Brunei 434000 12128 BN
+ """
+ return algorithms.SelectNFrame(self,
+ n=n,
+ keep=keep,
+ columns=columns).nlargest()
+
+ def nsmallest(self, n, columns, keep='first'):
+ """
+ Return the first `n` rows ordered by `columns` in ascending order.
+
+ Return the first `n` rows with the smallest values in `columns`, in
+ ascending order. The columns that are not specified are returned as
+ well, but not used for ordering.
+
+ This method is equivalent to
+ ``df.sort_values(columns, ascending=True).head(n)``, but more
+ performant.
+
+ Parameters
+ ----------
+ n : int
+ Number of items to retrieve.
+ columns : list or str
+ Column name or names to order by.
+ keep : {'first', 'last', 'all'}, default 'first'
+ Where there are duplicate values:
+
+ - ``first`` : take the first occurrence.
+ - ``last`` : take the last occurrence.
+ - ``all`` : do not drop any duplicates, even it means
+ selecting more than `n` items.
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ DataFrame
+
+ See Also
+ --------
+ DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
+ descending order.
+ DataFrame.sort_values : Sort DataFrame by the values.
+ DataFrame.head : Return the first `n` rows without re-ordering.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
+ ... 434000, 434000, 337000, 11300,
+ ... 11300, 11300],
+ ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
+ ... 17036, 182, 38, 311],
+ ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
+ ... "IS", "NR", "TV", "AI"]},
+ ... index=["Italy", "France", "Malta",
+ ... "Maldives", "Brunei", "Iceland",
+ ... "Nauru", "Tuvalu", "Anguilla"])
+ >>> df
+ population GDP alpha-2
+ Italy 59000000 1937894 IT
+ France 65000000 2583560 FR
+ Malta 434000 12011 MT
+ Maldives 434000 4520 MV
+ Brunei 434000 12128 BN
+ Iceland 337000 17036 IS
+ Nauru 11300 182 NR
+ Tuvalu 11300 38 TV
+ Anguilla 11300 311 AI
+
+ In the following example, we will use ``nsmallest`` to select the
+ three rows having the smallest values in column "a".
+
+ >>> df.nsmallest(3, 'population')
+ population GDP alpha-2
+ Nauru 11300 182 NR
+ Tuvalu 11300 38 TV
+ Anguilla 11300 311 AI
+
+ When using ``keep='last'``, ties are resolved in reverse order:
+
+ >>> df.nsmallest(3, 'population', keep='last')
+ population GDP alpha-2
+ Anguilla 11300 311 AI
+ Tuvalu 11300 38 TV
+ Nauru 11300 182 NR
+
+ When using ``keep='all'``, all duplicate items are maintained:
+
+ >>> df.nsmallest(3, 'population', keep='all')
+ population GDP alpha-2
+ Nauru 11300 182 NR
+ Tuvalu 11300 38 TV
+ Anguilla 11300 311 AI
+
+ To order by the largest values in column "a" and then "c", we can
+ specify multiple columns like in the next example.
+
+ >>> df.nsmallest(3, ['population', 'GDP'])
+ population GDP alpha-2
+ Tuvalu 11300 38 TV
+ Nauru 11300 182 NR
+ Anguilla 11300 311 AI
+ """
+ return algorithms.SelectNFrame(self,
+ n=n,
+ keep=keep,
+ columns=columns).nsmallest()
+
+ def swaplevel(self, i=-2, j=-1, axis=0):
+ """
+ Swap levels i and j in a MultiIndex on a particular axis.
+
+ Parameters
+ ----------
+ i, j : int, string (can be mixed)
+ Level of index to be swapped. Can pass level name as string.
+
+ Returns
+ -------
+ swapped : same type as caller (new object)
+
+ .. versionchanged:: 0.18.1
+
+ The indexes ``i`` and ``j`` are now optional, and default to
+ the two innermost levels of the index.
+ """
+ result = self.copy()
+
+ axis = self._get_axis_number(axis)
+ if axis == 0:
+ result.index = result.index.swaplevel(i, j)
+ else:
+ result.columns = result.columns.swaplevel(i, j)
+ return result
+
+ def reorder_levels(self, order, axis=0):
+ """
+ Rearrange index levels using input order. May not drop or
+ duplicate levels.
+
+ Parameters
+ ----------
+ order : list of int or list of str
+ List representing new level order. Reference level by number
+ (position) or by key (label).
+ axis : int
+ Where to reorder levels.
+
+ Returns
+ -------
+ type of caller (new object)
+ """
+ axis = self._get_axis_number(axis)
+ if not isinstance(self._get_axis(axis),
+ MultiIndex): # pragma: no cover
+ raise TypeError('Can only reorder levels on a hierarchical axis.')
+
+ result = self.copy()
+
+ if axis == 0:
+ result.index = result.index.reorder_levels(order)
+ else:
+ result.columns = result.columns.reorder_levels(order)
+ return result
+
+ # ----------------------------------------------------------------------
+ # Arithmetic / combination related
+
+ def _combine_frame(self, other, func, fill_value=None, level=None):
+ this, other = self.align(other, join='outer', level=level, copy=False)
+ new_index, new_columns = this.index, this.columns
+
+ def _arith_op(left, right):
+ # for the mixed_type case where we iterate over columns,
+ # _arith_op(left, right) is equivalent to
+ # left._binop(right, func, fill_value=fill_value)
+ left, right = ops.fill_binop(left, right, fill_value)
+ return func(left, right)
+
+ if ops.should_series_dispatch(this, other, func):
+ # iterate over columns
+ return ops.dispatch_to_series(this, other, _arith_op)
+ else:
+ result = _arith_op(this.values, other.values)
+ return self._constructor(result,
+ index=new_index, columns=new_columns,
+ copy=False)
+
+ def _combine_match_index(self, other, func, level=None):
+ left, right = self.align(other, join='outer', axis=0, level=level,
+ copy=False)
+ assert left.index.equals(right.index)
+
+ if left._is_mixed_type or right._is_mixed_type:
+ # operate column-wise; avoid costly object-casting in `.values`
+ return ops.dispatch_to_series(left, right, func)
+ else:
+ # fastpath --> operate directly on values
+ with np.errstate(all="ignore"):
+ new_data = func(left.values.T, right.values).T
+ return self._constructor(new_data,
+ index=left.index, columns=self.columns,
+ copy=False)
+
+ def _combine_match_columns(self, other, func, level=None):
+ assert isinstance(other, Series)
+ left, right = self.align(other, join='outer', axis=1, level=level,
+ copy=False)
+ assert left.columns.equals(right.index)
+ return ops.dispatch_to_series(left, right, func, axis="columns")
+
+ def _combine_const(self, other, func):
+ assert lib.is_scalar(other) or np.ndim(other) == 0
+ return ops.dispatch_to_series(self, other, func)
+
+ def combine(self, other, func, fill_value=None, overwrite=True):
+ """
+ Perform column-wise combine with another DataFrame based on a
+ passed function.
+
+ Combines a DataFrame with `other` DataFrame using `func`
+ to element-wise combine columns. The row and column indexes of the
+ resulting DataFrame will be the union of the two.
+
+ Parameters
+ ----------
+ other : DataFrame
+ The DataFrame to merge column-wise.
+ func : function
+ Function that takes two series as inputs and return a Series or a
+ scalar. Used to merge the two dataframes column by columns.
+ fill_value : scalar value, default None
+ The value to fill NaNs with prior to passing any column to the
+ merge func.
+ overwrite : boolean, default True
+ If True, columns in `self` that do not exist in `other` will be
+ overwritten with NaNs.
+
+ Returns
+ -------
+ result : DataFrame
+
+ See Also
+ --------
+ DataFrame.combine_first : Combine two DataFrame objects and default to
+ non-null values in frame calling the method.
+
+ Examples
+ --------
+ Combine using a simple function that chooses the smaller column.
+
+ >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
+ >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
+ >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
+ >>> df1.combine(df2, take_smaller)
+ A B
+ 0 0 3
+ 1 0 3
+
+ Example using a true element-wise combine function.
+
+ >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})
+ >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
+ >>> df1.combine(df2, np.minimum)
+ A B
+ 0 1 2
+ 1 0 3
+
+ Using `fill_value` fills Nones prior to passing the column to the
+ merge function.
+
+ >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
+ >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
+ >>> df1.combine(df2, take_smaller, fill_value=-5)
+ A B
+ 0 0 -5.0
+ 1 0 4.0
+
+ However, if the same element in both dataframes is None, that None
+ is preserved
+
+ >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
+ >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})
+ >>> df1.combine(df2, take_smaller, fill_value=-5)
+ A B
+ 0 0 NaN
+ 1 0 3.0
+
+ Example that demonstrates the use of `overwrite` and behavior when
+ the axis differ between the dataframes.
+
+ >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
+ >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1],}, index=[1, 2])
+ >>> df1.combine(df2, take_smaller)
+ A B C
+ 0 NaN NaN NaN
+ 1 NaN 3.0 -10.0
+ 2 NaN 3.0 1.0
+
+ >>> df1.combine(df2, take_smaller, overwrite=False)
+ A B C
+ 0 0.0 NaN NaN
+ 1 0.0 3.0 -10.0
+ 2 NaN 3.0 1.0
+
+ Demonstrating the preference of the passed in dataframe.
+
+ >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1],}, index=[1, 2])
+ >>> df2.combine(df1, take_smaller)
+ A B C
+ 0 0.0 NaN NaN
+ 1 0.0 3.0 NaN
+ 2 NaN 3.0 NaN
+
+ >>> df2.combine(df1, take_smaller, overwrite=False)
+ A B C
+ 0 0.0 NaN NaN
+ 1 0.0 3.0 1.0
+ 2 NaN 3.0 1.0
+ """
+ other_idxlen = len(other.index) # save for compare
+
+ this, other = self.align(other, copy=False)
+ new_index = this.index
+
+ if other.empty and len(new_index) == len(self.index):
+ return self.copy()
+
+ if self.empty and len(other) == other_idxlen:
+ return other.copy()
+
+ # sorts if possible
+ new_columns = this.columns.union(other.columns)
+ do_fill = fill_value is not None
+ result = {}
+ for col in new_columns:
+ series = this[col]
+ otherSeries = other[col]
+
+ this_dtype = series.dtype
+ other_dtype = otherSeries.dtype
+
+ this_mask = isna(series)
+ other_mask = isna(otherSeries)
+
+ # don't overwrite columns unecessarily
+ # DO propagate if this column is not in the intersection
+ if not overwrite and other_mask.all():
+ result[col] = this[col].copy()
+ continue
+
+ if do_fill:
+ series = series.copy()
+ otherSeries = otherSeries.copy()
+ series[this_mask] = fill_value
+ otherSeries[other_mask] = fill_value
+
+ if col not in self.columns:
+ # If self DataFrame does not have col in other DataFrame,
+ # try to promote series, which is all NaN, as other_dtype.
+ new_dtype = other_dtype
+ try:
+ series = series.astype(new_dtype, copy=False)
+ except ValueError:
+ # e.g. new_dtype is integer types
+ pass
+ else:
+ # if we have different dtypes, possibly promote
+ new_dtype = find_common_type([this_dtype, other_dtype])
+ if not is_dtype_equal(this_dtype, new_dtype):
+ series = series.astype(new_dtype)
+ if not is_dtype_equal(other_dtype, new_dtype):
+ otherSeries = otherSeries.astype(new_dtype)
+
+ arr = func(series, otherSeries)
+ arr = maybe_downcast_to_dtype(arr, this_dtype)
+
+ result[col] = arr
+
+ # convert_objects just in case
+ return self._constructor(result, index=new_index,
+ columns=new_columns)
+
+ def combine_first(self, other):
+ """
+ Update null elements with value in the same location in `other`.
+
+ Combine two DataFrame objects by filling null values in one DataFrame
+ with non-null values from other DataFrame. The row and column indexes
+ of the resulting DataFrame will be the union of the two.
+
+ Parameters
+ ----------
+ other : DataFrame
+ Provided DataFrame to use to fill null values.
+
+ Returns
+ -------
+ combined : DataFrame
+
+ See Also
+ --------
+ DataFrame.combine : Perform series-wise operation on two DataFrames
+ using a given function.
+
+ Examples
+ --------
+
+ >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
+ >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
+ >>> df1.combine_first(df2)
+ A B
+ 0 1.0 3.0
+ 1 0.0 4.0
+
+ Null values still persist if the location of that null value
+ does not exist in `other`
+
+ >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
+ >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
+ >>> df1.combine_first(df2)
+ A B C
+ 0 NaN 4.0 NaN
+ 1 0.0 3.0 1.0
+ 2 NaN 3.0 1.0
+ """
+ import pandas.core.computation.expressions as expressions
+
+ def extract_values(arr):
+ # Does two things:
+ # 1. maybe gets the values from the Series / Index
+ # 2. convert datelike to i8
+ if isinstance(arr, (ABCIndexClass, ABCSeries)):
+ arr = arr._values
+
+ if needs_i8_conversion(arr):
+ if is_extension_array_dtype(arr.dtype):
+ arr = arr.asi8
+ else:
+ arr = arr.view('i8')
+ return arr
+
+ def combiner(x, y):
+ mask = isna(x)
+ if isinstance(mask, (ABCIndexClass, ABCSeries)):
+ mask = mask._values
+
+ x_values = extract_values(x)
+ y_values = extract_values(y)
+
+ # If the column y in other DataFrame is not in first DataFrame,
+ # just return y_values.
+ if y.name not in self.columns:
+ return y_values
+
+ return expressions.where(mask, y_values, x_values)
+
+ return self.combine(other, combiner, overwrite=False)
+
+ @deprecate_kwarg(old_arg_name='raise_conflict', new_arg_name='errors',
+ mapping={False: 'ignore', True: 'raise'})
+ def update(self, other, join='left', overwrite=True, filter_func=None,
+ errors='ignore'):
+ """
+ Modify in place using non-NA values from another DataFrame.
+
+ Aligns on indices. There is no return value.
+
+ Parameters
+ ----------
+ other : DataFrame, or object coercible into a DataFrame
+ Should have at least one matching index/column label
+ with the original DataFrame. If a Series is passed,
+ its name attribute must be set, and that will be
+ used as the column name to align with the original DataFrame.
+ join : {'left'}, default 'left'
+ Only left join is implemented, keeping the index and columns of the
+ original object.
+ overwrite : bool, default True
+ How to handle non-NA values for overlapping keys:
+
+ * True: overwrite original DataFrame's values
+ with values from `other`.
+ * False: only update values that are NA in
+ the original DataFrame.
+
+ filter_func : callable(1d-array) -> bool 1d-array, optional
+ Can choose to replace values other than NA. Return True for values
+ that should be updated.
+ errors : {'raise', 'ignore'}, default 'ignore'
+ If 'raise', will raise a ValueError if the DataFrame and `other`
+ both contain non-NA data in the same place.
+
+ .. versionchanged :: 0.24.0
+ Changed from `raise_conflict=False|True`
+ to `errors='ignore'|'raise'`.
+
+ Returns
+ -------
+ None : method directly changes calling object
+
+ Raises
+ ------
+ ValueError
+ * When `errors='raise'` and there's overlapping non-NA data.
+ * When `errors` is not either `'ignore'` or `'raise'`
+ NotImplementedError
+ * If `join != 'left'`
+
+ See Also
+ --------
+ dict.update : Similar method for dictionaries.
+ DataFrame.merge : For column(s)-on-columns(s) operations.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'A': [1, 2, 3],
+ ... 'B': [400, 500, 600]})
+ >>> new_df = pd.DataFrame({'B': [4, 5, 6],
+ ... 'C': [7, 8, 9]})
+ >>> df.update(new_df)
+ >>> df
+ A B
+ 0 1 4
+ 1 2 5
+ 2 3 6
+
+ The DataFrame's length does not increase as a result of the update,
+ only values at matching index/column labels are updated.
+
+ >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
+ ... 'B': ['x', 'y', 'z']})
+ >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
+ >>> df.update(new_df)
+ >>> df
+ A B
+ 0 a d
+ 1 b e
+ 2 c f
+
+ For Series, it's name attribute must be set.
+
+ >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
+ ... 'B': ['x', 'y', 'z']})
+ >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])
+ >>> df.update(new_column)
+ >>> df
+ A B
+ 0 a d
+ 1 b y
+ 2 c e
+ >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
+ ... 'B': ['x', 'y', 'z']})
+ >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])
+ >>> df.update(new_df)
+ >>> df
+ A B
+ 0 a x
+ 1 b d
+ 2 c e
+
+ If `other` contains NaNs the corresponding values are not updated
+ in the original dataframe.
+
+ >>> df = pd.DataFrame({'A': [1, 2, 3],
+ ... 'B': [400, 500, 600]})
+ >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
+ >>> df.update(new_df)
+ >>> df
+ A B
+ 0 1 4.0
+ 1 2 500.0
+ 2 3 6.0
+ """
+ import pandas.core.computation.expressions as expressions
+ # TODO: Support other joins
+ if join != 'left': # pragma: no cover
+ raise NotImplementedError("Only left join is supported")
+ if errors not in ['ignore', 'raise']:
+ raise ValueError("The parameter errors must be either "
+ "'ignore' or 'raise'")
+
+ if not isinstance(other, DataFrame):
+ other = DataFrame(other)
+
+ other = other.reindex_like(self)
+
+ for col in self.columns:
+ this = self[col].values
+ that = other[col].values
+ if filter_func is not None:
+ with np.errstate(all='ignore'):
+ mask = ~filter_func(this) | isna(that)
+ else:
+ if errors == 'raise':
+ mask_this = notna(that)
+ mask_that = notna(this)
+ if any(mask_this & mask_that):
+ raise ValueError("Data overlaps.")
+
+ if overwrite:
+ mask = isna(that)
+ else:
+ mask = notna(this)
+
+ # don't overwrite columns unecessarily
+ if mask.all():
+ continue
+
+ self[col] = expressions.where(mask, this, that)
+
+ # ----------------------------------------------------------------------
+ # Data reshaping
+
+ _shared_docs['pivot'] = """
+ Return reshaped DataFrame organized by given index / column values.
+
+ Reshape data (produce a "pivot" table) based on column values. Uses
+ unique values from specified `index` / `columns` to form axes of the
+ resulting DataFrame. This function does not support data
+ aggregation, multiple values will result in a MultiIndex in the
+ columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
+
+ Parameters
+ ----------%s
+ index : string or object, optional
+ Column to use to make new frame's index. If None, uses
+ existing index.
+ columns : string or object
+ Column to use to make new frame's columns.
+ values : string, object or a list of the previous, optional
+ Column(s) to use for populating new frame's values. If not
+ specified, all remaining columns will be used and the result will
+ have hierarchically indexed columns.
+
+ .. versionchanged :: 0.23.0
+ Also accept list of column names.
+
+ Returns
+ -------
+ DataFrame
+ Returns reshaped DataFrame.
+
+ Raises
+ ------
+ ValueError:
+ When there are any `index`, `columns` combinations with multiple
+ values. `DataFrame.pivot_table` when you need to aggregate.
+
+ See Also
+ --------
+ DataFrame.pivot_table : Generalization of pivot that can handle
+ duplicate values for one index/column pair.
+ DataFrame.unstack : Pivot based on the index values instead of a
+ column.
+
+ Notes
+ -----
+ For finer-tuned control, see hierarchical indexing documentation along
+ with the related stack/unstack methods.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
+ ... 'two'],
+ ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
+ ... 'baz': [1, 2, 3, 4, 5, 6],
+ ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
+ >>> df
+ foo bar baz zoo
+ 0 one A 1 x
+ 1 one B 2 y
+ 2 one C 3 z
+ 3 two A 4 q
+ 4 two B 5 w
+ 5 two C 6 t
+
+ >>> df.pivot(index='foo', columns='bar', values='baz')
+ bar A B C
+ foo
+ one 1 2 3
+ two 4 5 6
+
+ >>> df.pivot(index='foo', columns='bar')['baz']
+ bar A B C
+ foo
+ one 1 2 3
+ two 4 5 6
+
+ >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
+ baz zoo
+ bar A B C A B C
+ foo
+ one 1 2 3 x y z
+ two 4 5 6 q w t
+
+ A ValueError is raised if there are any duplicates.
+
+ >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
+ ... "bar": ['A', 'A', 'B', 'C'],
+ ... "baz": [1, 2, 3, 4]})
+ >>> df
+ foo bar baz
+ 0 one A 1
+ 1 one A 2
+ 2 two B 3
+ 3 two C 4
+
+ Notice that the first two rows are the same for our `index`
+ and `columns` arguments.
+
+ >>> df.pivot(index='foo', columns='bar', values='baz')
+ Traceback (most recent call last):
+ ...
+ ValueError: Index contains duplicate entries, cannot reshape
+ """
+
+ @Substitution('')
+ @Appender(_shared_docs['pivot'])
+ def pivot(self, index=None, columns=None, values=None):
+ from pandas.core.reshape.pivot import pivot
+ return pivot(self, index=index, columns=columns, values=values)
+
+ _shared_docs['pivot_table'] = """
+ Create a spreadsheet-style pivot table as a DataFrame. The levels in
+ the pivot table will be stored in MultiIndex objects (hierarchical
+ indexes) on the index and columns of the result DataFrame.
+
+ Parameters
+ ----------%s
+ values : column to aggregate, optional
+ index : column, Grouper, array, or list of the previous
+ If an array is passed, it must be the same length as the data. The
+ list can contain any of the other types (except list).
+ Keys to group by on the pivot table index. If an array is passed,
+ it is being used as the same manner as column values.
+ columns : column, Grouper, array, or list of the previous
+ If an array is passed, it must be the same length as the data. The
+ list can contain any of the other types (except list).
+ Keys to group by on the pivot table column. If an array is passed,
+ it is being used as the same manner as column values.
+ aggfunc : function, list of functions, dict, default numpy.mean
+ If list of functions passed, the resulting pivot table will have
+ hierarchical columns whose top level are the function names
+ (inferred from the function objects themselves)
+ If dict is passed, the key is column to aggregate and value
+ is function or list of functions
+ fill_value : scalar, default None
+ Value to replace missing values with
+ margins : boolean, default False
+ Add all row / columns (e.g. for subtotal / grand totals)
+ dropna : boolean, default True
+ Do not include columns whose entries are all NaN
+ margins_name : string, default 'All'
+ Name of the row / column that will contain the totals
+ when margins is True.
+
+ Returns
+ -------
+ table : DataFrame
+
+ See Also
+ --------
+ DataFrame.pivot : Pivot without aggregation that can handle
+ non-numeric data.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
+ ... "bar", "bar", "bar", "bar"],
+ ... "B": ["one", "one", "one", "two", "two",
+ ... "one", "one", "two", "two"],
+ ... "C": ["small", "large", "large", "small",
+ ... "small", "large", "small", "small",
+ ... "large"],
+ ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
+ ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
+ >>> df
+ A B C D E
+ 0 foo one small 1 2
+ 1 foo one large 2 4
+ 2 foo one large 2 5
+ 3 foo two small 3 5
+ 4 foo two small 3 6
+ 5 bar one large 4 6
+ 6 bar one small 5 8
+ 7 bar two small 6 9
+ 8 bar two large 7 9
+
+ This first example aggregates values by taking the sum.
+
+ >>> table = pivot_table(df, values='D', index=['A', 'B'],
+ ... columns=['C'], aggfunc=np.sum)
+ >>> table
+ C large small
+ A B
+ bar one 4 5
+ two 7 6
+ foo one 4 1
+ two NaN 6
+
+ We can also fill missing values using the `fill_value` parameter.
+
+ >>> table = pivot_table(df, values='D', index=['A', 'B'],
+ ... columns=['C'], aggfunc=np.sum, fill_value=0)
+ >>> table
+ C large small
+ A B
+ bar one 4 5
+ two 7 6
+ foo one 4 1
+ two 0 6
+
+ The next example aggregates by taking the mean across multiple columns.
+
+ >>> table = pivot_table(df, values=['D', 'E'], index=['A', 'C'],
+ ... aggfunc={'D': np.mean,
+ ... 'E': np.mean})
+ >>> table
+ D E
+ mean mean
+ A C
+ bar large 5.500000 7.500000
+ small 5.500000 8.500000
+ foo large 2.000000 4.500000
+ small 2.333333 4.333333
+
+ We can also calculate multiple types of aggregations for any given
+ value column.
+
+ >>> table = pivot_table(df, values=['D', 'E'], index=['A', 'C'],
+ ... aggfunc={'D': np.mean,
+ ... 'E': [min, max, np.mean]})
+ >>> table
+ D E
+ mean max mean min
+ A C
+ bar large 5.500000 9 7.500000 6
+ small 5.500000 9 8.500000 8
+ foo large 2.000000 5 4.500000 4
+ small 2.333333 6 4.333333 2
+ """
+
+ @Substitution('')
+ @Appender(_shared_docs['pivot_table'])
+ def pivot_table(self, values=None, index=None, columns=None,
+ aggfunc='mean', fill_value=None, margins=False,
+ dropna=True, margins_name='All'):
+ from pandas.core.reshape.pivot import pivot_table
+ return pivot_table(self, values=values, index=index, columns=columns,
+ aggfunc=aggfunc, fill_value=fill_value,
+ margins=margins, dropna=dropna,
+ margins_name=margins_name)
+
+ def stack(self, level=-1, dropna=True):
+ """
+ Stack the prescribed level(s) from columns to index.
+
+ Return a reshaped DataFrame or Series having a multi-level
+ index with one or more new inner-most levels compared to the current
+ DataFrame. The new inner-most levels are created by pivoting the
+ columns of the current dataframe:
+
+ - if the columns have a single level, the output is a Series;
+ - if the columns have multiple levels, the new index
+ level(s) is (are) taken from the prescribed level(s) and
+ the output is a DataFrame.
+
+ The new index levels are sorted.
+
+ Parameters
+ ----------
+ level : int, str, list, default -1
+ Level(s) to stack from the column axis onto the index
+ axis, defined as one index or label, or a list of indices
+ or labels.
+ dropna : bool, default True
+ Whether to drop rows in the resulting Frame/Series with
+ missing values. Stacking a column level onto the index
+ axis can create combinations of index and column values
+ that are missing from the original dataframe. See Examples
+ section.
+
+ Returns
+ -------
+ DataFrame or Series
+ Stacked dataframe or series.
+
+ See Also
+ --------
+ DataFrame.unstack : Unstack prescribed level(s) from index axis
+ onto column axis.
+ DataFrame.pivot : Reshape dataframe from long format to wide
+ format.
+ DataFrame.pivot_table : Create a spreadsheet-style pivot table
+ as a DataFrame.
+
+ Notes
+ -----
+ The function is named by analogy with a collection of books
+ being re-organised from being side by side on a horizontal
+ position (the columns of the dataframe) to being stacked
+ vertically on top of of each other (in the index of the
+ dataframe).
+
+ Examples
+ --------
+ **Single level columns**
+
+ >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
+ ... index=['cat', 'dog'],
+ ... columns=['weight', 'height'])
+
+ Stacking a dataframe with a single level column axis returns a Series:
+
+ >>> df_single_level_cols
+ weight height
+ cat 0 1
+ dog 2 3
+ >>> df_single_level_cols.stack()
+ cat weight 0
+ height 1
+ dog weight 2
+ height 3
+ dtype: int64
+
+ **Multi level columns: simple case**
+
+ >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
+ ... ('weight', 'pounds')])
+ >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
+ ... index=['cat', 'dog'],
+ ... columns=multicol1)
+
+ Stacking a dataframe with a multi-level column axis:
+
+ >>> df_multi_level_cols1
+ weight
+ kg pounds
+ cat 1 2
+ dog 2 4
+ >>> df_multi_level_cols1.stack()
+ weight
+ cat kg 1
+ pounds 2
+ dog kg 2
+ pounds 4
+
+ **Missing values**
+
+ >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
+ ... ('height', 'm')])
+ >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
+ ... index=['cat', 'dog'],
+ ... columns=multicol2)
+
+ It is common to have missing values when stacking a dataframe
+ with multi-level columns, as the stacked dataframe typically
+ has more values than the original dataframe. Missing values
+ are filled with NaNs:
+
+ >>> df_multi_level_cols2
+ weight height
+ kg m
+ cat 1.0 2.0
+ dog 3.0 4.0
+ >>> df_multi_level_cols2.stack()
+ height weight
+ cat kg NaN 1.0
+ m 2.0 NaN
+ dog kg NaN 3.0
+ m 4.0 NaN
+
+ **Prescribing the level(s) to be stacked**
+
+ The first parameter controls which level or levels are stacked:
+
+ >>> df_multi_level_cols2.stack(0)
+ kg m
+ cat height NaN 2.0
+ weight 1.0 NaN
+ dog height NaN 4.0
+ weight 3.0 NaN
+ >>> df_multi_level_cols2.stack([0, 1])
+ cat height m 2.0
+ weight kg 1.0
+ dog height m 4.0
+ weight kg 3.0
+ dtype: float64
+
+ **Dropping missing values**
+
+ >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],
+ ... index=['cat', 'dog'],
+ ... columns=multicol2)
+
+ Note that rows where all values are missing are dropped by
+ default but this behaviour can be controlled via the dropna
+ keyword parameter:
+
+ >>> df_multi_level_cols3
+ weight height
+ kg m
+ cat NaN 1.0
+ dog 2.0 3.0
+ >>> df_multi_level_cols3.stack(dropna=False)
+ height weight
+ cat kg NaN NaN
+ m 1.0 NaN
+ dog kg NaN 2.0
+ m 3.0 NaN
+ >>> df_multi_level_cols3.stack(dropna=True)
+ height weight
+ cat m 1.0 NaN
+ dog kg NaN 2.0
+ m 3.0 NaN
+ """
+ from pandas.core.reshape.reshape import stack, stack_multiple
+
+ if isinstance(level, (tuple, list)):
+ return stack_multiple(self, level, dropna=dropna)
+ else:
+ return stack(self, level, dropna=dropna)
+
+ def unstack(self, level=-1, fill_value=None):
+ """
+ Pivot a level of the (necessarily hierarchical) index labels, returning
+ a DataFrame having a new level of column labels whose inner-most level
+ consists of the pivoted index labels.
+
+ If the index is not a MultiIndex, the output will be a Series
+ (the analogue of stack when the columns are not a MultiIndex).
+
+ The level involved will automatically get sorted.
+
+ Parameters
+ ----------
+ level : int, string, or list of these, default -1 (last level)
+ Level(s) of index to unstack, can pass level name
+ fill_value : replace NaN with this value if the unstack produces
+ missing values
+
+ .. versionadded:: 0.18.0
+
+ Returns
+ -------
+ unstacked : DataFrame or Series
+
+ See Also
+ --------
+ DataFrame.pivot : Pivot a table based on column values.
+ DataFrame.stack : Pivot a level of the column labels (inverse operation
+ from `unstack`).
+
+ Examples
+ --------
+ >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
+ ... ('two', 'a'), ('two', 'b')])
+ >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
+ >>> s
+ one a 1.0
+ b 2.0
+ two a 3.0
+ b 4.0
+ dtype: float64
+
+ >>> s.unstack(level=-1)
+ a b
+ one 1.0 2.0
+ two 3.0 4.0
+
+ >>> s.unstack(level=0)
+ one two
+ a 1.0 3.0
+ b 2.0 4.0
+
+ >>> df = s.unstack(level=0)
+ >>> df.unstack()
+ one a 1.0
+ b 2.0
+ two a 3.0
+ b 4.0
+ dtype: float64
+ """
+ from pandas.core.reshape.reshape import unstack
+ return unstack(self, level, fill_value)
+
+ _shared_docs['melt'] = ("""
+ Unpivots a DataFrame from wide format to long format, optionally
+ leaving identifier variables set.
+
+ This function is useful to massage a DataFrame into a format where one
+ or more columns are identifier variables (`id_vars`), while all other
+ columns, considered measured variables (`value_vars`), are "unpivoted" to
+ the row axis, leaving just two non-identifier columns, 'variable' and
+ 'value'.
+
+ %(versionadded)s
+ Parameters
+ ----------
+ frame : DataFrame
+ id_vars : tuple, list, or ndarray, optional
+ Column(s) to use as identifier variables.
+ value_vars : tuple, list, or ndarray, optional
+ Column(s) to unpivot. If not specified, uses all columns that
+ are not set as `id_vars`.
+ var_name : scalar
+ Name to use for the 'variable' column. If None it uses
+ ``frame.columns.name`` or 'variable'.
+ value_name : scalar, default 'value'
+ Name to use for the 'value' column.
+ col_level : int or string, optional
+ If columns are a MultiIndex then use this level to melt.
+
+ See Also
+ --------
+ %(other)s
+ pivot_table
+ DataFrame.pivot
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
+ ... 'B': {0: 1, 1: 3, 2: 5},
+ ... 'C': {0: 2, 1: 4, 2: 6}})
+ >>> df
+ A B C
+ 0 a 1 2
+ 1 b 3 4
+ 2 c 5 6
+
+ >>> %(caller)sid_vars=['A'], value_vars=['B'])
+ A variable value
+ 0 a B 1
+ 1 b B 3
+ 2 c B 5
+
+ >>> %(caller)sid_vars=['A'], value_vars=['B', 'C'])
+ A variable value
+ 0 a B 1
+ 1 b B 3
+ 2 c B 5
+ 3 a C 2
+ 4 b C 4
+ 5 c C 6
+
+ The names of 'variable' and 'value' columns can be customized:
+
+ >>> %(caller)sid_vars=['A'], value_vars=['B'],
+ ... var_name='myVarname', value_name='myValname')
+ A myVarname myValname
+ 0 a B 1
+ 1 b B 3
+ 2 c B 5
+
+ If you have multi-index columns:
+
+ >>> df.columns = [list('ABC'), list('DEF')]
+ >>> df
+ A B C
+ D E F
+ 0 a 1 2
+ 1 b 3 4
+ 2 c 5 6
+
+ >>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B'])
+ A variable value
+ 0 a B 1
+ 1 b B 3
+ 2 c B 5
+
+ >>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')])
+ (A, D) variable_0 variable_1 value
+ 0 a B E 1
+ 1 b B E 3
+ 2 c B E 5
+ """)
+
+ @Appender(_shared_docs['melt'] %
+ dict(caller='df.melt(',
+ versionadded='.. versionadded:: 0.20.0\n',
+ other='melt'))
+ def melt(self, id_vars=None, value_vars=None, var_name=None,
+ value_name='value', col_level=None):
+ from pandas.core.reshape.melt import melt
+ return melt(self, id_vars=id_vars, value_vars=value_vars,
+ var_name=var_name, value_name=value_name,
+ col_level=col_level)
+
+ # ----------------------------------------------------------------------
+ # Time series-related
+
+ def diff(self, periods=1, axis=0):
+ """
+ First discrete difference of element.
+
+ Calculates the difference of a DataFrame element compared with another
+ element in the DataFrame (default is the element in the same column
+ of the previous row).
+
+ Parameters
+ ----------
+ periods : int, default 1
+ Periods to shift for calculating difference, accepts negative
+ values.
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ Take difference over rows (0) or columns (1).
+
+ .. versionadded:: 0.16.1.
+
+ Returns
+ -------
+ diffed : DataFrame
+
+ See Also
+ --------
+ Series.diff: First discrete difference for a Series.
+ DataFrame.pct_change: Percent change over given number of periods.
+ DataFrame.shift: Shift index by desired number of periods with an
+ optional time freq.
+
+ Examples
+ --------
+ Difference with previous row
+
+ >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
+ ... 'b': [1, 1, 2, 3, 5, 8],
+ ... 'c': [1, 4, 9, 16, 25, 36]})
+ >>> df
+ a b c
+ 0 1 1 1
+ 1 2 1 4
+ 2 3 2 9
+ 3 4 3 16
+ 4 5 5 25
+ 5 6 8 36
+
+ >>> df.diff()
+ a b c
+ 0 NaN NaN NaN
+ 1 1.0 0.0 3.0
+ 2 1.0 1.0 5.0
+ 3 1.0 1.0 7.0
+ 4 1.0 2.0 9.0
+ 5 1.0 3.0 11.0
+
+ Difference with previous column
+
+ >>> df.diff(axis=1)
+ a b c
+ 0 NaN 0.0 0.0
+ 1 NaN -1.0 3.0
+ 2 NaN -1.0 7.0
+ 3 NaN -1.0 13.0
+ 4 NaN 0.0 20.0
+ 5 NaN 2.0 28.0
+
+ Difference with 3rd previous row
+
+ >>> df.diff(periods=3)
+ a b c
+ 0 NaN NaN NaN
+ 1 NaN NaN NaN
+ 2 NaN NaN NaN
+ 3 3.0 2.0 15.0
+ 4 3.0 4.0 21.0
+ 5 3.0 6.0 27.0
+
+ Difference with following row
+
+ >>> df.diff(periods=-1)
+ a b c
+ 0 -1.0 0.0 -3.0
+ 1 -1.0 -1.0 -5.0
+ 2 -1.0 -1.0 -7.0
+ 3 -1.0 -2.0 -9.0
+ 4 -1.0 -3.0 -11.0
+ 5 NaN NaN NaN
+ """
+ bm_axis = self._get_block_manager_axis(axis)
+ new_data = self._data.diff(n=periods, axis=bm_axis)
+ return self._constructor(new_data)
+
+ # ----------------------------------------------------------------------
+ # Function application
+
+ def _gotitem(self,
+ key, # type: Union[str, List[str]]
+ ndim, # type: int
+ subset=None # type: Union[Series, DataFrame, None]
+ ):
+ # type: (...) -> Union[Series, DataFrame]
+ """
+ Sub-classes to define. Return a sliced object.
+
+ Parameters
+ ----------
+ key : string / list of selections
+ ndim : 1,2
+ requested ndim of result
+ subset : object, default None
+ subset to act on
+ """
+ if subset is None:
+ subset = self
+ elif subset.ndim == 1: # is Series
+ return subset
+
+ # TODO: _shallow_copy(subset)?
+ return subset[key]
+
+ _agg_summary_and_see_also_doc = dedent("""
+ The aggregation operations are always performed over an axis, either the
+ index (default) or the column axis. This behavior is different from
+ `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
+ `var`), where the default is to compute the aggregation of the flattened
+ array, e.g., ``numpy.mean(arr_2d)`` as opposed to ``numpy.mean(arr_2d,
+ axis=0)``.
+
+ `agg` is an alias for `aggregate`. Use the alias.
+
+ See Also
+ --------
+ DataFrame.apply : Perform any type of operations.
+ DataFrame.transform : Perform transformation type operations.
+ pandas.core.groupby.GroupBy : Perform operations over groups.
+ pandas.core.resample.Resampler : Perform operations over resampled bins.
+ pandas.core.window.Rolling : Perform operations over rolling window.
+ pandas.core.window.Expanding : Perform operations over expanding window.
+ pandas.core.window.EWM : Perform operation over exponential weighted
+ window.
+ """)
+
+ _agg_examples_doc = dedent("""
+ Examples
+ --------
+ >>> df = pd.DataFrame([[1, 2, 3],
+ ... [4, 5, 6],
+ ... [7, 8, 9],
+ ... [np.nan, np.nan, np.nan]],
+ ... columns=['A', 'B', 'C'])
+
+ Aggregate these functions over the rows.
+
+ >>> df.agg(['sum', 'min'])
+ A B C
+ sum 12.0 15.0 18.0
+ min 1.0 2.0 3.0
+
+ Different aggregations per column.
+
+ >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
+ A B
+ max NaN 8.0
+ min 1.0 2.0
+ sum 12.0 NaN
+
+ Aggregate over the columns.
+
+ >>> df.agg("mean", axis="columns")
+ 0 2.0
+ 1 5.0
+ 2 8.0
+ 3 NaN
+ dtype: float64
+ """)
+
+ @Substitution(see_also=_agg_summary_and_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded='.. versionadded:: 0.20.0',
+ **_shared_doc_kwargs)
+ @Appender(_shared_docs['aggregate'])
+ def aggregate(self, func, axis=0, *args, **kwargs):
+ axis = self._get_axis_number(axis)
+
+ result = None
+ try:
+ result, how = self._aggregate(func, axis=axis, *args, **kwargs)
+ except TypeError:
+ pass
+ if result is None:
+ return self.apply(func, axis=axis, args=args, **kwargs)
+ return result
+
+ def _aggregate(self, arg, axis=0, *args, **kwargs):
+ if axis == 1:
+ # NDFrame.aggregate returns a tuple, and we need to transpose
+ # only result
+ result, how = (super(DataFrame, self.T)
+ ._aggregate(arg, *args, **kwargs))
+ result = result.T if result is not None else result
+ return result, how
+ return super(DataFrame, self)._aggregate(arg, *args, **kwargs)
+
+ agg = aggregate
+
+ @Appender(_shared_docs['transform'] % _shared_doc_kwargs)
+ def transform(self, func, axis=0, *args, **kwargs):
+ axis = self._get_axis_number(axis)
+ if axis == 1:
+ return super(DataFrame, self.T).transform(func, *args, **kwargs).T
+ return super(DataFrame, self).transform(func, *args, **kwargs)
+
+ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None,
+ result_type=None, args=(), **kwds):
+ """
+ Apply a function along an axis of the DataFrame.
+
+ Objects passed to the function are Series objects whose index is
+ either the DataFrame's index (``axis=0``) or the DataFrame's columns
+ (``axis=1``). By default (``result_type=None``), the final return type
+ is inferred from the return type of the applied function. Otherwise,
+ it depends on the `result_type` argument.
+
+ Parameters
+ ----------
+ func : function
+ Function to apply to each column or row.
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ Axis along which the function is applied:
+
+ * 0 or 'index': apply function to each column.
+ * 1 or 'columns': apply function to each row.
+ broadcast : bool, optional
+ Only relevant for aggregation functions:
+
+ * ``False`` or ``None`` : returns a Series whose length is the
+ length of the index or the number of columns (based on the
+ `axis` parameter)
+ * ``True`` : results will be broadcast to the original shape
+ of the frame, the original index and columns will be retained.
+
+ .. deprecated:: 0.23.0
+ This argument will be removed in a future version, replaced
+ by result_type='broadcast'.
+
+ raw : bool, default False
+ * ``False`` : passes each row or column as a Series to the
+ function.
+ * ``True`` : the passed function will receive ndarray objects
+ instead.
+ If you are just applying a NumPy reduction function this will
+ achieve much better performance.
+ reduce : bool or None, default None
+ Try to apply reduction procedures. If the DataFrame is empty,
+ `apply` will use `reduce` to determine whether the result
+ should be a Series or a DataFrame. If ``reduce=None`` (the
+ default), `apply`'s return value will be guessed by calling
+ `func` on an empty Series
+ (note: while guessing, exceptions raised by `func` will be
+ ignored).
+ If ``reduce=True`` a Series will always be returned, and if
+ ``reduce=False`` a DataFrame will always be returned.
+
+ .. deprecated:: 0.23.0
+ This argument will be removed in a future version, replaced
+ by ``result_type='reduce'``.
+
+ result_type : {'expand', 'reduce', 'broadcast', None}, default None
+ These only act when ``axis=1`` (columns):
+
+ * 'expand' : list-like results will be turned into columns.
+ * 'reduce' : returns a Series if possible rather than expanding
+ list-like results. This is the opposite of 'expand'.
+ * 'broadcast' : results will be broadcast to the original shape
+ of the DataFrame, the original index and columns will be
+ retained.
+
+ The default behaviour (None) depends on the return value of the
+ applied function: list-like results will be returned as a Series
+ of those. However if the apply function returns a Series these
+ are expanded to columns.
+
+ .. versionadded:: 0.23.0
+
+ args : tuple
+ Positional arguments to pass to `func` in addition to the
+ array/series.
+ **kwds
+ Additional keyword arguments to pass as keywords arguments to
+ `func`.
+
+ Returns
+ -------
+ applied : Series or DataFrame
+
+ See Also
+ --------
+ DataFrame.applymap: For elementwise operations.
+ DataFrame.aggregate: Only perform aggregating type operations.
+ DataFrame.transform: Only perform transforming type operations.
+
+ Notes
+ -----
+ In the current implementation apply calls `func` twice on the
+ first column/row to decide whether it can take a fast or slow
+ code path. This can lead to unexpected behavior if `func` has
+ side-effects, as they will take effect twice for the first
+ column/row.
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame([[4, 9],] * 3, columns=['A', 'B'])
+ >>> df
+ A B
+ 0 4 9
+ 1 4 9
+ 2 4 9
+
+ Using a numpy universal function (in this case the same as
+ ``np.sqrt(df)``):
+
+ >>> df.apply(np.sqrt)
+ A B
+ 0 2.0 3.0
+ 1 2.0 3.0
+ 2 2.0 3.0
+
+ Using a reducing function on either axis
+
+ >>> df.apply(np.sum, axis=0)
+ A 12
+ B 27
+ dtype: int64
+
+ >>> df.apply(np.sum, axis=1)
+ 0 13
+ 1 13
+ 2 13
+ dtype: int64
+
+ Retuning a list-like will result in a Series
+
+ >>> df.apply(lambda x: [1, 2], axis=1)
+ 0 [1, 2]
+ 1 [1, 2]
+ 2 [1, 2]
+ dtype: object
+
+ Passing result_type='expand' will expand list-like results
+ to columns of a Dataframe
+
+ >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
+ 0 1
+ 0 1 2
+ 1 1 2
+ 2 1 2
+
+ Returning a Series inside the function is similar to passing
+ ``result_type='expand'``. The resulting column names
+ will be the Series index.
+
+ >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
+ foo bar
+ 0 1 2
+ 1 1 2
+ 2 1 2
+
+ Passing ``result_type='broadcast'`` will ensure the same shape
+ result, whether list-like or scalar is returned by the function,
+ and broadcast it along the axis. The resulting column names will
+ be the originals.
+
+ >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
+ A B
+ 0 1 2
+ 1 1 2
+ 2 1 2
+ """
+ from pandas.core.apply import frame_apply
+ op = frame_apply(self,
+ func=func,
+ axis=axis,
+ broadcast=broadcast,
+ raw=raw,
+ reduce=reduce,
+ result_type=result_type,
+ args=args,
+ kwds=kwds)
+ return op.get_result()
+
+ def applymap(self, func):
+ """
+ Apply a function to a Dataframe elementwise.
+
+ This method applies a function that accepts and returns a scalar
+ to every element of a DataFrame.
+
+ Parameters
+ ----------
+ func : callable
+ Python function, returns a single value from a single value.
+
+ Returns
+ -------
+ DataFrame
+ Transformed DataFrame.
+
+ See Also
+ --------
+ DataFrame.apply : Apply a function along input axis of DataFrame.
+
+ Notes
+ -----
+ In the current implementation applymap calls `func` twice on the
+ first column/row to decide whether it can take a fast or slow
+ code path. This can lead to unexpected behavior if `func` has
+ side-effects, as they will take effect twice for the first
+ column/row.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
+ >>> df
+ 0 1
+ 0 1.000 2.120
+ 1 3.356 4.567
+
+ >>> df.applymap(lambda x: len(str(x)))
+ 0 1
+ 0 3 4
+ 1 5 5
+
+ Note that a vectorized version of `func` often exists, which will
+ be much faster. You could square each number elementwise.
+
+ >>> df.applymap(lambda x: x**2)
+ 0 1
+ 0 1.000000 4.494400
+ 1 11.262736 20.857489
+
+ But it's better to avoid applymap in that case.
+
+ >>> df ** 2
+ 0 1
+ 0 1.000000 4.494400
+ 1 11.262736 20.857489
+ """
+
+ # if we have a dtype == 'M8[ns]', provide boxed values
+ def infer(x):
+ if x.empty:
+ return lib.map_infer(x, func)
+ return lib.map_infer(x.astype(object).values, func)
+
+ return self.apply(infer)
+
+ # ----------------------------------------------------------------------
+ # Merging / joining methods
+
+ def append(self, other, ignore_index=False,
+ verify_integrity=False, sort=None):
+ """
+ Append rows of `other` to the end of caller, returning a new object.
+
+ Columns in `other` that are not in the caller are added as new columns.
+
+ Parameters
+ ----------
+ other : DataFrame or Series/dict-like object, or list of these
+ The data to append.
+ ignore_index : boolean, default False
+ If True, do not use the index labels.
+ verify_integrity : boolean, default False
+ If True, raise ValueError on creating index with duplicates.
+ sort : boolean, default None
+ Sort columns if the columns of `self` and `other` are not aligned.
+ The default sorting is deprecated and will change to not-sorting
+ in a future version of pandas. Explicitly pass ``sort=True`` to
+ silence the warning and sort. Explicitly pass ``sort=False`` to
+ silence the warning and not sort.
+
+ .. versionadded:: 0.23.0
+
+ Returns
+ -------
+ appended : DataFrame
+
+ See Also
+ --------
+ pandas.concat : General function to concatenate DataFrame, Series
+ or Panel objects.
+
+ Notes
+ -----
+ If a list of dict/series is passed and the keys are all contained in
+ the DataFrame's index, the order of the columns in the resulting
+ DataFrame will be unchanged.
+
+ Iteratively appending rows to a DataFrame can be more computationally
+ intensive than a single concatenate. A better solution is to append
+ those rows to a list and then concatenate the list with the original
+ DataFrame all at once.
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
+ >>> df
+ A B
+ 0 1 2
+ 1 3 4
+ >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
+ >>> df.append(df2)
+ A B
+ 0 1 2
+ 1 3 4
+ 0 5 6
+ 1 7 8
+
+ With `ignore_index` set to True:
+
+ >>> df.append(df2, ignore_index=True)
+ A B
+ 0 1 2
+ 1 3 4
+ 2 5 6
+ 3 7 8
+
+ The following, while not recommended methods for generating DataFrames,
+ show two ways to generate a DataFrame from multiple data sources.
+
+ Less efficient:
+
+ >>> df = pd.DataFrame(columns=['A'])
+ >>> for i in range(5):
+ ... df = df.append({'A': i}, ignore_index=True)
+ >>> df
+ A
+ 0 0
+ 1 1
+ 2 2
+ 3 3
+ 4 4
+
+ More efficient:
+
+ >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],
+ ... ignore_index=True)
+ A
+ 0 0
+ 1 1
+ 2 2
+ 3 3
+ 4 4
+ """
+ if isinstance(other, (Series, dict)):
+ if isinstance(other, dict):
+ other = Series(other)
+ if other.name is None and not ignore_index:
+ raise TypeError('Can only append a Series if ignore_index=True'
+ ' or if the Series has a name')
+
+ if other.name is None:
+ index = None
+ else:
+ # other must have the same index name as self, otherwise
+ # index name will be reset
+ index = Index([other.name], name=self.index.name)
+
+ idx_diff = other.index.difference(self.columns)
+ try:
+ combined_columns = self.columns.append(idx_diff)
+ except TypeError:
+ combined_columns = self.columns.astype(object).append(idx_diff)
+ other = other.reindex(combined_columns, copy=False)
+ other = DataFrame(other.values.reshape((1, len(other))),
+ index=index,
+ columns=combined_columns)
+ other = other._convert(datetime=True, timedelta=True)
+ if not self.columns.equals(combined_columns):
+ self = self.reindex(columns=combined_columns)
+ elif isinstance(other, list) and not isinstance(other[0], DataFrame):
+ other = DataFrame(other)
+ if (self.columns.get_indexer(other.columns) >= 0).all():
+ other = other.loc[:, self.columns]
+
+ from pandas.core.reshape.concat import concat
+ if isinstance(other, (list, tuple)):
+ to_concat = [self] + other
+ else:
+ to_concat = [self, other]
+ return concat(to_concat, ignore_index=ignore_index,
+ verify_integrity=verify_integrity,
+ sort=sort)
+
+ def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
+ sort=False):
+ """
+ Join columns of another DataFrame.
+
+ Join columns with `other` DataFrame either on index or on a key
+ column. Efficiently join multiple DataFrame objects by index at once by
+ passing a list.
+
+ Parameters
+ ----------
+ other : DataFrame, Series, or list of DataFrame
+ Index should be similar to one of the columns in this one. If a
+ Series is passed, its name attribute must be set, and that will be
+ used as the column name in the resulting joined DataFrame.
+ on : str, list of str, or array-like, optional
+ Column or index level name(s) in the caller to join on the index
+ in `other`, otherwise joins index-on-index. If multiple
+ values given, the `other` DataFrame must have a MultiIndex. Can
+ pass an array as the join key if it is not already contained in
+ the calling DataFrame. Like an Excel VLOOKUP operation.
+ how : {'left', 'right', 'outer', 'inner'}, default 'left'
+ How to handle the operation of the two objects.
+
+ * left: use calling frame's index (or column if on is specified)
+ * right: use `other`'s index.
+ * outer: form union of calling frame's index (or column if on is
+ specified) with `other`'s index, and sort it.
+ lexicographically.
+ * inner: form intersection of calling frame's index (or column if
+ on is specified) with `other`'s index, preserving the order
+ of the calling's one.
+ lsuffix : str, default ''
+ Suffix to use from left frame's overlapping columns.
+ rsuffix : str, default ''
+ Suffix to use from right frame's overlapping columns.
+ sort : bool, default False
+ Order result DataFrame lexicographically by the join key. If False,
+ the order of the join key depends on the join type (how keyword).
+
+ Returns
+ -------
+ DataFrame
+ A dataframe containing columns from both the caller and `other`.
+
+ See Also
+ --------
+ DataFrame.merge : For column(s)-on-columns(s) operations.
+
+ Notes
+ -----
+ Parameters `on`, `lsuffix`, and `rsuffix` are not supported when
+ passing a list of `DataFrame` objects.
+
+ Support for specifying index levels as the `on` parameter was added
+ in version 0.23.0.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
+ ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
+
+ >>> df
+ key A
+ 0 K0 A0
+ 1 K1 A1
+ 2 K2 A2
+ 3 K3 A3
+ 4 K4 A4
+ 5 K5 A5
+
+ >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
+ ... 'B': ['B0', 'B1', 'B2']})
+
+ >>> other
+ key B
+ 0 K0 B0
+ 1 K1 B1
+ 2 K2 B2
+
+ Join DataFrames using their indexes.
+
+ >>> df.join(other, lsuffix='_caller', rsuffix='_other')
+ key_caller A key_other B
+ 0 K0 A0 K0 B0
+ 1 K1 A1 K1 B1
+ 2 K2 A2 K2 B2
+ 3 K3 A3 NaN NaN
+ 4 K4 A4 NaN NaN
+ 5 K5 A5 NaN NaN
+
+ If we want to join using the key columns, we need to set key to be
+ the index in both `df` and `other`. The joined DataFrame will have
+ key as its index.
+
+ >>> df.set_index('key').join(other.set_index('key'))
+ A B
+ key
+ K0 A0 B0
+ K1 A1 B1
+ K2 A2 B2
+ K3 A3 NaN
+ K4 A4 NaN
+ K5 A5 NaN
+
+ Another option to join using the key columns is to use the `on`
+ parameter. DataFrame.join always uses `other`'s index but we can use
+ any column in `df`. This method preserves the original DataFrame's
+ index in the result.
+
+ >>> df.join(other.set_index('key'), on='key')
+ key A B
+ 0 K0 A0 B0
+ 1 K1 A1 B1
+ 2 K2 A2 B2
+ 3 K3 A3 NaN
+ 4 K4 A4 NaN
+ 5 K5 A5 NaN
+ """
+ # For SparseDataFrame's benefit
+ return self._join_compat(other, on=on, how=how, lsuffix=lsuffix,
+ rsuffix=rsuffix, sort=sort)
+
+ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
+ sort=False):
+ from pandas.core.reshape.merge import merge
+ from pandas.core.reshape.concat import concat
+
+ if isinstance(other, Series):
+ if other.name is None:
+ raise ValueError('Other Series must have a name')
+ other = DataFrame({other.name: other})
+
+ if isinstance(other, DataFrame):
+ return merge(self, other, left_on=on, how=how,
+ left_index=on is None, right_index=True,
+ suffixes=(lsuffix, rsuffix), sort=sort)
+ else:
+ if on is not None:
+ raise ValueError('Joining multiple DataFrames only supported'
+ ' for joining on index')
+
+ frames = [self] + list(other)
+
+ can_concat = all(df.index.is_unique for df in frames)
+
+ # join indexes only using concat
+ if can_concat:
+ if how == 'left':
+ how = 'outer'
+ join_axes = [self.index]
+ else:
+ join_axes = None
+ return concat(frames, axis=1, join=how, join_axes=join_axes,
+ verify_integrity=True)
+
+ joined = frames[0]
+
+ for frame in frames[1:]:
+ joined = merge(joined, frame, how=how, left_index=True,
+ right_index=True)
+
+ return joined
+
+ @Substitution('')
+ @Appender(_merge_doc, indents=2)
+ def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
+ left_index=False, right_index=False, sort=False,
+ suffixes=('_x', '_y'), copy=True, indicator=False,
+ validate=None):
+ from pandas.core.reshape.merge import merge
+ return merge(self, right, how=how, on=on, left_on=left_on,
+ right_on=right_on, left_index=left_index,
+ right_index=right_index, sort=sort, suffixes=suffixes,
+ copy=copy, indicator=indicator, validate=validate)
+
+ def round(self, decimals=0, *args, **kwargs):
+ """
+ Round a DataFrame to a variable number of decimal places.
+
+ Parameters
+ ----------
+ decimals : int, dict, Series
+ Number of decimal places to round each column to. If an int is
+ given, round each column to the same number of places.
+ Otherwise dict and Series round to variable numbers of places.
+ Column names should be in the keys if `decimals` is a
+ dict-like, or in the index if `decimals` is a Series. Any
+ columns not included in `decimals` will be left as is. Elements
+ of `decimals` which are not columns of the input will be
+ ignored.
+
+ Returns
+ -------
+ DataFrame
+
+ See Also
+ --------
+ numpy.around
+ Series.round
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(np.random.random([3, 3]),
+ ... columns=['A', 'B', 'C'], index=['first', 'second', 'third'])
+ >>> df
+ A B C
+ first 0.028208 0.992815 0.173891
+ second 0.038683 0.645646 0.577595
+ third 0.877076 0.149370 0.491027
+ >>> df.round(2)
+ A B C
+ first 0.03 0.99 0.17
+ second 0.04 0.65 0.58
+ third 0.88 0.15 0.49
+ >>> df.round({'A': 1, 'C': 2})
+ A B C
+ first 0.0 0.992815 0.17
+ second 0.0 0.645646 0.58
+ third 0.9 0.149370 0.49
+ >>> decimals = pd.Series([1, 0, 2], index=['A', 'B', 'C'])
+ >>> df.round(decimals)
+ A B C
+ first 0.0 1 0.17
+ second 0.0 1 0.58
+ third 0.9 0 0.49
+ """
+ from pandas.core.reshape.concat import concat
+
+ def _dict_round(df, decimals):
+ for col, vals in df.iteritems():
+ try:
+ yield _series_round(vals, decimals[col])
+ except KeyError:
+ yield vals
+
+ def _series_round(s, decimals):
+ if is_integer_dtype(s) or is_float_dtype(s):
+ return s.round(decimals)
+ return s
+
+ nv.validate_round(args, kwargs)
+
+ if isinstance(decimals, (dict, Series)):
+ if isinstance(decimals, Series):
+ if not decimals.index.is_unique:
+ raise ValueError("Index of decimals must be unique")
+ new_cols = [col for col in _dict_round(self, decimals)]
+ elif is_integer(decimals):
+ # Dispatch to Series.round
+ new_cols = [_series_round(v, decimals)
+ for _, v in self.iteritems()]
+ else:
+ raise TypeError("decimals must be an integer, a dict-like or a "
+ "Series")
+
+ if len(new_cols) > 0:
+ return self._constructor(concat(new_cols, axis=1),
+ index=self.index,
+ columns=self.columns)
+ else:
+ return self
+
+ # ----------------------------------------------------------------------
+ # Statistical methods, etc.
+
+ def corr(self, method='pearson', min_periods=1):
+ """
+ Compute pairwise correlation of columns, excluding NA/null values.
+
+ Parameters
+ ----------
+ method : {'pearson', 'kendall', 'spearman'} or callable
+ * pearson : standard correlation coefficient
+ * kendall : Kendall Tau correlation coefficient
+ * spearman : Spearman rank correlation
+ * callable: callable with input two 1d ndarrays
+ and returning a float
+ .. versionadded:: 0.24.0
+
+ min_periods : int, optional
+ Minimum number of observations required per pair of columns
+ to have a valid result. Currently only available for pearson
+ and spearman correlation
+
+ Returns
+ -------
+ y : DataFrame
+
+ See Also
+ --------
+ DataFrame.corrwith
+ Series.corr
+
+ Examples
+ --------
+ >>> histogram_intersection = lambda a, b: np.minimum(a, b
+ ... ).sum().round(decimals=1)
+ >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
+ ... columns=['dogs', 'cats'])
+ >>> df.corr(method=histogram_intersection)
+ dogs cats
+ dogs 1.0 0.3
+ cats 0.3 1.0
+ """
+ numeric_df = self._get_numeric_data()
+ cols = numeric_df.columns
+ idx = cols.copy()
+ mat = numeric_df.values
+
+ if method == 'pearson':
+ correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods)
+ elif method == 'spearman':
+ correl = libalgos.nancorr_spearman(ensure_float64(mat),
+ minp=min_periods)
+ elif method == 'kendall' or callable(method):
+ if min_periods is None:
+ min_periods = 1
+ mat = ensure_float64(mat).T
+ corrf = nanops.get_corr_func(method)
+ K = len(cols)
+ correl = np.empty((K, K), dtype=float)
+ mask = np.isfinite(mat)
+ for i, ac in enumerate(mat):
+ for j, bc in enumerate(mat):
+ if i > j:
+ continue
+
+ valid = mask[i] & mask[j]
+ if valid.sum() < min_periods:
+ c = np.nan
+ elif i == j:
+ c = 1.
+ elif not valid.all():
+ c = corrf(ac[valid], bc[valid])
+ else:
+ c = corrf(ac, bc)
+ correl[i, j] = c
+ correl[j, i] = c
+ else:
+ raise ValueError("method must be either 'pearson', "
+ "'spearman', or 'kendall', '{method}' "
+ "was supplied".format(method=method))
+
+ return self._constructor(correl, index=idx, columns=cols)
+
+ def cov(self, min_periods=None):
+ """
+ Compute pairwise covariance of columns, excluding NA/null values.
+
+ Compute the pairwise covariance among the series of a DataFrame.
+ The returned data frame is the `covariance matrix
+ <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
+ of the DataFrame.
+
+ Both NA and null values are automatically excluded from the
+ calculation. (See the note below about bias from missing values.)
+ A threshold can be set for the minimum number of
+ observations for each value created. Comparisons with observations
+ below this threshold will be returned as ``NaN``.
+
+ This method is generally used for the analysis of time series data to
+ understand the relationship between different measures
+ across time.
+
+ Parameters
+ ----------
+ min_periods : int, optional
+ Minimum number of observations required per pair of columns
+ to have a valid result.
+
+ Returns
+ -------
+ DataFrame
+ The covariance matrix of the series of the DataFrame.
+
+ See Also
+ --------
+ pandas.Series.cov : Compute covariance with another Series.
+ pandas.core.window.EWM.cov: Exponential weighted sample covariance.
+ pandas.core.window.Expanding.cov : Expanding sample covariance.
+ pandas.core.window.Rolling.cov : Rolling sample covariance.
+
+ Notes
+ -----
+ Returns the covariance matrix of the DataFrame's time series.
+ The covariance is normalized by N-1.
+
+ For DataFrames that have Series that are missing data (assuming that
+ data is `missing at random
+ <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
+ the returned covariance matrix will be an unbiased estimate
+ of the variance and covariance between the member Series.
+
+ However, for many applications this estimate may not be acceptable
+ because the estimate covariance matrix is not guaranteed to be positive
+ semi-definite. This could lead to estimate correlations having
+ absolute values which are greater than one, and/or a non-invertible
+ covariance matrix. See `Estimation of covariance matrices
+ <http://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
+ matrices>`__ for more details.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
+ ... columns=['dogs', 'cats'])
+ >>> df.cov()
+ dogs cats
+ dogs 0.666667 -1.000000
+ cats -1.000000 1.666667
+
+ >>> np.random.seed(42)
+ >>> df = pd.DataFrame(np.random.randn(1000, 5),
+ ... columns=['a', 'b', 'c', 'd', 'e'])
+ >>> df.cov()
+ a b c d e
+ a 0.998438 -0.020161 0.059277 -0.008943 0.014144
+ b -0.020161 1.059352 -0.008543 -0.024738 0.009826
+ c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
+ d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
+ e 0.014144 0.009826 -0.000271 -0.013692 0.977795
+
+ **Minimum number of periods**
+
+ This method also supports an optional ``min_periods`` keyword
+ that specifies the required minimum number of non-NA observations for
+ each column pair in order to have a valid result:
+
+ >>> np.random.seed(42)
+ >>> df = pd.DataFrame(np.random.randn(20, 3),
+ ... columns=['a', 'b', 'c'])
+ >>> df.loc[df.index[:5], 'a'] = np.nan
+ >>> df.loc[df.index[5:10], 'b'] = np.nan
+ >>> df.cov(min_periods=12)
+ a b c
+ a 0.316741 NaN -0.150812
+ b NaN 1.248003 0.191417
+ c -0.150812 0.191417 0.895202
+ """
+ numeric_df = self._get_numeric_data()
+ cols = numeric_df.columns
+ idx = cols.copy()
+ mat = numeric_df.values
+
+ if notna(mat).all():
+ if min_periods is not None and min_periods > len(mat):
+ baseCov = np.empty((mat.shape[1], mat.shape[1]))
+ baseCov.fill(np.nan)
+ else:
+ baseCov = np.cov(mat.T)
+ baseCov = baseCov.reshape((len(cols), len(cols)))
+ else:
+ baseCov = libalgos.nancorr(ensure_float64(mat), cov=True,
+ minp=min_periods)
+
+ return self._constructor(baseCov, index=idx, columns=cols)
+
+ def corrwith(self, other, axis=0, drop=False, method='pearson'):
+ """
+ Compute pairwise correlation between rows or columns of DataFrame
+ with rows or columns of Series or DataFrame. DataFrames are first
+ aligned along both axes before computing the correlations.
+
+ Parameters
+ ----------
+ other : DataFrame, Series
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ 0 or 'index' to compute column-wise, 1 or 'columns' for row-wise
+ drop : boolean, default False
+ Drop missing indices from result
+ method : {'pearson', 'kendall', 'spearman'} or callable
+ * pearson : standard correlation coefficient
+ * kendall : Kendall Tau correlation coefficient
+ * spearman : Spearman rank correlation
+ * callable: callable with input two 1d ndarrays
+ and returning a float
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ correls : Series
+
+ See Also
+ -------
+ DataFrame.corr
+ """
+ axis = self._get_axis_number(axis)
+ this = self._get_numeric_data()
+
+ if isinstance(other, Series):
+ return this.apply(lambda x: other.corr(x, method=method),
+ axis=axis)
+
+ other = other._get_numeric_data()
+ left, right = this.align(other, join='inner', copy=False)
+
+ if axis == 1:
+ left = left.T
+ right = right.T
+
+ if method == 'pearson':
+ # mask missing values
+ left = left + right * 0
+ right = right + left * 0
+
+ # demeaned data
+ ldem = left - left.mean()
+ rdem = right - right.mean()
+
+ num = (ldem * rdem).sum()
+ dom = (left.count() - 1) * left.std() * right.std()
+
+ correl = num / dom
+
+ elif method in ['kendall', 'spearman'] or callable(method):
+ def c(x):
+ return nanops.nancorr(x[0], x[1], method=method)
+
+ correl = Series(map(c,
+ zip(left.values.T, right.values.T)),
+ index=left.columns)
+
+ else:
+ raise ValueError("Invalid method {method} was passed, "
+ "valid methods are: 'pearson', 'kendall', "
+ "'spearman', or callable".
+ format(method=method))
+
+ if not drop:
+ # Find non-matching labels along the given axis
+ # and append missing correlations (GH 22375)
+ raxis = 1 if axis == 0 else 0
+ result_index = (this._get_axis(raxis).
+ union(other._get_axis(raxis)))
+ idx_diff = result_index.difference(correl.index)
+
+ if len(idx_diff) > 0:
+ correl = correl.append(Series([np.nan] * len(idx_diff),
+ index=idx_diff))
+
+ return correl
+
+ # ----------------------------------------------------------------------
+ # ndarray-like stats methods
+
+ def count(self, axis=0, level=None, numeric_only=False):
+ """
+ Count non-NA cells for each column or row.
+
+ The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending
+ on `pandas.options.mode.use_inf_as_na`) are considered NA.
+
+ Parameters
+ ----------
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ If 0 or 'index' counts are generated for each column.
+ If 1 or 'columns' counts are generated for each **row**.
+ level : int or str, optional
+ If the axis is a `MultiIndex` (hierarchical), count along a
+ particular `level`, collapsing into a `DataFrame`.
+ A `str` specifies the level name.
+ numeric_only : boolean, default False
+ Include only `float`, `int` or `boolean` data.
+
+ Returns
+ -------
+ Series or DataFrame
+ For each column/row the number of non-NA/null entries.
+ If `level` is specified returns a `DataFrame`.
+
+ See Also
+ --------
+ Series.count: Number of non-NA elements in a Series.
+ DataFrame.shape: Number of DataFrame rows and columns (including NA
+ elements).
+ DataFrame.isna: Boolean same-sized DataFrame showing places of NA
+ elements.
+
+ Examples
+ --------
+ Constructing DataFrame from a dictionary:
+
+ >>> df = pd.DataFrame({"Person":
+ ... ["John", "Myla", "Lewis", "John", "Myla"],
+ ... "Age": [24., np.nan, 21., 33, 26],
+ ... "Single": [False, True, True, True, False]})
+ >>> df
+ Person Age Single
+ 0 John 24.0 False
+ 1 Myla NaN True
+ 2 Lewis 21.0 True
+ 3 John 33.0 True
+ 4 Myla 26.0 False
+
+ Notice the uncounted NA values:
+
+ >>> df.count()
+ Person 5
+ Age 4
+ Single 5
+ dtype: int64
+
+ Counts for each **row**:
+
+ >>> df.count(axis='columns')
+ 0 3
+ 1 2
+ 2 3
+ 3 3
+ 4 3
+ dtype: int64
+
+ Counts for one level of a `MultiIndex`:
+
+ >>> df.set_index(["Person", "Single"]).count(level="Person")
+ Age
+ Person
+ John 2
+ Lewis 1
+ Myla 1
+ """
+ axis = self._get_axis_number(axis)
+ if level is not None:
+ return self._count_level(level, axis=axis,
+ numeric_only=numeric_only)
+
+ if numeric_only:
+ frame = self._get_numeric_data()
+ else:
+ frame = self
+
+ # GH #423
+ if len(frame._get_axis(axis)) == 0:
+ result = Series(0, index=frame._get_agg_axis(axis))
+ else:
+ if frame._is_mixed_type or frame._data.any_extension_types:
+ # the or any_extension_types is really only hit for single-
+ # column frames with an extension array
+ result = notna(frame).sum(axis=axis)
+ else:
+ # GH13407
+ series_counts = notna(frame).sum(axis=axis)
+ counts = series_counts.values
+ result = Series(counts, index=frame._get_agg_axis(axis))
+
+ return result.astype('int64')
+
+ def _count_level(self, level, axis=0, numeric_only=False):
+ if numeric_only:
+ frame = self._get_numeric_data()
+ else:
+ frame = self
+
+ count_axis = frame._get_axis(axis)
+ agg_axis = frame._get_agg_axis(axis)
+
+ if not isinstance(count_axis, MultiIndex):
+ raise TypeError("Can only count levels on hierarchical "
+ "{ax}.".format(ax=self._get_axis_name(axis)))
+
+ if frame._is_mixed_type:
+ # Since we have mixed types, calling notna(frame.values) might
+ # upcast everything to object
+ mask = notna(frame).values
+ else:
+ # But use the speedup when we have homogeneous dtypes
+ mask = notna(frame.values)
+
+ if axis == 1:
+ # We're transposing the mask rather than frame to avoid potential
+ # upcasts to object, which induces a ~20x slowdown
+ mask = mask.T
+
+ if isinstance(level, compat.string_types):
+ level = count_axis._get_level_number(level)
+
+ level_index = count_axis.levels[level]
+ level_codes = ensure_int64(count_axis.codes[level])
+ counts = lib.count_level_2d(mask, level_codes, len(level_index),
+ axis=0)
+
+ result = DataFrame(counts, index=level_index, columns=agg_axis)
+
+ if axis == 1:
+ # Undo our earlier transpose
+ return result.T
+ else:
+ return result
+
+ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
+ filter_type=None, **kwds):
+ if axis is None and filter_type == 'bool':
+ labels = None
+ constructor = None
+ else:
+ # TODO: Make other agg func handle axis=None properly
+ axis = self._get_axis_number(axis)
+ labels = self._get_agg_axis(axis)
+ constructor = self._constructor
+
+ def f(x):
+ return op(x, axis=axis, skipna=skipna, **kwds)
+
+ # exclude timedelta/datetime unless we are uniform types
+ if (axis == 1 and self._is_datelike_mixed_type
+ and (not self._is_homogeneous_type
+ and not is_datetime64tz_dtype(self.dtypes[0]))):
+ numeric_only = True
+
+ if numeric_only is None:
+ try:
+ values = self.values
+ result = f(values)
+
+ if (filter_type == 'bool' and is_object_dtype(values) and
+ axis is None):
+ # work around https://github.com/numpy/numpy/issues/10489
+ # TODO: combine with hasattr(result, 'dtype') further down
+ # hard since we don't have `values` down there.
+ result = np.bool_(result)
+ except Exception as e:
+
+ # try by-column first
+ if filter_type is None and axis == 0:
+ try:
+
+ # this can end up with a non-reduction
+ # but not always. if the types are mixed
+ # with datelike then need to make sure a series
+
+ # we only end up here if we have not specified
+ # numeric_only and yet we have tried a
+ # column-by-column reduction, where we have mixed type.
+ # So let's just do what we can
+ from pandas.core.apply import frame_apply
+ opa = frame_apply(self,
+ func=f,
+ result_type='expand',
+ ignore_failures=True)
+ result = opa.get_result()
+ if result.ndim == self.ndim:
+ result = result.iloc[0]
+ return result
+ except Exception:
+ pass
+
+ if filter_type is None or filter_type == 'numeric':
+ data = self._get_numeric_data()
+ elif filter_type == 'bool':
+ data = self._get_bool_data()
+ else: # pragma: no cover
+ e = NotImplementedError(
+ "Handling exception with filter_type {f} not"
+ "implemented.".format(f=filter_type))
+ raise_with_traceback(e)
+ with np.errstate(all='ignore'):
+ result = f(data.values)
+ labels = data._get_agg_axis(axis)
+ else:
+ if numeric_only:
+ if filter_type is None or filter_type == 'numeric':
+ data = self._get_numeric_data()
+ elif filter_type == 'bool':
+ # GH 25101, # GH 24434
+ data = self._get_bool_data() if axis == 0 else self
+ else: # pragma: no cover
+ msg = ("Generating numeric_only data with filter_type {f}"
+ "not supported.".format(f=filter_type))
+ raise NotImplementedError(msg)
+ values = data.values
+ labels = data._get_agg_axis(axis)
+ else:
+ values = self.values
+ result = f(values)
+
+ if hasattr(result, 'dtype') and is_object_dtype(result.dtype):
+ try:
+ if filter_type is None or filter_type == 'numeric':
+ result = result.astype(np.float64)
+ elif filter_type == 'bool' and notna(result).all():
+ result = result.astype(np.bool_)
+ except (ValueError, TypeError):
+
+ # try to coerce to the original dtypes item by item if we can
+ if axis == 0:
+ result = coerce_to_dtypes(result, self.dtypes)
+
+ if constructor is not None:
+ result = Series(result, index=labels)
+ return result
+
+ def nunique(self, axis=0, dropna=True):
+ """
+ Count distinct observations over requested axis.
+
+ Return Series with number of distinct observations. Can ignore NaN
+ values.
+
+ .. versionadded:: 0.20.0
+
+ Parameters
+ ----------
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
+ column-wise.
+ dropna : bool, default True
+ Don't include NaN in the counts.
+
+ Returns
+ -------
+ nunique : Series
+
+ See Also
+ --------
+ Series.nunique: Method nunique for Series.
+ DataFrame.count: Count non-NA cells for each column or row.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
+ >>> df.nunique()
+ A 3
+ B 1
+ dtype: int64
+
+ >>> df.nunique(axis=1)
+ 0 1
+ 1 2
+ 2 2
+ dtype: int64
+ """
+ return self.apply(Series.nunique, axis=axis, dropna=dropna)
+
+ def idxmin(self, axis=0, skipna=True):
+ """
+ Return index of first occurrence of minimum over requested axis.
+ NA/null values are excluded.
+
+ Parameters
+ ----------
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ 0 or 'index' for row-wise, 1 or 'columns' for column-wise
+ skipna : boolean, default True
+ Exclude NA/null values. If an entire row/column is NA, the result
+ will be NA.
+
+ Returns
+ -------
+ idxmin : Series
+
+ Raises
+ ------
+ ValueError
+ * If the row/column is empty
+
+ See Also
+ --------
+ Series.idxmin
+
+ Notes
+ -----
+ This method is the DataFrame version of ``ndarray.argmin``.
+ """
+ axis = self._get_axis_number(axis)
+ indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna)
+ index = self._get_axis(axis)
+ result = [index[i] if i >= 0 else np.nan for i in indices]
+ return Series(result, index=self._get_agg_axis(axis))
+
+ def idxmax(self, axis=0, skipna=True):
+ """
+ Return index of first occurrence of maximum over requested axis.
+ NA/null values are excluded.
+
+ Parameters
+ ----------
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ 0 or 'index' for row-wise, 1 or 'columns' for column-wise
+ skipna : boolean, default True
+ Exclude NA/null values. If an entire row/column is NA, the result
+ will be NA.
+
+ Returns
+ -------
+ idxmax : Series
+
+ Raises
+ ------
+ ValueError
+ * If the row/column is empty
+
+ See Also
+ --------
+ Series.idxmax
+
+ Notes
+ -----
+ This method is the DataFrame version of ``ndarray.argmax``.
+ """
+ axis = self._get_axis_number(axis)
+ indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna)
+ index = self._get_axis(axis)
+ result = [index[i] if i >= 0 else np.nan for i in indices]
+ return Series(result, index=self._get_agg_axis(axis))
+
+ def _get_agg_axis(self, axis_num):
+ """
+ Let's be explicit about this.
+ """
+ if axis_num == 0:
+ return self.columns
+ elif axis_num == 1:
+ return self.index
+ else:
+ raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)
+
+ def mode(self, axis=0, numeric_only=False, dropna=True):
+ """
+ Get the mode(s) of each element along the selected axis.
+
+ The mode of a set of values is the value that appears most often.
+ It can be multiple values.
+
+ Parameters
+ ----------
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ The axis to iterate over while searching for the mode:
+
+ * 0 or 'index' : get mode of each column
+ * 1 or 'columns' : get mode of each row
+ numeric_only : bool, default False
+ If True, only apply to numeric columns.
+ dropna : bool, default True
+ Don't consider counts of NaN/NaT.
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ DataFrame
+ The modes of each column or row.
+
+ See Also
+ --------
+ Series.mode : Return the highest frequency value in a Series.
+ Series.value_counts : Return the counts of values in a Series.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([('bird', 2, 2),
+ ... ('mammal', 4, np.nan),
+ ... ('arthropod', 8, 0),
+ ... ('bird', 2, np.nan)],
+ ... index=('falcon', 'horse', 'spider', 'ostrich'),
+ ... columns=('species', 'legs', 'wings'))
+ >>> df
+ species legs wings
+ falcon bird 2 2.0
+ horse mammal 4 NaN
+ spider arthropod 8 0.0
+ ostrich bird 2 NaN
+
+ By default, missing values are not considered, and the mode of wings
+ are both 0 and 2. The second row of species and legs contains ``NaN``,
+ because they have only one mode, but the DataFrame has two rows.
+
+ >>> df.mode()
+ species legs wings
+ 0 bird 2.0 0.0
+ 1 NaN NaN 2.0
+
+ Setting ``dropna=False`` ``NaN`` values are considered and they can be
+ the mode (like for wings).
+
+ >>> df.mode(dropna=False)
+ species legs wings
+ 0 bird 2 NaN
+
+ Setting ``numeric_only=True``, only the mode of numeric columns is
+ computed, and columns of other types are ignored.
+
+ >>> df.mode(numeric_only=True)
+ legs wings
+ 0 2.0 0.0
+ 1 NaN 2.0
+
+ To compute the mode over columns and not rows, use the axis parameter:
+
+ >>> df.mode(axis='columns', numeric_only=True)
+ 0 1
+ falcon 2.0 NaN
+ horse 4.0 NaN
+ spider 0.0 8.0
+ ostrich 2.0 NaN
+ """
+ data = self if not numeric_only else self._get_numeric_data()
+
+ def f(s):
+ return s.mode(dropna=dropna)
+
+ return data.apply(f, axis=axis)
+
+ def quantile(self, q=0.5, axis=0, numeric_only=True,
+ interpolation='linear'):
+ """
+ Return values at the given quantile over requested axis.
+
+ Parameters
+ ----------
+ q : float or array-like, default 0.5 (50% quantile)
+ Value between 0 <= q <= 1, the quantile(s) to compute.
+ axis : {0, 1, 'index', 'columns'} (default 0)
+ Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
+ numeric_only : bool, default True
+ If False, the quantile of datetime and timedelta data will be
+ computed as well.
+ interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
+ This optional parameter specifies the interpolation method to use,
+ when the desired quantile lies between two data points `i` and `j`:
+
+ * linear: `i + (j - i) * fraction`, where `fraction` is the
+ fractional part of the index surrounded by `i` and `j`.
+ * lower: `i`.
+ * higher: `j`.
+ * nearest: `i` or `j` whichever is nearest.
+ * midpoint: (`i` + `j`) / 2.
+
+ .. versionadded:: 0.18.0
+
+ Returns
+ -------
+ quantiles : Series or DataFrame
+
+ - If ``q`` is an array, a DataFrame will be returned where the
+ index is ``q``, the columns are the columns of self, and the
+ values are the quantiles.
+ - If ``q`` is a float, a Series will be returned where the
+ index is the columns of self and the values are the quantiles.
+
+ See Also
+ --------
+ core.window.Rolling.quantile: Rolling quantile.
+ numpy.percentile: Numpy function to compute the percentile.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
+ ... columns=['a', 'b'])
+ >>> df.quantile(.1)
+ a 1.3
+ b 3.7
+ Name: 0.1, dtype: float64
+ >>> df.quantile([.1, .5])
+ a b
+ 0.1 1.3 3.7
+ 0.5 2.5 55.0
+
+ Specifying `numeric_only=False` will also compute the quantile of
+ datetime and timedelta data.
+
+ >>> df = pd.DataFrame({'A': [1, 2],
+ ... 'B': [pd.Timestamp('2010'),
+ ... pd.Timestamp('2011')],
+ ... 'C': [pd.Timedelta('1 days'),
+ ... pd.Timedelta('2 days')]})
+ >>> df.quantile(0.5, numeric_only=False)
+ A 1.5
+ B 2010-07-02 12:00:00
+ C 1 days 12:00:00
+ Name: 0.5, dtype: object
+ """
+ self._check_percentile(q)
+
+ data = self._get_numeric_data() if numeric_only else self
+ axis = self._get_axis_number(axis)
+ is_transposed = axis == 1
+
+ if is_transposed:
+ data = data.T
+
+ result = data._data.quantile(qs=q,
+ axis=1,
+ interpolation=interpolation,
+ transposed=is_transposed)
+
+ if result.ndim == 2:
+ result = self._constructor(result)
+ else:
+ result = self._constructor_sliced(result, name=q)
+
+ if is_transposed:
+ result = result.T
+
+ return result
+
+ def to_timestamp(self, freq=None, how='start', axis=0, copy=True):
+ """
+ Cast to DatetimeIndex of timestamps, at *beginning* of period.
+
+ Parameters
+ ----------
+ freq : string, default frequency of PeriodIndex
+ Desired frequency
+ how : {'s', 'e', 'start', 'end'}
+ Convention for converting period to timestamp; start of period
+ vs. end
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ The axis to convert (the index by default)
+ copy : boolean, default True
+ If false then underlying input data is not copied
+
+ Returns
+ -------
+ df : DataFrame with DatetimeIndex
+ """
+ new_data = self._data
+ if copy:
+ new_data = new_data.copy()
+
+ axis = self._get_axis_number(axis)
+ if axis == 0:
+ new_data.set_axis(1, self.index.to_timestamp(freq=freq, how=how))
+ elif axis == 1:
+ new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how))
+ else: # pragma: no cover
+ raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format(
+ ax=axis))
+
+ return self._constructor(new_data)
+
+ def to_period(self, freq=None, axis=0, copy=True):
+ """
+ Convert DataFrame from DatetimeIndex to PeriodIndex with desired
+ frequency (inferred from index if not passed).
+
+ Parameters
+ ----------
+ freq : string, default
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ The axis to convert (the index by default)
+ copy : boolean, default True
+ If False then underlying input data is not copied
+
+ Returns
+ -------
+ ts : TimeSeries with PeriodIndex
+ """
+ new_data = self._data
+ if copy:
+ new_data = new_data.copy()
+
+ axis = self._get_axis_number(axis)
+ if axis == 0:
+ new_data.set_axis(1, self.index.to_period(freq=freq))
+ elif axis == 1:
+ new_data.set_axis(0, self.columns.to_period(freq=freq))
+ else: # pragma: no cover
+ raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format(
+ ax=axis))
+
+ return self._constructor(new_data)
+
+ def isin(self, values):
+ """
+ Whether each element in the DataFrame is contained in values.
+
+ Parameters
+ ----------
+ values : iterable, Series, DataFrame or dict
+ The result will only be true at a location if all the
+ labels match. If `values` is a Series, that's the index. If
+ `values` is a dict, the keys must be the column names,
+ which must match. If `values` is a DataFrame,
+ then both the index and column labels must match.
+
+ Returns
+ -------
+ DataFrame
+ DataFrame of booleans showing whether each element in the DataFrame
+ is contained in values.
+
+ See Also
+ --------
+ DataFrame.eq: Equality test for DataFrame.
+ Series.isin: Equivalent method on Series.
+ Series.str.contains: Test if pattern or regex is contained within a
+ string of a Series or Index.
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
+ ... index=['falcon', 'dog'])
+ >>> df
+ num_legs num_wings
+ falcon 2 2
+ dog 4 0
+
+ When ``values`` is a list check whether every value in the DataFrame
+ is present in the list (which animals have 0 or 2 legs or wings)
+
+ >>> df.isin([0, 2])
+ num_legs num_wings
+ falcon True True
+ dog False True
+
+ When ``values`` is a dict, we can pass values to check for each
+ column separately:
+
+ >>> df.isin({'num_wings': [0, 3]})
+ num_legs num_wings
+ falcon False False
+ dog False True
+
+ When ``values`` is a Series or DataFrame the index and column must
+ match. Note that 'falcon' does not match based on the number of legs
+ in df2.
+
+ >>> other = pd.DataFrame({'num_legs': [8, 2],'num_wings': [0, 2]},
+ ... index=['spider', 'falcon'])
+ >>> df.isin(other)
+ num_legs num_wings
+ falcon True True
+ dog False False
+ """
+ if isinstance(values, dict):
+ from pandas.core.reshape.concat import concat
+ values = collections.defaultdict(list, values)
+ return concat((self.iloc[:, [i]].isin(values[col])
+ for i, col in enumerate(self.columns)), axis=1)
+ elif isinstance(values, Series):
+ if not values.index.is_unique:
+ raise ValueError("cannot compute isin with "
+ "a duplicate axis.")
+ return self.eq(values.reindex_like(self), axis='index')
+ elif isinstance(values, DataFrame):
+ if not (values.columns.is_unique and values.index.is_unique):
+ raise ValueError("cannot compute isin with "
+ "a duplicate axis.")
+ return self.eq(values.reindex_like(self))
+ else:
+ if not is_list_like(values):
+ raise TypeError("only list-like or dict-like objects are "
+ "allowed to be passed to DataFrame.isin(), "
+ "you passed a "
+ "{0!r}".format(type(values).__name__))
+ return DataFrame(
+ algorithms.isin(self.values.ravel(),
+ values).reshape(self.shape), self.index,
+ self.columns)
+
+ # ----------------------------------------------------------------------
+ # Add plotting methods to DataFrame
+ plot = CachedAccessor("plot", gfx.FramePlotMethods)
+ hist = gfx.hist_frame
+ boxplot = gfx.boxplot_frame
+
+
+DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0,
+ axes_are_reversed=True, aliases={'rows': 0},
+ docs={
+ 'index': 'The index (row labels) of the DataFrame.',
+ 'columns': 'The column labels of the DataFrame.'})
+DataFrame._add_numeric_operations()
+DataFrame._add_series_or_dataframe_operations()
+
+ops.add_flex_arithmetic_methods(DataFrame)
+ops.add_special_arithmetic_methods(DataFrame)
+
+
+def _from_nested_dict(data):
+ # TODO: this should be seriously cythonized
+ new_data = OrderedDict()
+ for index, s in compat.iteritems(data):
+ for col, v in compat.iteritems(s):
+ new_data[col] = new_data.get(col, OrderedDict())
+ new_data[col][index] = v
+ return new_data
+
+
+def _put_str(s, space):
+ return u'{s}'.format(s=s)[:space].ljust(space)
diff --git a/contrib/python/pandas/py2/pandas/core/generic.py b/contrib/python/pandas/py2/pandas/core/generic.py
new file mode 100644
index 00000000000..6b86695a775
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/generic.py
@@ -0,0 +1,11039 @@
+# pylint: disable=W0231,E1101
+import collections
+from datetime import timedelta
+import functools
+import gc
+import json
+import operator
+from textwrap import dedent
+import warnings
+import weakref
+
+import numpy as np
+
+from pandas._libs import Timestamp, iNaT, properties
+import pandas.compat as compat
+from pandas.compat import (
+ cPickle as pkl, isidentifier, lrange, lzip, map, set_function_name,
+ string_types, to_str, zip)
+from pandas.compat.numpy import function as nv
+from pandas.errors import AbstractMethodError
+from pandas.util._decorators import (
+ Appender, Substitution, rewrite_axis_style_signature)
+from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
+
+from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask
+from pandas.core.dtypes.common import (
+ ensure_int64, ensure_object, is_bool, is_bool_dtype,
+ is_datetime64_any_dtype, is_datetime64tz_dtype, is_dict_like,
+ is_extension_array_dtype, is_integer, is_list_like, is_number,
+ is_numeric_dtype, is_object_dtype, is_period_arraylike, is_re_compilable,
+ is_scalar, is_timedelta64_dtype, pandas_dtype)
+from pandas.core.dtypes.generic import ABCDataFrame, ABCPanel, ABCSeries
+from pandas.core.dtypes.inference import is_hashable
+from pandas.core.dtypes.missing import isna, notna
+
+import pandas as pd
+from pandas.core import config, missing, nanops
+import pandas.core.algorithms as algos
+from pandas.core.base import PandasObject, SelectionMixin
+import pandas.core.common as com
+from pandas.core.index import (
+ Index, InvalidIndexError, MultiIndex, RangeIndex, ensure_index)
+from pandas.core.indexes.datetimes import DatetimeIndex
+from pandas.core.indexes.period import Period, PeriodIndex
+import pandas.core.indexing as indexing
+from pandas.core.internals import BlockManager
+from pandas.core.ops import _align_method_FRAME
+
+from pandas.io.formats.format import DataFrameFormatter, format_percentiles
+from pandas.io.formats.printing import pprint_thing
+from pandas.tseries.frequencies import to_offset
+
+# goal is to be able to define the docs close to function, while still being
+# able to share
+_shared_docs = dict()
+_shared_doc_kwargs = dict(
+ axes='keywords for axes', klass='NDFrame',
+ axes_single_arg='int or labels for object',
+ args_transpose='axes to permute (int or label for object)',
+ optional_by="""
+ by : str or list of str
+ Name or list of names to sort by""")
+
+# sentinel value to use as kwarg in place of None when None has special meaning
+# and needs to be distinguished from a user explicitly passing None.
+sentinel = object()
+
+
+def _single_replace(self, to_replace, method, inplace, limit):
+ """
+ Replaces values in a Series using the fill method specified when no
+ replacement value is given in the replace method
+ """
+ if self.ndim != 1:
+ raise TypeError('cannot replace {0} with method {1} on a {2}'
+ .format(to_replace, method, type(self).__name__))
+
+ orig_dtype = self.dtype
+ result = self if inplace else self.copy()
+ fill_f = missing.get_fill_func(method)
+
+ mask = missing.mask_missing(result.values, to_replace)
+ values = fill_f(result.values, limit=limit, mask=mask)
+
+ if values.dtype == orig_dtype and inplace:
+ return
+
+ result = pd.Series(values, index=self.index,
+ dtype=self.dtype).__finalize__(self)
+
+ if inplace:
+ self._update_inplace(result._data)
+ return
+
+ return result
+
+
+class NDFrame(PandasObject, SelectionMixin):
+ """
+ N-dimensional analogue of DataFrame. Store multi-dimensional in a
+ size-mutable, labeled data structure
+
+ Parameters
+ ----------
+ data : BlockManager
+ axes : list
+ copy : boolean, default False
+ """
+ _internal_names = ['_data', '_cacher', '_item_cache', '_cache', '_is_copy',
+ '_subtyp', '_name', '_index', '_default_kind',
+ '_default_fill_value', '_metadata', '__array_struct__',
+ '__array_interface__']
+ _internal_names_set = set(_internal_names)
+ _accessors = frozenset()
+ _deprecations = frozenset(['as_blocks', 'blocks',
+ 'convert_objects', 'is_copy'])
+ _metadata = []
+ _is_copy = None
+
+ # dummy attribute so that datetime.__eq__(Series/DataFrame) defers
+ # by returning NotImplemented
+ timetuple = None
+
+ # ----------------------------------------------------------------------
+ # Constructors
+
+ def __init__(self, data, axes=None, copy=False, dtype=None,
+ fastpath=False):
+
+ if not fastpath:
+ if dtype is not None:
+ data = data.astype(dtype)
+ elif copy:
+ data = data.copy()
+
+ if axes is not None:
+ for i, ax in enumerate(axes):
+ data = data.reindex_axis(ax, axis=i)
+
+ object.__setattr__(self, '_is_copy', None)
+ object.__setattr__(self, '_data', data)
+ object.__setattr__(self, '_item_cache', {})
+
+ def _init_mgr(self, mgr, axes=None, dtype=None, copy=False):
+ """ passed a manager and a axes dict """
+ for a, axe in axes.items():
+ if axe is not None:
+ mgr = mgr.reindex_axis(axe,
+ axis=self._get_block_manager_axis(a),
+ copy=False)
+
+ # make a copy if explicitly requested
+ if copy:
+ mgr = mgr.copy()
+ if dtype is not None:
+ # avoid further copies if we can
+ if len(mgr.blocks) > 1 or mgr.blocks[0].values.dtype != dtype:
+ mgr = mgr.astype(dtype=dtype)
+ return mgr
+
+ # ----------------------------------------------------------------------
+
+ @property
+ def is_copy(self):
+ """
+ Return the copy.
+ """
+ warnings.warn("Attribute 'is_copy' is deprecated and will be removed "
+ "in a future version.", FutureWarning, stacklevel=2)
+ return self._is_copy
+
+ @is_copy.setter
+ def is_copy(self, msg):
+ warnings.warn("Attribute 'is_copy' is deprecated and will be removed "
+ "in a future version.", FutureWarning, stacklevel=2)
+ self._is_copy = msg
+
+ def _validate_dtype(self, dtype):
+ """ validate the passed dtype """
+
+ if dtype is not None:
+ dtype = pandas_dtype(dtype)
+
+ # a compound dtype
+ if dtype.kind == 'V':
+ raise NotImplementedError("compound dtypes are not implemented"
+ " in the {0} constructor"
+ .format(self.__class__.__name__))
+
+ return dtype
+
+ # ----------------------------------------------------------------------
+ # Construction
+
+ @property
+ def _constructor(self):
+ """Used when a manipulation result has the same dimensions as the
+ original.
+ """
+ raise AbstractMethodError(self)
+
+ @property
+ def _constructor_sliced(self):
+ """Used when a manipulation result has one lower dimension(s) as the
+ original, such as DataFrame single columns slicing.
+ """
+ raise AbstractMethodError(self)
+
+ @property
+ def _constructor_expanddim(self):
+ """Used when a manipulation result has one higher dimension as the
+ original, such as Series.to_frame() and DataFrame.to_panel()
+ """
+ raise NotImplementedError
+
+ # ----------------------------------------------------------------------
+ # Axis
+
+ @classmethod
+ def _setup_axes(cls, axes, info_axis=None, stat_axis=None, aliases=None,
+ slicers=None, axes_are_reversed=False, build_axes=True,
+ ns=None, docs=None):
+ """Provide axes setup for the major PandasObjects.
+
+ Parameters
+ ----------
+ axes : the names of the axes in order (lowest to highest)
+ info_axis_num : the axis of the selector dimension (int)
+ stat_axis_num : the number of axis for the default stats (int)
+ aliases : other names for a single axis (dict)
+ slicers : how axes slice to others (dict)
+ axes_are_reversed : boolean whether to treat passed axes as
+ reversed (DataFrame)
+ build_axes : setup the axis properties (default True)
+ """
+
+ cls._AXIS_ORDERS = axes
+ cls._AXIS_NUMBERS = {a: i for i, a in enumerate(axes)}
+ cls._AXIS_LEN = len(axes)
+ cls._AXIS_ALIASES = aliases or dict()
+ cls._AXIS_IALIASES = {v: k for k, v in cls._AXIS_ALIASES.items()}
+ cls._AXIS_NAMES = dict(enumerate(axes))
+ cls._AXIS_SLICEMAP = slicers or None
+ cls._AXIS_REVERSED = axes_are_reversed
+
+ # typ
+ setattr(cls, '_typ', cls.__name__.lower())
+
+ # indexing support
+ cls._ix = None
+
+ if info_axis is not None:
+ cls._info_axis_number = info_axis
+ cls._info_axis_name = axes[info_axis]
+
+ if stat_axis is not None:
+ cls._stat_axis_number = stat_axis
+ cls._stat_axis_name = axes[stat_axis]
+
+ # setup the actual axis
+ if build_axes:
+
+ def set_axis(a, i):
+ setattr(cls, a, properties.AxisProperty(i, docs.get(a, a)))
+ cls._internal_names_set.add(a)
+
+ if axes_are_reversed:
+ m = cls._AXIS_LEN - 1
+ for i, a in cls._AXIS_NAMES.items():
+ set_axis(a, m - i)
+ else:
+ for i, a in cls._AXIS_NAMES.items():
+ set_axis(a, i)
+
+ assert not isinstance(ns, dict)
+
+ def _construct_axes_dict(self, axes=None, **kwargs):
+ """Return an axes dictionary for myself."""
+ d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}
+ d.update(kwargs)
+ return d
+
+ @staticmethod
+ def _construct_axes_dict_from(self, axes, **kwargs):
+ """Return an axes dictionary for the passed axes."""
+ d = {a: ax for a, ax in zip(self._AXIS_ORDERS, axes)}
+ d.update(kwargs)
+ return d
+
+ def _construct_axes_dict_for_slice(self, axes=None, **kwargs):
+ """Return an axes dictionary for myself."""
+ d = {self._AXIS_SLICEMAP[a]: self._get_axis(a)
+ for a in (axes or self._AXIS_ORDERS)}
+ d.update(kwargs)
+ return d
+
+ def _construct_axes_from_arguments(
+ self, args, kwargs, require_all=False, sentinel=None):
+ """Construct and returns axes if supplied in args/kwargs.
+
+ If require_all, raise if all axis arguments are not supplied
+ return a tuple of (axes, kwargs).
+
+ sentinel specifies the default parameter when an axis is not
+ supplied; useful to distinguish when a user explicitly passes None
+ in scenarios where None has special meaning.
+ """
+
+ # construct the args
+ args = list(args)
+ for a in self._AXIS_ORDERS:
+
+ # if we have an alias for this axis
+ alias = self._AXIS_IALIASES.get(a)
+ if alias is not None:
+ if a in kwargs:
+ if alias in kwargs:
+ raise TypeError("arguments are mutually exclusive "
+ "for [%s,%s]" % (a, alias))
+ continue
+ if alias in kwargs:
+ kwargs[a] = kwargs.pop(alias)
+ continue
+
+ # look for a argument by position
+ if a not in kwargs:
+ try:
+ kwargs[a] = args.pop(0)
+ except IndexError:
+ if require_all:
+ raise TypeError("not enough/duplicate arguments "
+ "specified!")
+
+ axes = {a: kwargs.pop(a, sentinel) for a in self._AXIS_ORDERS}
+ return axes, kwargs
+
+ @classmethod
+ def _from_axes(cls, data, axes, **kwargs):
+ # for construction from BlockManager
+ if isinstance(data, BlockManager):
+ return cls(data, **kwargs)
+ else:
+ if cls._AXIS_REVERSED:
+ axes = axes[::-1]
+ d = cls._construct_axes_dict_from(cls, axes, copy=False)
+ d.update(kwargs)
+ return cls(data, **d)
+
+ @classmethod
+ def _get_axis_number(cls, axis):
+ axis = cls._AXIS_ALIASES.get(axis, axis)
+ if is_integer(axis):
+ if axis in cls._AXIS_NAMES:
+ return axis
+ else:
+ try:
+ return cls._AXIS_NUMBERS[axis]
+ except KeyError:
+ pass
+ raise ValueError('No axis named {0} for object type {1}'
+ .format(axis, type(cls)))
+
+ @classmethod
+ def _get_axis_name(cls, axis):
+ axis = cls._AXIS_ALIASES.get(axis, axis)
+ if isinstance(axis, string_types):
+ if axis in cls._AXIS_NUMBERS:
+ return axis
+ else:
+ try:
+ return cls._AXIS_NAMES[axis]
+ except KeyError:
+ pass
+ raise ValueError('No axis named {0} for object type {1}'
+ .format(axis, type(cls)))
+
+ def _get_axis(self, axis):
+ name = self._get_axis_name(axis)
+ return getattr(self, name)
+
+ @classmethod
+ def _get_block_manager_axis(cls, axis):
+ """Map the axis to the block_manager axis."""
+ axis = cls._get_axis_number(axis)
+ if cls._AXIS_REVERSED:
+ m = cls._AXIS_LEN - 1
+ return m - axis
+ return axis
+
+ def _get_axis_resolvers(self, axis):
+ # index or columns
+ axis_index = getattr(self, axis)
+ d = dict()
+ prefix = axis[0]
+
+ for i, name in enumerate(axis_index.names):
+ if name is not None:
+ key = level = name
+ else:
+ # prefix with 'i' or 'c' depending on the input axis
+ # e.g., you must do ilevel_0 for the 0th level of an unnamed
+ # multiiindex
+ key = '{prefix}level_{i}'.format(prefix=prefix, i=i)
+ level = i
+
+ level_values = axis_index.get_level_values(level)
+ s = level_values.to_series()
+ s.index = axis_index
+ d[key] = s
+
+ # put the index/columns itself in the dict
+ if isinstance(axis_index, MultiIndex):
+ dindex = axis_index
+ else:
+ dindex = axis_index.to_series()
+
+ d[axis] = dindex
+ return d
+
+ def _get_index_resolvers(self):
+ d = {}
+ for axis_name in self._AXIS_ORDERS:
+ d.update(self._get_axis_resolvers(axis_name))
+ return d
+
+ @property
+ def _info_axis(self):
+ return getattr(self, self._info_axis_name)
+
+ @property
+ def _stat_axis(self):
+ return getattr(self, self._stat_axis_name)
+
+ @property
+ def shape(self):
+ """
+ Return a tuple of axis dimensions
+ """
+ return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)
+
+ @property
+ def axes(self):
+ """
+ Return index label(s) of the internal NDFrame
+ """
+ # we do it this way because if we have reversed axes, then
+ # the block manager shows then reversed
+ return [self._get_axis(a) for a in self._AXIS_ORDERS]
+
+ @property
+ def ndim(self):
+ """
+ Return an int representing the number of axes / array dimensions.
+
+ Return 1 if Series. Otherwise return 2 if DataFrame.
+
+ See Also
+ --------
+ ndarray.ndim : Number of array dimensions.
+
+ Examples
+ --------
+ >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
+ >>> s.ndim
+ 1
+
+ >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
+ >>> df.ndim
+ 2
+ """
+ return self._data.ndim
+
+ @property
+ def size(self):
+ """
+ Return an int representing the number of elements in this object.
+
+ Return the number of rows if Series. Otherwise return the number of
+ rows times number of columns if DataFrame.
+
+ See Also
+ --------
+ ndarray.size : Number of elements in the array.
+
+ Examples
+ --------
+ >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
+ >>> s.size
+ 3
+
+ >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
+ >>> df.size
+ 4
+ """
+ return np.prod(self.shape)
+
+ @property
+ def _selected_obj(self):
+ """ internal compat with SelectionMixin """
+ return self
+
+ @property
+ def _obj_with_exclusions(self):
+ """ internal compat with SelectionMixin """
+ return self
+
+ def _expand_axes(self, key):
+ new_axes = []
+ for k, ax in zip(key, self.axes):
+ if k not in ax:
+ if type(k) != ax.dtype.type:
+ ax = ax.astype('O')
+ new_axes.append(ax.insert(len(ax), k))
+ else:
+ new_axes.append(ax)
+
+ return new_axes
+
+ def set_axis(self, labels, axis=0, inplace=None):
+ """
+ Assign desired index to given axis.
+
+ Indexes for column or row labels can be changed by assigning
+ a list-like or Index.
+
+ .. versionchanged:: 0.21.0
+
+ The signature is now `labels` and `axis`, consistent with
+ the rest of pandas API. Previously, the `axis` and `labels`
+ arguments were respectively the first and second positional
+ arguments.
+
+ Parameters
+ ----------
+ labels : list-like, Index
+ The values for the new index.
+
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ The axis to update. The value 0 identifies the rows, and 1
+ identifies the columns.
+
+ inplace : boolean, default None
+ Whether to return a new %(klass)s instance.
+
+ .. warning::
+
+ ``inplace=None`` currently falls back to to True, but in a
+ future version, will default to False. Use inplace=True
+ explicitly rather than relying on the default.
+
+ Returns
+ -------
+ renamed : %(klass)s or None
+ An object of same type as caller if inplace=False, None otherwise.
+
+ See Also
+ --------
+ DataFrame.rename_axis : Alter the name of the index or columns.
+
+ Examples
+ --------
+ **Series**
+
+ >>> s = pd.Series([1, 2, 3])
+ >>> s
+ 0 1
+ 1 2
+ 2 3
+ dtype: int64
+
+ >>> s.set_axis(['a', 'b', 'c'], axis=0, inplace=False)
+ a 1
+ b 2
+ c 3
+ dtype: int64
+
+ The original object is not modified.
+
+ >>> s
+ 0 1
+ 1 2
+ 2 3
+ dtype: int64
+
+ **DataFrame**
+
+ >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+
+ Change the row labels.
+
+ >>> df.set_axis(['a', 'b', 'c'], axis='index', inplace=False)
+ A B
+ a 1 4
+ b 2 5
+ c 3 6
+
+ Change the column labels.
+
+ >>> df.set_axis(['I', 'II'], axis='columns', inplace=False)
+ I II
+ 0 1 4
+ 1 2 5
+ 2 3 6
+
+ Now, update the labels inplace.
+
+ >>> df.set_axis(['i', 'ii'], axis='columns', inplace=True)
+ >>> df
+ i ii
+ 0 1 4
+ 1 2 5
+ 2 3 6
+ """
+ if is_scalar(labels):
+ warnings.warn(
+ 'set_axis now takes "labels" as first argument, and '
+ '"axis" as named parameter. The old form, with "axis" as '
+ 'first parameter and \"labels\" as second, is still supported '
+ 'but will be deprecated in a future version of pandas.',
+ FutureWarning, stacklevel=2)
+ labels, axis = axis, labels
+
+ if inplace is None:
+ warnings.warn(
+ 'set_axis currently defaults to operating inplace.\nThis '
+ 'will change in a future version of pandas, use '
+ 'inplace=True to avoid this warning.',
+ FutureWarning, stacklevel=2)
+ inplace = True
+ if inplace:
+ setattr(self, self._get_axis_name(axis), labels)
+ else:
+ obj = self.copy()
+ obj.set_axis(labels, axis=axis, inplace=True)
+ return obj
+
+ def _set_axis(self, axis, labels):
+ self._data.set_axis(axis, labels)
+ self._clear_item_cache()
+
+ def transpose(self, *args, **kwargs):
+ """
+ Permute the dimensions of the %(klass)s
+
+ Parameters
+ ----------
+ args : %(args_transpose)s
+ copy : boolean, default False
+ Make a copy of the underlying data. Mixed-dtype data will
+ always result in a copy
+
+ Returns
+ -------
+ y : same as input
+
+ Examples
+ --------
+ >>> p.transpose(2, 0, 1)
+ >>> p.transpose(2, 0, 1, copy=True)
+ """
+
+ # construct the args
+ axes, kwargs = self._construct_axes_from_arguments(args, kwargs,
+ require_all=True)
+ axes_names = tuple(self._get_axis_name(axes[a])
+ for a in self._AXIS_ORDERS)
+ axes_numbers = tuple(self._get_axis_number(axes[a])
+ for a in self._AXIS_ORDERS)
+
+ # we must have unique axes
+ if len(axes) != len(set(axes)):
+ raise ValueError('Must specify %s unique axes' % self._AXIS_LEN)
+
+ new_axes = self._construct_axes_dict_from(self, [self._get_axis(x)
+ for x in axes_names])
+ new_values = self.values.transpose(axes_numbers)
+ if kwargs.pop('copy', None) or (len(args) and args[-1]):
+ new_values = new_values.copy()
+
+ nv.validate_transpose_for_generic(self, kwargs)
+ return self._constructor(new_values, **new_axes).__finalize__(self)
+
+ def swapaxes(self, axis1, axis2, copy=True):
+ """
+ Interchange axes and swap values axes appropriately.
+
+ Returns
+ -------
+ y : same as input
+ """
+ i = self._get_axis_number(axis1)
+ j = self._get_axis_number(axis2)
+
+ if i == j:
+ if copy:
+ return self.copy()
+ return self
+
+ mapping = {i: j, j: i}
+
+ new_axes = (self._get_axis(mapping.get(k, k))
+ for k in range(self._AXIS_LEN))
+ new_values = self.values.swapaxes(i, j)
+ if copy:
+ new_values = new_values.copy()
+
+ return self._constructor(new_values, *new_axes).__finalize__(self)
+
+ def droplevel(self, level, axis=0):
+ """
+ Return DataFrame with requested index / column level(s) removed.
+
+ .. versionadded:: 0.24.0
+
+ Parameters
+ ----------
+ level : int, str, or list-like
+ If a string is given, must be the name of a level
+ If list-like, elements must be names or positional indexes
+ of levels.
+
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+
+ Returns
+ -------
+ DataFrame.droplevel()
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([
+ ... [1, 2, 3, 4],
+ ... [5, 6, 7, 8],
+ ... [9, 10, 11, 12]
+ ... ]).set_index([0, 1]).rename_axis(['a', 'b'])
+
+ >>> df.columns = pd.MultiIndex.from_tuples([
+ ... ('c', 'e'), ('d', 'f')
+ ... ], names=['level_1', 'level_2'])
+
+ >>> df
+ level_1 c d
+ level_2 e f
+ a b
+ 1 2 3 4
+ 5 6 7 8
+ 9 10 11 12
+
+ >>> df.droplevel('a')
+ level_1 c d
+ level_2 e f
+ b
+ 2 3 4
+ 6 7 8
+ 10 11 12
+
+ >>> df.droplevel('level2', axis=1)
+ level_1 c d
+ a b
+ 1 2 3 4
+ 5 6 7 8
+ 9 10 11 12
+ """
+ labels = self._get_axis(axis)
+ new_labels = labels.droplevel(level)
+ result = self.set_axis(new_labels, axis=axis, inplace=False)
+ return result
+
+ def pop(self, item):
+ """
+ Return item and drop from frame. Raise KeyError if not found.
+
+ Parameters
+ ----------
+ item : str
+ Column label to be popped
+
+ Returns
+ -------
+ popped : Series
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
+ ... ('parrot', 'bird', 24.0),
+ ... ('lion', 'mammal', 80.5),
+ ... ('monkey', 'mammal', np.nan)],
+ ... columns=('name', 'class', 'max_speed'))
+ >>> df
+ name class max_speed
+ 0 falcon bird 389.0
+ 1 parrot bird 24.0
+ 2 lion mammal 80.5
+ 3 monkey mammal NaN
+
+ >>> df.pop('class')
+ 0 bird
+ 1 bird
+ 2 mammal
+ 3 mammal
+ Name: class, dtype: object
+
+ >>> df
+ name max_speed
+ 0 falcon 389.0
+ 1 parrot 24.0
+ 2 lion 80.5
+ 3 monkey NaN
+ """
+ result = self[item]
+ del self[item]
+ try:
+ result._reset_cacher()
+ except AttributeError:
+ pass
+
+ return result
+
+ def squeeze(self, axis=None):
+ """
+ Squeeze 1 dimensional axis objects into scalars.
+
+ Series or DataFrames with a single element are squeezed to a scalar.
+ DataFrames with a single column or a single row are squeezed to a
+ Series. Otherwise the object is unchanged.
+
+ This method is most useful when you don't know if your
+ object is a Series or DataFrame, but you do know it has just a single
+ column. In that case you can safely call `squeeze` to ensure you have a
+ Series.
+
+ Parameters
+ ----------
+ axis : {0 or 'index', 1 or 'columns', None}, default None
+ A specific axis to squeeze. By default, all length-1 axes are
+ squeezed.
+
+ .. versionadded:: 0.20.0
+
+ Returns
+ -------
+ DataFrame, Series, or scalar
+ The projection after squeezing `axis` or all the axes.
+
+ See Also
+ --------
+ Series.iloc : Integer-location based indexing for selecting scalars.
+ DataFrame.iloc : Integer-location based indexing for selecting Series.
+ Series.to_frame : Inverse of DataFrame.squeeze for a
+ single-column DataFrame.
+
+ Examples
+ --------
+ >>> primes = pd.Series([2, 3, 5, 7])
+
+ Slicing might produce a Series with a single value:
+
+ >>> even_primes = primes[primes % 2 == 0]
+ >>> even_primes
+ 0 2
+ dtype: int64
+
+ >>> even_primes.squeeze()
+ 2
+
+ Squeezing objects with more than one value in every axis does nothing:
+
+ >>> odd_primes = primes[primes % 2 == 1]
+ >>> odd_primes
+ 1 3
+ 2 5
+ 3 7
+ dtype: int64
+
+ >>> odd_primes.squeeze()
+ 1 3
+ 2 5
+ 3 7
+ dtype: int64
+
+ Squeezing is even more effective when used with DataFrames.
+
+ >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])
+ >>> df
+ a b
+ 0 1 2
+ 1 3 4
+
+ Slicing a single column will produce a DataFrame with the columns
+ having only one value:
+
+ >>> df_a = df[['a']]
+ >>> df_a
+ a
+ 0 1
+ 1 3
+
+ So the columns can be squeezed down, resulting in a Series:
+
+ >>> df_a.squeeze('columns')
+ 0 1
+ 1 3
+ Name: a, dtype: int64
+
+ Slicing a single row from a single column will produce a single
+ scalar DataFrame:
+
+ >>> df_0a = df.loc[df.index < 1, ['a']]
+ >>> df_0a
+ a
+ 0 1
+
+ Squeezing the rows produces a single scalar Series:
+
+ >>> df_0a.squeeze('rows')
+ a 1
+ Name: 0, dtype: int64
+
+ Squeezing all axes wil project directly into a scalar:
+
+ >>> df_0a.squeeze()
+ 1
+ """
+ axis = (self._AXIS_NAMES if axis is None else
+ (self._get_axis_number(axis),))
+ try:
+ return self.iloc[
+ tuple(0 if i in axis and len(a) == 1 else slice(None)
+ for i, a in enumerate(self.axes))]
+ except Exception:
+ return self
+
+ def swaplevel(self, i=-2, j=-1, axis=0):
+ """
+ Swap levels i and j in a MultiIndex on a particular axis
+
+ Parameters
+ ----------
+ i, j : int, string (can be mixed)
+ Level of index to be swapped. Can pass level name as string.
+
+ Returns
+ -------
+ swapped : same type as caller (new object)
+
+ .. versionchanged:: 0.18.1
+
+ The indexes ``i`` and ``j`` are now optional, and default to
+ the two innermost levels of the index.
+
+ """
+ axis = self._get_axis_number(axis)
+ result = self.copy()
+ labels = result._data.axes[axis]
+ result._data.set_axis(axis, labels.swaplevel(i, j))
+ return result
+
+ # ----------------------------------------------------------------------
+ # Rename
+
+ def rename(self, *args, **kwargs):
+ """
+ Alter axes input function or functions. Function / dict values must be
+ unique (1-to-1). Labels not contained in a dict / Series will be left
+ as-is. Extra labels listed don't throw an error. Alternatively, change
+ ``Series.name`` with a scalar value (Series only).
+
+ Parameters
+ ----------
+ %(axes)s : scalar, list-like, dict-like or function, optional
+ Scalar or list-like will alter the ``Series.name`` attribute,
+ and raise on DataFrame or Panel.
+ dict-like or functions are transformations to apply to
+ that axis' values
+ copy : boolean, default True
+ Also copy underlying data
+ inplace : boolean, default False
+ Whether to return a new %(klass)s. If True then value of copy is
+ ignored.
+ level : int or level name, default None
+ In case of a MultiIndex, only rename labels in the specified
+ level.
+
+ Returns
+ -------
+ renamed : %(klass)s (new object)
+
+ See Also
+ --------
+ pandas.NDFrame.rename_axis
+
+ Examples
+ --------
+
+ >>> s = pd.Series([1, 2, 3])
+ >>> s
+ 0 1
+ 1 2
+ 2 3
+ dtype: int64
+ >>> s.rename("my_name") # scalar, changes Series.name
+ 0 1
+ 1 2
+ 2 3
+ Name: my_name, dtype: int64
+ >>> s.rename(lambda x: x ** 2) # function, changes labels
+ 0 1
+ 1 2
+ 4 3
+ dtype: int64
+ >>> s.rename({1: 3, 2: 5}) # mapping, changes labels
+ 0 1
+ 3 2
+ 5 3
+ dtype: int64
+
+ Since ``DataFrame`` doesn't have a ``.name`` attribute,
+ only mapping-type arguments are allowed.
+
+ >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+ >>> df.rename(2)
+ Traceback (most recent call last):
+ ...
+ TypeError: 'int' object is not callable
+
+ ``DataFrame.rename`` supports two calling conventions
+
+ * ``(index=index_mapper, columns=columns_mapper, ...)``
+ * ``(mapper, axis={'index', 'columns'}, ...)``
+
+ We *highly* recommend using keyword arguments to clarify your
+ intent.
+
+ >>> df.rename(index=str, columns={"A": "a", "B": "c"})
+ a c
+ 0 1 4
+ 1 2 5
+ 2 3 6
+
+ >>> df.rename(index=str, columns={"A": "a", "C": "c"})
+ a B
+ 0 1 4
+ 1 2 5
+ 2 3 6
+
+ Using axis-style parameters
+
+ >>> df.rename(str.lower, axis='columns')
+ a b
+ 0 1 4
+ 1 2 5
+ 2 3 6
+
+ >>> df.rename({1: 2, 2: 4}, axis='index')
+ A B
+ 0 1 4
+ 2 2 5
+ 4 3 6
+
+ See the :ref:`user guide <basics.rename>` for more.
+ """
+ axes, kwargs = self._construct_axes_from_arguments(args, kwargs)
+ copy = kwargs.pop('copy', True)
+ inplace = kwargs.pop('inplace', False)
+ level = kwargs.pop('level', None)
+ axis = kwargs.pop('axis', None)
+ if axis is not None:
+ # Validate the axis
+ self._get_axis_number(axis)
+
+ if kwargs:
+ raise TypeError('rename() got an unexpected keyword '
+ 'argument "{0}"'.format(list(kwargs.keys())[0]))
+
+ if com.count_not_none(*axes.values()) == 0:
+ raise TypeError('must pass an index to rename')
+
+ self._consolidate_inplace()
+ result = self if inplace else self.copy(deep=copy)
+
+ # start in the axis order to eliminate too many copies
+ for axis in lrange(self._AXIS_LEN):
+ v = axes.get(self._AXIS_NAMES[axis])
+ if v is None:
+ continue
+ f = com._get_rename_function(v)
+
+ baxis = self._get_block_manager_axis(axis)
+ if level is not None:
+ level = self.axes[axis]._get_level_number(level)
+ result._data = result._data.rename_axis(f, axis=baxis, copy=copy,
+ level=level)
+ result._clear_item_cache()
+
+ if inplace:
+ self._update_inplace(result._data)
+ else:
+ return result.__finalize__(self)
+
+ @rewrite_axis_style_signature('mapper', [('copy', True),
+ ('inplace', False)])
+ def rename_axis(self, mapper=sentinel, **kwargs):
+ """
+ Set the name of the axis for the index or columns.
+
+ Parameters
+ ----------
+ mapper : scalar, list-like, optional
+ Value to set the axis name attribute.
+ index, columns : scalar, list-like, dict-like or function, optional
+ A scalar, list-like, dict-like or functions transformations to
+ apply to that axis' values.
+
+ Use either ``mapper`` and ``axis`` to
+ specify the axis to target with ``mapper``, or ``index``
+ and/or ``columns``.
+
+ .. versionchanged:: 0.24.0
+
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ The axis to rename.
+ copy : bool, default True
+ Also copy underlying data.
+ inplace : bool, default False
+ Modifies the object directly, instead of creating a new Series
+ or DataFrame.
+
+ Returns
+ -------
+ Series, DataFrame, or None
+ The same type as the caller or None if `inplace` is True.
+
+ See Also
+ --------
+ Series.rename : Alter Series index labels or name.
+ DataFrame.rename : Alter DataFrame index labels or name.
+ Index.rename : Set new names on index.
+
+ Notes
+ -----
+ Prior to version 0.21.0, ``rename_axis`` could also be used to change
+ the axis *labels* by passing a mapping or scalar. This behavior is
+ deprecated and will be removed in a future version. Use ``rename``
+ instead.
+
+ ``DataFrame.rename_axis`` supports two calling conventions
+
+ * ``(index=index_mapper, columns=columns_mapper, ...)``
+ * ``(mapper, axis={'index', 'columns'}, ...)``
+
+ The first calling convention will only modify the names of
+ the index and/or the names of the Index object that is the columns.
+ In this case, the parameter ``copy`` is ignored.
+
+ The second calling convention will modify the names of the
+ the corresponding index if mapper is a list or a scalar.
+ However, if mapper is dict-like or a function, it will use the
+ deprecated behavior of modifying the axis *labels*.
+
+ We *highly* recommend using keyword arguments to clarify your
+ intent.
+
+ Examples
+ --------
+ **Series**
+
+ >>> s = pd.Series(["dog", "cat", "monkey"])
+ >>> s
+ 0 dog
+ 1 cat
+ 2 monkey
+ dtype: object
+ >>> s.rename_axis("animal")
+ animal
+ 0 dog
+ 1 cat
+ 2 monkey
+ dtype: object
+
+ **DataFrame**
+
+ >>> df = pd.DataFrame({"num_legs": [4, 4, 2],
+ ... "num_arms": [0, 0, 2]},
+ ... ["dog", "cat", "monkey"])
+ >>> df
+ num_legs num_arms
+ dog 4 0
+ cat 4 0
+ monkey 2 2
+ >>> df = df.rename_axis("animal")
+ >>> df
+ num_legs num_arms
+ animal
+ dog 4 0
+ cat 4 0
+ monkey 2 2
+ >>> df = df.rename_axis("limbs", axis="columns")
+ >>> df
+ limbs num_legs num_arms
+ animal
+ dog 4 0
+ cat 4 0
+ monkey 2 2
+
+ **MultiIndex**
+
+ >>> df.index = pd.MultiIndex.from_product([['mammal'],
+ ... ['dog', 'cat', 'monkey']],
+ ... names=['type', 'name'])
+ >>> df
+ limbs num_legs num_arms
+ type name
+ mammal dog 4 0
+ cat 4 0
+ monkey 2 2
+
+ >>> df.rename_axis(index={'type': 'class'})
+ limbs num_legs num_arms
+ class name
+ mammal dog 4 0
+ cat 4 0
+ monkey 2 2
+
+ >>> df.rename_axis(columns=str.upper)
+ LIMBS num_legs num_arms
+ type name
+ mammal dog 4 0
+ cat 4 0
+ monkey 2 2
+ """
+ axes, kwargs = self._construct_axes_from_arguments(
+ (), kwargs, sentinel=sentinel)
+ copy = kwargs.pop('copy', True)
+ inplace = kwargs.pop('inplace', False)
+ axis = kwargs.pop('axis', 0)
+ if axis is not None:
+ axis = self._get_axis_number(axis)
+
+ if kwargs:
+ raise TypeError('rename_axis() got an unexpected keyword '
+ 'argument "{0}"'.format(list(kwargs.keys())[0]))
+
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+
+ if (mapper is not sentinel):
+ # Use v0.23 behavior if a scalar or list
+ non_mapper = is_scalar(mapper) or (is_list_like(mapper) and not
+ is_dict_like(mapper))
+ if non_mapper:
+ return self._set_axis_name(mapper, axis=axis, inplace=inplace)
+ else:
+ # Deprecated (v0.21) behavior is if mapper is specified,
+ # and not a list or scalar, then call rename
+ msg = ("Using 'rename_axis' to alter labels is deprecated. "
+ "Use '.rename' instead")
+ warnings.warn(msg, FutureWarning, stacklevel=3)
+ axis = self._get_axis_name(axis)
+ d = {'copy': copy, 'inplace': inplace}
+ d[axis] = mapper
+ return self.rename(**d)
+ else:
+ # Use new behavior. Means that index and/or columns
+ # is specified
+ result = self if inplace else self.copy(deep=copy)
+
+ for axis in lrange(self._AXIS_LEN):
+ v = axes.get(self._AXIS_NAMES[axis])
+ if v is sentinel:
+ continue
+ non_mapper = is_scalar(v) or (is_list_like(v) and not
+ is_dict_like(v))
+ if non_mapper:
+ newnames = v
+ else:
+ f = com._get_rename_function(v)
+ curnames = self._get_axis(axis).names
+ newnames = [f(name) for name in curnames]
+ result._set_axis_name(newnames, axis=axis,
+ inplace=True)
+ if not inplace:
+ return result
+
+ def _set_axis_name(self, name, axis=0, inplace=False):
+ """
+ Set the name(s) of the axis.
+
+ Parameters
+ ----------
+ name : str or list of str
+ Name(s) to set.
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ The axis to set the label. The value 0 or 'index' specifies index,
+ and the value 1 or 'columns' specifies columns.
+ inplace : bool, default False
+ If `True`, do operation inplace and return None.
+
+ .. versionadded:: 0.21.0
+
+ Returns
+ -------
+ Series, DataFrame, or None
+ The same type as the caller or `None` if `inplace` is `True`.
+
+ See Also
+ --------
+ DataFrame.rename : Alter the axis labels of :class:`DataFrame`.
+ Series.rename : Alter the index labels or set the index name
+ of :class:`Series`.
+ Index.rename : Set the name of :class:`Index` or :class:`MultiIndex`.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({"num_legs": [4, 4, 2]},
+ ... ["dog", "cat", "monkey"])
+ >>> df
+ num_legs
+ dog 4
+ cat 4
+ monkey 2
+ >>> df._set_axis_name("animal")
+ num_legs
+ animal
+ dog 4
+ cat 4
+ monkey 2
+ >>> df.index = pd.MultiIndex.from_product(
+ ... [["mammal"], ['dog', 'cat', 'monkey']])
+ >>> df._set_axis_name(["type", "name"])
+ legs
+ type name
+ mammal dog 4
+ cat 4
+ monkey 2
+ """
+ axis = self._get_axis_number(axis)
+ idx = self._get_axis(axis).set_names(name)
+
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ renamed = self if inplace else self.copy()
+ renamed.set_axis(idx, axis=axis, inplace=True)
+ if not inplace:
+ return renamed
+
+ # ----------------------------------------------------------------------
+ # Comparison Methods
+
+ def _indexed_same(self, other):
+ return all(self._get_axis(a).equals(other._get_axis(a))
+ for a in self._AXIS_ORDERS)
+
+ def equals(self, other):
+ """
+ Test whether two objects contain the same elements.
+
+ This function allows two Series or DataFrames to be compared against
+ each other to see if they have the same shape and elements. NaNs in
+ the same location are considered equal. The column headers do not
+ need to have the same type, but the elements within the columns must
+ be the same dtype.
+
+ Parameters
+ ----------
+ other : Series or DataFrame
+ The other Series or DataFrame to be compared with the first.
+
+ Returns
+ -------
+ bool
+ True if all elements are the same in both objects, False
+ otherwise.
+
+ See Also
+ --------
+ Series.eq : Compare two Series objects of the same length
+ and return a Series where each element is True if the element
+ in each Series is equal, False otherwise.
+ DataFrame.eq : Compare two DataFrame objects of the same shape and
+ return a DataFrame where each element is True if the respective
+ element in each DataFrame is equal, False otherwise.
+ assert_series_equal : Return True if left and right Series are equal,
+ False otherwise.
+ assert_frame_equal : Return True if left and right DataFrames are
+ equal, False otherwise.
+ numpy.array_equal : Return True if two arrays have the same shape
+ and elements, False otherwise.
+
+ Notes
+ -----
+ This function requires that the elements have the same dtype as their
+ respective elements in the other Series or DataFrame. However, the
+ column labels do not need to have the same type, as long as they are
+ still considered equal.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({1: [10], 2: [20]})
+ >>> df
+ 1 2
+ 0 10 20
+
+ DataFrames df and exactly_equal have the same types and values for
+ their elements and column labels, which will return True.
+
+ >>> exactly_equal = pd.DataFrame({1: [10], 2: [20]})
+ >>> exactly_equal
+ 1 2
+ 0 10 20
+ >>> df.equals(exactly_equal)
+ True
+
+ DataFrames df and different_column_type have the same element
+ types and values, but have different types for the column labels,
+ which will still return True.
+
+ >>> different_column_type = pd.DataFrame({1.0: [10], 2.0: [20]})
+ >>> different_column_type
+ 1.0 2.0
+ 0 10 20
+ >>> df.equals(different_column_type)
+ True
+
+ DataFrames df and different_data_type have different types for the
+ same values for their elements, and will return False even though
+ their column labels are the same values and types.
+
+ >>> different_data_type = pd.DataFrame({1: [10.0], 2: [20.0]})
+ >>> different_data_type
+ 1 2
+ 0 10.0 20.0
+ >>> df.equals(different_data_type)
+ False
+ """
+ if not isinstance(other, self._constructor):
+ return False
+ return self._data.equals(other._data)
+
+ # -------------------------------------------------------------------------
+ # Unary Methods
+
+ def __neg__(self):
+ values = com.values_from_object(self)
+ if is_bool_dtype(values):
+ arr = operator.inv(values)
+ elif (is_numeric_dtype(values) or is_timedelta64_dtype(values)
+ or is_object_dtype(values)):
+ arr = operator.neg(values)
+ else:
+ raise TypeError("Unary negative expects numeric dtype, not {}"
+ .format(values.dtype))
+ return self.__array_wrap__(arr)
+
+ def __pos__(self):
+ values = com.values_from_object(self)
+ if (is_bool_dtype(values) or is_period_arraylike(values)):
+ arr = values
+ elif (is_numeric_dtype(values) or is_timedelta64_dtype(values)
+ or is_object_dtype(values)):
+ arr = operator.pos(values)
+ else:
+ raise TypeError("Unary plus expects numeric dtype, not {}"
+ .format(values.dtype))
+ return self.__array_wrap__(arr)
+
+ def __invert__(self):
+ try:
+ arr = operator.inv(com.values_from_object(self))
+ return self.__array_wrap__(arr)
+ except Exception:
+
+ # inv fails with 0 len
+ if not np.prod(self.shape):
+ return self
+
+ raise
+
+ def __nonzero__(self):
+ raise ValueError("The truth value of a {0} is ambiguous. "
+ "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
+ .format(self.__class__.__name__))
+
+ __bool__ = __nonzero__
+
+ def bool(self):
+ """
+ Return the bool of a single element PandasObject.
+
+ This must be a boolean scalar value, either True or False. Raise a
+ ValueError if the PandasObject does not have exactly 1 element, or that
+ element is not boolean
+ """
+ v = self.squeeze()
+ if isinstance(v, (bool, np.bool_)):
+ return bool(v)
+ elif is_scalar(v):
+ raise ValueError("bool cannot act on a non-boolean single element "
+ "{0}".format(self.__class__.__name__))
+
+ self.__nonzero__()
+
+ def __abs__(self):
+ return self.abs()
+
+ def __round__(self, decimals=0):
+ return self.round(decimals)
+
+ # -------------------------------------------------------------------------
+ # Label or Level Combination Helpers
+ #
+ # A collection of helper methods for DataFrame/Series operations that
+ # accept a combination of column/index labels and levels. All such
+ # operations should utilize/extend these methods when possible so that we
+ # have consistent precedence and validation logic throughout the library.
+
+ def _is_level_reference(self, key, axis=0):
+ """
+ Test whether a key is a level reference for a given axis.
+
+ To be considered a level reference, `key` must be a string that:
+ - (axis=0): Matches the name of an index level and does NOT match
+ a column label.
+ - (axis=1): Matches the name of a column level and does NOT match
+ an index label.
+
+ Parameters
+ ----------
+ key : str
+ Potential level name for the given axis
+ axis : int, default 0
+ Axis that levels are associated with (0 for index, 1 for columns)
+
+ Returns
+ -------
+ is_level : bool
+ """
+ axis = self._get_axis_number(axis)
+
+ if self.ndim > 2:
+ raise NotImplementedError(
+ "_is_level_reference is not implemented for {type}"
+ .format(type=type(self)))
+
+ return (key is not None and
+ is_hashable(key) and
+ key in self.axes[axis].names and
+ not self._is_label_reference(key, axis=axis))
+
+ def _is_label_reference(self, key, axis=0):
+ """
+ Test whether a key is a label reference for a given axis.
+
+ To be considered a label reference, `key` must be a string that:
+ - (axis=0): Matches a column label
+ - (axis=1): Matches an index label
+
+ Parameters
+ ----------
+ key: str
+ Potential label name
+ axis: int, default 0
+ Axis perpendicular to the axis that labels are associated with
+ (0 means search for column labels, 1 means search for index labels)
+
+ Returns
+ -------
+ is_label: bool
+ """
+ axis = self._get_axis_number(axis)
+ other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]
+
+ if self.ndim > 2:
+ raise NotImplementedError(
+ "_is_label_reference is not implemented for {type}"
+ .format(type=type(self)))
+
+ return (key is not None and
+ is_hashable(key) and
+ any(key in self.axes[ax] for ax in other_axes))
+
+ def _is_label_or_level_reference(self, key, axis=0):
+ """
+ Test whether a key is a label or level reference for a given axis.
+
+ To be considered either a label or a level reference, `key` must be a
+ string that:
+ - (axis=0): Matches a column label or an index level
+ - (axis=1): Matches an index label or a column level
+
+ Parameters
+ ----------
+ key: str
+ Potential label or level name
+ axis: int, default 0
+ Axis that levels are associated with (0 for index, 1 for columns)
+
+ Returns
+ -------
+ is_label_or_level: bool
+ """
+
+ if self.ndim > 2:
+ raise NotImplementedError(
+ "_is_label_or_level_reference is not implemented for {type}"
+ .format(type=type(self)))
+
+ return (self._is_level_reference(key, axis=axis) or
+ self._is_label_reference(key, axis=axis))
+
+ def _check_label_or_level_ambiguity(self, key, axis=0):
+ """
+ Check whether `key` is ambiguous.
+
+ By ambiguous, we mean that it matches both a level of the input
+ `axis` and a label of the other axis.
+
+ Parameters
+ ----------
+ key: str or object
+ label or level name
+ axis: int, default 0
+ Axis that levels are associated with (0 for index, 1 for columns)
+
+ Raises
+ ------
+ ValueError: `key` is ambiguous
+ """
+
+ axis = self._get_axis_number(axis)
+ other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]
+
+ if self.ndim > 2:
+ raise NotImplementedError(
+ "_check_label_or_level_ambiguity is not implemented for {type}"
+ .format(type=type(self)))
+
+ if (key is not None and
+ is_hashable(key) and
+ key in self.axes[axis].names and
+ any(key in self.axes[ax] for ax in other_axes)):
+
+ # Build an informative and grammatical warning
+ level_article, level_type = (('an', 'index')
+ if axis == 0 else
+ ('a', 'column'))
+
+ label_article, label_type = (('a', 'column')
+ if axis == 0 else
+ ('an', 'index'))
+
+ msg = ("'{key}' is both {level_article} {level_type} level and "
+ "{label_article} {label_type} label, which is ambiguous."
+ ).format(key=key,
+ level_article=level_article,
+ level_type=level_type,
+ label_article=label_article,
+ label_type=label_type)
+ raise ValueError(msg)
+
+ def _get_label_or_level_values(self, key, axis=0):
+ """
+ Return a 1-D array of values associated with `key`, a label or level
+ from the given `axis`.
+
+ Retrieval logic:
+ - (axis=0): Return column values if `key` matches a column label.
+ Otherwise return index level values if `key` matches an index
+ level.
+ - (axis=1): Return row values if `key` matches an index label.
+ Otherwise return column level values if 'key' matches a column
+ level
+
+ Parameters
+ ----------
+ key: str
+ Label or level name.
+ axis: int, default 0
+ Axis that levels are associated with (0 for index, 1 for columns)
+
+ Returns
+ -------
+ values: np.ndarray
+
+ Raises
+ ------
+ KeyError
+ if `key` matches neither a label nor a level
+ ValueError
+ if `key` matches multiple labels
+ FutureWarning
+ if `key` is ambiguous. This will become an ambiguity error in a
+ future version
+ """
+
+ axis = self._get_axis_number(axis)
+ other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]
+
+ if self.ndim > 2:
+ raise NotImplementedError(
+ "_get_label_or_level_values is not implemented for {type}"
+ .format(type=type(self)))
+
+ if self._is_label_reference(key, axis=axis):
+ self._check_label_or_level_ambiguity(key, axis=axis)
+ values = self.xs(key, axis=other_axes[0])._values
+ elif self._is_level_reference(key, axis=axis):
+ values = self.axes[axis].get_level_values(key)._values
+ else:
+ raise KeyError(key)
+
+ # Check for duplicates
+ if values.ndim > 1:
+
+ if other_axes and isinstance(
+ self._get_axis(other_axes[0]), MultiIndex):
+ multi_message = ('\n'
+ 'For a multi-index, the label must be a '
+ 'tuple with elements corresponding to '
+ 'each level.')
+ else:
+ multi_message = ''
+
+ label_axis_name = 'column' if axis == 0 else 'index'
+ raise ValueError(("The {label_axis_name} label '{key}' "
+ "is not unique.{multi_message}")
+ .format(key=key,
+ label_axis_name=label_axis_name,
+ multi_message=multi_message))
+
+ return values
+
+ def _drop_labels_or_levels(self, keys, axis=0):
+ """
+ Drop labels and/or levels for the given `axis`.
+
+ For each key in `keys`:
+ - (axis=0): If key matches a column label then drop the column.
+ Otherwise if key matches an index level then drop the level.
+ - (axis=1): If key matches an index label then drop the row.
+ Otherwise if key matches a column level then drop the level.
+
+ Parameters
+ ----------
+ keys: str or list of str
+ labels or levels to drop
+ axis: int, default 0
+ Axis that levels are associated with (0 for index, 1 for columns)
+
+ Returns
+ -------
+ dropped: DataFrame
+
+ Raises
+ ------
+ ValueError
+ if any `keys` match neither a label nor a level
+ """
+
+ axis = self._get_axis_number(axis)
+
+ if self.ndim > 2:
+ raise NotImplementedError(
+ "_drop_labels_or_levels is not implemented for {type}"
+ .format(type=type(self)))
+
+ # Validate keys
+ keys = com.maybe_make_list(keys)
+ invalid_keys = [k for k in keys if not
+ self._is_label_or_level_reference(k, axis=axis)]
+
+ if invalid_keys:
+ raise ValueError(("The following keys are not valid labels or "
+ "levels for axis {axis}: {invalid_keys}")
+ .format(axis=axis,
+ invalid_keys=invalid_keys))
+
+ # Compute levels and labels to drop
+ levels_to_drop = [k for k in keys
+ if self._is_level_reference(k, axis=axis)]
+
+ labels_to_drop = [k for k in keys
+ if not self._is_level_reference(k, axis=axis)]
+
+ # Perform copy upfront and then use inplace operations below.
+ # This ensures that we always perform exactly one copy.
+ # ``copy`` and/or ``inplace`` options could be added in the future.
+ dropped = self.copy()
+
+ if axis == 0:
+ # Handle dropping index levels
+ if levels_to_drop:
+ dropped.reset_index(levels_to_drop, drop=True, inplace=True)
+
+ # Handle dropping columns labels
+ if labels_to_drop:
+ dropped.drop(labels_to_drop, axis=1, inplace=True)
+ else:
+ # Handle dropping column levels
+ if levels_to_drop:
+ if isinstance(dropped.columns, MultiIndex):
+ # Drop the specified levels from the MultiIndex
+ dropped.columns = dropped.columns.droplevel(levels_to_drop)
+ else:
+ # Drop the last level of Index by replacing with
+ # a RangeIndex
+ dropped.columns = RangeIndex(dropped.columns.size)
+
+ # Handle dropping index labels
+ if labels_to_drop:
+ dropped.drop(labels_to_drop, axis=0, inplace=True)
+
+ return dropped
+
+ # ----------------------------------------------------------------------
+ # Iteration
+
+ def __hash__(self):
+ raise TypeError('{0!r} objects are mutable, thus they cannot be'
+ ' hashed'.format(self.__class__.__name__))
+
+ def __iter__(self):
+ """Iterate over infor axis"""
+ return iter(self._info_axis)
+
+ # can we get a better explanation of this?
+ def keys(self):
+ """Get the 'info axis' (see Indexing for more)
+
+ This is index for Series, columns for DataFrame and major_axis for
+ Panel.
+ """
+ return self._info_axis
+
+ def iteritems(self):
+ """Iterate over (label, values) on info axis
+
+ This is index for Series, columns for DataFrame, major_axis for Panel,
+ and so on.
+ """
+ for h in self._info_axis:
+ yield h, self[h]
+
+ def __len__(self):
+ """Returns length of info axis"""
+ return len(self._info_axis)
+
+ def __contains__(self, key):
+ """True if the key is in the info axis"""
+ return key in self._info_axis
+
+ @property
+ def empty(self):
+ """
+ Indicator whether DataFrame is empty.
+
+ True if DataFrame is entirely empty (no items), meaning any of the
+ axes are of length 0.
+
+ Returns
+ -------
+ bool
+ If DataFrame is empty, return True, if not return False.
+
+ See Also
+ --------
+ pandas.Series.dropna
+ pandas.DataFrame.dropna
+
+ Notes
+ -----
+ If DataFrame contains only NaNs, it is still not considered empty. See
+ the example below.
+
+ Examples
+ --------
+ An example of an actual empty DataFrame. Notice the index is empty:
+
+ >>> df_empty = pd.DataFrame({'A' : []})
+ >>> df_empty
+ Empty DataFrame
+ Columns: [A]
+ Index: []
+ >>> df_empty.empty
+ True
+
+ If we only have NaNs in our DataFrame, it is not considered empty! We
+ will need to drop the NaNs to make the DataFrame empty:
+
+ >>> df = pd.DataFrame({'A' : [np.nan]})
+ >>> df
+ A
+ 0 NaN
+ >>> df.empty
+ False
+ >>> df.dropna().empty
+ True
+ """
+ return any(len(self._get_axis(a)) == 0 for a in self._AXIS_ORDERS)
+
+ # ----------------------------------------------------------------------
+ # Array Interface
+
+ # This is also set in IndexOpsMixin
+ # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented
+ __array_priority__ = 1000
+
+ def __array__(self, dtype=None):
+ return com.values_from_object(self)
+
+ def __array_wrap__(self, result, context=None):
+ d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
+ return self._constructor(result, **d).__finalize__(self)
+
+ # ideally we would define this to avoid the getattr checks, but
+ # is slower
+ # @property
+ # def __array_interface__(self):
+ # """ provide numpy array interface method """
+ # values = self.values
+ # return dict(typestr=values.dtype.str,shape=values.shape,data=values)
+
+ def to_dense(self):
+ """
+ Return dense representation of NDFrame (as opposed to sparse).
+ """
+ # compat
+ return self
+
+ # ----------------------------------------------------------------------
+ # Picklability
+
+ def __getstate__(self):
+ meta = {k: getattr(self, k, None) for k in self._metadata}
+ return dict(_data=self._data, _typ=self._typ, _metadata=self._metadata,
+ **meta)
+
+ def __setstate__(self, state):
+
+ if isinstance(state, BlockManager):
+ self._data = state
+ elif isinstance(state, dict):
+ typ = state.get('_typ')
+ if typ is not None:
+
+ # set in the order of internal names
+ # to avoid definitional recursion
+ # e.g. say fill_value needing _data to be
+ # defined
+ meta = set(self._internal_names + self._metadata)
+ for k in list(meta):
+ if k in state:
+ v = state[k]
+ object.__setattr__(self, k, v)
+
+ for k, v in state.items():
+ if k not in meta:
+ object.__setattr__(self, k, v)
+
+ else:
+ self._unpickle_series_compat(state)
+ elif isinstance(state[0], dict):
+ if len(state) == 5:
+ self._unpickle_sparse_frame_compat(state)
+ else:
+ self._unpickle_frame_compat(state)
+ elif len(state) == 4:
+ self._unpickle_panel_compat(state)
+ elif len(state) == 2:
+ self._unpickle_series_compat(state)
+ else: # pragma: no cover
+ # old pickling format, for compatibility
+ self._unpickle_matrix_compat(state)
+
+ self._item_cache = {}
+
+ # ----------------------------------------------------------------------
+ # Rendering Methods
+
+ def __unicode__(self):
+ # unicode representation based upon iterating over self
+ # (since, by definition, `PandasContainers` are iterable)
+ prepr = '[%s]' % ','.join(map(pprint_thing, self))
+ return '%s(%s)' % (self.__class__.__name__, prepr)
+
+ def _repr_latex_(self):
+ """
+ Returns a LaTeX representation for a particular object.
+ Mainly for use with nbconvert (jupyter notebook conversion to pdf).
+ """
+ if config.get_option('display.latex.repr'):
+ return self.to_latex()
+ else:
+ return None
+
+ def _repr_data_resource_(self):
+ """
+ Not a real Jupyter special repr method, but we use the same
+ naming convention.
+ """
+ if config.get_option("display.html.table_schema"):
+ data = self.head(config.get_option('display.max_rows'))
+ payload = json.loads(data.to_json(orient='table'),
+ object_pairs_hook=collections.OrderedDict)
+ return payload
+
+ # ----------------------------------------------------------------------
+ # I/O Methods
+
+ _shared_docs['to_excel'] = """
+ Write %(klass)s to an Excel sheet.
+
+ To write a single %(klass)s to an Excel .xlsx file it is only necessary to
+ specify a target file name. To write to multiple sheets it is necessary to
+ create an `ExcelWriter` object with a target file name, and specify a sheet
+ in the file to write to.
+
+ Multiple sheets may be written to by specifying unique `sheet_name`.
+ With all data written to the file it is necessary to save the changes.
+ Note that creating an `ExcelWriter` object with a file name that already
+ exists will result in the contents of the existing file being erased.
+
+ Parameters
+ ----------
+ excel_writer : str or ExcelWriter object
+ File path or existing ExcelWriter.
+ sheet_name : str, default 'Sheet1'
+ Name of sheet which will contain DataFrame.
+ na_rep : str, default ''
+ Missing data representation.
+ float_format : str, optional
+ Format string for floating point numbers. For example
+ ``float_format="%%.2f"`` will format 0.1234 to 0.12.
+ columns : sequence or list of str, optional
+ Columns to write.
+ header : bool or list of str, default True
+ Write out the column names. If a list of string is given it is
+ assumed to be aliases for the column names.
+ index : bool, default True
+ Write row names (index).
+ index_label : str or sequence, optional
+ Column label for index column(s) if desired. If not specified, and
+ `header` and `index` are True, then the index names are used. A
+ sequence should be given if the DataFrame uses MultiIndex.
+ startrow : int, default 0
+ Upper left cell row to dump data frame.
+ startcol : int, default 0
+ Upper left cell column to dump data frame.
+ engine : str, optional
+ Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this
+ via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and
+ ``io.excel.xlsm.writer``.
+ merge_cells : bool, default True
+ Write MultiIndex and Hierarchical Rows as merged cells.
+ encoding : str, optional
+ Encoding of the resulting excel file. Only necessary for xlwt,
+ other writers support unicode natively.
+ inf_rep : str, default 'inf'
+ Representation for infinity (there is no native representation for
+ infinity in Excel).
+ verbose : bool, default True
+ Display more information in the error logs.
+ freeze_panes : tuple of int (length 2), optional
+ Specifies the one-based bottommost row and rightmost column that
+ is to be frozen.
+
+ .. versionadded:: 0.20.0.
+
+ See Also
+ --------
+ to_csv : Write DataFrame to a comma-separated values (csv) file.
+ ExcelWriter : Class for writing DataFrame objects into excel sheets.
+ read_excel : Read an Excel file into a pandas DataFrame.
+ read_csv : Read a comma-separated values (csv) file into DataFrame.
+
+ Notes
+ -----
+ For compatibility with :meth:`~DataFrame.to_csv`,
+ to_excel serializes lists and dicts to strings before writing.
+
+ Once a workbook has been saved it is not possible write further data
+ without rewriting the whole workbook.
+
+ Examples
+ --------
+
+ Create, write to and save a workbook:
+
+ >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']],
+ ... index=['row 1', 'row 2'],
+ ... columns=['col 1', 'col 2'])
+ >>> df1.to_excel("output.xlsx") # doctest: +SKIP
+
+ To specify the sheet name:
+
+ >>> df1.to_excel("output.xlsx",
+ ... sheet_name='Sheet_name_1') # doctest: +SKIP
+
+ If you wish to write to more than one sheet in the workbook, it is
+ necessary to specify an ExcelWriter object:
+
+ >>> df2 = df1.copy()
+ >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP
+ ... df1.to_excel(writer, sheet_name='Sheet_name_1')
+ ... df2.to_excel(writer, sheet_name='Sheet_name_2')
+
+ To set the library that is used to write the Excel file,
+ you can pass the `engine` keyword (the default engine is
+ automatically chosen depending on the file extension):
+
+ >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP
+ """
+
+ @Appender(_shared_docs["to_excel"] % dict(klass="object"))
+ def to_excel(self, excel_writer, sheet_name="Sheet1", na_rep="",
+ float_format=None, columns=None, header=True, index=True,
+ index_label=None, startrow=0, startcol=0, engine=None,
+ merge_cells=True, encoding=None, inf_rep="inf", verbose=True,
+ freeze_panes=None):
+ df = self if isinstance(self, ABCDataFrame) else self.to_frame()
+
+ from pandas.io.formats.excel import ExcelFormatter
+ formatter = ExcelFormatter(df, na_rep=na_rep, cols=columns,
+ header=header,
+ float_format=float_format, index=index,
+ index_label=index_label,
+ merge_cells=merge_cells,
+ inf_rep=inf_rep)
+ formatter.write(excel_writer, sheet_name=sheet_name, startrow=startrow,
+ startcol=startcol, freeze_panes=freeze_panes,
+ engine=engine)
+
+ def to_json(self, path_or_buf=None, orient=None, date_format=None,
+ double_precision=10, force_ascii=True, date_unit='ms',
+ default_handler=None, lines=False, compression='infer',
+ index=True):
+ """
+ Convert the object to a JSON string.
+
+ Note NaN's and None will be converted to null and datetime objects
+ will be converted to UNIX timestamps.
+
+ Parameters
+ ----------
+ path_or_buf : string or file handle, optional
+ File path or object. If not specified, the result is returned as
+ a string.
+ orient : string
+ Indication of expected JSON string format.
+
+ * Series
+
+ - default is 'index'
+ - allowed values are: {'split','records','index','table'}
+
+ * DataFrame
+
+ - default is 'columns'
+ - allowed values are:
+ {'split','records','index','columns','values','table'}
+
+ * The format of the JSON string
+
+ - 'split' : dict like {'index' -> [index],
+ 'columns' -> [columns], 'data' -> [values]}
+ - 'records' : list like
+ [{column -> value}, ... , {column -> value}]
+ - 'index' : dict like {index -> {column -> value}}
+ - 'columns' : dict like {column -> {index -> value}}
+ - 'values' : just the values array
+ - 'table' : dict like {'schema': {schema}, 'data': {data}}
+ describing the data, and the data component is
+ like ``orient='records'``.
+
+ .. versionchanged:: 0.20.0
+
+ date_format : {None, 'epoch', 'iso'}
+ Type of date conversion. 'epoch' = epoch milliseconds,
+ 'iso' = ISO8601. The default depends on the `orient`. For
+ ``orient='table'``, the default is 'iso'. For all other orients,
+ the default is 'epoch'.
+ double_precision : int, default 10
+ The number of decimal places to use when encoding
+ floating point values.
+ force_ascii : bool, default True
+ Force encoded string to be ASCII.
+ date_unit : string, default 'ms' (milliseconds)
+ The time unit to encode to, governs timestamp and ISO8601
+ precision. One of 's', 'ms', 'us', 'ns' for second, millisecond,
+ microsecond, and nanosecond respectively.
+ default_handler : callable, default None
+ Handler to call if object cannot otherwise be converted to a
+ suitable format for JSON. Should receive a single argument which is
+ the object to convert and return a serialisable object.
+ lines : bool, default False
+ If 'orient' is 'records' write out line delimited json format. Will
+ throw ValueError if incorrect 'orient' since others are not list
+ like.
+
+ .. versionadded:: 0.19.0
+
+ compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
+
+ A string representing the compression to use in the output file,
+ only used when the first argument is a filename. By default, the
+ compression is inferred from the filename.
+
+ .. versionadded:: 0.21.0
+ .. versionchanged:: 0.24.0
+ 'infer' option added and set to default
+ index : bool, default True
+ Whether to include the index values in the JSON string. Not
+ including the index (``index=False``) is only supported when
+ orient is 'split' or 'table'.
+
+ .. versionadded:: 0.23.0
+
+ See Also
+ --------
+ read_json
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
+ ... index=['row 1', 'row 2'],
+ ... columns=['col 1', 'col 2'])
+ >>> df.to_json(orient='split')
+ '{"columns":["col 1","col 2"],
+ "index":["row 1","row 2"],
+ "data":[["a","b"],["c","d"]]}'
+
+ Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
+ Note that index labels are not preserved with this encoding.
+
+ >>> df.to_json(orient='records')
+ '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
+
+ Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
+
+ >>> df.to_json(orient='index')
+ '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
+
+ Encoding/decoding a Dataframe using ``'columns'`` formatted JSON:
+
+ >>> df.to_json(orient='columns')
+ '{"col 1":{"row 1":"a","row 2":"c"},"col 2":{"row 1":"b","row 2":"d"}}'
+
+ Encoding/decoding a Dataframe using ``'values'`` formatted JSON:
+
+ >>> df.to_json(orient='values')
+ '[["a","b"],["c","d"]]'
+
+ Encoding with Table Schema
+
+ >>> df.to_json(orient='table')
+ '{"schema": {"fields": [{"name": "index", "type": "string"},
+ {"name": "col 1", "type": "string"},
+ {"name": "col 2", "type": "string"}],
+ "primaryKey": "index",
+ "pandas_version": "0.20.0"},
+ "data": [{"index": "row 1", "col 1": "a", "col 2": "b"},
+ {"index": "row 2", "col 1": "c", "col 2": "d"}]}'
+ """
+
+ from pandas.io import json
+ if date_format is None and orient == 'table':
+ date_format = 'iso'
+ elif date_format is None:
+ date_format = 'epoch'
+ return json.to_json(path_or_buf=path_or_buf, obj=self, orient=orient,
+ date_format=date_format,
+ double_precision=double_precision,
+ force_ascii=force_ascii, date_unit=date_unit,
+ default_handler=default_handler,
+ lines=lines, compression=compression,
+ index=index)
+
+ def to_hdf(self, path_or_buf, key, **kwargs):
+ """
+ Write the contained data to an HDF5 file using HDFStore.
+
+ Hierarchical Data Format (HDF) is self-describing, allowing an
+ application to interpret the structure and contents of a file with
+ no outside information. One HDF file can hold a mix of related objects
+ which can be accessed as a group or as individual objects.
+
+ In order to add another DataFrame or Series to an existing HDF file
+ please use append mode and a different a key.
+
+ For more information see the :ref:`user guide <io.hdf5>`.
+
+ Parameters
+ ----------
+ path_or_buf : str or pandas.HDFStore
+ File path or HDFStore object.
+ key : str
+ Identifier for the group in the store.
+ mode : {'a', 'w', 'r+'}, default 'a'
+ Mode to open file:
+
+ - 'w': write, a new file is created (an existing file with
+ the same name would be deleted).
+ - 'a': append, an existing file is opened for reading and
+ writing, and if the file does not exist it is created.
+ - 'r+': similar to 'a', but the file must already exist.
+ format : {'fixed', 'table'}, default 'fixed'
+ Possible values:
+
+ - 'fixed': Fixed format. Fast writing/reading. Not-appendable,
+ nor searchable.
+ - 'table': Table format. Write as a PyTables Table structure
+ which may perform worse but allow more flexible operations
+ like searching / selecting subsets of the data.
+ append : bool, default False
+ For Table formats, append the input data to the existing.
+ data_columns : list of columns or True, optional
+ List of columns to create as indexed data columns for on-disk
+ queries, or True to use all columns. By default only the axes
+ of the object are indexed. See :ref:`io.hdf5-query-data-columns`.
+ Applicable only to format='table'.
+ complevel : {0-9}, optional
+ Specifies a compression level for data.
+ A value of 0 disables compression.
+ complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
+ Specifies the compression library to be used.
+ As of v0.20.2 these additional compressors for Blosc are supported
+ (default if no compressor specified: 'blosc:blosclz'):
+ {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
+ 'blosc:zlib', 'blosc:zstd'}.
+ Specifying a compression library which is not available issues
+ a ValueError.
+ fletcher32 : bool, default False
+ If applying compression use the fletcher32 checksum.
+ dropna : bool, default False
+ If true, ALL nan rows will not be written to store.
+ errors : str, default 'strict'
+ Specifies how encoding and decoding errors are to be handled.
+ See the errors argument for :func:`open` for a full list
+ of options.
+
+ See Also
+ --------
+ DataFrame.read_hdf : Read from HDF file.
+ DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
+ DataFrame.to_sql : Write to a sql table.
+ DataFrame.to_feather : Write out feather-format for DataFrames.
+ DataFrame.to_csv : Write out to a csv file.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
+ ... index=['a', 'b', 'c'])
+ >>> df.to_hdf('data.h5', key='df', mode='w')
+
+ We can add another object to the same file:
+
+ >>> s = pd.Series([1, 2, 3, 4])
+ >>> s.to_hdf('data.h5', key='s')
+
+ Reading from HDF file:
+
+ >>> pd.read_hdf('data.h5', 'df')
+ A B
+ a 1 4
+ b 2 5
+ c 3 6
+ >>> pd.read_hdf('data.h5', 's')
+ 0 1
+ 1 2
+ 2 3
+ 3 4
+ dtype: int64
+
+ Deleting file with data:
+
+ >>> import os
+ >>> os.remove('data.h5')
+ """
+ from pandas.io import pytables
+ return pytables.to_hdf(path_or_buf, key, self, **kwargs)
+
+ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
+ """
+ Serialize object to input file path using msgpack format.
+
+ THIS IS AN EXPERIMENTAL LIBRARY and the storage format
+ may not be stable until a future release.
+
+ Parameters
+ ----------
+ path : string File path, buffer-like, or None
+ if None, return generated string
+ append : bool whether to append to an existing msgpack
+ (default is False)
+ compress : type of compressor (zlib or blosc), default to None (no
+ compression)
+ """
+
+ from pandas.io import packers
+ return packers.to_msgpack(path_or_buf, self, encoding=encoding,
+ **kwargs)
+
+ def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
+ index_label=None, chunksize=None, dtype=None, method=None):
+ """
+ Write records stored in a DataFrame to a SQL database.
+
+ Databases supported by SQLAlchemy [1]_ are supported. Tables can be
+ newly created, appended to, or overwritten.
+
+ Parameters
+ ----------
+ name : string
+ Name of SQL table.
+ con : sqlalchemy.engine.Engine or sqlite3.Connection
+ Using SQLAlchemy makes it possible to use any DB supported by that
+ library. Legacy support is provided for sqlite3.Connection objects.
+ schema : string, optional
+ Specify the schema (if database flavor supports this). If None, use
+ default schema.
+ if_exists : {'fail', 'replace', 'append'}, default 'fail'
+ How to behave if the table already exists.
+
+ * fail: Raise a ValueError.
+ * replace: Drop the table before inserting new values.
+ * append: Insert new values to the existing table.
+
+ index : bool, default True
+ Write DataFrame index as a column. Uses `index_label` as the column
+ name in the table.
+ index_label : string or sequence, default None
+ Column label for index column(s). If None is given (default) and
+ `index` is True, then the index names are used.
+ A sequence should be given if the DataFrame uses MultiIndex.
+ chunksize : int, optional
+ Rows will be written in batches of this size at a time. By default,
+ all rows will be written at once.
+ dtype : dict, optional
+ Specifying the datatype for columns. The keys should be the column
+ names and the values should be the SQLAlchemy types or strings for
+ the sqlite3 legacy mode.
+ method : {None, 'multi', callable}, default None
+ Controls the SQL insertion clause used:
+
+ * None : Uses standard SQL ``INSERT`` clause (one per row).
+ * 'multi': Pass multiple values in a single ``INSERT`` clause.
+ * callable with signature ``(pd_table, conn, keys, data_iter)``.
+
+ Details and a sample callable implementation can be found in the
+ section :ref:`insert method <io.sql.method>`.
+
+ .. versionadded:: 0.24.0
+
+ Raises
+ ------
+ ValueError
+ When the table already exists and `if_exists` is 'fail' (the
+ default).
+
+ See Also
+ --------
+ read_sql : Read a DataFrame from a table.
+
+ Notes
+ -----
+ Timezone aware datetime columns will be written as
+ ``Timestamp with timezone`` type with SQLAlchemy if supported by the
+ database. Otherwise, the datetimes will be stored as timezone unaware
+ timestamps local to the original timezone.
+
+ .. versionadded:: 0.24.0
+
+ References
+ ----------
+ .. [1] http://docs.sqlalchemy.org
+ .. [2] https://www.python.org/dev/peps/pep-0249/
+
+ Examples
+ --------
+
+ Create an in-memory SQLite database.
+
+ >>> from sqlalchemy import create_engine
+ >>> engine = create_engine('sqlite://', echo=False)
+
+ Create a table from scratch with 3 rows.
+
+ >>> df = pd.DataFrame({'name' : ['User 1', 'User 2', 'User 3']})
+ >>> df
+ name
+ 0 User 1
+ 1 User 2
+ 2 User 3
+
+ >>> df.to_sql('users', con=engine)
+ >>> engine.execute("SELECT * FROM users").fetchall()
+ [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')]
+
+ >>> df1 = pd.DataFrame({'name' : ['User 4', 'User 5']})
+ >>> df1.to_sql('users', con=engine, if_exists='append')
+ >>> engine.execute("SELECT * FROM users").fetchall()
+ [(0, 'User 1'), (1, 'User 2'), (2, 'User 3'),
+ (0, 'User 4'), (1, 'User 5')]
+
+ Overwrite the table with just ``df1``.
+
+ >>> df1.to_sql('users', con=engine, if_exists='replace',
+ ... index_label='id')
+ >>> engine.execute("SELECT * FROM users").fetchall()
+ [(0, 'User 4'), (1, 'User 5')]
+
+ Specify the dtype (especially useful for integers with missing values).
+ Notice that while pandas is forced to store the data as floating point,
+ the database supports nullable integers. When fetching the data with
+ Python, we get back integer scalars.
+
+ >>> df = pd.DataFrame({"A": [1, None, 2]})
+ >>> df
+ A
+ 0 1.0
+ 1 NaN
+ 2 2.0
+
+ >>> from sqlalchemy.types import Integer
+ >>> df.to_sql('integers', con=engine, index=False,
+ ... dtype={"A": Integer()})
+
+ >>> engine.execute("SELECT * FROM integers").fetchall()
+ [(1,), (None,), (2,)]
+ """
+ from pandas.io import sql
+ sql.to_sql(self, name, con, schema=schema, if_exists=if_exists,
+ index=index, index_label=index_label, chunksize=chunksize,
+ dtype=dtype, method=method)
+
+ def to_pickle(self, path, compression='infer',
+ protocol=pkl.HIGHEST_PROTOCOL):
+ """
+ Pickle (serialize) object to file.
+
+ Parameters
+ ----------
+ path : str
+ File path where the pickled object will be stored.
+ compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, \
+ default 'infer'
+ A string representing the compression to use in the output file. By
+ default, infers from the file extension in specified path.
+
+ .. versionadded:: 0.20.0
+ protocol : int
+ Int which indicates which protocol should be used by the pickler,
+ default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible
+ values for this parameter depend on the version of Python. For
+ Python 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a
+ valid value. For Python >= 3.4, 4 is a valid value. A negative
+ value for the protocol parameter is equivalent to setting its value
+ to HIGHEST_PROTOCOL.
+
+ .. [1] https://docs.python.org/3/library/pickle.html
+ .. versionadded:: 0.21.0
+
+ See Also
+ --------
+ read_pickle : Load pickled pandas object (or any object) from file.
+ DataFrame.to_hdf : Write DataFrame to an HDF5 file.
+ DataFrame.to_sql : Write DataFrame to a SQL database.
+ DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
+
+ Examples
+ --------
+ >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
+ >>> original_df
+ foo bar
+ 0 0 5
+ 1 1 6
+ 2 2 7
+ 3 3 8
+ 4 4 9
+ >>> original_df.to_pickle("./dummy.pkl")
+
+ >>> unpickled_df = pd.read_pickle("./dummy.pkl")
+ >>> unpickled_df
+ foo bar
+ 0 0 5
+ 1 1 6
+ 2 2 7
+ 3 3 8
+ 4 4 9
+
+ >>> import os
+ >>> os.remove("./dummy.pkl")
+ """
+ from pandas.io.pickle import to_pickle
+ return to_pickle(self, path, compression=compression,
+ protocol=protocol)
+
+ def to_clipboard(self, excel=True, sep=None, **kwargs):
+ r"""
+ Copy object to the system clipboard.
+
+ Write a text representation of object to the system clipboard.
+ This can be pasted into Excel, for example.
+
+ Parameters
+ ----------
+ excel : bool, default True
+ - True, use the provided separator, writing in a csv format for
+ allowing easy pasting into excel.
+ - False, write a string representation of the object to the
+ clipboard.
+
+ sep : str, default ``'\t'``
+ Field delimiter.
+ **kwargs
+ These parameters will be passed to DataFrame.to_csv.
+
+ See Also
+ --------
+ DataFrame.to_csv : Write a DataFrame to a comma-separated values
+ (csv) file.
+ read_clipboard : Read text from clipboard and pass to read_table.
+
+ Notes
+ -----
+ Requirements for your platform.
+
+ - Linux : `xclip`, or `xsel` (with `gtk` or `PyQt4` modules)
+ - Windows : none
+ - OS X : none
+
+ Examples
+ --------
+ Copy the contents of a DataFrame to the clipboard.
+
+ >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
+ >>> df.to_clipboard(sep=',')
+ ... # Wrote the following to the system clipboard:
+ ... # ,A,B,C
+ ... # 0,1,2,3
+ ... # 1,4,5,6
+
+ We can omit the the index by passing the keyword `index` and setting
+ it to false.
+
+ >>> df.to_clipboard(sep=',', index=False)
+ ... # Wrote the following to the system clipboard:
+ ... # A,B,C
+ ... # 1,2,3
+ ... # 4,5,6
+ """
+ from pandas.io import clipboards
+ clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs)
+
+ def to_xarray(self):
+ """
+ Return an xarray object from the pandas object.
+
+ Returns
+ -------
+ xarray.DataArray or xarray.Dataset
+ Data in the pandas structure converted to Dataset if the object is
+ a DataFrame, or a DataArray if the object is a Series.
+
+ See Also
+ --------
+ DataFrame.to_hdf : Write DataFrame to an HDF5 file.
+ DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
+
+ Notes
+ -----
+ See the `xarray docs <http://xarray.pydata.org/en/stable/>`__
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([('falcon', 'bird', 389.0, 2),
+ ... ('parrot', 'bird', 24.0, 2),
+ ... ('lion', 'mammal', 80.5, 4),
+ ... ('monkey', 'mammal', np.nan, 4)],
+ ... columns=['name', 'class', 'max_speed',
+ ... 'num_legs'])
+ >>> df
+ name class max_speed num_legs
+ 0 falcon bird 389.0 2
+ 1 parrot bird 24.0 2
+ 2 lion mammal 80.5 4
+ 3 monkey mammal NaN 4
+
+ >>> df.to_xarray()
+ <xarray.Dataset>
+ Dimensions: (index: 4)
+ Coordinates:
+ * index (index) int64 0 1 2 3
+ Data variables:
+ name (index) object 'falcon' 'parrot' 'lion' 'monkey'
+ class (index) object 'bird' 'bird' 'mammal' 'mammal'
+ max_speed (index) float64 389.0 24.0 80.5 nan
+ num_legs (index) int64 2 2 4 4
+
+ >>> df['max_speed'].to_xarray()
+ <xarray.DataArray 'max_speed' (index: 4)>
+ array([389. , 24. , 80.5, nan])
+ Coordinates:
+ * index (index) int64 0 1 2 3
+
+ >>> dates = pd.to_datetime(['2018-01-01', '2018-01-01',
+ ... '2018-01-02', '2018-01-02'])
+ >>> df_multiindex = pd.DataFrame({'date': dates,
+ ... 'animal': ['falcon', 'parrot', 'falcon',
+ ... 'parrot'],
+ ... 'speed': [350, 18, 361, 15]}).set_index(['date',
+ ... 'animal'])
+ >>> df_multiindex
+ speed
+ date animal
+ 2018-01-01 falcon 350
+ parrot 18
+ 2018-01-02 falcon 361
+ parrot 15
+
+ >>> df_multiindex.to_xarray()
+ <xarray.Dataset>
+ Dimensions: (animal: 2, date: 2)
+ Coordinates:
+ * date (date) datetime64[ns] 2018-01-01 2018-01-02
+ * animal (animal) object 'falcon' 'parrot'
+ Data variables:
+ speed (date, animal) int64 350 18 361 15
+ """
+
+ try:
+ import xarray
+ except ImportError:
+ # Give a nice error message
+ raise ImportError("the xarray library is not installed\n"
+ "you can install via conda\n"
+ "conda install xarray\n"
+ "or via pip\n"
+ "pip install xarray\n")
+
+ if self.ndim == 1:
+ return xarray.DataArray.from_series(self)
+ elif self.ndim == 2:
+ return xarray.Dataset.from_dataframe(self)
+
+ # > 2 dims
+ coords = [(a, self._get_axis(a)) for a in self._AXIS_ORDERS]
+ return xarray.DataArray(self,
+ coords=coords,
+ )
+
+ def to_latex(self, buf=None, columns=None, col_space=None, header=True,
+ index=True, na_rep='NaN', formatters=None, float_format=None,
+ sparsify=None, index_names=True, bold_rows=False,
+ column_format=None, longtable=None, escape=None,
+ encoding=None, decimal='.', multicolumn=None,
+ multicolumn_format=None, multirow=None):
+ r"""
+ Render an object to a LaTeX tabular environment table.
+
+ Render an object to a tabular environment table. You can splice
+ this into a LaTeX document. Requires \usepackage{booktabs}.
+
+ .. versionchanged:: 0.20.2
+ Added to Series
+
+ Parameters
+ ----------
+ buf : file descriptor or None
+ Buffer to write to. If None, the output is returned as a string.
+ columns : list of label, optional
+ The subset of columns to write. Writes all columns by default.
+ col_space : int, optional
+ The minimum width of each column.
+ header : bool or list of str, default True
+ Write out the column names. If a list of strings is given,
+ it is assumed to be aliases for the column names.
+ index : bool, default True
+ Write row names (index).
+ na_rep : str, default 'NaN'
+ Missing data representation.
+ formatters : list of functions or dict of {str: function}, optional
+ Formatter functions to apply to columns' elements by position or
+ name. The result of each function must be a unicode string.
+ List must be of length equal to the number of columns.
+ float_format : str, optional
+ Format string for floating point numbers.
+ sparsify : bool, optional
+ Set to False for a DataFrame with a hierarchical index to print
+ every multiindex key at each row. By default, the value will be
+ read from the config module.
+ index_names : bool, default True
+ Prints the names of the indexes.
+ bold_rows : bool, default False
+ Make the row labels bold in the output.
+ column_format : str, optional
+ The columns format as specified in `LaTeX table format
+ <https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g. 'rcl' for 3
+ columns. By default, 'l' will be used for all columns except
+ columns of numbers, which default to 'r'.
+ longtable : bool, optional
+ By default, the value will be read from the pandas config
+ module. Use a longtable environment instead of tabular. Requires
+ adding a \usepackage{longtable} to your LaTeX preamble.
+ escape : bool, optional
+ By default, the value will be read from the pandas config
+ module. When set to False prevents from escaping latex special
+ characters in column names.
+ encoding : str, optional
+ A string representing the encoding to use in the output file,
+ defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
+ decimal : str, default '.'
+ Character recognized as decimal separator, e.g. ',' in Europe.
+ .. versionadded:: 0.18.0
+ multicolumn : bool, default True
+ Use \multicolumn to enhance MultiIndex columns.
+ The default will be read from the config module.
+ .. versionadded:: 0.20.0
+ multicolumn_format : str, default 'l'
+ The alignment for multicolumns, similar to `column_format`
+ The default will be read from the config module.
+ .. versionadded:: 0.20.0
+ multirow : bool, default False
+ Use \multirow to enhance MultiIndex rows. Requires adding a
+ \usepackage{multirow} to your LaTeX preamble. Will print
+ centered labels (instead of top-aligned) across the contained
+ rows, separating groups via clines. The default will be read
+ from the pandas config module.
+ .. versionadded:: 0.20.0
+
+ Returns
+ -------
+ str or None
+ If buf is None, returns the resulting LateX format as a
+ string. Otherwise returns None.
+
+ See Also
+ --------
+ DataFrame.to_string : Render a DataFrame to a console-friendly
+ tabular output.
+ DataFrame.to_html : Render a DataFrame as an HTML table.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'],
+ ... 'mask': ['red', 'purple'],
+ ... 'weapon': ['sai', 'bo staff']})
+ >>> df.to_latex(index=False) # doctest: +NORMALIZE_WHITESPACE
+ '\\begin{tabular}{lll}\n\\toprule\n name & mask & weapon
+ \\\\\n\\midrule\n Raphael & red & sai \\\\\n Donatello &
+ purple & bo staff \\\\\n\\bottomrule\n\\end{tabular}\n'
+ """
+ # Get defaults from the pandas config
+ if self.ndim == 1:
+ self = self.to_frame()
+ if longtable is None:
+ longtable = config.get_option("display.latex.longtable")
+ if escape is None:
+ escape = config.get_option("display.latex.escape")
+ if multicolumn is None:
+ multicolumn = config.get_option("display.latex.multicolumn")
+ if multicolumn_format is None:
+ multicolumn_format = config.get_option(
+ "display.latex.multicolumn_format")
+ if multirow is None:
+ multirow = config.get_option("display.latex.multirow")
+
+ formatter = DataFrameFormatter(self, buf=buf, columns=columns,
+ col_space=col_space, na_rep=na_rep,
+ header=header, index=index,
+ formatters=formatters,
+ float_format=float_format,
+ bold_rows=bold_rows,
+ sparsify=sparsify,
+ index_names=index_names,
+ escape=escape, decimal=decimal)
+ formatter.to_latex(column_format=column_format, longtable=longtable,
+ encoding=encoding, multicolumn=multicolumn,
+ multicolumn_format=multicolumn_format,
+ multirow=multirow)
+
+ if buf is None:
+ return formatter.buf.getvalue()
+
+ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
+ columns=None, header=True, index=True, index_label=None,
+ mode='w', encoding=None, compression='infer', quoting=None,
+ quotechar='"', line_terminator=None, chunksize=None,
+ tupleize_cols=None, date_format=None, doublequote=True,
+ escapechar=None, decimal='.'):
+ r"""
+ Write object to a comma-separated values (csv) file.
+
+ .. versionchanged:: 0.24.0
+ The order of arguments for Series was changed.
+
+ Parameters
+ ----------
+ path_or_buf : str or file handle, default None
+ File path or object, if None is provided the result is returned as
+ a string. If a file object is passed it should be opened with
+ `newline=''`, disabling universal newlines.
+
+ .. versionchanged:: 0.24.0
+
+ Was previously named "path" for Series.
+
+ sep : str, default ','
+ String of length 1. Field delimiter for the output file.
+ na_rep : str, default ''
+ Missing data representation.
+ float_format : str, default None
+ Format string for floating point numbers.
+ columns : sequence, optional
+ Columns to write.
+ header : bool or list of str, default True
+ Write out the column names. If a list of strings is given it is
+ assumed to be aliases for the column names.
+
+ .. versionchanged:: 0.24.0
+
+ Previously defaulted to False for Series.
+
+ index : bool, default True
+ Write row names (index).
+ index_label : str or sequence, or False, default None
+ Column label for index column(s) if desired. If None is given, and
+ `header` and `index` are True, then the index names are used. A
+ sequence should be given if the object uses MultiIndex. If
+ False do not print fields for index names. Use index_label=False
+ for easier importing in R.
+ mode : str
+ Python write mode, default 'w'.
+ encoding : str, optional
+ A string representing the encoding to use in the output file,
+ defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
+ compression : str, default 'infer'
+ Compression mode among the following possible values: {'infer',
+ 'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf`
+ is path-like, then detect compression from the following
+ extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no
+ compression).
+
+ .. versionchanged:: 0.24.0
+
+ 'infer' option added and set to default.
+
+ quoting : optional constant from csv module
+ Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
+ then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
+ will treat them as non-numeric.
+ quotechar : str, default '\"'
+ String of length 1. Character used to quote fields.
+ line_terminator : string, optional
+ The newline character or character sequence to use in the output
+ file. Defaults to `os.linesep`, which depends on the OS in which
+ this method is called ('\n' for linux, '\r\n' for Windows, i.e.).
+
+ .. versionchanged:: 0.24.0
+ chunksize : int or None
+ Rows to write at a time.
+ tupleize_cols : bool, default False
+ Write MultiIndex columns as a list of tuples (if True) or in
+ the new, expanded format, where each MultiIndex column is a row
+ in the CSV (if False).
+
+ .. deprecated:: 0.21.0
+ This argument will be removed and will always write each row
+ of the multi-index as a separate row in the CSV file.
+ date_format : str, default None
+ Format string for datetime objects.
+ doublequote : bool, default True
+ Control quoting of `quotechar` inside a field.
+ escapechar : str, default None
+ String of length 1. Character used to escape `sep` and `quotechar`
+ when appropriate.
+ decimal : str, default '.'
+ Character recognized as decimal separator. E.g. use ',' for
+ European data.
+
+ Returns
+ -------
+ None or str
+ If path_or_buf is None, returns the resulting csv format as a
+ string. Otherwise returns None.
+
+ See Also
+ --------
+ read_csv : Load a CSV file into a DataFrame.
+ to_excel : Load an Excel file into a DataFrame.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'],
+ ... 'mask': ['red', 'purple'],
+ ... 'weapon': ['sai', 'bo staff']})
+ >>> df.to_csv(index=False)
+ 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
+ """
+
+ df = self if isinstance(self, ABCDataFrame) else self.to_frame()
+
+ if tupleize_cols is not None:
+ warnings.warn("The 'tupleize_cols' parameter is deprecated and "
+ "will be removed in a future version",
+ FutureWarning, stacklevel=2)
+ else:
+ tupleize_cols = False
+
+ from pandas.io.formats.csvs import CSVFormatter
+ formatter = CSVFormatter(df, path_or_buf,
+ line_terminator=line_terminator, sep=sep,
+ encoding=encoding,
+ compression=compression, quoting=quoting,
+ na_rep=na_rep, float_format=float_format,
+ cols=columns, header=header, index=index,
+ index_label=index_label, mode=mode,
+ chunksize=chunksize, quotechar=quotechar,
+ tupleize_cols=tupleize_cols,
+ date_format=date_format,
+ doublequote=doublequote,
+ escapechar=escapechar, decimal=decimal)
+ formatter.save()
+
+ if path_or_buf is None:
+ return formatter.path_or_buf.getvalue()
+
+ # ----------------------------------------------------------------------
+ # Fancy Indexing
+
+ @classmethod
+ def _create_indexer(cls, name, indexer):
+ """Create an indexer like _name in the class."""
+ if getattr(cls, name, None) is None:
+ _indexer = functools.partial(indexer, name)
+ setattr(cls, name, property(_indexer, doc=indexer.__doc__))
+
+ def get(self, key, default=None):
+ """
+ Get item from object for given key (DataFrame column, Panel slice,
+ etc.). Returns default value if not found.
+
+ Parameters
+ ----------
+ key : object
+
+ Returns
+ -------
+ value : same type as items contained in object
+ """
+ try:
+ return self[key]
+ except (KeyError, ValueError, IndexError):
+ return default
+
+ def __getitem__(self, item):
+ return self._get_item_cache(item)
+
+ def _get_item_cache(self, item):
+ """Return the cached item, item represents a label indexer."""
+ cache = self._item_cache
+ res = cache.get(item)
+ if res is None:
+ values = self._data.get(item)
+ res = self._box_item_values(item, values)
+ cache[item] = res
+ res._set_as_cached(item, self)
+
+ # for a chain
+ res._is_copy = self._is_copy
+ return res
+
+ def _set_as_cached(self, item, cacher):
+ """Set the _cacher attribute on the calling object with a weakref to
+ cacher.
+ """
+ self._cacher = (item, weakref.ref(cacher))
+
+ def _reset_cacher(self):
+ """Reset the cacher."""
+ if hasattr(self, '_cacher'):
+ del self._cacher
+
+ def _iget_item_cache(self, item):
+ """Return the cached item, item represents a positional indexer."""
+ ax = self._info_axis
+ if ax.is_unique:
+ lower = self._get_item_cache(ax[item])
+ else:
+ lower = self._take(item, axis=self._info_axis_number)
+ return lower
+
+ def _box_item_values(self, key, values):
+ raise AbstractMethodError(self)
+
+ def _maybe_cache_changed(self, item, value):
+ """The object has called back to us saying maybe it has changed.
+ """
+ self._data.set(item, value)
+
+ @property
+ def _is_cached(self):
+ """Return boolean indicating if self is cached or not."""
+ return getattr(self, '_cacher', None) is not None
+
+ def _get_cacher(self):
+ """return my cacher or None"""
+ cacher = getattr(self, '_cacher', None)
+ if cacher is not None:
+ cacher = cacher[1]()
+ return cacher
+
+ @property
+ def _is_view(self):
+ """Return boolean indicating if self is view of another array """
+ return self._data.is_view
+
+ def _maybe_update_cacher(self, clear=False, verify_is_copy=True):
+ """
+ See if we need to update our parent cacher if clear, then clear our
+ cache.
+
+ Parameters
+ ----------
+ clear : boolean, default False
+ clear the item cache
+ verify_is_copy : boolean, default True
+ provide is_copy checks
+
+ """
+
+ cacher = getattr(self, '_cacher', None)
+ if cacher is not None:
+ ref = cacher[1]()
+
+ # we are trying to reference a dead referant, hence
+ # a copy
+ if ref is None:
+ del self._cacher
+ else:
+ try:
+ ref._maybe_cache_changed(cacher[0], self)
+ except Exception:
+ pass
+
+ if verify_is_copy:
+ self._check_setitem_copy(stacklevel=5, t='referant')
+
+ if clear:
+ self._clear_item_cache()
+
+ def _clear_item_cache(self, i=None):
+ if i is not None:
+ self._item_cache.pop(i, None)
+ else:
+ self._item_cache.clear()
+
+ def _slice(self, slobj, axis=0, kind=None):
+ """
+ Construct a slice of this container.
+
+ kind parameter is maintained for compatibility with Series slicing.
+ """
+ axis = self._get_block_manager_axis(axis)
+ result = self._constructor(self._data.get_slice(slobj, axis=axis))
+ result = result.__finalize__(self)
+
+ # this could be a view
+ # but only in a single-dtyped view slicable case
+ is_copy = axis != 0 or result._is_view
+ result._set_is_copy(self, copy=is_copy)
+ return result
+
+ def _set_item(self, key, value):
+ self._data.set(key, value)
+ self._clear_item_cache()
+
+ def _set_is_copy(self, ref=None, copy=True):
+ if not copy:
+ self._is_copy = None
+ else:
+ if ref is not None:
+ self._is_copy = weakref.ref(ref)
+ else:
+ self._is_copy = None
+
+ def _check_is_chained_assignment_possible(self):
+ """
+ Check if we are a view, have a cacher, and are of mixed type.
+ If so, then force a setitem_copy check.
+
+ Should be called just near setting a value
+
+ Will return a boolean if it we are a view and are cached, but a
+ single-dtype meaning that the cacher should be updated following
+ setting.
+ """
+ if self._is_view and self._is_cached:
+ ref = self._get_cacher()
+ if ref is not None and ref._is_mixed_type:
+ self._check_setitem_copy(stacklevel=4, t='referant',
+ force=True)
+ return True
+ elif self._is_copy:
+ self._check_setitem_copy(stacklevel=4, t='referant')
+ return False
+
+ def _check_setitem_copy(self, stacklevel=4, t='setting', force=False):
+ """
+
+ Parameters
+ ----------
+ stacklevel : integer, default 4
+ the level to show of the stack when the error is output
+ t : string, the type of setting error
+ force : boolean, default False
+ if True, then force showing an error
+
+ validate if we are doing a settitem on a chained copy.
+
+ If you call this function, be sure to set the stacklevel such that the
+ user will see the error *at the level of setting*
+
+ It is technically possible to figure out that we are setting on
+ a copy even WITH a multi-dtyped pandas object. In other words, some
+ blocks may be views while other are not. Currently _is_view will ALWAYS
+ return False for multi-blocks to avoid having to handle this case.
+
+ df = DataFrame(np.arange(0,9), columns=['count'])
+ df['group'] = 'b'
+
+ # This technically need not raise SettingWithCopy if both are view
+ # (which is not # generally guaranteed but is usually True. However,
+ # this is in general not a good practice and we recommend using .loc.
+ df.iloc[0:5]['group'] = 'a'
+
+ """
+
+ if force or self._is_copy:
+
+ value = config.get_option('mode.chained_assignment')
+ if value is None:
+ return
+
+ # see if the copy is not actually referred; if so, then dissolve
+ # the copy weakref
+ try:
+ gc.collect(2)
+ if not gc.get_referents(self._is_copy()):
+ self._is_copy = None
+ return
+ except Exception:
+ pass
+
+ # we might be a false positive
+ try:
+ if self._is_copy().shape == self.shape:
+ self._is_copy = None
+ return
+ except Exception:
+ pass
+
+ # a custom message
+ if isinstance(self._is_copy, string_types):
+ t = self._is_copy
+
+ elif t == 'referant':
+ t = ("\n"
+ "A value is trying to be set on a copy of a slice from a "
+ "DataFrame\n\n"
+ "See the caveats in the documentation: "
+ "http://pandas.pydata.org/pandas-docs/stable/"
+ "indexing.html#indexing-view-versus-copy"
+ )
+
+ else:
+ t = ("\n"
+ "A value is trying to be set on a copy of a slice from a "
+ "DataFrame.\n"
+ "Try using .loc[row_indexer,col_indexer] = value "
+ "instead\n\nSee the caveats in the documentation: "
+ "http://pandas.pydata.org/pandas-docs/stable/"
+ "indexing.html#indexing-view-versus-copy"
+ )
+
+ if value == 'raise':
+ raise com.SettingWithCopyError(t)
+ elif value == 'warn':
+ warnings.warn(t, com.SettingWithCopyWarning,
+ stacklevel=stacklevel)
+
+ def __delitem__(self, key):
+ """
+ Delete item
+ """
+ deleted = False
+
+ maybe_shortcut = False
+ if hasattr(self, 'columns') and isinstance(self.columns, MultiIndex):
+ try:
+ maybe_shortcut = key not in self.columns._engine
+ except TypeError:
+ pass
+
+ if maybe_shortcut:
+ # Allow shorthand to delete all columns whose first len(key)
+ # elements match key:
+ if not isinstance(key, tuple):
+ key = (key, )
+ for col in self.columns:
+ if isinstance(col, tuple) and col[:len(key)] == key:
+ del self[col]
+ deleted = True
+ if not deleted:
+ # If the above loop ran and didn't delete anything because
+ # there was no match, this call should raise the appropriate
+ # exception:
+ self._data.delete(key)
+
+ # delete from the caches
+ try:
+ del self._item_cache[key]
+ except KeyError:
+ pass
+
+ def _take(self, indices, axis=0, is_copy=True):
+ """
+ Return the elements in the given *positional* indices along an axis.
+
+ This means that we are not indexing according to actual values in
+ the index attribute of the object. We are indexing according to the
+ actual position of the element in the object.
+
+ This is the internal version of ``.take()`` and will contain a wider
+ selection of parameters useful for internal use but not as suitable
+ for public usage.
+
+ Parameters
+ ----------
+ indices : array-like
+ An array of ints indicating which positions to take.
+ axis : int, default 0
+ The axis on which to select elements. "0" means that we are
+ selecting rows, "1" means that we are selecting columns, etc.
+ is_copy : bool, default True
+ Whether to return a copy of the original object or not.
+
+ Returns
+ -------
+ taken : same type as caller
+ An array-like containing the elements taken from the object.
+
+ See Also
+ --------
+ numpy.ndarray.take
+ numpy.take
+ """
+ self._consolidate_inplace()
+
+ new_data = self._data.take(indices,
+ axis=self._get_block_manager_axis(axis),
+ verify=True)
+ result = self._constructor(new_data).__finalize__(self)
+
+ # Maybe set copy if we didn't actually change the index.
+ if is_copy:
+ if not result._get_axis(axis).equals(self._get_axis(axis)):
+ result._set_is_copy(self)
+
+ return result
+
+ def take(self, indices, axis=0, convert=None, is_copy=True, **kwargs):
+ """
+ Return the elements in the given *positional* indices along an axis.
+
+ This means that we are not indexing according to actual values in
+ the index attribute of the object. We are indexing according to the
+ actual position of the element in the object.
+
+ Parameters
+ ----------
+ indices : array-like
+ An array of ints indicating which positions to take.
+ axis : {0 or 'index', 1 or 'columns', None}, default 0
+ The axis on which to select elements. ``0`` means that we are
+ selecting rows, ``1`` means that we are selecting columns.
+ convert : bool, default True
+ Whether to convert negative indices into positive ones.
+ For example, ``-1`` would map to the ``len(axis) - 1``.
+ The conversions are similar to the behavior of indexing a
+ regular Python list.
+
+ .. deprecated:: 0.21.0
+ In the future, negative indices will always be converted.
+
+ is_copy : bool, default True
+ Whether to return a copy of the original object or not.
+ **kwargs
+ For compatibility with :meth:`numpy.take`. Has no effect on the
+ output.
+
+ Returns
+ -------
+ taken : same type as caller
+ An array-like containing the elements taken from the object.
+
+ See Also
+ --------
+ DataFrame.loc : Select a subset of a DataFrame by labels.
+ DataFrame.iloc : Select a subset of a DataFrame by positions.
+ numpy.take : Take elements from an array along an axis.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
+ ... ('parrot', 'bird', 24.0),
+ ... ('lion', 'mammal', 80.5),
+ ... ('monkey', 'mammal', np.nan)],
+ ... columns=['name', 'class', 'max_speed'],
+ ... index=[0, 2, 3, 1])
+ >>> df
+ name class max_speed
+ 0 falcon bird 389.0
+ 2 parrot bird 24.0
+ 3 lion mammal 80.5
+ 1 monkey mammal NaN
+
+ Take elements at positions 0 and 3 along the axis 0 (default).
+
+ Note how the actual indices selected (0 and 1) do not correspond to
+ our selected indices 0 and 3. That's because we are selecting the 0th
+ and 3rd rows, not rows whose indices equal 0 and 3.
+
+ >>> df.take([0, 3])
+ name class max_speed
+ 0 falcon bird 389.0
+ 1 monkey mammal NaN
+
+ Take elements at indices 1 and 2 along the axis 1 (column selection).
+
+ >>> df.take([1, 2], axis=1)
+ class max_speed
+ 0 bird 389.0
+ 2 bird 24.0
+ 3 mammal 80.5
+ 1 mammal NaN
+
+ We may take elements using negative integers for positive indices,
+ starting from the end of the object, just like with Python lists.
+
+ >>> df.take([-1, -2])
+ name class max_speed
+ 1 monkey mammal NaN
+ 3 lion mammal 80.5
+ """
+ if convert is not None:
+ msg = ("The 'convert' parameter is deprecated "
+ "and will be removed in a future version.")
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+
+ nv.validate_take(tuple(), kwargs)
+ return self._take(indices, axis=axis, is_copy=is_copy)
+
+ def xs(self, key, axis=0, level=None, drop_level=True):
+ """
+ Return cross-section from the Series/DataFrame.
+
+ This method takes a `key` argument to select data at a particular
+ level of a MultiIndex.
+
+ Parameters
+ ----------
+ key : label or tuple of label
+ Label contained in the index, or partially in a MultiIndex.
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ Axis to retrieve cross-section on.
+ level : object, defaults to first n levels (n=1 or len(key))
+ In case of a key partially contained in a MultiIndex, indicate
+ which levels are used. Levels can be referred by label or position.
+ drop_level : bool, default True
+ If False, returns object with same levels as self.
+
+ Returns
+ -------
+ Series or DataFrame
+ Cross-section from the original Series or DataFrame
+ corresponding to the selected index levels.
+
+ See Also
+ --------
+ DataFrame.loc : Access a group of rows and columns
+ by label(s) or a boolean array.
+ DataFrame.iloc : Purely integer-location based indexing
+ for selection by position.
+
+ Notes
+ -----
+ `xs` can not be used to set values.
+
+ MultiIndex Slicers is a generic way to get/set values on
+ any level or levels.
+ It is a superset of `xs` functionality, see
+ :ref:`MultiIndex Slicers <advanced.mi_slicers>`.
+
+ Examples
+ --------
+ >>> d = {'num_legs': [4, 4, 2, 2],
+ ... 'num_wings': [0, 0, 2, 2],
+ ... 'class': ['mammal', 'mammal', 'mammal', 'bird'],
+ ... 'animal': ['cat', 'dog', 'bat', 'penguin'],
+ ... 'locomotion': ['walks', 'walks', 'flies', 'walks']}
+ >>> df = pd.DataFrame(data=d)
+ >>> df = df.set_index(['class', 'animal', 'locomotion'])
+ >>> df
+ num_legs num_wings
+ class animal locomotion
+ mammal cat walks 4 0
+ dog walks 4 0
+ bat flies 2 2
+ bird penguin walks 2 2
+
+ Get values at specified index
+
+ >>> df.xs('mammal')
+ num_legs num_wings
+ animal locomotion
+ cat walks 4 0
+ dog walks 4 0
+ bat flies 2 2
+
+ Get values at several indexes
+
+ >>> df.xs(('mammal', 'dog'))
+ num_legs num_wings
+ locomotion
+ walks 4 0
+
+ Get values at specified index and level
+
+ >>> df.xs('cat', level=1)
+ num_legs num_wings
+ class locomotion
+ mammal walks 4 0
+
+ Get values at several indexes and levels
+
+ >>> df.xs(('bird', 'walks'),
+ ... level=[0, 'locomotion'])
+ num_legs num_wings
+ animal
+ penguin 2 2
+
+ Get values at specified column and axis
+
+ >>> df.xs('num_wings', axis=1)
+ class animal locomotion
+ mammal cat walks 0
+ dog walks 0
+ bat flies 2
+ bird penguin walks 2
+ Name: num_wings, dtype: int64
+ """
+ axis = self._get_axis_number(axis)
+ labels = self._get_axis(axis)
+ if level is not None:
+ loc, new_ax = labels.get_loc_level(key, level=level,
+ drop_level=drop_level)
+
+ # create the tuple of the indexer
+ indexer = [slice(None)] * self.ndim
+ indexer[axis] = loc
+ indexer = tuple(indexer)
+
+ result = self.iloc[indexer]
+ setattr(result, result._get_axis_name(axis), new_ax)
+ return result
+
+ if axis == 1:
+ return self[key]
+
+ self._consolidate_inplace()
+
+ index = self.index
+ if isinstance(index, MultiIndex):
+ loc, new_index = self.index.get_loc_level(key,
+ drop_level=drop_level)
+ else:
+ loc = self.index.get_loc(key)
+
+ if isinstance(loc, np.ndarray):
+ if loc.dtype == np.bool_:
+ inds, = loc.nonzero()
+ return self._take(inds, axis=axis)
+ else:
+ return self._take(loc, axis=axis)
+
+ if not is_scalar(loc):
+ new_index = self.index[loc]
+
+ if is_scalar(loc):
+ new_values = self._data.fast_xs(loc)
+
+ # may need to box a datelike-scalar
+ #
+ # if we encounter an array-like and we only have 1 dim
+ # that means that their are list/ndarrays inside the Series!
+ # so just return them (GH 6394)
+ if not is_list_like(new_values) or self.ndim == 1:
+ return com.maybe_box_datetimelike(new_values)
+
+ result = self._constructor_sliced(
+ new_values, index=self.columns,
+ name=self.index[loc], dtype=new_values.dtype)
+
+ else:
+ result = self.iloc[loc]
+ result.index = new_index
+
+ # this could be a view
+ # but only in a single-dtyped view slicable case
+ result._set_is_copy(self, copy=not result._is_view)
+ return result
+
+ _xs = xs
+
+ def select(self, crit, axis=0):
+ """
+ Return data corresponding to axis labels matching criteria.
+
+ .. deprecated:: 0.21.0
+ Use df.loc[df.index.map(crit)] to select via labels
+
+ Parameters
+ ----------
+ crit : function
+ To be called on each index (label). Should return True or False
+ axis : int
+
+ Returns
+ -------
+ selection : same type as caller
+ """
+ warnings.warn("'select' is deprecated and will be removed in a "
+ "future release. You can use "
+ ".loc[labels.map(crit)] as a replacement",
+ FutureWarning, stacklevel=2)
+
+ axis = self._get_axis_number(axis)
+ axis_name = self._get_axis_name(axis)
+ axis_values = self._get_axis(axis)
+
+ if len(axis_values) > 0:
+ new_axis = axis_values[
+ np.asarray([bool(crit(label)) for label in axis_values])]
+ else:
+ new_axis = axis_values
+
+ return self.reindex(**{axis_name: new_axis})
+
+ def reindex_like(self, other, method=None, copy=True, limit=None,
+ tolerance=None):
+ """
+ Return an object with matching indices as other object.
+
+ Conform the object to the same index on all axes. Optional
+ filling logic, placing NaN in locations having no value
+ in the previous index. A new object is produced unless the
+ new index is equivalent to the current one and copy=False.
+
+ Parameters
+ ----------
+ other : Object of the same data type
+ Its row and column indices are used to define the new indices
+ of this object.
+ method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}
+ Method to use for filling holes in reindexed DataFrame.
+ Please note: this is only applicable to DataFrames/Series with a
+ monotonically increasing/decreasing index.
+
+ * None (default): don't fill gaps
+ * pad / ffill: propagate last valid observation forward to next
+ valid
+ * backfill / bfill: use next valid observation to fill gap
+ * nearest: use nearest valid observations to fill gap
+
+ copy : bool, default True
+ Return a new object, even if the passed indexes are the same.
+ limit : int, default None
+ Maximum number of consecutive labels to fill for inexact matches.
+ tolerance : optional
+ Maximum distance between original and new labels for inexact
+ matches. The values of the index at the matching locations most
+ satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
+
+ Tolerance may be a scalar value, which applies the same tolerance
+ to all values, or list-like, which applies variable tolerance per
+ element. List-like includes list, tuple, array, Series, and must be
+ the same size as the index and its dtype must exactly match the
+ index's type.
+
+ .. versionadded:: 0.21.0 (list-like tolerance)
+
+ Returns
+ -------
+ Series or DataFrame
+ Same type as caller, but with changed indices on each axis.
+
+ See Also
+ --------
+ DataFrame.set_index : Set row labels.
+ DataFrame.reset_index : Remove row labels or move them to new columns.
+ DataFrame.reindex : Change to new indices or expand indices.
+
+ Notes
+ -----
+ Same as calling
+ ``.reindex(index=other.index, columns=other.columns,...)``.
+
+ Examples
+ --------
+ >>> df1 = pd.DataFrame([[24.3, 75.7, 'high'],
+ ... [31, 87.8, 'high'],
+ ... [22, 71.6, 'medium'],
+ ... [35, 95, 'medium']],
+ ... columns=['temp_celsius', 'temp_fahrenheit', 'windspeed'],
+ ... index=pd.date_range(start='2014-02-12',
+ ... end='2014-02-15', freq='D'))
+
+ >>> df1
+ temp_celsius temp_fahrenheit windspeed
+ 2014-02-12 24.3 75.7 high
+ 2014-02-13 31.0 87.8 high
+ 2014-02-14 22.0 71.6 medium
+ 2014-02-15 35.0 95.0 medium
+
+ >>> df2 = pd.DataFrame([[28, 'low'],
+ ... [30, 'low'],
+ ... [35.1, 'medium']],
+ ... columns=['temp_celsius', 'windspeed'],
+ ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',
+ ... '2014-02-15']))
+
+ >>> df2
+ temp_celsius windspeed
+ 2014-02-12 28.0 low
+ 2014-02-13 30.0 low
+ 2014-02-15 35.1 medium
+
+ >>> df2.reindex_like(df1)
+ temp_celsius temp_fahrenheit windspeed
+ 2014-02-12 28.0 NaN low
+ 2014-02-13 30.0 NaN low
+ 2014-02-14 NaN NaN NaN
+ 2014-02-15 35.1 NaN medium
+ """
+ d = other._construct_axes_dict(axes=self._AXIS_ORDERS, method=method,
+ copy=copy, limit=limit,
+ tolerance=tolerance)
+
+ return self.reindex(**d)
+
+ def drop(self, labels=None, axis=0, index=None, columns=None, level=None,
+ inplace=False, errors='raise'):
+
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+
+ if labels is not None:
+ if index is not None or columns is not None:
+ raise ValueError("Cannot specify both 'labels' and "
+ "'index'/'columns'")
+ axis_name = self._get_axis_name(axis)
+ axes = {axis_name: labels}
+ elif index is not None or columns is not None:
+ axes, _ = self._construct_axes_from_arguments((index, columns), {})
+ else:
+ raise ValueError("Need to specify at least one of 'labels', "
+ "'index' or 'columns'")
+
+ obj = self
+
+ for axis, labels in axes.items():
+ if labels is not None:
+ obj = obj._drop_axis(labels, axis, level=level, errors=errors)
+
+ if inplace:
+ self._update_inplace(obj)
+ else:
+ return obj
+
+ def _drop_axis(self, labels, axis, level=None, errors='raise'):
+ """
+ Drop labels from specified axis. Used in the ``drop`` method
+ internally.
+
+ Parameters
+ ----------
+ labels : single label or list-like
+ axis : int or axis name
+ level : int or level name, default None
+ For MultiIndex
+ errors : {'ignore', 'raise'}, default 'raise'
+ If 'ignore', suppress error and existing labels are dropped.
+
+ """
+ axis = self._get_axis_number(axis)
+ axis_name = self._get_axis_name(axis)
+ axis = self._get_axis(axis)
+
+ if axis.is_unique:
+ if level is not None:
+ if not isinstance(axis, MultiIndex):
+ raise AssertionError('axis must be a MultiIndex')
+ new_axis = axis.drop(labels, level=level, errors=errors)
+ else:
+ new_axis = axis.drop(labels, errors=errors)
+ result = self.reindex(**{axis_name: new_axis})
+
+ # Case for non-unique axis
+ else:
+ labels = ensure_object(com.index_labels_to_array(labels))
+ if level is not None:
+ if not isinstance(axis, MultiIndex):
+ raise AssertionError('axis must be a MultiIndex')
+ indexer = ~axis.get_level_values(level).isin(labels)
+
+ # GH 18561 MultiIndex.drop should raise if label is absent
+ if errors == 'raise' and indexer.all():
+ raise KeyError('{} not found in axis'.format(labels))
+ else:
+ indexer = ~axis.isin(labels)
+ # Check if label doesn't exist along axis
+ labels_missing = (axis.get_indexer_for(labels) == -1).any()
+ if errors == 'raise' and labels_missing:
+ raise KeyError('{} not found in axis'.format(labels))
+
+ slicer = [slice(None)] * self.ndim
+ slicer[self._get_axis_number(axis_name)] = indexer
+
+ result = self.loc[tuple(slicer)]
+
+ return result
+
+ def _update_inplace(self, result, verify_is_copy=True):
+ """
+ Replace self internals with result.
+
+ Parameters
+ ----------
+ verify_is_copy : boolean, default True
+ provide is_copy checks
+
+ """
+ # NOTE: This does *not* call __finalize__ and that's an explicit
+ # decision that we may revisit in the future.
+
+ self._reset_cache()
+ self._clear_item_cache()
+ self._data = getattr(result, '_data', result)
+ self._maybe_update_cacher(verify_is_copy=verify_is_copy)
+
+ def add_prefix(self, prefix):
+ """
+ Prefix labels with string `prefix`.
+
+ For Series, the row labels are prefixed.
+ For DataFrame, the column labels are prefixed.
+
+ Parameters
+ ----------
+ prefix : str
+ The string to add before each label.
+
+ Returns
+ -------
+ Series or DataFrame
+ New Series or DataFrame with updated labels.
+
+ See Also
+ --------
+ Series.add_suffix: Suffix row labels with string `suffix`.
+ DataFrame.add_suffix: Suffix column labels with string `suffix`.
+
+ Examples
+ --------
+ >>> s = pd.Series([1, 2, 3, 4])
+ >>> s
+ 0 1
+ 1 2
+ 2 3
+ 3 4
+ dtype: int64
+
+ >>> s.add_prefix('item_')
+ item_0 1
+ item_1 2
+ item_2 3
+ item_3 4
+ dtype: int64
+
+ >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
+ >>> df
+ A B
+ 0 1 3
+ 1 2 4
+ 2 3 5
+ 3 4 6
+
+ >>> df.add_prefix('col_')
+ col_A col_B
+ 0 1 3
+ 1 2 4
+ 2 3 5
+ 3 4 6
+ """
+ f = functools.partial('{prefix}{}'.format, prefix=prefix)
+
+ mapper = {self._info_axis_name: f}
+ return self.rename(**mapper)
+
+ def add_suffix(self, suffix):
+ """
+ Suffix labels with string `suffix`.
+
+ For Series, the row labels are suffixed.
+ For DataFrame, the column labels are suffixed.
+
+ Parameters
+ ----------
+ suffix : str
+ The string to add after each label.
+
+ Returns
+ -------
+ Series or DataFrame
+ New Series or DataFrame with updated labels.
+
+ See Also
+ --------
+ Series.add_prefix: Prefix row labels with string `prefix`.
+ DataFrame.add_prefix: Prefix column labels with string `prefix`.
+
+ Examples
+ --------
+ >>> s = pd.Series([1, 2, 3, 4])
+ >>> s
+ 0 1
+ 1 2
+ 2 3
+ 3 4
+ dtype: int64
+
+ >>> s.add_suffix('_item')
+ 0_item 1
+ 1_item 2
+ 2_item 3
+ 3_item 4
+ dtype: int64
+
+ >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
+ >>> df
+ A B
+ 0 1 3
+ 1 2 4
+ 2 3 5
+ 3 4 6
+
+ >>> df.add_suffix('_col')
+ A_col B_col
+ 0 1 3
+ 1 2 4
+ 2 3 5
+ 3 4 6
+ """
+ f = functools.partial('{}{suffix}'.format, suffix=suffix)
+
+ mapper = {self._info_axis_name: f}
+ return self.rename(**mapper)
+
+ def sort_values(self, by=None, axis=0, ascending=True, inplace=False,
+ kind='quicksort', na_position='last'):
+ """
+ Sort by the values along either axis
+
+ Parameters
+ ----------%(optional_by)s
+ axis : %(axes_single_arg)s, default 0
+ Axis to be sorted
+ ascending : bool or list of bool, default True
+ Sort ascending vs. descending. Specify list for multiple sort
+ orders. If this is a list of bools, must match the length of
+ the by.
+ inplace : bool, default False
+ if True, perform operation in-place
+ kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
+ Choice of sorting algorithm. See also ndarray.np.sort for more
+ information. `mergesort` is the only stable algorithm. For
+ DataFrames, this option is only applied when sorting on a single
+ column or label.
+ na_position : {'first', 'last'}, default 'last'
+ `first` puts NaNs at the beginning, `last` puts NaNs at the end
+
+ Returns
+ -------
+ sorted_obj : %(klass)s
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({
+ ... 'col1' : ['A', 'A', 'B', np.nan, 'D', 'C'],
+ ... 'col2' : [2, 1, 9, 8, 7, 4],
+ ... 'col3': [0, 1, 9, 4, 2, 3],
+ ... })
+ >>> df
+ col1 col2 col3
+ 0 A 2 0
+ 1 A 1 1
+ 2 B 9 9
+ 3 NaN 8 4
+ 4 D 7 2
+ 5 C 4 3
+
+ Sort by col1
+
+ >>> df.sort_values(by=['col1'])
+ col1 col2 col3
+ 0 A 2 0
+ 1 A 1 1
+ 2 B 9 9
+ 5 C 4 3
+ 4 D 7 2
+ 3 NaN 8 4
+
+ Sort by multiple columns
+
+ >>> df.sort_values(by=['col1', 'col2'])
+ col1 col2 col3
+ 1 A 1 1
+ 0 A 2 0
+ 2 B 9 9
+ 5 C 4 3
+ 4 D 7 2
+ 3 NaN 8 4
+
+ Sort Descending
+
+ >>> df.sort_values(by='col1', ascending=False)
+ col1 col2 col3
+ 4 D 7 2
+ 5 C 4 3
+ 2 B 9 9
+ 0 A 2 0
+ 1 A 1 1
+ 3 NaN 8 4
+
+ Putting NAs first
+
+ >>> df.sort_values(by='col1', ascending=False, na_position='first')
+ col1 col2 col3
+ 3 NaN 8 4
+ 4 D 7 2
+ 5 C 4 3
+ 2 B 9 9
+ 0 A 2 0
+ 1 A 1 1
+ """
+ raise NotImplementedError("sort_values has not been implemented "
+ "on Panel or Panel4D objects.")
+
+ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
+ kind='quicksort', na_position='last', sort_remaining=True):
+ """
+ Sort object by labels (along an axis)
+
+ Parameters
+ ----------
+ axis : %(axes)s to direct sorting
+ level : int or level name or list of ints or list of level names
+ if not None, sort on values in specified index level(s)
+ ascending : boolean, default True
+ Sort ascending vs. descending
+ inplace : bool, default False
+ if True, perform operation in-place
+ kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
+ Choice of sorting algorithm. See also ndarray.np.sort for more
+ information. `mergesort` is the only stable algorithm. For
+ DataFrames, this option is only applied when sorting on a single
+ column or label.
+ na_position : {'first', 'last'}, default 'last'
+ `first` puts NaNs at the beginning, `last` puts NaNs at the end.
+ Not implemented for MultiIndex.
+ sort_remaining : bool, default True
+ if true and sorting by level and index is multilevel, sort by other
+ levels too (in order) after sorting by specified level
+
+ Returns
+ -------
+ sorted_obj : %(klass)s
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ axis = self._get_axis_number(axis)
+ axis_name = self._get_axis_name(axis)
+ labels = self._get_axis(axis)
+
+ if level is not None:
+ raise NotImplementedError("level is not implemented")
+ if inplace:
+ raise NotImplementedError("inplace is not implemented")
+
+ sort_index = labels.argsort()
+ if not ascending:
+ sort_index = sort_index[::-1]
+
+ new_axis = labels.take(sort_index)
+ return self.reindex(**{axis_name: new_axis})
+
+ def reindex(self, *args, **kwargs):
+ """
+ Conform %(klass)s to new index with optional filling logic, placing
+ NA/NaN in locations having no value in the previous index. A new object
+ is produced unless the new index is equivalent to the current one and
+ ``copy=False``.
+
+ Parameters
+ ----------
+ %(optional_labels)s
+ %(axes)s : array-like, optional
+ New labels / index to conform to, should be specified using
+ keywords. Preferably an Index object to avoid duplicating data
+ %(optional_axis)s
+ method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}
+ Method to use for filling holes in reindexed DataFrame.
+ Please note: this is only applicable to DataFrames/Series with a
+ monotonically increasing/decreasing index.
+
+ * None (default): don't fill gaps
+ * pad / ffill: propagate last valid observation forward to next
+ valid
+ * backfill / bfill: use next valid observation to fill gap
+ * nearest: use nearest valid observations to fill gap
+
+ copy : bool, default True
+ Return a new object, even if the passed indexes are the same.
+ level : int or name
+ Broadcast across a level, matching Index values on the
+ passed MultiIndex level.
+ fill_value : scalar, default np.NaN
+ Value to use for missing values. Defaults to NaN, but can be any
+ "compatible" value.
+ limit : int, default None
+ Maximum number of consecutive elements to forward or backward fill.
+ tolerance : optional
+ Maximum distance between original and new labels for inexact
+ matches. The values of the index at the matching locations most
+ satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
+
+ Tolerance may be a scalar value, which applies the same tolerance
+ to all values, or list-like, which applies variable tolerance per
+ element. List-like includes list, tuple, array, Series, and must be
+ the same size as the index and its dtype must exactly match the
+ index's type.
+
+ .. versionadded:: 0.21.0 (list-like tolerance)
+
+ Returns
+ -------
+ %(klass)s with changed index.
+
+ See Also
+ --------
+ DataFrame.set_index : Set row labels.
+ DataFrame.reset_index : Remove row labels or move them to new columns.
+ DataFrame.reindex_like : Change to same indices as other DataFrame.
+
+ Examples
+ --------
+
+ ``DataFrame.reindex`` supports two calling conventions
+
+ * ``(index=index_labels, columns=column_labels, ...)``
+ * ``(labels, axis={'index', 'columns'}, ...)``
+
+ We *highly* recommend using keyword arguments to clarify your
+ intent.
+
+ Create a dataframe with some fictional data.
+
+ >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
+ >>> df = pd.DataFrame({
+ ... 'http_status': [200,200,404,404,301],
+ ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},
+ ... index=index)
+ >>> df
+ http_status response_time
+ Firefox 200 0.04
+ Chrome 200 0.02
+ Safari 404 0.07
+ IE10 404 0.08
+ Konqueror 301 1.00
+
+ Create a new index and reindex the dataframe. By default
+ values in the new index that do not have corresponding
+ records in the dataframe are assigned ``NaN``.
+
+ >>> new_index= ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10',
+ ... 'Chrome']
+ >>> df.reindex(new_index)
+ http_status response_time
+ Safari 404.0 0.07
+ Iceweasel NaN NaN
+ Comodo Dragon NaN NaN
+ IE10 404.0 0.08
+ Chrome 200.0 0.02
+
+ We can fill in the missing values by passing a value to
+ the keyword ``fill_value``. Because the index is not monotonically
+ increasing or decreasing, we cannot use arguments to the keyword
+ ``method`` to fill the ``NaN`` values.
+
+ >>> df.reindex(new_index, fill_value=0)
+ http_status response_time
+ Safari 404 0.07
+ Iceweasel 0 0.00
+ Comodo Dragon 0 0.00
+ IE10 404 0.08
+ Chrome 200 0.02
+
+ >>> df.reindex(new_index, fill_value='missing')
+ http_status response_time
+ Safari 404 0.07
+ Iceweasel missing missing
+ Comodo Dragon missing missing
+ IE10 404 0.08
+ Chrome 200 0.02
+
+ We can also reindex the columns.
+
+ >>> df.reindex(columns=['http_status', 'user_agent'])
+ http_status user_agent
+ Firefox 200 NaN
+ Chrome 200 NaN
+ Safari 404 NaN
+ IE10 404 NaN
+ Konqueror 301 NaN
+
+ Or we can use "axis-style" keyword arguments
+
+ >>> df.reindex(['http_status', 'user_agent'], axis="columns")
+ http_status user_agent
+ Firefox 200 NaN
+ Chrome 200 NaN
+ Safari 404 NaN
+ IE10 404 NaN
+ Konqueror 301 NaN
+
+ To further illustrate the filling functionality in
+ ``reindex``, we will create a dataframe with a
+ monotonically increasing index (for example, a sequence
+ of dates).
+
+ >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D')
+ >>> df2 = pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]},
+ ... index=date_index)
+ >>> df2
+ prices
+ 2010-01-01 100.0
+ 2010-01-02 101.0
+ 2010-01-03 NaN
+ 2010-01-04 100.0
+ 2010-01-05 89.0
+ 2010-01-06 88.0
+
+ Suppose we decide to expand the dataframe to cover a wider
+ date range.
+
+ >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D')
+ >>> df2.reindex(date_index2)
+ prices
+ 2009-12-29 NaN
+ 2009-12-30 NaN
+ 2009-12-31 NaN
+ 2010-01-01 100.0
+ 2010-01-02 101.0
+ 2010-01-03 NaN
+ 2010-01-04 100.0
+ 2010-01-05 89.0
+ 2010-01-06 88.0
+ 2010-01-07 NaN
+
+ The index entries that did not have a value in the original data frame
+ (for example, '2009-12-29') are by default filled with ``NaN``.
+ If desired, we can fill in the missing values using one of several
+ options.
+
+ For example, to back-propagate the last valid value to fill the ``NaN``
+ values, pass ``bfill`` as an argument to the ``method`` keyword.
+
+ >>> df2.reindex(date_index2, method='bfill')
+ prices
+ 2009-12-29 100.0
+ 2009-12-30 100.0
+ 2009-12-31 100.0
+ 2010-01-01 100.0
+ 2010-01-02 101.0
+ 2010-01-03 NaN
+ 2010-01-04 100.0
+ 2010-01-05 89.0
+ 2010-01-06 88.0
+ 2010-01-07 NaN
+
+ Please note that the ``NaN`` value present in the original dataframe
+ (at index value 2010-01-03) will not be filled by any of the
+ value propagation schemes. This is because filling while reindexing
+ does not look at dataframe values, but only compares the original and
+ desired indexes. If you do want to fill in the ``NaN`` values present
+ in the original dataframe, use the ``fillna()`` method.
+
+ See the :ref:`user guide <basics.reindexing>` for more.
+ """
+ # TODO: Decide if we care about having different examples for different
+ # kinds
+
+ # construct the args
+ axes, kwargs = self._construct_axes_from_arguments(args, kwargs)
+ method = missing.clean_reindex_fill_method(kwargs.pop('method', None))
+ level = kwargs.pop('level', None)
+ copy = kwargs.pop('copy', True)
+ limit = kwargs.pop('limit', None)
+ tolerance = kwargs.pop('tolerance', None)
+ fill_value = kwargs.pop('fill_value', None)
+
+ # Series.reindex doesn't use / need the axis kwarg
+ # We pop and ignore it here, to make writing Series/Frame generic code
+ # easier
+ kwargs.pop("axis", None)
+
+ if kwargs:
+ raise TypeError('reindex() got an unexpected keyword '
+ 'argument "{0}"'.format(list(kwargs.keys())[0]))
+
+ self._consolidate_inplace()
+
+ # if all axes that are requested to reindex are equal, then only copy
+ # if indicated must have index names equal here as well as values
+ if all(self._get_axis(axis).identical(ax)
+ for axis, ax in axes.items() if ax is not None):
+ if copy:
+ return self.copy()
+ return self
+
+ # check if we are a multi reindex
+ if self._needs_reindex_multi(axes, method, level):
+ try:
+ return self._reindex_multi(axes, copy, fill_value)
+ except Exception:
+ pass
+
+ # perform the reindex on the axes
+ return self._reindex_axes(axes, level, limit, tolerance, method,
+ fill_value, copy).__finalize__(self)
+
+ def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
+ copy):
+ """Perform the reindex for all the axes."""
+ obj = self
+ for a in self._AXIS_ORDERS:
+ labels = axes[a]
+ if labels is None:
+ continue
+
+ ax = self._get_axis(a)
+ new_index, indexer = ax.reindex(labels, level=level, limit=limit,
+ tolerance=tolerance, method=method)
+
+ axis = self._get_axis_number(a)
+ obj = obj._reindex_with_indexers({axis: [new_index, indexer]},
+ fill_value=fill_value,
+ copy=copy, allow_dups=False)
+
+ return obj
+
+ def _needs_reindex_multi(self, axes, method, level):
+ """Check if we do need a multi reindex."""
+ return ((com.count_not_none(*axes.values()) == self._AXIS_LEN) and
+ method is None and level is None and not self._is_mixed_type)
+
+ def _reindex_multi(self, axes, copy, fill_value):
+ return NotImplemented
+
+ _shared_docs['reindex_axis'] = ("""
+ Conform input object to new index.
+
+ .. deprecated:: 0.21.0
+ Use `reindex` instead.
+
+ By default, places NaN in locations having no value in the
+ previous index. A new object is produced unless the new index
+ is equivalent to the current one and copy=False.
+
+ Parameters
+ ----------
+ labels : array-like
+ New labels / index to conform to. Preferably an Index object to
+ avoid duplicating data.
+ axis : %(axes_single_arg)s
+ Indicate whether to use rows or columns.
+ method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}, optional
+ Method to use for filling holes in reindexed DataFrame:
+
+ * default: don't fill gaps.
+ * pad / ffill: propagate last valid observation forward to next
+ valid.
+ * backfill / bfill: use next valid observation to fill gap.
+ * nearest: use nearest valid observations to fill gap.
+
+ level : int or str
+ Broadcast across a level, matching Index values on the
+ passed MultiIndex level.
+ copy : bool, default True
+ Return a new object, even if the passed indexes are the same.
+ limit : int, optional
+ Maximum number of consecutive elements to forward or backward fill.
+ fill_value : float, default NaN
+ Value used to fill in locations having no value in the previous
+ index.
+
+ .. versionadded:: 0.21.0 (list-like tolerance)
+
+ Returns
+ -------
+ %(klass)s
+ Returns a new DataFrame object with new indices, unless the new
+ index is equivalent to the current one and copy=False.
+
+ See Also
+ --------
+ DataFrame.set_index : Set row labels.
+ DataFrame.reset_index : Remove row labels or move them to new columns.
+ DataFrame.reindex : Change to new indices or expand indices.
+ DataFrame.reindex_like : Change to same indices as other DataFrame.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
+ ... index=['dog', 'hawk'])
+ >>> df
+ num_legs num_wings
+ dog 4 0
+ hawk 2 2
+ >>> df.reindex(['num_wings', 'num_legs', 'num_heads'],
+ ... axis='columns')
+ num_wings num_legs num_heads
+ dog 0 4 NaN
+ hawk 2 2 NaN
+ """)
+
+ @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)
+ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
+ limit=None, fill_value=None):
+ msg = ("'.reindex_axis' is deprecated and will be removed in a future "
+ "version. Use '.reindex' instead.")
+ self._consolidate_inplace()
+
+ axis_name = self._get_axis_name(axis)
+ axis_values = self._get_axis(axis_name)
+ method = missing.clean_reindex_fill_method(method)
+ warnings.warn(msg, FutureWarning, stacklevel=3)
+ new_index, indexer = axis_values.reindex(labels, method, level,
+ limit=limit)
+ return self._reindex_with_indexers({axis: [new_index, indexer]},
+ fill_value=fill_value, copy=copy)
+
+ def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False,
+ allow_dups=False):
+ """allow_dups indicates an internal call here """
+
+ # reindex doing multiple operations on different axes if indicated
+ new_data = self._data
+ for axis in sorted(reindexers.keys()):
+ index, indexer = reindexers[axis]
+ baxis = self._get_block_manager_axis(axis)
+
+ if index is None:
+ continue
+
+ index = ensure_index(index)
+ if indexer is not None:
+ indexer = ensure_int64(indexer)
+
+ # TODO: speed up on homogeneous DataFrame objects
+ new_data = new_data.reindex_indexer(index, indexer, axis=baxis,
+ fill_value=fill_value,
+ allow_dups=allow_dups,
+ copy=copy)
+
+ if copy and new_data is self._data:
+ new_data = new_data.copy()
+
+ return self._constructor(new_data).__finalize__(self)
+
+ def filter(self, items=None, like=None, regex=None, axis=None):
+ """
+ Subset rows or columns of dataframe according to labels in
+ the specified index.
+
+ Note that this routine does not filter a dataframe on its
+ contents. The filter is applied to the labels of the index.
+
+ Parameters
+ ----------
+ items : list-like
+ List of axis to restrict to (must not all be present).
+ like : string
+ Keep axis where "arg in col == True".
+ regex : string (regular expression)
+ Keep axis with re.search(regex, col) == True.
+ axis : int or string axis name
+ The axis to filter on. By default this is the info axis,
+ 'index' for Series, 'columns' for DataFrame.
+
+ Returns
+ -------
+ same type as input object
+
+ See Also
+ --------
+ DataFrame.loc
+
+ Notes
+ -----
+ The ``items``, ``like``, and ``regex`` parameters are
+ enforced to be mutually exclusive.
+
+ ``axis`` defaults to the info axis that is used when indexing
+ with ``[]``.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(np.array(([1,2,3], [4,5,6])),
+ ... index=['mouse', 'rabbit'],
+ ... columns=['one', 'two', 'three'])
+
+ >>> # select columns by name
+ >>> df.filter(items=['one', 'three'])
+ one three
+ mouse 1 3
+ rabbit 4 6
+
+ >>> # select columns by regular expression
+ >>> df.filter(regex='e$', axis=1)
+ one three
+ mouse 1 3
+ rabbit 4 6
+
+ >>> # select rows containing 'bbi'
+ >>> df.filter(like='bbi', axis=0)
+ one two three
+ rabbit 4 5 6
+ """
+ import re
+
+ nkw = com.count_not_none(items, like, regex)
+ if nkw > 1:
+ raise TypeError('Keyword arguments `items`, `like`, or `regex` '
+ 'are mutually exclusive')
+
+ if axis is None:
+ axis = self._info_axis_name
+ labels = self._get_axis(axis)
+
+ if items is not None:
+ name = self._get_axis_name(axis)
+ return self.reindex(
+ **{name: [r for r in items if r in labels]})
+ elif like:
+ def f(x):
+ return like in to_str(x)
+ values = labels.map(f)
+ return self.loc(axis=axis)[values]
+ elif regex:
+ def f(x):
+ return matcher.search(to_str(x)) is not None
+ matcher = re.compile(regex)
+ values = labels.map(f)
+ return self.loc(axis=axis)[values]
+ else:
+ raise TypeError('Must pass either `items`, `like`, or `regex`')
+
+ def head(self, n=5):
+ """
+ Return the first `n` rows.
+
+ This function returns the first `n` rows for the object based
+ on position. It is useful for quickly testing if your object
+ has the right type of data in it.
+
+ Parameters
+ ----------
+ n : int, default 5
+ Number of rows to select.
+
+ Returns
+ -------
+ obj_head : same type as caller
+ The first `n` rows of the caller object.
+
+ See Also
+ --------
+ DataFrame.tail: Returns the last `n` rows.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion',
+ ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})
+ >>> df
+ animal
+ 0 alligator
+ 1 bee
+ 2 falcon
+ 3 lion
+ 4 monkey
+ 5 parrot
+ 6 shark
+ 7 whale
+ 8 zebra
+
+ Viewing the first 5 lines
+
+ >>> df.head()
+ animal
+ 0 alligator
+ 1 bee
+ 2 falcon
+ 3 lion
+ 4 monkey
+
+ Viewing the first `n` lines (three in this case)
+
+ >>> df.head(3)
+ animal
+ 0 alligator
+ 1 bee
+ 2 falcon
+ """
+
+ return self.iloc[:n]
+
+ def tail(self, n=5):
+ """
+ Return the last `n` rows.
+
+ This function returns last `n` rows from the object based on
+ position. It is useful for quickly verifying data, for example,
+ after sorting or appending rows.
+
+ Parameters
+ ----------
+ n : int, default 5
+ Number of rows to select.
+
+ Returns
+ -------
+ type of caller
+ The last `n` rows of the caller object.
+
+ See Also
+ --------
+ DataFrame.head : The first `n` rows of the caller object.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion',
+ ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})
+ >>> df
+ animal
+ 0 alligator
+ 1 bee
+ 2 falcon
+ 3 lion
+ 4 monkey
+ 5 parrot
+ 6 shark
+ 7 whale
+ 8 zebra
+
+ Viewing the last 5 lines
+
+ >>> df.tail()
+ animal
+ 4 monkey
+ 5 parrot
+ 6 shark
+ 7 whale
+ 8 zebra
+
+ Viewing the last `n` lines (three in this case)
+
+ >>> df.tail(3)
+ animal
+ 6 shark
+ 7 whale
+ 8 zebra
+ """
+
+ if n == 0:
+ return self.iloc[0:0]
+ return self.iloc[-n:]
+
+ def sample(self, n=None, frac=None, replace=False, weights=None,
+ random_state=None, axis=None):
+ """
+ Return a random sample of items from an axis of object.
+
+ You can use `random_state` for reproducibility.
+
+ Parameters
+ ----------
+ n : int, optional
+ Number of items from axis to return. Cannot be used with `frac`.
+ Default = 1 if `frac` = None.
+ frac : float, optional
+ Fraction of axis items to return. Cannot be used with `n`.
+ replace : bool, default False
+ Sample with or without replacement.
+ weights : str or ndarray-like, optional
+ Default 'None' results in equal probability weighting.
+ If passed a Series, will align with target object on index. Index
+ values in weights not found in sampled object will be ignored and
+ index values in sampled object not in weights will be assigned
+ weights of zero.
+ If called on a DataFrame, will accept the name of a column
+ when axis = 0.
+ Unless weights are a Series, weights must be same length as axis
+ being sampled.
+ If weights do not sum to 1, they will be normalized to sum to 1.
+ Missing values in the weights column will be treated as zero.
+ Infinite values not allowed.
+ random_state : int or numpy.random.RandomState, optional
+ Seed for the random number generator (if int), or numpy RandomState
+ object.
+ axis : int or string, optional
+ Axis to sample. Accepts axis number or name. Default is stat axis
+ for given data type (0 for Series and DataFrames, 1 for Panels).
+
+ Returns
+ -------
+ Series or DataFrame
+ A new object of same type as caller containing `n` items randomly
+ sampled from the caller object.
+
+ See Also
+ --------
+ numpy.random.choice: Generates a random sample from a given 1-D numpy
+ array.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0],
+ ... 'num_wings': [2, 0, 0, 0],
+ ... 'num_specimen_seen': [10, 2, 1, 8]},
+ ... index=['falcon', 'dog', 'spider', 'fish'])
+ >>> df
+ num_legs num_wings num_specimen_seen
+ falcon 2 2 10
+ dog 4 0 2
+ spider 8 0 1
+ fish 0 0 8
+
+ Extract 3 random elements from the ``Series`` ``df['num_legs']``:
+ Note that we use `random_state` to ensure the reproducibility of
+ the examples.
+
+ >>> df['num_legs'].sample(n=3, random_state=1)
+ fish 0
+ spider 8
+ falcon 2
+ Name: num_legs, dtype: int64
+
+ A random 50% sample of the ``DataFrame`` with replacement:
+
+ >>> df.sample(frac=0.5, replace=True, random_state=1)
+ num_legs num_wings num_specimen_seen
+ dog 4 0 2
+ fish 0 0 8
+
+ Using a DataFrame column as weights. Rows with larger value in the
+ `num_specimen_seen` column are more likely to be sampled.
+
+ >>> df.sample(n=2, weights='num_specimen_seen', random_state=1)
+ num_legs num_wings num_specimen_seen
+ falcon 2 2 10
+ fish 0 0 8
+ """
+
+ if axis is None:
+ axis = self._stat_axis_number
+
+ axis = self._get_axis_number(axis)
+ axis_length = self.shape[axis]
+
+ # Process random_state argument
+ rs = com.random_state(random_state)
+
+ # Check weights for compliance
+ if weights is not None:
+
+ # If a series, align with frame
+ if isinstance(weights, pd.Series):
+ weights = weights.reindex(self.axes[axis])
+
+ # Strings acceptable if a dataframe and axis = 0
+ if isinstance(weights, string_types):
+ if isinstance(self, pd.DataFrame):
+ if axis == 0:
+ try:
+ weights = self[weights]
+ except KeyError:
+ raise KeyError("String passed to weights not a "
+ "valid column")
+ else:
+ raise ValueError("Strings can only be passed to "
+ "weights when sampling from rows on "
+ "a DataFrame")
+ else:
+ raise ValueError("Strings cannot be passed as weights "
+ "when sampling from a Series or Panel.")
+
+ weights = pd.Series(weights, dtype='float64')
+
+ if len(weights) != axis_length:
+ raise ValueError("Weights and axis to be sampled must be of "
+ "same length")
+
+ if (weights == np.inf).any() or (weights == -np.inf).any():
+ raise ValueError("weight vector may not include `inf` values")
+
+ if (weights < 0).any():
+ raise ValueError("weight vector many not include negative "
+ "values")
+
+ # If has nan, set to zero.
+ weights = weights.fillna(0)
+
+ # Renormalize if don't sum to 1
+ if weights.sum() != 1:
+ if weights.sum() != 0:
+ weights = weights / weights.sum()
+ else:
+ raise ValueError("Invalid weights: weights sum to zero")
+
+ weights = weights.values
+
+ # If no frac or n, default to n=1.
+ if n is None and frac is None:
+ n = 1
+ elif n is not None and frac is None and n % 1 != 0:
+ raise ValueError("Only integers accepted as `n` values")
+ elif n is None and frac is not None:
+ n = int(round(frac * axis_length))
+ elif n is not None and frac is not None:
+ raise ValueError('Please enter a value for `frac` OR `n`, not '
+ 'both')
+
+ # Check for negative sizes
+ if n < 0:
+ raise ValueError("A negative number of rows requested. Please "
+ "provide positive value.")
+
+ locs = rs.choice(axis_length, size=n, replace=replace, p=weights)
+ return self.take(locs, axis=axis, is_copy=False)
+
+ _shared_docs['pipe'] = (r"""
+ Apply func(self, \*args, \*\*kwargs).
+
+ Parameters
+ ----------
+ func : function
+ function to apply to the %(klass)s.
+ ``args``, and ``kwargs`` are passed into ``func``.
+ Alternatively a ``(callable, data_keyword)`` tuple where
+ ``data_keyword`` is a string indicating the keyword of
+ ``callable`` that expects the %(klass)s.
+ args : iterable, optional
+ positional arguments passed into ``func``.
+ kwargs : mapping, optional
+ a dictionary of keyword arguments passed into ``func``.
+
+ Returns
+ -------
+ object : the return type of ``func``.
+
+ See Also
+ --------
+ DataFrame.apply
+ DataFrame.applymap
+ Series.map
+
+ Notes
+ -----
+
+ Use ``.pipe`` when chaining together functions that expect
+ Series, DataFrames or GroupBy objects. Instead of writing
+
+ >>> f(g(h(df), arg1=a), arg2=b, arg3=c)
+
+ You can write
+
+ >>> (df.pipe(h)
+ ... .pipe(g, arg1=a)
+ ... .pipe(f, arg2=b, arg3=c)
+ ... )
+
+ If you have a function that takes the data as (say) the second
+ argument, pass a tuple indicating which keyword expects the
+ data. For example, suppose ``f`` takes its data as ``arg2``:
+
+ >>> (df.pipe(h)
+ ... .pipe(g, arg1=a)
+ ... .pipe((f, 'arg2'), arg1=a, arg3=c)
+ ... )
+ """)
+
+ @Appender(_shared_docs['pipe'] % _shared_doc_kwargs)
+ def pipe(self, func, *args, **kwargs):
+ return com._pipe(self, func, *args, **kwargs)
+
+ _shared_docs['aggregate'] = dedent("""
+ Aggregate using one or more operations over the specified axis.
+
+ %(versionadded)s
+
+ Parameters
+ ----------
+ func : function, str, list or dict
+ Function to use for aggregating the data. If a function, must either
+ work when passed a %(klass)s or when passed to %(klass)s.apply.
+
+ Accepted combinations are:
+
+ - function
+ - string function name
+ - list of functions and/or function names, e.g. ``[np.sum, 'mean']``
+ - dict of axis labels -> functions, function names or list of such.
+ %(axis)s
+ *args
+ Positional arguments to pass to `func`.
+ **kwargs
+ Keyword arguments to pass to `func`.
+
+ Returns
+ -------
+ DataFrame, Series or scalar
+ if DataFrame.agg is called with a single function, returns a Series
+ if DataFrame.agg is called with several functions, returns a DataFrame
+ if Series.agg is called with single function, returns a scalar
+ if Series.agg is called with several functions, returns a Series
+
+ %(see_also)s
+
+ Notes
+ -----
+ `agg` is an alias for `aggregate`. Use the alias.
+
+ A passed user-defined-function will be passed a Series for evaluation.
+
+ %(examples)s
+ """)
+
+ _shared_docs['transform'] = ("""
+ Call ``func`` on self producing a %(klass)s with transformed values
+ and that has the same axis length as self.
+
+ .. versionadded:: 0.20.0
+
+ Parameters
+ ----------
+ func : function, str, list or dict
+ Function to use for transforming the data. If a function, must either
+ work when passed a %(klass)s or when passed to %(klass)s.apply.
+
+ Accepted combinations are:
+
+ - function
+ - string function name
+ - list of functions and/or function names, e.g. ``[np.exp. 'sqrt']``
+ - dict of axis labels -> functions, function names or list of such.
+ %(axis)s
+ *args
+ Positional arguments to pass to `func`.
+ **kwargs
+ Keyword arguments to pass to `func`.
+
+ Returns
+ -------
+ %(klass)s
+ A %(klass)s that must have the same length as self.
+
+ Raises
+ ------
+ ValueError : If the returned %(klass)s has a different length than self.
+
+ See Also
+ --------
+ %(klass)s.agg : Only perform aggregating type operations.
+ %(klass)s.apply : Invoke function on a %(klass)s.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'A': range(3), 'B': range(1, 4)})
+ >>> df
+ A B
+ 0 0 1
+ 1 1 2
+ 2 2 3
+ >>> df.transform(lambda x: x + 1)
+ A B
+ 0 1 2
+ 1 2 3
+ 2 3 4
+
+ Even though the resulting %(klass)s must have the same length as the
+ input %(klass)s, it is possible to provide several input functions:
+
+ >>> s = pd.Series(range(3))
+ >>> s
+ 0 0
+ 1 1
+ 2 2
+ dtype: int64
+ >>> s.transform([np.sqrt, np.exp])
+ sqrt exp
+ 0 0.000000 1.000000
+ 1 1.000000 2.718282
+ 2 1.414214 7.389056
+ """)
+
+ # ----------------------------------------------------------------------
+ # Attribute access
+
+ def __finalize__(self, other, method=None, **kwargs):
+ """
+ Propagate metadata from other to self.
+
+ Parameters
+ ----------
+ other : the object from which to get the attributes that we are going
+ to propagate
+ method : optional, a passed method name ; possibly to take different
+ types of propagation actions based on this
+
+ """
+ if isinstance(other, NDFrame):
+ for name in self._metadata:
+ object.__setattr__(self, name, getattr(other, name, None))
+ return self
+
+ def __getattr__(self, name):
+ """After regular attribute access, try looking up the name
+ This allows simpler access to columns for interactive use.
+ """
+
+ # Note: obj.x will always call obj.__getattribute__('x') prior to
+ # calling obj.__getattr__('x').
+
+ if (name in self._internal_names_set or name in self._metadata or
+ name in self._accessors):
+ return object.__getattribute__(self, name)
+ else:
+ if self._info_axis._can_hold_identifiers_and_holds_name(name):
+ return self[name]
+ return object.__getattribute__(self, name)
+
+ def __setattr__(self, name, value):
+ """After regular attribute access, try setting the name
+ This allows simpler access to columns for interactive use.
+ """
+
+ # first try regular attribute access via __getattribute__, so that
+ # e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify
+ # the same attribute.
+
+ try:
+ object.__getattribute__(self, name)
+ return object.__setattr__(self, name, value)
+ except AttributeError:
+ pass
+
+ # if this fails, go on to more involved attribute setting
+ # (note that this matches __getattr__, above).
+ if name in self._internal_names_set:
+ object.__setattr__(self, name, value)
+ elif name in self._metadata:
+ object.__setattr__(self, name, value)
+ else:
+ try:
+ existing = getattr(self, name)
+ if isinstance(existing, Index):
+ object.__setattr__(self, name, value)
+ elif name in self._info_axis:
+ self[name] = value
+ else:
+ object.__setattr__(self, name, value)
+ except (AttributeError, TypeError):
+ if isinstance(self, ABCDataFrame) and (is_list_like(value)):
+ warnings.warn("Pandas doesn't allow columns to be "
+ "created via a new attribute name - see "
+ "https://pandas.pydata.org/pandas-docs/"
+ "stable/indexing.html#attribute-access",
+ stacklevel=2)
+ object.__setattr__(self, name, value)
+
+ def _dir_additions(self):
+ """ add the string-like attributes from the info_axis.
+ If info_axis is a MultiIndex, it's first level values are used.
+ """
+ additions = {c for c in self._info_axis.unique(level=0)[:100]
+ if isinstance(c, string_types) and isidentifier(c)}
+ return super(NDFrame, self)._dir_additions().union(additions)
+
+ # ----------------------------------------------------------------------
+ # Getting and setting elements
+
+ # ----------------------------------------------------------------------
+ # Consolidation of internals
+
+ def _protect_consolidate(self, f):
+ """Consolidate _data -- if the blocks have changed, then clear the
+ cache
+ """
+ blocks_before = len(self._data.blocks)
+ result = f()
+ if len(self._data.blocks) != blocks_before:
+ self._clear_item_cache()
+ return result
+
+ def _consolidate_inplace(self):
+ """Consolidate data in place and return None"""
+
+ def f():
+ self._data = self._data.consolidate()
+
+ self._protect_consolidate(f)
+
+ def _consolidate(self, inplace=False):
+ """
+ Compute NDFrame with "consolidated" internals (data of each dtype
+ grouped together in a single ndarray).
+
+ Parameters
+ ----------
+ inplace : boolean, default False
+ If False return new object, otherwise modify existing object
+
+ Returns
+ -------
+ consolidated : same type as caller
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ if inplace:
+ self._consolidate_inplace()
+ else:
+ f = lambda: self._data.consolidate()
+ cons_data = self._protect_consolidate(f)
+ return self._constructor(cons_data).__finalize__(self)
+
+ @property
+ def _is_mixed_type(self):
+ f = lambda: self._data.is_mixed_type
+ return self._protect_consolidate(f)
+
+ @property
+ def _is_numeric_mixed_type(self):
+ f = lambda: self._data.is_numeric_mixed_type
+ return self._protect_consolidate(f)
+
+ @property
+ def _is_datelike_mixed_type(self):
+ f = lambda: self._data.is_datelike_mixed_type
+ return self._protect_consolidate(f)
+
+ def _check_inplace_setting(self, value):
+ """ check whether we allow in-place setting with this type of value """
+
+ if self._is_mixed_type:
+ if not self._is_numeric_mixed_type:
+
+ # allow an actual np.nan thru
+ try:
+ if np.isnan(value):
+ return True
+ except Exception:
+ pass
+
+ raise TypeError('Cannot do inplace boolean setting on '
+ 'mixed-types with a non np.nan value')
+
+ return True
+
+ def _get_numeric_data(self):
+ return self._constructor(
+ self._data.get_numeric_data()).__finalize__(self)
+
+ def _get_bool_data(self):
+ return self._constructor(self._data.get_bool_data()).__finalize__(self)
+
+ # ----------------------------------------------------------------------
+ # Internal Interface Methods
+
+ def as_matrix(self, columns=None):
+ """
+ Convert the frame to its Numpy-array representation.
+
+ .. deprecated:: 0.23.0
+ Use :meth:`DataFrame.values` instead.
+
+ Parameters
+ ----------
+ columns : list, optional, default:None
+ If None, return all columns, otherwise, returns specified columns.
+
+ Returns
+ -------
+ values : ndarray
+ If the caller is heterogeneous and contains booleans or objects,
+ the result will be of dtype=object. See Notes.
+
+ See Also
+ --------
+ DataFrame.values
+
+ Notes
+ -----
+ Return is NOT a Numpy-matrix, rather, a Numpy-array.
+
+ The dtype will be a lower-common-denominator dtype (implicit
+ upcasting); that is to say if the dtypes (even of numeric types)
+ are mixed, the one that accommodates all will be chosen. Use this
+ with care if you are not dealing with the blocks.
+
+ e.g. If the dtypes are float16 and float32, dtype will be upcast to
+ float32. If dtypes are int32 and uint8, dtype will be upcase to
+ int32. By numpy.find_common_type convention, mixing int64 and uint64
+ will result in a float64 dtype.
+
+ This method is provided for backwards compatibility. Generally,
+ it is recommended to use '.values'.
+ """
+ warnings.warn("Method .as_matrix will be removed in a future version. "
+ "Use .values instead.", FutureWarning, stacklevel=2)
+ self._consolidate_inplace()
+ return self._data.as_array(transpose=self._AXIS_REVERSED,
+ items=columns)
+
+ @property
+ def values(self):
+ """
+ Return a Numpy representation of the DataFrame.
+
+ .. warning::
+
+ We recommend using :meth:`DataFrame.to_numpy` instead.
+
+ Only the values in the DataFrame will be returned, the axes labels
+ will be removed.
+
+ Returns
+ -------
+ numpy.ndarray
+ The values of the DataFrame.
+
+ See Also
+ --------
+ DataFrame.to_numpy : Recommended alternative to this method.
+ pandas.DataFrame.index : Retrieve the index labels.
+ pandas.DataFrame.columns : Retrieving the column names.
+
+ Notes
+ -----
+ The dtype will be a lower-common-denominator dtype (implicit
+ upcasting); that is to say if the dtypes (even of numeric types)
+ are mixed, the one that accommodates all will be chosen. Use this
+ with care if you are not dealing with the blocks.
+
+ e.g. If the dtypes are float16 and float32, dtype will be upcast to
+ float32. If dtypes are int32 and uint8, dtype will be upcast to
+ int32. By :func:`numpy.find_common_type` convention, mixing int64
+ and uint64 will result in a float64 dtype.
+
+ Examples
+ --------
+ A DataFrame where all columns are the same type (e.g., int64) results
+ in an array of the same type.
+
+ >>> df = pd.DataFrame({'age': [ 3, 29],
+ ... 'height': [94, 170],
+ ... 'weight': [31, 115]})
+ >>> df
+ age height weight
+ 0 3 94 31
+ 1 29 170 115
+ >>> df.dtypes
+ age int64
+ height int64
+ weight int64
+ dtype: object
+ >>> df.values
+ array([[ 3, 94, 31],
+ [ 29, 170, 115]], dtype=int64)
+
+ A DataFrame with mixed type columns(e.g., str/object, int64, float32)
+ results in an ndarray of the broadest type that accommodates these
+ mixed types (e.g., object).
+
+ >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'),
+ ... ('lion', 80.5, 1),
+ ... ('monkey', np.nan, None)],
+ ... columns=('name', 'max_speed', 'rank'))
+ >>> df2.dtypes
+ name object
+ max_speed float64
+ rank object
+ dtype: object
+ >>> df2.values
+ array([['parrot', 24.0, 'second'],
+ ['lion', 80.5, 1],
+ ['monkey', nan, None]], dtype=object)
+ """
+ self._consolidate_inplace()
+ return self._data.as_array(transpose=self._AXIS_REVERSED)
+
+ @property
+ def _values(self):
+ """internal implementation"""
+ return self.values
+
+ @property
+ def _get_values(self):
+ # compat
+ return self.values
+
+ def get_values(self):
+ """
+ Return an ndarray after converting sparse values to dense.
+
+ This is the same as ``.values`` for non-sparse data. For sparse
+ data contained in a `pandas.SparseArray`, the data are first
+ converted to a dense representation.
+
+ Returns
+ -------
+ numpy.ndarray
+ Numpy representation of DataFrame
+
+ See Also
+ --------
+ values : Numpy representation of DataFrame.
+ pandas.SparseArray : Container for sparse data.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'a': [1, 2], 'b': [True, False],
+ ... 'c': [1.0, 2.0]})
+ >>> df
+ a b c
+ 0 1 True 1.0
+ 1 2 False 2.0
+
+ >>> df.get_values()
+ array([[1, True, 1.0], [2, False, 2.0]], dtype=object)
+
+ >>> df = pd.DataFrame({"a": pd.SparseArray([1, None, None]),
+ ... "c": [1.0, 2.0, 3.0]})
+ >>> df
+ a c
+ 0 1.0 1.0
+ 1 NaN 2.0
+ 2 NaN 3.0
+
+ >>> df.get_values()
+ array([[ 1., 1.],
+ [nan, 2.],
+ [nan, 3.]])
+ """
+ return self.values
+
+ def get_dtype_counts(self):
+ """
+ Return counts of unique dtypes in this object.
+
+ Returns
+ -------
+ dtype : Series
+ Series with the count of columns with each dtype.
+
+ See Also
+ --------
+ dtypes : Return the dtypes in this object.
+
+ Examples
+ --------
+ >>> a = [['a', 1, 1.0], ['b', 2, 2.0], ['c', 3, 3.0]]
+ >>> df = pd.DataFrame(a, columns=['str', 'int', 'float'])
+ >>> df
+ str int float
+ 0 a 1 1.0
+ 1 b 2 2.0
+ 2 c 3 3.0
+
+ >>> df.get_dtype_counts()
+ float64 1
+ int64 1
+ object 1
+ dtype: int64
+ """
+ from pandas import Series
+ return Series(self._data.get_dtype_counts())
+
+ def get_ftype_counts(self):
+ """
+ Return counts of unique ftypes in this object.
+
+ .. deprecated:: 0.23.0
+
+ This is useful for SparseDataFrame or for DataFrames containing
+ sparse arrays.
+
+ Returns
+ -------
+ dtype : Series
+ Series with the count of columns with each type and
+ sparsity (dense/sparse)
+
+ See Also
+ --------
+ ftypes : Return ftypes (indication of sparse/dense and dtype) in
+ this object.
+
+ Examples
+ --------
+ >>> a = [['a', 1, 1.0], ['b', 2, 2.0], ['c', 3, 3.0]]
+ >>> df = pd.DataFrame(a, columns=['str', 'int', 'float'])
+ >>> df
+ str int float
+ 0 a 1 1.0
+ 1 b 2 2.0
+ 2 c 3 3.0
+
+ >>> df.get_ftype_counts() # doctest: +SKIP
+ float64:dense 1
+ int64:dense 1
+ object:dense 1
+ dtype: int64
+ """
+ warnings.warn("get_ftype_counts is deprecated and will "
+ "be removed in a future version",
+ FutureWarning, stacklevel=2)
+
+ from pandas import Series
+ return Series(self._data.get_ftype_counts())
+
+ @property
+ def dtypes(self):
+ """
+ Return the dtypes in the DataFrame.
+
+ This returns a Series with the data type of each column.
+ The result's index is the original DataFrame's columns. Columns
+ with mixed types are stored with the ``object`` dtype. See
+ :ref:`the User Guide <basics.dtypes>` for more.
+
+ Returns
+ -------
+ pandas.Series
+ The data type of each column.
+
+ See Also
+ --------
+ pandas.DataFrame.ftypes : Dtype and sparsity information.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'float': [1.0],
+ ... 'int': [1],
+ ... 'datetime': [pd.Timestamp('20180310')],
+ ... 'string': ['foo']})
+ >>> df.dtypes
+ float float64
+ int int64
+ datetime datetime64[ns]
+ string object
+ dtype: object
+ """
+ from pandas import Series
+ return Series(self._data.get_dtypes(), index=self._info_axis,
+ dtype=np.object_)
+
+ @property
+ def ftypes(self):
+ """
+ Return the ftypes (indication of sparse/dense and dtype) in DataFrame.
+
+ This returns a Series with the data type of each column.
+ The result's index is the original DataFrame's columns. Columns
+ with mixed types are stored with the ``object`` dtype. See
+ :ref:`the User Guide <basics.dtypes>` for more.
+
+ Returns
+ -------
+ pandas.Series
+ The data type and indication of sparse/dense of each column.
+
+ See Also
+ --------
+ pandas.DataFrame.dtypes: Series with just dtype information.
+ pandas.SparseDataFrame : Container for sparse tabular data.
+
+ Notes
+ -----
+ Sparse data should have the same dtypes as its dense representation.
+
+ Examples
+ --------
+ >>> arr = np.random.RandomState(0).randn(100, 4)
+ >>> arr[arr < .8] = np.nan
+ >>> pd.DataFrame(arr).ftypes
+ 0 float64:dense
+ 1 float64:dense
+ 2 float64:dense
+ 3 float64:dense
+ dtype: object
+
+ >>> pd.SparseDataFrame(arr).ftypes
+ 0 float64:sparse
+ 1 float64:sparse
+ 2 float64:sparse
+ 3 float64:sparse
+ dtype: object
+ """
+ from pandas import Series
+ return Series(self._data.get_ftypes(), index=self._info_axis,
+ dtype=np.object_)
+
+ def as_blocks(self, copy=True):
+ """
+ Convert the frame to a dict of dtype -> Constructor Types that each has
+ a homogeneous dtype.
+
+ .. deprecated:: 0.21.0
+
+ NOTE: the dtypes of the blocks WILL BE PRESERVED HERE (unlike in
+ as_matrix)
+
+ Parameters
+ ----------
+ copy : boolean, default True
+
+ Returns
+ -------
+ values : a dict of dtype -> Constructor Types
+ """
+ warnings.warn("as_blocks is deprecated and will "
+ "be removed in a future version",
+ FutureWarning, stacklevel=2)
+ return self._to_dict_of_blocks(copy=copy)
+
+ @property
+ def blocks(self):
+ """
+ Internal property, property synonym for as_blocks().
+
+ .. deprecated:: 0.21.0
+ """
+ return self.as_blocks()
+
+ def _to_dict_of_blocks(self, copy=True):
+ """
+ Return a dict of dtype -> Constructor Types that
+ each is a homogeneous dtype.
+
+ Internal ONLY
+ """
+ return {k: self._constructor(v).__finalize__(self)
+ for k, v, in self._data.to_dict(copy=copy).items()}
+
+ def astype(self, dtype, copy=True, errors='raise', **kwargs):
+ """
+ Cast a pandas object to a specified dtype ``dtype``.
+
+ Parameters
+ ----------
+ dtype : data type, or dict of column name -> data type
+ Use a numpy.dtype or Python type to cast entire pandas object to
+ the same type. Alternatively, use {col: dtype, ...}, where col is a
+ column label and dtype is a numpy.dtype or Python type to cast one
+ or more of the DataFrame's columns to column-specific types.
+ copy : bool, default True
+ Return a copy when ``copy=True`` (be very careful setting
+ ``copy=False`` as changes to values then may propagate to other
+ pandas objects).
+ errors : {'raise', 'ignore'}, default 'raise'
+ Control raising of exceptions on invalid data for provided dtype.
+
+ - ``raise`` : allow exceptions to be raised
+ - ``ignore`` : suppress exceptions. On error return original object
+
+ .. versionadded:: 0.20.0
+
+ kwargs : keyword arguments to pass on to the constructor
+
+ Returns
+ -------
+ casted : same type as caller
+
+ See Also
+ --------
+ to_datetime : Convert argument to datetime.
+ to_timedelta : Convert argument to timedelta.
+ to_numeric : Convert argument to a numeric type.
+ numpy.ndarray.astype : Cast a numpy array to a specified type.
+
+ Examples
+ --------
+ >>> ser = pd.Series([1, 2], dtype='int32')
+ >>> ser
+ 0 1
+ 1 2
+ dtype: int32
+ >>> ser.astype('int64')
+ 0 1
+ 1 2
+ dtype: int64
+
+ Convert to categorical type:
+
+ >>> ser.astype('category')
+ 0 1
+ 1 2
+ dtype: category
+ Categories (2, int64): [1, 2]
+
+ Convert to ordered categorical type with custom ordering:
+
+ >>> cat_dtype = pd.api.types.CategoricalDtype(
+ ... categories=[2, 1], ordered=True)
+ >>> ser.astype(cat_dtype)
+ 0 1
+ 1 2
+ dtype: category
+ Categories (2, int64): [2 < 1]
+
+ Note that using ``copy=False`` and changing data on a new
+ pandas object may propagate changes:
+
+ >>> s1 = pd.Series([1,2])
+ >>> s2 = s1.astype('int64', copy=False)
+ >>> s2[0] = 10
+ >>> s1 # note that s1[0] has changed too
+ 0 10
+ 1 2
+ dtype: int64
+ """
+ if is_dict_like(dtype):
+ if self.ndim == 1: # i.e. Series
+ if len(dtype) > 1 or self.name not in dtype:
+ raise KeyError('Only the Series name can be used for '
+ 'the key in Series dtype mappings.')
+ new_type = dtype[self.name]
+ return self.astype(new_type, copy, errors, **kwargs)
+ elif self.ndim > 2:
+ raise NotImplementedError(
+ 'astype() only accepts a dtype arg of type dict when '
+ 'invoked on Series and DataFrames. A single dtype must be '
+ 'specified when invoked on a Panel.'
+ )
+ for col_name in dtype.keys():
+ if col_name not in self:
+ raise KeyError('Only a column name can be used for the '
+ 'key in a dtype mappings argument.')
+ results = []
+ for col_name, col in self.iteritems():
+ if col_name in dtype:
+ results.append(col.astype(dtype[col_name], copy=copy))
+ else:
+ results.append(results.append(col.copy() if copy else col))
+
+ elif is_extension_array_dtype(dtype) and self.ndim > 1:
+ # GH 18099/22869: columnwise conversion to extension dtype
+ # GH 24704: use iloc to handle duplicate column names
+ results = (self.iloc[:, i].astype(dtype, copy=copy)
+ for i in range(len(self.columns)))
+
+ else:
+ # else, only a single dtype is given
+ new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
+ **kwargs)
+ return self._constructor(new_data).__finalize__(self)
+
+ # GH 19920: retain column metadata after concat
+ result = pd.concat(results, axis=1, copy=False)
+ result.columns = self.columns
+ return result
+
+ def copy(self, deep=True):
+ """
+ Make a copy of this object's indices and data.
+
+ When ``deep=True`` (default), a new object will be created with a
+ copy of the calling object's data and indices. Modifications to
+ the data or indices of the copy will not be reflected in the
+ original object (see notes below).
+
+ When ``deep=False``, a new object will be created without copying
+ the calling object's data or index (only references to the data
+ and index are copied). Any changes to the data of the original
+ will be reflected in the shallow copy (and vice versa).
+
+ Parameters
+ ----------
+ deep : bool, default True
+ Make a deep copy, including a copy of the data and the indices.
+ With ``deep=False`` neither the indices nor the data are copied.
+
+ Returns
+ -------
+ copy : Series, DataFrame or Panel
+ Object type matches caller.
+
+ Notes
+ -----
+ When ``deep=True``, data is copied but actual Python objects
+ will not be copied recursively, only the reference to the object.
+ This is in contrast to `copy.deepcopy` in the Standard Library,
+ which recursively copies object data (see examples below).
+
+ While ``Index`` objects are copied when ``deep=True``, the underlying
+ numpy array is not copied for performance reasons. Since ``Index`` is
+ immutable, the underlying data can be safely shared and a copy
+ is not needed.
+
+ Examples
+ --------
+ >>> s = pd.Series([1, 2], index=["a", "b"])
+ >>> s
+ a 1
+ b 2
+ dtype: int64
+
+ >>> s_copy = s.copy()
+ >>> s_copy
+ a 1
+ b 2
+ dtype: int64
+
+ **Shallow copy versus default (deep) copy:**
+
+ >>> s = pd.Series([1, 2], index=["a", "b"])
+ >>> deep = s.copy()
+ >>> shallow = s.copy(deep=False)
+
+ Shallow copy shares data and index with original.
+
+ >>> s is shallow
+ False
+ >>> s.values is shallow.values and s.index is shallow.index
+ True
+
+ Deep copy has own copy of data and index.
+
+ >>> s is deep
+ False
+ >>> s.values is deep.values or s.index is deep.index
+ False
+
+ Updates to the data shared by shallow copy and original is reflected
+ in both; deep copy remains unchanged.
+
+ >>> s[0] = 3
+ >>> shallow[1] = 4
+ >>> s
+ a 3
+ b 4
+ dtype: int64
+ >>> shallow
+ a 3
+ b 4
+ dtype: int64
+ >>> deep
+ a 1
+ b 2
+ dtype: int64
+
+ Note that when copying an object containing Python objects, a deep copy
+ will copy the data, but will not do so recursively. Updating a nested
+ data object will be reflected in the deep copy.
+
+ >>> s = pd.Series([[1, 2], [3, 4]])
+ >>> deep = s.copy()
+ >>> s[0][0] = 10
+ >>> s
+ 0 [10, 2]
+ 1 [3, 4]
+ dtype: object
+ >>> deep
+ 0 [10, 2]
+ 1 [3, 4]
+ dtype: object
+ """
+ data = self._data.copy(deep=deep)
+ return self._constructor(data).__finalize__(self)
+
+ def __copy__(self, deep=True):
+ return self.copy(deep=deep)
+
+ def __deepcopy__(self, memo=None):
+ """
+ Parameters
+ ----------
+ memo, default None
+ Standard signature. Unused
+ """
+ if memo is None:
+ memo = {}
+ return self.copy(deep=True)
+
+ def _convert(self, datetime=False, numeric=False, timedelta=False,
+ coerce=False, copy=True):
+ """
+ Attempt to infer better dtype for object columns
+
+ Parameters
+ ----------
+ datetime : boolean, default False
+ If True, convert to date where possible.
+ numeric : boolean, default False
+ If True, attempt to convert to numbers (including strings), with
+ unconvertible values becoming NaN.
+ timedelta : boolean, default False
+ If True, convert to timedelta where possible.
+ coerce : boolean, default False
+ If True, force conversion with unconvertible values converted to
+ nulls (NaN or NaT)
+ copy : boolean, default True
+ If True, return a copy even if no copy is necessary (e.g. no
+ conversion was done). Note: This is meant for internal use, and
+ should not be confused with inplace.
+
+ Returns
+ -------
+ converted : same as input object
+ """
+ return self._constructor(
+ self._data.convert(datetime=datetime, numeric=numeric,
+ timedelta=timedelta, coerce=coerce,
+ copy=copy)).__finalize__(self)
+
+ def convert_objects(self, convert_dates=True, convert_numeric=False,
+ convert_timedeltas=True, copy=True):
+ """
+ Attempt to infer better dtype for object columns.
+
+ .. deprecated:: 0.21.0
+
+ Parameters
+ ----------
+ convert_dates : boolean, default True
+ If True, convert to date where possible. If 'coerce', force
+ conversion, with unconvertible values becoming NaT.
+ convert_numeric : boolean, default False
+ If True, attempt to coerce to numbers (including strings), with
+ unconvertible values becoming NaN.
+ convert_timedeltas : boolean, default True
+ If True, convert to timedelta where possible. If 'coerce', force
+ conversion, with unconvertible values becoming NaT.
+ copy : boolean, default True
+ If True, return a copy even if no copy is necessary (e.g. no
+ conversion was done). Note: This is meant for internal use, and
+ should not be confused with inplace.
+
+ Returns
+ -------
+ converted : same as input object
+
+ See Also
+ --------
+ to_datetime : Convert argument to datetime.
+ to_timedelta : Convert argument to timedelta.
+ to_numeric : Convert argument to numeric type.
+ """
+ msg = ("convert_objects is deprecated. To re-infer data dtypes for "
+ "object columns, use {klass}.infer_objects()\nFor all "
+ "other conversions use the data-type specific converters "
+ "pd.to_datetime, pd.to_timedelta and pd.to_numeric."
+ ).format(klass=self.__class__.__name__)
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+
+ return self._constructor(
+ self._data.convert(convert_dates=convert_dates,
+ convert_numeric=convert_numeric,
+ convert_timedeltas=convert_timedeltas,
+ copy=copy)).__finalize__(self)
+
+ def infer_objects(self):
+ """
+ Attempt to infer better dtypes for object columns.
+
+ Attempts soft conversion of object-dtyped
+ columns, leaving non-object and unconvertible
+ columns unchanged. The inference rules are the
+ same as during normal Series/DataFrame construction.
+
+ .. versionadded:: 0.21.0
+
+ Returns
+ -------
+ converted : same type as input object
+
+ See Also
+ --------
+ to_datetime : Convert argument to datetime.
+ to_timedelta : Convert argument to timedelta.
+ to_numeric : Convert argument to numeric type.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({"A": ["a", 1, 2, 3]})
+ >>> df = df.iloc[1:]
+ >>> df
+ A
+ 1 1
+ 2 2
+ 3 3
+
+ >>> df.dtypes
+ A object
+ dtype: object
+
+ >>> df.infer_objects().dtypes
+ A int64
+ dtype: object
+ """
+ # numeric=False necessary to only soft convert;
+ # python objects will still be converted to
+ # native numpy numeric types
+ return self._constructor(
+ self._data.convert(datetime=True, numeric=False,
+ timedelta=True, coerce=False,
+ copy=True)).__finalize__(self)
+
+ # ----------------------------------------------------------------------
+ # Filling NA's
+
+ def fillna(self, value=None, method=None, axis=None, inplace=False,
+ limit=None, downcast=None):
+ """
+ Fill NA/NaN values using the specified method.
+
+ Parameters
+ ----------
+ value : scalar, dict, Series, or DataFrame
+ Value to use to fill holes (e.g. 0), alternately a
+ dict/Series/DataFrame of values specifying which value to use for
+ each index (for a Series) or column (for a DataFrame). (values not
+ in the dict/Series/DataFrame will not be filled). This value cannot
+ be a list.
+ method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
+ Method to use for filling holes in reindexed Series
+ pad / ffill: propagate last valid observation forward to next valid
+ backfill / bfill: use NEXT valid observation to fill gap
+ axis : %(axes_single_arg)s
+ inplace : boolean, default False
+ If True, fill in place. Note: this will modify any
+ other views on this object, (e.g. a no-copy slice for a column in a
+ DataFrame).
+ limit : int, default None
+ If method is specified, this is the maximum number of consecutive
+ NaN values to forward/backward fill. In other words, if there is
+ a gap with more than this number of consecutive NaNs, it will only
+ be partially filled. If method is not specified, this is the
+ maximum number of entries along the entire axis where NaNs will be
+ filled. Must be greater than 0 if not None.
+ downcast : dict, default is None
+ a dict of item->dtype of what to downcast if possible,
+ or the string 'infer' which will try to downcast to an appropriate
+ equal type (e.g. float64 to int64 if possible)
+
+ Returns
+ -------
+ filled : %(klass)s
+
+ See Also
+ --------
+ interpolate : Fill NaN values using interpolation.
+ reindex, asfreq
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],
+ ... [3, 4, np.nan, 1],
+ ... [np.nan, np.nan, np.nan, 5],
+ ... [np.nan, 3, np.nan, 4]],
+ ... columns=list('ABCD'))
+ >>> df
+ A B C D
+ 0 NaN 2.0 NaN 0
+ 1 3.0 4.0 NaN 1
+ 2 NaN NaN NaN 5
+ 3 NaN 3.0 NaN 4
+
+ Replace all NaN elements with 0s.
+
+ >>> df.fillna(0)
+ A B C D
+ 0 0.0 2.0 0.0 0
+ 1 3.0 4.0 0.0 1
+ 2 0.0 0.0 0.0 5
+ 3 0.0 3.0 0.0 4
+
+ We can also propagate non-null values forward or backward.
+
+ >>> df.fillna(method='ffill')
+ A B C D
+ 0 NaN 2.0 NaN 0
+ 1 3.0 4.0 NaN 1
+ 2 3.0 4.0 NaN 5
+ 3 3.0 3.0 NaN 4
+
+ Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
+ 2, and 3 respectively.
+
+ >>> values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
+ >>> df.fillna(value=values)
+ A B C D
+ 0 0.0 2.0 2.0 0
+ 1 3.0 4.0 2.0 1
+ 2 0.0 1.0 2.0 5
+ 3 0.0 3.0 2.0 4
+
+ Only replace the first NaN element.
+
+ >>> df.fillna(value=values, limit=1)
+ A B C D
+ 0 0.0 2.0 2.0 0
+ 1 3.0 4.0 NaN 1
+ 2 NaN 1.0 NaN 5
+ 3 NaN 3.0 NaN 4
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ value, method = validate_fillna_kwargs(value, method)
+
+ self._consolidate_inplace()
+
+ # set the default here, so functions examining the signaure
+ # can detect if something was set (e.g. in groupby) (GH9221)
+ if axis is None:
+ axis = 0
+ axis = self._get_axis_number(axis)
+
+ from pandas import DataFrame
+ if value is None:
+
+ if self._is_mixed_type and axis == 1:
+ if inplace:
+ raise NotImplementedError()
+ result = self.T.fillna(method=method, limit=limit).T
+
+ # need to downcast here because of all of the transposes
+ result._data = result._data.downcast()
+
+ return result
+
+ # > 3d
+ if self.ndim > 3:
+ raise NotImplementedError('Cannot fillna with a method for > '
+ '3dims')
+
+ # 3d
+ elif self.ndim == 3:
+ # fill in 2d chunks
+ result = {col: s.fillna(method=method, value=value)
+ for col, s in self.iteritems()}
+ prelim_obj = self._constructor.from_dict(result)
+ new_obj = prelim_obj.__finalize__(self)
+ new_data = new_obj._data
+
+ else:
+ # 2d or less
+ new_data = self._data.interpolate(method=method, axis=axis,
+ limit=limit, inplace=inplace,
+ coerce=True,
+ downcast=downcast)
+ else:
+ if len(self._get_axis(axis)) == 0:
+ return self
+
+ if self.ndim == 1:
+ if isinstance(value, (dict, ABCSeries)):
+ from pandas import Series
+ value = Series(value)
+ elif not is_list_like(value):
+ pass
+ else:
+ raise TypeError('"value" parameter must be a scalar, dict '
+ 'or Series, but you passed a '
+ '"{0}"'.format(type(value).__name__))
+
+ new_data = self._data.fillna(value=value, limit=limit,
+ inplace=inplace,
+ downcast=downcast)
+
+ elif isinstance(value, (dict, ABCSeries)):
+ if axis == 1:
+ raise NotImplementedError('Currently only can fill '
+ 'with dict/Series column '
+ 'by column')
+
+ result = self if inplace else self.copy()
+ for k, v in compat.iteritems(value):
+ if k not in result:
+ continue
+ obj = result[k]
+ obj.fillna(v, limit=limit, inplace=True, downcast=downcast)
+ return result if not inplace else None
+
+ elif not is_list_like(value):
+ new_data = self._data.fillna(value=value, limit=limit,
+ inplace=inplace,
+ downcast=downcast)
+ elif isinstance(value, DataFrame) and self.ndim == 2:
+ new_data = self.where(self.notna(), value)
+ else:
+ raise ValueError("invalid fill value with a %s" % type(value))
+
+ if inplace:
+ self._update_inplace(new_data)
+ else:
+ return self._constructor(new_data).__finalize__(self)
+
+ def ffill(self, axis=None, inplace=False, limit=None, downcast=None):
+ """
+ Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``.
+ """
+ return self.fillna(method='ffill', axis=axis, inplace=inplace,
+ limit=limit, downcast=downcast)
+
+ def bfill(self, axis=None, inplace=False, limit=None, downcast=None):
+ """
+ Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``.
+ """
+ return self.fillna(method='bfill', axis=axis, inplace=inplace,
+ limit=limit, downcast=downcast)
+
+ _shared_docs['replace'] = ("""
+ Replace values given in `to_replace` with `value`.
+
+ Values of the %(klass)s are replaced with other values dynamically.
+ This differs from updating with ``.loc`` or ``.iloc``, which require
+ you to specify a location to update with some value.
+
+ Parameters
+ ----------
+ to_replace : str, regex, list, dict, Series, int, float, or None
+ How to find the values that will be replaced.
+
+ * numeric, str or regex:
+
+ - numeric: numeric values equal to `to_replace` will be
+ replaced with `value`
+ - str: string exactly matching `to_replace` will be replaced
+ with `value`
+ - regex: regexs matching `to_replace` will be replaced with
+ `value`
+
+ * list of str, regex, or numeric:
+
+ - First, if `to_replace` and `value` are both lists, they
+ **must** be the same length.
+ - Second, if ``regex=True`` then all of the strings in **both**
+ lists will be interpreted as regexs otherwise they will match
+ directly. This doesn't matter much for `value` since there
+ are only a few possible substitution regexes you can use.
+ - str, regex and numeric rules apply as above.
+
+ * dict:
+
+ - Dicts can be used to specify different replacement values
+ for different existing values. For example,
+ ``{'a': 'b', 'y': 'z'}`` replaces the value 'a' with 'b' and
+ 'y' with 'z'. To use a dict in this way the `value`
+ parameter should be `None`.
+ - For a DataFrame a dict can specify that different values
+ should be replaced in different columns. For example,
+ ``{'a': 1, 'b': 'z'}`` looks for the value 1 in column 'a'
+ and the value 'z' in column 'b' and replaces these values
+ with whatever is specified in `value`. The `value` parameter
+ should not be ``None`` in this case. You can treat this as a
+ special case of passing two lists except that you are
+ specifying the column to search in.
+ - For a DataFrame nested dictionaries, e.g.,
+ ``{'a': {'b': np.nan}}``, are read as follows: look in column
+ 'a' for the value 'b' and replace it with NaN. The `value`
+ parameter should be ``None`` to use a nested dict in this
+ way. You can nest regular expressions as well. Note that
+ column names (the top-level dictionary keys in a nested
+ dictionary) **cannot** be regular expressions.
+
+ * None:
+
+ - This means that the `regex` argument must be a string,
+ compiled regular expression, or list, dict, ndarray or
+ Series of such elements. If `value` is also ``None`` then
+ this **must** be a nested dictionary or Series.
+
+ See the examples section for examples of each of these.
+ value : scalar, dict, list, str, regex, default None
+ Value to replace any values matching `to_replace` with.
+ For a DataFrame a dict of values can be used to specify which
+ value to use for each column (columns not in the dict will not be
+ filled). Regular expressions, strings and lists or dicts of such
+ objects are also allowed.
+ inplace : bool, default False
+ If True, in place. Note: this will modify any
+ other views on this object (e.g. a column from a DataFrame).
+ Returns the caller if this is True.
+ limit : int, default None
+ Maximum size gap to forward or backward fill.
+ regex : bool or same types as `to_replace`, default False
+ Whether to interpret `to_replace` and/or `value` as regular
+ expressions. If this is ``True`` then `to_replace` *must* be a
+ string. Alternatively, this could be a regular expression or a
+ list, dict, or array of regular expressions in which case
+ `to_replace` must be ``None``.
+ method : {'pad', 'ffill', 'bfill', `None`}
+ The method to use when for replacement, when `to_replace` is a
+ scalar, list or tuple and `value` is ``None``.
+
+ .. versionchanged:: 0.23.0
+ Added to DataFrame.
+
+ Returns
+ -------
+ %(klass)s
+ Object after replacement.
+
+ Raises
+ ------
+ AssertionError
+ * If `regex` is not a ``bool`` and `to_replace` is not
+ ``None``.
+ TypeError
+ * If `to_replace` is a ``dict`` and `value` is not a ``list``,
+ ``dict``, ``ndarray``, or ``Series``
+ * If `to_replace` is ``None`` and `regex` is not compilable
+ into a regular expression or is a list, dict, ndarray, or
+ Series.
+ * When replacing multiple ``bool`` or ``datetime64`` objects and
+ the arguments to `to_replace` does not match the type of the
+ value being replaced
+ ValueError
+ * If a ``list`` or an ``ndarray`` is passed to `to_replace` and
+ `value` but they are not the same length.
+
+ See Also
+ --------
+ %(klass)s.fillna : Fill NA values.
+ %(klass)s.where : Replace values based on boolean condition.
+ Series.str.replace : Simple string replacement.
+
+ Notes
+ -----
+ * Regex substitution is performed under the hood with ``re.sub``. The
+ rules for substitution for ``re.sub`` are the same.
+ * Regular expressions will only substitute on strings, meaning you
+ cannot provide, for example, a regular expression matching floating
+ point numbers and expect the columns in your frame that have a
+ numeric dtype to be matched. However, if those floating point
+ numbers *are* strings, then you can do this.
+ * This method has *a lot* of options. You are encouraged to experiment
+ and play with this method to gain intuition about how it works.
+ * When dict is used as the `to_replace` value, it is like
+ key(s) in the dict are the to_replace part and
+ value(s) in the dict are the value parameter.
+
+ Examples
+ --------
+
+ **Scalar `to_replace` and `value`**
+
+ >>> s = pd.Series([0, 1, 2, 3, 4])
+ >>> s.replace(0, 5)
+ 0 5
+ 1 1
+ 2 2
+ 3 3
+ 4 4
+ dtype: int64
+
+ >>> df = pd.DataFrame({'A': [0, 1, 2, 3, 4],
+ ... 'B': [5, 6, 7, 8, 9],
+ ... 'C': ['a', 'b', 'c', 'd', 'e']})
+ >>> df.replace(0, 5)
+ A B C
+ 0 5 5 a
+ 1 1 6 b
+ 2 2 7 c
+ 3 3 8 d
+ 4 4 9 e
+
+ **List-like `to_replace`**
+
+ >>> df.replace([0, 1, 2, 3], 4)
+ A B C
+ 0 4 5 a
+ 1 4 6 b
+ 2 4 7 c
+ 3 4 8 d
+ 4 4 9 e
+
+ >>> df.replace([0, 1, 2, 3], [4, 3, 2, 1])
+ A B C
+ 0 4 5 a
+ 1 3 6 b
+ 2 2 7 c
+ 3 1 8 d
+ 4 4 9 e
+
+ >>> s.replace([1, 2], method='bfill')
+ 0 0
+ 1 3
+ 2 3
+ 3 3
+ 4 4
+ dtype: int64
+
+ **dict-like `to_replace`**
+
+ >>> df.replace({0: 10, 1: 100})
+ A B C
+ 0 10 5 a
+ 1 100 6 b
+ 2 2 7 c
+ 3 3 8 d
+ 4 4 9 e
+
+ >>> df.replace({'A': 0, 'B': 5}, 100)
+ A B C
+ 0 100 100 a
+ 1 1 6 b
+ 2 2 7 c
+ 3 3 8 d
+ 4 4 9 e
+
+ >>> df.replace({'A': {0: 100, 4: 400}})
+ A B C
+ 0 100 5 a
+ 1 1 6 b
+ 2 2 7 c
+ 3 3 8 d
+ 4 400 9 e
+
+ **Regular expression `to_replace`**
+
+ >>> df = pd.DataFrame({'A': ['bat', 'foo', 'bait'],
+ ... 'B': ['abc', 'bar', 'xyz']})
+ >>> df.replace(to_replace=r'^ba.$', value='new', regex=True)
+ A B
+ 0 new abc
+ 1 foo new
+ 2 bait xyz
+
+ >>> df.replace({'A': r'^ba.$'}, {'A': 'new'}, regex=True)
+ A B
+ 0 new abc
+ 1 foo bar
+ 2 bait xyz
+
+ >>> df.replace(regex=r'^ba.$', value='new')
+ A B
+ 0 new abc
+ 1 foo new
+ 2 bait xyz
+
+ >>> df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'})
+ A B
+ 0 new abc
+ 1 xyz new
+ 2 bait xyz
+
+ >>> df.replace(regex=[r'^ba.$', 'foo'], value='new')
+ A B
+ 0 new abc
+ 1 new new
+ 2 bait xyz
+
+ Note that when replacing multiple ``bool`` or ``datetime64`` objects,
+ the data types in the `to_replace` parameter must match the data
+ type of the value being replaced:
+
+ >>> df = pd.DataFrame({'A': [True, False, True],
+ ... 'B': [False, True, False]})
+ >>> df.replace({'a string': 'new value', True: False}) # raises
+ Traceback (most recent call last):
+ ...
+ TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str'
+
+ This raises a ``TypeError`` because one of the ``dict`` keys is not of
+ the correct type for replacement.
+
+ Compare the behavior of ``s.replace({'a': None})`` and
+ ``s.replace('a', None)`` to understand the peculiarities
+ of the `to_replace` parameter:
+
+ >>> s = pd.Series([10, 'a', 'a', 'b', 'a'])
+
+ When one uses a dict as the `to_replace` value, it is like the
+ value(s) in the dict are equal to the `value` parameter.
+ ``s.replace({'a': None})`` is equivalent to
+ ``s.replace(to_replace={'a': None}, value=None, method=None)``:
+
+ >>> s.replace({'a': None})
+ 0 10
+ 1 None
+ 2 None
+ 3 b
+ 4 None
+ dtype: object
+
+ When ``value=None`` and `to_replace` is a scalar, list or
+ tuple, `replace` uses the method parameter (default 'pad') to do the
+ replacement. So this is why the 'a' values are being replaced by 10
+ in rows 1 and 2 and 'b' in row 4 in this case.
+ The command ``s.replace('a', None)`` is actually equivalent to
+ ``s.replace(to_replace='a', value=None, method='pad')``:
+
+ >>> s.replace('a', None)
+ 0 10
+ 1 10
+ 2 10
+ 3 b
+ 4 b
+ dtype: object
+ """)
+
+ @Appender(_shared_docs['replace'] % _shared_doc_kwargs)
+ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
+ regex=False, method='pad'):
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ if not is_bool(regex) and to_replace is not None:
+ raise AssertionError("'to_replace' must be 'None' if 'regex' is "
+ "not a bool")
+
+ self._consolidate_inplace()
+
+ if value is None:
+ # passing a single value that is scalar like
+ # when value is None (GH5319), for compat
+ if not is_dict_like(to_replace) and not is_dict_like(regex):
+ to_replace = [to_replace]
+
+ if isinstance(to_replace, (tuple, list)):
+ if isinstance(self, pd.DataFrame):
+ return self.apply(_single_replace,
+ args=(to_replace, method, inplace,
+ limit))
+ return _single_replace(self, to_replace, method, inplace,
+ limit)
+
+ if not is_dict_like(to_replace):
+ if not is_dict_like(regex):
+ raise TypeError('If "to_replace" and "value" are both None'
+ ' and "to_replace" is not a list, then '
+ 'regex must be a mapping')
+ to_replace = regex
+ regex = True
+
+ items = list(compat.iteritems(to_replace))
+ keys, values = lzip(*items) or ([], [])
+
+ are_mappings = [is_dict_like(v) for v in values]
+
+ if any(are_mappings):
+ if not all(are_mappings):
+ raise TypeError("If a nested mapping is passed, all values"
+ " of the top level mapping must be "
+ "mappings")
+ # passed a nested dict/Series
+ to_rep_dict = {}
+ value_dict = {}
+
+ for k, v in items:
+ keys, values = lzip(*v.items()) or ([], [])
+ if set(keys) & set(values):
+ raise ValueError("Replacement not allowed with "
+ "overlapping keys and values")
+ to_rep_dict[k] = list(keys)
+ value_dict[k] = list(values)
+
+ to_replace, value = to_rep_dict, value_dict
+ else:
+ to_replace, value = keys, values
+
+ return self.replace(to_replace, value, inplace=inplace,
+ limit=limit, regex=regex)
+ else:
+
+ # need a non-zero len on all axes
+ for a in self._AXIS_ORDERS:
+ if not len(self._get_axis(a)):
+ return self
+
+ new_data = self._data
+ if is_dict_like(to_replace):
+ if is_dict_like(value): # {'A' : NA} -> {'A' : 0}
+ res = self if inplace else self.copy()
+ for c, src in compat.iteritems(to_replace):
+ if c in value and c in self:
+ # object conversion is handled in
+ # series.replace which is called recursivelly
+ res[c] = res[c].replace(to_replace=src,
+ value=value[c],
+ inplace=False,
+ regex=regex)
+ return None if inplace else res
+
+ # {'A': NA} -> 0
+ elif not is_list_like(value):
+ keys = [(k, src) for k, src in compat.iteritems(to_replace)
+ if k in self]
+ keys_len = len(keys) - 1
+ for i, (k, src) in enumerate(keys):
+ convert = i == keys_len
+ new_data = new_data.replace(to_replace=src,
+ value=value,
+ filter=[k],
+ inplace=inplace,
+ regex=regex,
+ convert=convert)
+ else:
+ raise TypeError('value argument must be scalar, dict, or '
+ 'Series')
+
+ elif is_list_like(to_replace): # [NA, ''] -> [0, 'missing']
+ if is_list_like(value):
+ if len(to_replace) != len(value):
+ raise ValueError('Replacement lists must match '
+ 'in length. Expecting %d got %d ' %
+ (len(to_replace), len(value)))
+
+ new_data = self._data.replace_list(src_list=to_replace,
+ dest_list=value,
+ inplace=inplace,
+ regex=regex)
+
+ else: # [NA, ''] -> 0
+ new_data = self._data.replace(to_replace=to_replace,
+ value=value, inplace=inplace,
+ regex=regex)
+ elif to_replace is None:
+ if not (is_re_compilable(regex) or
+ is_list_like(regex) or is_dict_like(regex)):
+ raise TypeError("'regex' must be a string or a compiled "
+ "regular expression or a list or dict of "
+ "strings or regular expressions, you "
+ "passed a"
+ " {0!r}".format(type(regex).__name__))
+ return self.replace(regex, value, inplace=inplace, limit=limit,
+ regex=True)
+ else:
+
+ # dest iterable dict-like
+ if is_dict_like(value): # NA -> {'A' : 0, 'B' : -1}
+ new_data = self._data
+
+ for k, v in compat.iteritems(value):
+ if k in self:
+ new_data = new_data.replace(to_replace=to_replace,
+ value=v, filter=[k],
+ inplace=inplace,
+ regex=regex)
+
+ elif not is_list_like(value): # NA -> 0
+ new_data = self._data.replace(to_replace=to_replace,
+ value=value, inplace=inplace,
+ regex=regex)
+ else:
+ msg = ('Invalid "to_replace" type: '
+ '{0!r}').format(type(to_replace).__name__)
+ raise TypeError(msg) # pragma: no cover
+
+ if inplace:
+ self._update_inplace(new_data)
+ else:
+ return self._constructor(new_data).__finalize__(self)
+
+ _shared_docs['interpolate'] = """
+ Please note that only ``method='linear'`` is supported for
+ DataFrame/Series with a MultiIndex.
+
+ Parameters
+ ----------
+ method : str, default 'linear'
+ Interpolation technique to use. One of:
+
+ * 'linear': Ignore the index and treat the values as equally
+ spaced. This is the only method supported on MultiIndexes.
+ * 'time': Works on daily and higher resolution data to interpolate
+ given length of interval.
+ * 'index', 'values': use the actual numerical values of the index.
+ * 'pad': Fill in NaNs using existing values.
+ * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'spline',
+ 'barycentric', 'polynomial': Passed to
+ `scipy.interpolate.interp1d`. Both 'polynomial' and 'spline'
+ require that you also specify an `order` (int),
+ e.g. ``df.interpolate(method='polynomial', order=4)``.
+ These use the numerical values of the index.
+ * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima':
+ Wrappers around the SciPy interpolation methods of similar
+ names. See `Notes`.
+ * 'from_derivatives': Refers to
+ `scipy.interpolate.BPoly.from_derivatives` which
+ replaces 'piecewise_polynomial' interpolation method in
+ scipy 0.18.
+
+ .. versionadded:: 0.18.1
+
+ Added support for the 'akima' method.
+ Added interpolate method 'from_derivatives' which replaces
+ 'piecewise_polynomial' in SciPy 0.18; backwards-compatible with
+ SciPy < 0.18
+
+ axis : {0 or 'index', 1 or 'columns', None}, default None
+ Axis to interpolate along.
+ limit : int, optional
+ Maximum number of consecutive NaNs to fill. Must be greater than
+ 0.
+ inplace : bool, default False
+ Update the data in place if possible.
+ limit_direction : {'forward', 'backward', 'both'}, default 'forward'
+ If limit is specified, consecutive NaNs will be filled in this
+ direction.
+ limit_area : {`None`, 'inside', 'outside'}, default None
+ If limit is specified, consecutive NaNs will be filled with this
+ restriction.
+
+ * ``None``: No fill restriction.
+ * 'inside': Only fill NaNs surrounded by valid values
+ (interpolate).
+ * 'outside': Only fill NaNs outside valid values (extrapolate).
+
+ .. versionadded:: 0.21.0
+
+ downcast : optional, 'infer' or None, defaults to None
+ Downcast dtypes if possible.
+ **kwargs
+ Keyword arguments to pass on to the interpolating function.
+
+ Returns
+ -------
+ Series or DataFrame
+ Returns the same object type as the caller, interpolated at
+ some or all ``NaN`` values
+
+ See Also
+ --------
+ fillna : Fill missing values using different methods.
+ scipy.interpolate.Akima1DInterpolator : Piecewise cubic polynomials
+ (Akima interpolator).
+ scipy.interpolate.BPoly.from_derivatives : Piecewise polynomial in the
+ Bernstein basis.
+ scipy.interpolate.interp1d : Interpolate a 1-D function.
+ scipy.interpolate.KroghInterpolator : Interpolate polynomial (Krogh
+ interpolator).
+ scipy.interpolate.PchipInterpolator : PCHIP 1-d monotonic cubic
+ interpolation.
+ scipy.interpolate.CubicSpline : Cubic spline data interpolator.
+
+ Notes
+ -----
+ The 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima'
+ methods are wrappers around the respective SciPy implementations of
+ similar names. These use the actual numerical values of the index.
+ For more information on their behavior, see the
+ `SciPy documentation
+ <http://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation>`__
+ and `SciPy tutorial
+ <http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html>`__.
+
+ Examples
+ --------
+ Filling in ``NaN`` in a :class:`~pandas.Series` via linear
+ interpolation.
+
+ >>> s = pd.Series([0, 1, np.nan, 3])
+ >>> s
+ 0 0.0
+ 1 1.0
+ 2 NaN
+ 3 3.0
+ dtype: float64
+ >>> s.interpolate()
+ 0 0.0
+ 1 1.0
+ 2 2.0
+ 3 3.0
+ dtype: float64
+
+ Filling in ``NaN`` in a Series by padding, but filling at most two
+ consecutive ``NaN`` at a time.
+
+ >>> s = pd.Series([np.nan, "single_one", np.nan,
+ ... "fill_two_more", np.nan, np.nan, np.nan,
+ ... 4.71, np.nan])
+ >>> s
+ 0 NaN
+ 1 single_one
+ 2 NaN
+ 3 fill_two_more
+ 4 NaN
+ 5 NaN
+ 6 NaN
+ 7 4.71
+ 8 NaN
+ dtype: object
+ >>> s.interpolate(method='pad', limit=2)
+ 0 NaN
+ 1 single_one
+ 2 single_one
+ 3 fill_two_more
+ 4 fill_two_more
+ 5 fill_two_more
+ 6 NaN
+ 7 4.71
+ 8 4.71
+ dtype: object
+
+ Filling in ``NaN`` in a Series via polynomial interpolation or splines:
+ Both 'polynomial' and 'spline' methods require that you also specify
+ an ``order`` (int).
+
+ >>> s = pd.Series([0, 2, np.nan, 8])
+ >>> s.interpolate(method='polynomial', order=2)
+ 0 0.000000
+ 1 2.000000
+ 2 4.666667
+ 3 8.000000
+ dtype: float64
+
+ Fill the DataFrame forward (that is, going down) along each column
+ using linear interpolation.
+
+ Note how the last entry in column 'a' is interpolated differently,
+ because there is no entry after it to use for interpolation.
+ Note how the first entry in column 'b' remains ``NaN``, because there
+ is no entry befofe it to use for interpolation.
+
+ >>> df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0),
+ ... (np.nan, 2.0, np.nan, np.nan),
+ ... (2.0, 3.0, np.nan, 9.0),
+ ... (np.nan, 4.0, -4.0, 16.0)],
+ ... columns=list('abcd'))
+ >>> df
+ a b c d
+ 0 0.0 NaN -1.0 1.0
+ 1 NaN 2.0 NaN NaN
+ 2 2.0 3.0 NaN 9.0
+ 3 NaN 4.0 -4.0 16.0
+ >>> df.interpolate(method='linear', limit_direction='forward', axis=0)
+ a b c d
+ 0 0.0 NaN -1.0 1.0
+ 1 1.0 2.0 -2.0 5.0
+ 2 2.0 3.0 -3.0 9.0
+ 3 2.0 4.0 -4.0 16.0
+
+ Using polynomial interpolation.
+
+ >>> df['d'].interpolate(method='polynomial', order=2)
+ 0 1.0
+ 1 4.0
+ 2 9.0
+ 3 16.0
+ Name: d, dtype: float64
+ """
+
+ @Appender(_shared_docs['interpolate'] % _shared_doc_kwargs)
+ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
+ limit_direction='forward', limit_area=None,
+ downcast=None, **kwargs):
+ """
+ Interpolate values according to different methods.
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+
+ if self.ndim > 2:
+ raise NotImplementedError("Interpolate has not been implemented "
+ "on Panel and Panel 4D objects.")
+
+ if axis == 0:
+ ax = self._info_axis_name
+ _maybe_transposed_self = self
+ elif axis == 1:
+ _maybe_transposed_self = self.T
+ ax = 1
+ else:
+ _maybe_transposed_self = self
+ ax = _maybe_transposed_self._get_axis_number(ax)
+
+ if _maybe_transposed_self.ndim == 2:
+ alt_ax = 1 - ax
+ else:
+ alt_ax = ax
+
+ if (isinstance(_maybe_transposed_self.index, MultiIndex) and
+ method != 'linear'):
+ raise ValueError("Only `method=linear` interpolation is supported "
+ "on MultiIndexes.")
+
+ if _maybe_transposed_self._data.get_dtype_counts().get(
+ 'object') == len(_maybe_transposed_self.T):
+ raise TypeError("Cannot interpolate with all object-dtype columns "
+ "in the DataFrame. Try setting at least one "
+ "column to a numeric dtype.")
+
+ # create/use the index
+ if method == 'linear':
+ # prior default
+ index = np.arange(len(_maybe_transposed_self._get_axis(alt_ax)))
+ else:
+ index = _maybe_transposed_self._get_axis(alt_ax)
+
+ if isna(index).any():
+ raise NotImplementedError("Interpolation with NaNs in the index "
+ "has not been implemented. Try filling "
+ "those NaNs before interpolating.")
+ data = _maybe_transposed_self._data
+ new_data = data.interpolate(method=method, axis=ax, index=index,
+ values=_maybe_transposed_self, limit=limit,
+ limit_direction=limit_direction,
+ limit_area=limit_area,
+ inplace=inplace, downcast=downcast,
+ **kwargs)
+
+ if inplace:
+ if axis == 1:
+ new_data = self._constructor(new_data).T._data
+ self._update_inplace(new_data)
+ else:
+ res = self._constructor(new_data).__finalize__(self)
+ if axis == 1:
+ res = res.T
+ return res
+
+ # ----------------------------------------------------------------------
+ # Timeseries methods Methods
+
+ def asof(self, where, subset=None):
+ """
+ Return the last row(s) without any NaNs before `where`.
+
+ The last row (for each element in `where`, if list) without any
+ NaN is taken.
+ In case of a :class:`~pandas.DataFrame`, the last row without NaN
+ considering only the subset of columns (if not `None`)
+
+ .. versionadded:: 0.19.0 For DataFrame
+
+ If there is no good value, NaN is returned for a Series or
+ a Series of NaN values for a DataFrame
+
+ Parameters
+ ----------
+ where : date or array-like of dates
+ Date(s) before which the last row(s) are returned.
+ subset : str or array-like of str, default `None`
+ For DataFrame, if not `None`, only use these columns to
+ check for NaNs.
+
+ Returns
+ -------
+ scalar, Series, or DataFrame
+
+ * scalar : when `self` is a Series and `where` is a scalar
+ * Series: when `self` is a Series and `where` is an array-like,
+ or when `self` is a DataFrame and `where` is a scalar
+ * DataFrame : when `self` is a DataFrame and `where` is an
+ array-like
+
+ See Also
+ --------
+ merge_asof : Perform an asof merge. Similar to left join.
+
+ Notes
+ -----
+ Dates are assumed to be sorted. Raises if this is not the case.
+
+ Examples
+ --------
+ A Series and a scalar `where`.
+
+ >>> s = pd.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40])
+ >>> s
+ 10 1.0
+ 20 2.0
+ 30 NaN
+ 40 4.0
+ dtype: float64
+
+ >>> s.asof(20)
+ 2.0
+
+ For a sequence `where`, a Series is returned. The first value is
+ NaN, because the first element of `where` is before the first
+ index value.
+
+ >>> s.asof([5, 20])
+ 5 NaN
+ 20 2.0
+ dtype: float64
+
+ Missing values are not considered. The following is ``2.0``, not
+ NaN, even though NaN is at the index location for ``30``.
+
+ >>> s.asof(30)
+ 2.0
+
+ Take all columns into consideration
+
+ >>> df = pd.DataFrame({'a': [10, 20, 30, 40, 50],
+ ... 'b': [None, None, None, None, 500]},
+ ... index=pd.DatetimeIndex(['2018-02-27 09:01:00',
+ ... '2018-02-27 09:02:00',
+ ... '2018-02-27 09:03:00',
+ ... '2018-02-27 09:04:00',
+ ... '2018-02-27 09:05:00']))
+ >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',
+ ... '2018-02-27 09:04:30']))
+ a b
+ 2018-02-27 09:03:30 NaN NaN
+ 2018-02-27 09:04:30 NaN NaN
+
+ Take a single column into consideration
+
+ >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',
+ ... '2018-02-27 09:04:30']),
+ ... subset=['a'])
+ a b
+ 2018-02-27 09:03:30 30.0 NaN
+ 2018-02-27 09:04:30 40.0 NaN
+ """
+ if isinstance(where, compat.string_types):
+ from pandas import to_datetime
+ where = to_datetime(where)
+
+ if not self.index.is_monotonic:
+ raise ValueError("asof requires a sorted index")
+
+ is_series = isinstance(self, ABCSeries)
+ if is_series:
+ if subset is not None:
+ raise ValueError("subset is not valid for Series")
+ elif self.ndim > 2:
+ raise NotImplementedError("asof is not implemented "
+ "for {type}".format(type=type(self)))
+ else:
+ if subset is None:
+ subset = self.columns
+ if not is_list_like(subset):
+ subset = [subset]
+
+ is_list = is_list_like(where)
+ if not is_list:
+ start = self.index[0]
+ if isinstance(self.index, PeriodIndex):
+ where = Period(where, freq=self.index.freq).ordinal
+ start = start.ordinal
+
+ if where < start:
+ if not is_series:
+ from pandas import Series
+ return Series(index=self.columns, name=where)
+ return np.nan
+
+ # It's always much faster to use a *while* loop here for
+ # Series than pre-computing all the NAs. However a
+ # *while* loop is extremely expensive for DataFrame
+ # so we later pre-compute all the NAs and use the same
+ # code path whether *where* is a scalar or list.
+ # See PR: https://github.com/pandas-dev/pandas/pull/14476
+ if is_series:
+ loc = self.index.searchsorted(where, side='right')
+ if loc > 0:
+ loc -= 1
+
+ values = self._values
+ while loc > 0 and isna(values[loc]):
+ loc -= 1
+ return values[loc]
+
+ if not isinstance(where, Index):
+ where = Index(where) if is_list else Index([where])
+
+ nulls = self.isna() if is_series else self[subset].isna().any(1)
+ if nulls.all():
+ if is_series:
+ return self._constructor(np.nan, index=where, name=self.name)
+ elif is_list:
+ from pandas import DataFrame
+ return DataFrame(np.nan, index=where, columns=self.columns)
+ else:
+ from pandas import Series
+ return Series(np.nan, index=self.columns, name=where[0])
+
+ locs = self.index.asof_locs(where, ~(nulls.values))
+
+ # mask the missing
+ missing = locs == -1
+ data = self.take(locs, is_copy=False)
+ data.index = where
+ data.loc[missing] = np.nan
+ return data if is_list else data.iloc[-1]
+
+ # ----------------------------------------------------------------------
+ # Action Methods
+
+ _shared_docs['isna'] = """
+ Detect missing values.
+
+ Return a boolean same-sized object indicating if the values are NA.
+ NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
+ values.
+ Everything else gets mapped to False values. Characters such as empty
+ strings ``''`` or :attr:`numpy.inf` are not considered NA values
+ (unless you set ``pandas.options.mode.use_inf_as_na = True``).
+
+ Returns
+ -------
+ %(klass)s
+ Mask of bool values for each element in %(klass)s that
+ indicates whether an element is not an NA value.
+
+ See Also
+ --------
+ %(klass)s.isnull : Alias of isna.
+ %(klass)s.notna : Boolean inverse of isna.
+ %(klass)s.dropna : Omit axes labels with missing values.
+ isna : Top-level isna.
+
+ Examples
+ --------
+ Show which entries in a DataFrame are NA.
+
+ >>> df = pd.DataFrame({'age': [5, 6, np.NaN],
+ ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'),
+ ... pd.Timestamp('1940-04-25')],
+ ... 'name': ['Alfred', 'Batman', ''],
+ ... 'toy': [None, 'Batmobile', 'Joker']})
+ >>> df
+ age born name toy
+ 0 5.0 NaT Alfred None
+ 1 6.0 1939-05-27 Batman Batmobile
+ 2 NaN 1940-04-25 Joker
+
+ >>> df.isna()
+ age born name toy
+ 0 False True False True
+ 1 False False False False
+ 2 True False False False
+
+ Show which entries in a Series are NA.
+
+ >>> ser = pd.Series([5, 6, np.NaN])
+ >>> ser
+ 0 5.0
+ 1 6.0
+ 2 NaN
+ dtype: float64
+
+ >>> ser.isna()
+ 0 False
+ 1 False
+ 2 True
+ dtype: bool
+ """
+
+ @Appender(_shared_docs['isna'] % _shared_doc_kwargs)
+ def isna(self):
+ return isna(self).__finalize__(self)
+
+ @Appender(_shared_docs['isna'] % _shared_doc_kwargs)
+ def isnull(self):
+ return isna(self).__finalize__(self)
+
+ _shared_docs['notna'] = """
+ Detect existing (non-missing) values.
+
+ Return a boolean same-sized object indicating if the values are not NA.
+ Non-missing values get mapped to True. Characters such as empty
+ strings ``''`` or :attr:`numpy.inf` are not considered NA values
+ (unless you set ``pandas.options.mode.use_inf_as_na = True``).
+ NA values, such as None or :attr:`numpy.NaN`, get mapped to False
+ values.
+
+ Returns
+ -------
+ %(klass)s
+ Mask of bool values for each element in %(klass)s that
+ indicates whether an element is not an NA value.
+
+ See Also
+ --------
+ %(klass)s.notnull : Alias of notna.
+ %(klass)s.isna : Boolean inverse of notna.
+ %(klass)s.dropna : Omit axes labels with missing values.
+ notna : Top-level notna.
+
+ Examples
+ --------
+ Show which entries in a DataFrame are not NA.
+
+ >>> df = pd.DataFrame({'age': [5, 6, np.NaN],
+ ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'),
+ ... pd.Timestamp('1940-04-25')],
+ ... 'name': ['Alfred', 'Batman', ''],
+ ... 'toy': [None, 'Batmobile', 'Joker']})
+ >>> df
+ age born name toy
+ 0 5.0 NaT Alfred None
+ 1 6.0 1939-05-27 Batman Batmobile
+ 2 NaN 1940-04-25 Joker
+
+ >>> df.notna()
+ age born name toy
+ 0 True False True False
+ 1 True True True True
+ 2 False True True True
+
+ Show which entries in a Series are not NA.
+
+ >>> ser = pd.Series([5, 6, np.NaN])
+ >>> ser
+ 0 5.0
+ 1 6.0
+ 2 NaN
+ dtype: float64
+
+ >>> ser.notna()
+ 0 True
+ 1 True
+ 2 False
+ dtype: bool
+ """
+
+ @Appender(_shared_docs['notna'] % _shared_doc_kwargs)
+ def notna(self):
+ return notna(self).__finalize__(self)
+
+ @Appender(_shared_docs['notna'] % _shared_doc_kwargs)
+ def notnull(self):
+ return notna(self).__finalize__(self)
+
+ def _clip_with_scalar(self, lower, upper, inplace=False):
+ if ((lower is not None and np.any(isna(lower))) or
+ (upper is not None and np.any(isna(upper)))):
+ raise ValueError("Cannot use an NA value as a clip threshold")
+
+ result = self
+ mask = isna(self.values)
+
+ with np.errstate(all='ignore'):
+ if upper is not None:
+ subset = self.to_numpy() <= upper
+ result = result.where(subset, upper, axis=None, inplace=False)
+ if lower is not None:
+ subset = self.to_numpy() >= lower
+ result = result.where(subset, lower, axis=None, inplace=False)
+
+ if np.any(mask):
+ result[mask] = np.nan
+
+ if inplace:
+ self._update_inplace(result)
+ else:
+ return result
+
+ def _clip_with_one_bound(self, threshold, method, axis, inplace):
+
+ if axis is not None:
+ axis = self._get_axis_number(axis)
+
+ # method is self.le for upper bound and self.ge for lower bound
+ if is_scalar(threshold) and is_number(threshold):
+ if method.__name__ == 'le':
+ return self._clip_with_scalar(None, threshold, inplace=inplace)
+ return self._clip_with_scalar(threshold, None, inplace=inplace)
+
+ subset = method(threshold, axis=axis) | isna(self)
+
+ # GH #15390
+ # In order for where method to work, the threshold must
+ # be transformed to NDFrame from other array like structure.
+ if (not isinstance(threshold, ABCSeries)) and is_list_like(threshold):
+ if isinstance(self, ABCSeries):
+ threshold = pd.Series(threshold, index=self.index)
+ else:
+ threshold = _align_method_FRAME(self, threshold,
+ axis)
+ return self.where(subset, threshold, axis=axis, inplace=inplace)
+
+ def clip(self, lower=None, upper=None, axis=None, inplace=False,
+ *args, **kwargs):
+ """
+ Trim values at input threshold(s).
+
+ Assigns values outside boundary to boundary values. Thresholds
+ can be singular values or array like, and in the latter case
+ the clipping is performed element-wise in the specified axis.
+
+ Parameters
+ ----------
+ lower : float or array_like, default None
+ Minimum threshold value. All values below this
+ threshold will be set to it.
+ upper : float or array_like, default None
+ Maximum threshold value. All values above this
+ threshold will be set to it.
+ axis : int or string axis name, optional
+ Align object with lower and upper along the given axis.
+ inplace : boolean, default False
+ Whether to perform the operation in place on the data.
+
+ .. versionadded:: 0.21.0
+ *args, **kwargs
+ Additional keywords have no effect but might be accepted
+ for compatibility with numpy.
+
+ Returns
+ -------
+ Series or DataFrame
+ Same type as calling object with the values outside the
+ clip boundaries replaced
+
+ Examples
+ --------
+ >>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]}
+ >>> df = pd.DataFrame(data)
+ >>> df
+ col_0 col_1
+ 0 9 -2
+ 1 -3 -7
+ 2 0 6
+ 3 -1 8
+ 4 5 -5
+
+ Clips per column using lower and upper thresholds:
+
+ >>> df.clip(-4, 6)
+ col_0 col_1
+ 0 6 -2
+ 1 -3 -4
+ 2 0 6
+ 3 -1 6
+ 4 5 -4
+
+ Clips using specific lower and upper thresholds per column element:
+
+ >>> t = pd.Series([2, -4, -1, 6, 3])
+ >>> t
+ 0 2
+ 1 -4
+ 2 -1
+ 3 6
+ 4 3
+ dtype: int64
+
+ >>> df.clip(t, t + 4, axis=0)
+ col_0 col_1
+ 0 6 2
+ 1 -3 -4
+ 2 0 3
+ 3 6 8
+ 4 5 3
+ """
+ if isinstance(self, ABCPanel):
+ raise NotImplementedError("clip is not supported yet for panels")
+
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+
+ axis = nv.validate_clip_with_axis(axis, args, kwargs)
+ if axis is not None:
+ axis = self._get_axis_number(axis)
+
+ # GH 17276
+ # numpy doesn't like NaN as a clip value
+ # so ignore
+ # GH 19992
+ # numpy doesn't drop a list-like bound containing NaN
+ if not is_list_like(lower) and np.any(pd.isnull(lower)):
+ lower = None
+ if not is_list_like(upper) and np.any(pd.isnull(upper)):
+ upper = None
+
+ # GH 2747 (arguments were reversed)
+ if lower is not None and upper is not None:
+ if is_scalar(lower) and is_scalar(upper):
+ lower, upper = min(lower, upper), max(lower, upper)
+
+ # fast-path for scalars
+ if ((lower is None or (is_scalar(lower) and is_number(lower))) and
+ (upper is None or (is_scalar(upper) and is_number(upper)))):
+ return self._clip_with_scalar(lower, upper, inplace=inplace)
+
+ result = self
+ if lower is not None:
+ result = result._clip_with_one_bound(lower, method=self.ge,
+ axis=axis, inplace=inplace)
+ if upper is not None:
+ if inplace:
+ result = self
+ result = result._clip_with_one_bound(upper, method=self.le,
+ axis=axis, inplace=inplace)
+
+ return result
+
+ def clip_upper(self, threshold, axis=None, inplace=False):
+ """
+ Trim values above a given threshold.
+
+ .. deprecated:: 0.24.0
+ Use clip(upper=threshold) instead.
+
+ Elements above the `threshold` will be changed to match the
+ `threshold` value(s). Threshold can be a single value or an array,
+ in the latter case it performs the truncation element-wise.
+
+ Parameters
+ ----------
+ threshold : numeric or array-like
+ Maximum value allowed. All values above threshold will be set to
+ this value.
+
+ * float : every value is compared to `threshold`.
+ * array-like : The shape of `threshold` should match the object
+ it's compared to. When `self` is a Series, `threshold` should be
+ the length. When `self` is a DataFrame, `threshold` should 2-D
+ and the same shape as `self` for ``axis=None``, or 1-D and the
+ same length as the axis being compared.
+
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ Align object with `threshold` along the given axis.
+ inplace : boolean, default False
+ Whether to perform the operation in place on the data.
+
+ .. versionadded:: 0.21.0
+
+ Returns
+ -------
+ Series or DataFrame
+ Original data with values trimmed.
+
+ See Also
+ --------
+ Series.clip : General purpose method to trim Series values to given
+ threshold(s).
+ DataFrame.clip : General purpose method to trim DataFrame values to
+ given threshold(s).
+
+ Examples
+ --------
+ >>> s = pd.Series([1, 2, 3, 4, 5])
+ >>> s
+ 0 1
+ 1 2
+ 2 3
+ 3 4
+ 4 5
+ dtype: int64
+
+ >>> s.clip(upper=3)
+ 0 1
+ 1 2
+ 2 3
+ 3 3
+ 4 3
+ dtype: int64
+
+ >>> elemwise_thresholds = [5, 4, 3, 2, 1]
+ >>> elemwise_thresholds
+ [5, 4, 3, 2, 1]
+
+ >>> s.clip(upper=elemwise_thresholds)
+ 0 1
+ 1 2
+ 2 3
+ 3 2
+ 4 1
+ dtype: int64
+ """
+ warnings.warn('clip_upper(threshold) is deprecated, '
+ 'use clip(upper=threshold) instead',
+ FutureWarning, stacklevel=2)
+ return self._clip_with_one_bound(threshold, method=self.le,
+ axis=axis, inplace=inplace)
+
+ def clip_lower(self, threshold, axis=None, inplace=False):
+ """
+ Trim values below a given threshold.
+
+ .. deprecated:: 0.24.0
+ Use clip(lower=threshold) instead.
+
+ Elements below the `threshold` will be changed to match the
+ `threshold` value(s). Threshold can be a single value or an array,
+ in the latter case it performs the truncation element-wise.
+
+ Parameters
+ ----------
+ threshold : numeric or array-like
+ Minimum value allowed. All values below threshold will be set to
+ this value.
+
+ * float : every value is compared to `threshold`.
+ * array-like : The shape of `threshold` should match the object
+ it's compared to. When `self` is a Series, `threshold` should be
+ the length. When `self` is a DataFrame, `threshold` should 2-D
+ and the same shape as `self` for ``axis=None``, or 1-D and the
+ same length as the axis being compared.
+
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ Align `self` with `threshold` along the given axis.
+
+ inplace : boolean, default False
+ Whether to perform the operation in place on the data.
+
+ .. versionadded:: 0.21.0
+
+ Returns
+ -------
+ Series or DataFrame
+ Original data with values trimmed.
+
+ See Also
+ --------
+ Series.clip : General purpose method to trim Series values to given
+ threshold(s).
+ DataFrame.clip : General purpose method to trim DataFrame values to
+ given threshold(s).
+
+ Examples
+ --------
+
+ Series single threshold clipping:
+
+ >>> s = pd.Series([5, 6, 7, 8, 9])
+ >>> s.clip(lower=8)
+ 0 8
+ 1 8
+ 2 8
+ 3 8
+ 4 9
+ dtype: int64
+
+ Series clipping element-wise using an array of thresholds. `threshold`
+ should be the same length as the Series.
+
+ >>> elemwise_thresholds = [4, 8, 7, 2, 5]
+ >>> s.clip(lower=elemwise_thresholds)
+ 0 5
+ 1 8
+ 2 7
+ 3 8
+ 4 9
+ dtype: int64
+
+ DataFrames can be compared to a scalar.
+
+ >>> df = pd.DataFrame({"A": [1, 3, 5], "B": [2, 4, 6]})
+ >>> df
+ A B
+ 0 1 2
+ 1 3 4
+ 2 5 6
+
+ >>> df.clip(lower=3)
+ A B
+ 0 3 3
+ 1 3 4
+ 2 5 6
+
+ Or to an array of values. By default, `threshold` should be the same
+ shape as the DataFrame.
+
+ >>> df.clip(lower=np.array([[3, 4], [2, 2], [6, 2]]))
+ A B
+ 0 3 4
+ 1 3 4
+ 2 6 6
+
+ Control how `threshold` is broadcast with `axis`. In this case
+ `threshold` should be the same length as the axis specified by
+ `axis`.
+
+ >>> df.clip(lower=[3, 3, 5], axis='index')
+ A B
+ 0 3 3
+ 1 3 4
+ 2 5 6
+
+ >>> df.clip(lower=[4, 5], axis='columns')
+ A B
+ 0 4 5
+ 1 4 5
+ 2 5 6
+ """
+ warnings.warn('clip_lower(threshold) is deprecated, '
+ 'use clip(lower=threshold) instead',
+ FutureWarning, stacklevel=2)
+ return self._clip_with_one_bound(threshold, method=self.ge,
+ axis=axis, inplace=inplace)
+
+ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
+ group_keys=True, squeeze=False, observed=False, **kwargs):
+ """
+ Group DataFrame or Series using a mapper or by a Series of columns.
+
+ A groupby operation involves some combination of splitting the
+ object, applying a function, and combining the results. This can be
+ used to group large amounts of data and compute operations on these
+ groups.
+
+ Parameters
+ ----------
+ by : mapping, function, label, or list of labels
+ Used to determine the groups for the groupby.
+ If ``by`` is a function, it's called on each value of the object's
+ index. If a dict or Series is passed, the Series or dict VALUES
+ will be used to determine the groups (the Series' values are first
+ aligned; see ``.align()`` method). If an ndarray is passed, the
+ values are used as-is determine the groups. A label or list of
+ labels may be passed to group by the columns in ``self``. Notice
+ that a tuple is interpreted a (single) key.
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ Split along rows (0) or columns (1).
+ level : int, level name, or sequence of such, default None
+ If the axis is a MultiIndex (hierarchical), group by a particular
+ level or levels.
+ as_index : bool, default True
+ For aggregated output, return object with group labels as the
+ index. Only relevant for DataFrame input. as_index=False is
+ effectively "SQL-style" grouped output.
+ sort : bool, default True
+ Sort group keys. Get better performance by turning this off.
+ Note this does not influence the order of observations within each
+ group. Groupby preserves the order of rows within each group.
+ group_keys : bool, default True
+ When calling apply, add group keys to index to identify pieces.
+ squeeze : bool, default False
+ Reduce the dimensionality of the return type if possible,
+ otherwise return a consistent type.
+ observed : bool, default False
+ This only applies if any of the groupers are Categoricals.
+ If True: only show observed values for categorical groupers.
+ If False: show all values for categorical groupers.
+
+ .. versionadded:: 0.23.0
+
+ **kwargs
+ Optional, only accepts keyword argument 'mutated' and is passed
+ to groupby.
+
+ Returns
+ -------
+ DataFrameGroupBy or SeriesGroupBy
+ Depends on the calling object and returns groupby object that
+ contains information about the groups.
+
+ See Also
+ --------
+ resample : Convenience method for frequency conversion and resampling
+ of time series.
+
+ Notes
+ -----
+ See the `user guide
+ <http://pandas.pydata.org/pandas-docs/stable/groupby.html>`_ for more.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'Animal' : ['Falcon', 'Falcon',
+ ... 'Parrot', 'Parrot'],
+ ... 'Max Speed' : [380., 370., 24., 26.]})
+ >>> df
+ Animal Max Speed
+ 0 Falcon 380.0
+ 1 Falcon 370.0
+ 2 Parrot 24.0
+ 3 Parrot 26.0
+ >>> df.groupby(['Animal']).mean()
+ Max Speed
+ Animal
+ Falcon 375.0
+ Parrot 25.0
+
+ **Hierarchical Indexes**
+
+ We can groupby different levels of a hierarchical index
+ using the `level` parameter:
+
+ >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
+ ... ['Capitve', 'Wild', 'Capitve', 'Wild']]
+ >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
+ >>> df = pd.DataFrame({'Max Speed' : [390., 350., 30., 20.]},
+ ... index=index)
+ >>> df
+ Max Speed
+ Animal Type
+ Falcon Capitve 390.0
+ Wild 350.0
+ Parrot Capitve 30.0
+ Wild 20.0
+ >>> df.groupby(level=0).mean()
+ Max Speed
+ Animal
+ Falcon 370.0
+ Parrot 25.0
+ >>> df.groupby(level=1).mean()
+ Max Speed
+ Type
+ Capitve 210.0
+ Wild 185.0
+ """
+ from pandas.core.groupby.groupby import groupby
+
+ if level is None and by is None:
+ raise TypeError("You have to supply one of 'by' and 'level'")
+ axis = self._get_axis_number(axis)
+ return groupby(self, by=by, axis=axis, level=level, as_index=as_index,
+ sort=sort, group_keys=group_keys, squeeze=squeeze,
+ observed=observed, **kwargs)
+
+ def asfreq(self, freq, method=None, how=None, normalize=False,
+ fill_value=None):
+ """
+ Convert TimeSeries to specified frequency.
+
+ Optionally provide filling method to pad/backfill missing values.
+
+ Returns the original data conformed to a new index with the specified
+ frequency. ``resample`` is more appropriate if an operation, such as
+ summarization, is necessary to represent the data at the new frequency.
+
+ Parameters
+ ----------
+ freq : DateOffset object, or string
+ method : {'backfill'/'bfill', 'pad'/'ffill'}, default None
+ Method to use for filling holes in reindexed Series (note this
+ does not fill NaNs that already were present):
+
+ * 'pad' / 'ffill': propagate last valid observation forward to next
+ valid
+ * 'backfill' / 'bfill': use NEXT valid observation to fill
+ how : {'start', 'end'}, default end
+ For PeriodIndex only, see PeriodIndex.asfreq
+ normalize : bool, default False
+ Whether to reset output index to midnight
+ fill_value : scalar, optional
+ Value to use for missing values, applied during upsampling (note
+ this does not fill NaNs that already were present).
+
+ .. versionadded:: 0.20.0
+
+ Returns
+ -------
+ converted : same type as caller
+
+ See Also
+ --------
+ reindex
+
+ Notes
+ -----
+ To learn more about the frequency strings, please see `this link
+ <http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases>`__.
+
+ Examples
+ --------
+
+ Start by creating a series with 4 one minute timestamps.
+
+ >>> index = pd.date_range('1/1/2000', periods=4, freq='T')
+ >>> series = pd.Series([0.0, None, 2.0, 3.0], index=index)
+ >>> df = pd.DataFrame({'s':series})
+ >>> df
+ s
+ 2000-01-01 00:00:00 0.0
+ 2000-01-01 00:01:00 NaN
+ 2000-01-01 00:02:00 2.0
+ 2000-01-01 00:03:00 3.0
+
+ Upsample the series into 30 second bins.
+
+ >>> df.asfreq(freq='30S')
+ s
+ 2000-01-01 00:00:00 0.0
+ 2000-01-01 00:00:30 NaN
+ 2000-01-01 00:01:00 NaN
+ 2000-01-01 00:01:30 NaN
+ 2000-01-01 00:02:00 2.0
+ 2000-01-01 00:02:30 NaN
+ 2000-01-01 00:03:00 3.0
+
+ Upsample again, providing a ``fill value``.
+
+ >>> df.asfreq(freq='30S', fill_value=9.0)
+ s
+ 2000-01-01 00:00:00 0.0
+ 2000-01-01 00:00:30 9.0
+ 2000-01-01 00:01:00 NaN
+ 2000-01-01 00:01:30 9.0
+ 2000-01-01 00:02:00 2.0
+ 2000-01-01 00:02:30 9.0
+ 2000-01-01 00:03:00 3.0
+
+ Upsample again, providing a ``method``.
+
+ >>> df.asfreq(freq='30S', method='bfill')
+ s
+ 2000-01-01 00:00:00 0.0
+ 2000-01-01 00:00:30 NaN
+ 2000-01-01 00:01:00 NaN
+ 2000-01-01 00:01:30 2.0
+ 2000-01-01 00:02:00 2.0
+ 2000-01-01 00:02:30 3.0
+ 2000-01-01 00:03:00 3.0
+ """
+ from pandas.core.resample import asfreq
+ return asfreq(self, freq, method=method, how=how, normalize=normalize,
+ fill_value=fill_value)
+
+ def at_time(self, time, asof=False, axis=None):
+ """
+ Select values at particular time of day (e.g. 9:30AM).
+
+ Parameters
+ ----------
+ time : datetime.time or string
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ values_at_time : same type as caller
+
+ Raises
+ ------
+ TypeError
+ If the index is not a :class:`DatetimeIndex`
+
+ See Also
+ --------
+ between_time : Select values between particular times of the day.
+ first : Select initial periods of time series based on a date offset.
+ last : Select final periods of time series based on a date offset.
+ DatetimeIndex.indexer_at_time : Get just the index locations for
+ values at particular time of the day.
+
+ Examples
+ --------
+ >>> i = pd.date_range('2018-04-09', periods=4, freq='12H')
+ >>> ts = pd.DataFrame({'A': [1,2,3,4]}, index=i)
+ >>> ts
+ A
+ 2018-04-09 00:00:00 1
+ 2018-04-09 12:00:00 2
+ 2018-04-10 00:00:00 3
+ 2018-04-10 12:00:00 4
+
+ >>> ts.at_time('12:00')
+ A
+ 2018-04-09 12:00:00 2
+ 2018-04-10 12:00:00 4
+ """
+ if axis is None:
+ axis = self._stat_axis_number
+ axis = self._get_axis_number(axis)
+
+ index = self._get_axis(axis)
+ try:
+ indexer = index.indexer_at_time(time, asof=asof)
+ except AttributeError:
+ raise TypeError('Index must be DatetimeIndex')
+
+ return self._take(indexer, axis=axis)
+
+ def between_time(self, start_time, end_time, include_start=True,
+ include_end=True, axis=None):
+ """
+ Select values between particular times of the day (e.g., 9:00-9:30 AM).
+
+ By setting ``start_time`` to be later than ``end_time``,
+ you can get the times that are *not* between the two times.
+
+ Parameters
+ ----------
+ start_time : datetime.time or string
+ end_time : datetime.time or string
+ include_start : boolean, default True
+ include_end : boolean, default True
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ values_between_time : same type as caller
+
+ Raises
+ ------
+ TypeError
+ If the index is not a :class:`DatetimeIndex`
+
+ See Also
+ --------
+ at_time : Select values at a particular time of the day.
+ first : Select initial periods of time series based on a date offset.
+ last : Select final periods of time series based on a date offset.
+ DatetimeIndex.indexer_between_time : Get just the index locations for
+ values between particular times of the day.
+
+ Examples
+ --------
+ >>> i = pd.date_range('2018-04-09', periods=4, freq='1D20min')
+ >>> ts = pd.DataFrame({'A': [1,2,3,4]}, index=i)
+ >>> ts
+ A
+ 2018-04-09 00:00:00 1
+ 2018-04-10 00:20:00 2
+ 2018-04-11 00:40:00 3
+ 2018-04-12 01:00:00 4
+
+ >>> ts.between_time('0:15', '0:45')
+ A
+ 2018-04-10 00:20:00 2
+ 2018-04-11 00:40:00 3
+
+ You get the times that are *not* between two times by setting
+ ``start_time`` later than ``end_time``:
+
+ >>> ts.between_time('0:45', '0:15')
+ A
+ 2018-04-09 00:00:00 1
+ 2018-04-12 01:00:00 4
+ """
+ if axis is None:
+ axis = self._stat_axis_number
+ axis = self._get_axis_number(axis)
+
+ index = self._get_axis(axis)
+ try:
+ indexer = index.indexer_between_time(
+ start_time, end_time, include_start=include_start,
+ include_end=include_end)
+ except AttributeError:
+ raise TypeError('Index must be DatetimeIndex')
+
+ return self._take(indexer, axis=axis)
+
+ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None,
+ label=None, convention='start', kind=None, loffset=None,
+ limit=None, base=0, on=None, level=None):
+ """
+ Resample time-series data.
+
+ Convenience method for frequency conversion and resampling of time
+ series. Object must have a datetime-like index (`DatetimeIndex`,
+ `PeriodIndex`, or `TimedeltaIndex`), or pass datetime-like values
+ to the `on` or `level` keyword.
+
+ Parameters
+ ----------
+ rule : str
+ The offset string or object representing target conversion.
+ how : str
+ Method for down/re-sampling, default to 'mean' for downsampling.
+
+ .. deprecated:: 0.18.0
+ The new syntax is ``.resample(...).mean()``, or
+ ``.resample(...).apply(<func>)``
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ Which axis to use for up- or down-sampling. For `Series` this
+ will default to 0, i.e. along the rows. Must be
+ `DatetimeIndex`, `TimedeltaIndex` or `PeriodIndex`.
+ fill_method : str, default None
+ Filling method for upsampling.
+
+ .. deprecated:: 0.18.0
+ The new syntax is ``.resample(...).<func>()``,
+ e.g. ``.resample(...).pad()``
+ closed : {'right', 'left'}, default None
+ Which side of bin interval is closed. The default is 'left'
+ for all frequency offsets except for 'M', 'A', 'Q', 'BM',
+ 'BA', 'BQ', and 'W' which all have a default of 'right'.
+ label : {'right', 'left'}, default None
+ Which bin edge label to label bucket with. The default is 'left'
+ for all frequency offsets except for 'M', 'A', 'Q', 'BM',
+ 'BA', 'BQ', and 'W' which all have a default of 'right'.
+ convention : {'start', 'end', 's', 'e'}, default 'start'
+ For `PeriodIndex` only, controls whether to use the start or
+ end of `rule`.
+ kind : {'timestamp', 'period'}, optional, default None
+ Pass 'timestamp' to convert the resulting index to a
+ `DateTimeIndex` or 'period' to convert it to a `PeriodIndex`.
+ By default the input representation is retained.
+ loffset : timedelta, default None
+ Adjust the resampled time labels.
+ limit : int, default None
+ Maximum size gap when reindexing with `fill_method`.
+
+ .. deprecated:: 0.18.0
+ base : int, default 0
+ For frequencies that evenly subdivide 1 day, the "origin" of the
+ aggregated intervals. For example, for '5min' frequency, base could
+ range from 0 through 4. Defaults to 0.
+ on : str, optional
+ For a DataFrame, column to use instead of index for resampling.
+ Column must be datetime-like.
+
+ .. versionadded:: 0.19.0
+
+ level : str or int, optional
+ For a MultiIndex, level (name or number) to use for
+ resampling. `level` must be datetime-like.
+
+ .. versionadded:: 0.19.0
+
+ Returns
+ -------
+ Resampler object
+
+ See Also
+ --------
+ groupby : Group by mapping, function, label, or list of labels.
+ Series.resample : Resample a Series.
+ DataFrame.resample: Resample a DataFrame.
+
+ Notes
+ -----
+ See the `user guide
+ <http://pandas.pydata.org/pandas-docs/stable/timeseries.html#resampling>`_
+ for more.
+
+ To learn more about the offset strings, please see `this link
+ <http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases>`__.
+
+ Examples
+ --------
+
+ Start by creating a series with 9 one minute timestamps.
+
+ >>> index = pd.date_range('1/1/2000', periods=9, freq='T')
+ >>> series = pd.Series(range(9), index=index)
+ >>> series
+ 2000-01-01 00:00:00 0
+ 2000-01-01 00:01:00 1
+ 2000-01-01 00:02:00 2
+ 2000-01-01 00:03:00 3
+ 2000-01-01 00:04:00 4
+ 2000-01-01 00:05:00 5
+ 2000-01-01 00:06:00 6
+ 2000-01-01 00:07:00 7
+ 2000-01-01 00:08:00 8
+ Freq: T, dtype: int64
+
+ Downsample the series into 3 minute bins and sum the values
+ of the timestamps falling into a bin.
+
+ >>> series.resample('3T').sum()
+ 2000-01-01 00:00:00 3
+ 2000-01-01 00:03:00 12
+ 2000-01-01 00:06:00 21
+ Freq: 3T, dtype: int64
+
+ Downsample the series into 3 minute bins as above, but label each
+ bin using the right edge instead of the left. Please note that the
+ value in the bucket used as the label is not included in the bucket,
+ which it labels. For example, in the original series the
+ bucket ``2000-01-01 00:03:00`` contains the value 3, but the summed
+ value in the resampled bucket with the label ``2000-01-01 00:03:00``
+ does not include 3 (if it did, the summed value would be 6, not 3).
+ To include this value close the right side of the bin interval as
+ illustrated in the example below this one.
+
+ >>> series.resample('3T', label='right').sum()
+ 2000-01-01 00:03:00 3
+ 2000-01-01 00:06:00 12
+ 2000-01-01 00:09:00 21
+ Freq: 3T, dtype: int64
+
+ Downsample the series into 3 minute bins as above, but close the right
+ side of the bin interval.
+
+ >>> series.resample('3T', label='right', closed='right').sum()
+ 2000-01-01 00:00:00 0
+ 2000-01-01 00:03:00 6
+ 2000-01-01 00:06:00 15
+ 2000-01-01 00:09:00 15
+ Freq: 3T, dtype: int64
+
+ Upsample the series into 30 second bins.
+
+ >>> series.resample('30S').asfreq()[0:5] # Select first 5 rows
+ 2000-01-01 00:00:00 0.0
+ 2000-01-01 00:00:30 NaN
+ 2000-01-01 00:01:00 1.0
+ 2000-01-01 00:01:30 NaN
+ 2000-01-01 00:02:00 2.0
+ Freq: 30S, dtype: float64
+
+ Upsample the series into 30 second bins and fill the ``NaN``
+ values using the ``pad`` method.
+
+ >>> series.resample('30S').pad()[0:5]
+ 2000-01-01 00:00:00 0
+ 2000-01-01 00:00:30 0
+ 2000-01-01 00:01:00 1
+ 2000-01-01 00:01:30 1
+ 2000-01-01 00:02:00 2
+ Freq: 30S, dtype: int64
+
+ Upsample the series into 30 second bins and fill the
+ ``NaN`` values using the ``bfill`` method.
+
+ >>> series.resample('30S').bfill()[0:5]
+ 2000-01-01 00:00:00 0
+ 2000-01-01 00:00:30 1
+ 2000-01-01 00:01:00 1
+ 2000-01-01 00:01:30 2
+ 2000-01-01 00:02:00 2
+ Freq: 30S, dtype: int64
+
+ Pass a custom function via ``apply``
+
+ >>> def custom_resampler(array_like):
+ ... return np.sum(array_like) + 5
+ ...
+ >>> series.resample('3T').apply(custom_resampler)
+ 2000-01-01 00:00:00 8
+ 2000-01-01 00:03:00 17
+ 2000-01-01 00:06:00 26
+ Freq: 3T, dtype: int64
+
+ For a Series with a PeriodIndex, the keyword `convention` can be
+ used to control whether to use the start or end of `rule`.
+
+ Resample a year by quarter using 'start' `convention`. Values are
+ assigned to the first quarter of the period.
+
+ >>> s = pd.Series([1, 2], index=pd.period_range('2012-01-01',
+ ... freq='A',
+ ... periods=2))
+ >>> s
+ 2012 1
+ 2013 2
+ Freq: A-DEC, dtype: int64
+ >>> s.resample('Q', convention='start').asfreq()
+ 2012Q1 1.0
+ 2012Q2 NaN
+ 2012Q3 NaN
+ 2012Q4 NaN
+ 2013Q1 2.0
+ 2013Q2 NaN
+ 2013Q3 NaN
+ 2013Q4 NaN
+ Freq: Q-DEC, dtype: float64
+
+ Resample quarters by month using 'end' `convention`. Values are
+ assigned to the last month of the period.
+
+ >>> q = pd.Series([1, 2, 3, 4], index=pd.period_range('2018-01-01',
+ ... freq='Q',
+ ... periods=4))
+ >>> q
+ 2018Q1 1
+ 2018Q2 2
+ 2018Q3 3
+ 2018Q4 4
+ Freq: Q-DEC, dtype: int64
+ >>> q.resample('M', convention='end').asfreq()
+ 2018-03 1.0
+ 2018-04 NaN
+ 2018-05 NaN
+ 2018-06 2.0
+ 2018-07 NaN
+ 2018-08 NaN
+ 2018-09 3.0
+ 2018-10 NaN
+ 2018-11 NaN
+ 2018-12 4.0
+ Freq: M, dtype: float64
+
+ For DataFrame objects, the keyword `on` can be used to specify the
+ column instead of the index for resampling.
+
+ >>> d = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19],
+ ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]})
+ >>> df = pd.DataFrame(d)
+ >>> df['week_starting'] = pd.date_range('01/01/2018',
+ ... periods=8,
+ ... freq='W')
+ >>> df
+ price volume week_starting
+ 0 10 50 2018-01-07
+ 1 11 60 2018-01-14
+ 2 9 40 2018-01-21
+ 3 13 100 2018-01-28
+ 4 14 50 2018-02-04
+ 5 18 100 2018-02-11
+ 6 17 40 2018-02-18
+ 7 19 50 2018-02-25
+ >>> df.resample('M', on='week_starting').mean()
+ price volume
+ week_starting
+ 2018-01-31 10.75 62.5
+ 2018-02-28 17.00 60.0
+
+ For a DataFrame with MultiIndex, the keyword `level` can be used to
+ specify on which level the resampling needs to take place.
+
+ >>> days = pd.date_range('1/1/2000', periods=4, freq='D')
+ >>> d2 = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19],
+ ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]})
+ >>> df2 = pd.DataFrame(d2,
+ ... index=pd.MultiIndex.from_product([days,
+ ... ['morning',
+ ... 'afternoon']]
+ ... ))
+ >>> df2
+ price volume
+ 2000-01-01 morning 10 50
+ afternoon 11 60
+ 2000-01-02 morning 9 40
+ afternoon 13 100
+ 2000-01-03 morning 14 50
+ afternoon 18 100
+ 2000-01-04 morning 17 40
+ afternoon 19 50
+ >>> df2.resample('D', level=0).sum()
+ price volume
+ 2000-01-01 21 110
+ 2000-01-02 22 140
+ 2000-01-03 32 150
+ 2000-01-04 36 90
+ """
+
+ from pandas.core.resample import (resample,
+ _maybe_process_deprecations)
+ axis = self._get_axis_number(axis)
+ r = resample(self, freq=rule, label=label, closed=closed,
+ axis=axis, kind=kind, loffset=loffset,
+ convention=convention,
+ base=base, key=on, level=level)
+ return _maybe_process_deprecations(r,
+ how=how,
+ fill_method=fill_method,
+ limit=limit)
+
+ def first(self, offset):
+ """
+ Convenience method for subsetting initial periods of time series data
+ based on a date offset.
+
+ Parameters
+ ----------
+ offset : string, DateOffset, dateutil.relativedelta
+
+ Returns
+ -------
+ subset : same type as caller
+
+ Raises
+ ------
+ TypeError
+ If the index is not a :class:`DatetimeIndex`
+
+ See Also
+ --------
+ last : Select final periods of time series based on a date offset.
+ at_time : Select values at a particular time of the day.
+ between_time : Select values between particular times of the day.
+
+ Examples
+ --------
+ >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')
+ >>> ts = pd.DataFrame({'A': [1,2,3,4]}, index=i)
+ >>> ts
+ A
+ 2018-04-09 1
+ 2018-04-11 2
+ 2018-04-13 3
+ 2018-04-15 4
+
+ Get the rows for the first 3 days:
+
+ >>> ts.first('3D')
+ A
+ 2018-04-09 1
+ 2018-04-11 2
+
+ Notice the data for 3 first calender days were returned, not the first
+ 3 days observed in the dataset, and therefore data for 2018-04-13 was
+ not returned.
+ """
+ if not isinstance(self.index, DatetimeIndex):
+ raise TypeError("'first' only supports a DatetimeIndex index")
+
+ if len(self.index) == 0:
+ return self
+
+ offset = to_offset(offset)
+ end_date = end = self.index[0] + offset
+
+ # Tick-like, e.g. 3 weeks
+ if not offset.isAnchored() and hasattr(offset, '_inc'):
+ if end_date in self.index:
+ end = self.index.searchsorted(end_date, side='left')
+ return self.iloc[:end]
+
+ return self.loc[:end]
+
+ def last(self, offset):
+ """
+ Convenience method for subsetting final periods of time series data
+ based on a date offset.
+
+ Parameters
+ ----------
+ offset : string, DateOffset, dateutil.relativedelta
+
+ Returns
+ -------
+ subset : same type as caller
+
+ Raises
+ ------
+ TypeError
+ If the index is not a :class:`DatetimeIndex`
+
+ See Also
+ --------
+ first : Select initial periods of time series based on a date offset.
+ at_time : Select values at a particular time of the day.
+ between_time : Select values between particular times of the day.
+
+ Examples
+ --------
+ >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')
+ >>> ts = pd.DataFrame({'A': [1,2,3,4]}, index=i)
+ >>> ts
+ A
+ 2018-04-09 1
+ 2018-04-11 2
+ 2018-04-13 3
+ 2018-04-15 4
+
+ Get the rows for the last 3 days:
+
+ >>> ts.last('3D')
+ A
+ 2018-04-13 3
+ 2018-04-15 4
+
+ Notice the data for 3 last calender days were returned, not the last
+ 3 observed days in the dataset, and therefore data for 2018-04-11 was
+ not returned.
+ """
+ if not isinstance(self.index, DatetimeIndex):
+ raise TypeError("'last' only supports a DatetimeIndex index")
+
+ if len(self.index) == 0:
+ return self
+
+ offset = to_offset(offset)
+
+ start_date = self.index[-1] - offset
+ start = self.index.searchsorted(start_date, side='right')
+ return self.iloc[start:]
+
+ def rank(self, axis=0, method='average', numeric_only=None,
+ na_option='keep', ascending=True, pct=False):
+ """
+ Compute numerical data ranks (1 through n) along axis. Equal values are
+ assigned a rank that is the average of the ranks of those values.
+
+ Parameters
+ ----------
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ index to direct ranking
+ method : {'average', 'min', 'max', 'first', 'dense'}
+ * average: average rank of group
+ * min: lowest rank in group
+ * max: highest rank in group
+ * first: ranks assigned in order they appear in the array
+ * dense: like 'min', but rank always increases by 1 between groups
+ numeric_only : boolean, default None
+ Include only float, int, boolean data. Valid only for DataFrame or
+ Panel objects
+ na_option : {'keep', 'top', 'bottom'}
+ * keep: leave NA values where they are
+ * top: smallest rank if ascending
+ * bottom: smallest rank if descending
+ ascending : boolean, default True
+ False for ranks by high (1) to low (N)
+ pct : boolean, default False
+ Computes percentage rank of data
+
+ Returns
+ -------
+ ranks : same type as caller
+ """
+ axis = self._get_axis_number(axis)
+
+ if self.ndim > 2:
+ msg = "rank does not make sense when ndim > 2"
+ raise NotImplementedError(msg)
+
+ if na_option not in {'keep', 'top', 'bottom'}:
+ msg = "na_option must be one of 'keep', 'top', or 'bottom'"
+ raise ValueError(msg)
+
+ def ranker(data):
+ ranks = algos.rank(data.values, axis=axis, method=method,
+ ascending=ascending, na_option=na_option,
+ pct=pct)
+ ranks = self._constructor(ranks, **data._construct_axes_dict())
+ return ranks.__finalize__(self)
+
+ # if numeric_only is None, and we can't get anything, we try with
+ # numeric_only=True
+ if numeric_only is None:
+ try:
+ return ranker(self)
+ except TypeError:
+ numeric_only = True
+
+ if numeric_only:
+ data = self._get_numeric_data()
+ else:
+ data = self
+
+ return ranker(data)
+
+ _shared_docs['align'] = ("""
+ Align two objects on their axes with the
+ specified join method for each axis Index.
+
+ Parameters
+ ----------
+ other : DataFrame or Series
+ join : {'outer', 'inner', 'left', 'right'}, default 'outer'
+ axis : allowed axis of the other object, default None
+ Align on index (0), columns (1), or both (None)
+ level : int or level name, default None
+ Broadcast across a level, matching Index values on the
+ passed MultiIndex level
+ copy : boolean, default True
+ Always returns new objects. If copy=False and no reindexing is
+ required then original objects are returned.
+ fill_value : scalar, default np.NaN
+ Value to use for missing values. Defaults to NaN, but can be any
+ "compatible" value
+ method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
+ Method to use for filling holes in reindexed Series
+ pad / ffill: propagate last valid observation forward to next valid
+ backfill / bfill: use NEXT valid observation to fill gap
+ limit : int, default None
+ If method is specified, this is the maximum number of consecutive
+ NaN values to forward/backward fill. In other words, if there is
+ a gap with more than this number of consecutive NaNs, it will only
+ be partially filled. If method is not specified, this is the
+ maximum number of entries along the entire axis where NaNs will be
+ filled. Must be greater than 0 if not None.
+ fill_axis : %(axes_single_arg)s, default 0
+ Filling axis, method and limit
+ broadcast_axis : %(axes_single_arg)s, default None
+ Broadcast values along this axis, if aligning two objects of
+ different dimensions
+
+ Returns
+ -------
+ (left, right) : (%(klass)s, type of other)
+ Aligned objects
+ """)
+
+ @Appender(_shared_docs['align'] % _shared_doc_kwargs)
+ def align(self, other, join='outer', axis=None, level=None, copy=True,
+ fill_value=None, method=None, limit=None, fill_axis=0,
+ broadcast_axis=None):
+ from pandas import DataFrame, Series
+ method = missing.clean_fill_method(method)
+
+ if broadcast_axis == 1 and self.ndim != other.ndim:
+ if isinstance(self, Series):
+ # this means other is a DataFrame, and we need to broadcast
+ # self
+ cons = self._constructor_expanddim
+ df = cons({c: self for c in other.columns},
+ **other._construct_axes_dict())
+ return df._align_frame(other, join=join, axis=axis,
+ level=level, copy=copy,
+ fill_value=fill_value, method=method,
+ limit=limit, fill_axis=fill_axis)
+ elif isinstance(other, Series):
+ # this means self is a DataFrame, and we need to broadcast
+ # other
+ cons = other._constructor_expanddim
+ df = cons({c: other for c in self.columns},
+ **self._construct_axes_dict())
+ return self._align_frame(df, join=join, axis=axis, level=level,
+ copy=copy, fill_value=fill_value,
+ method=method, limit=limit,
+ fill_axis=fill_axis)
+
+ if axis is not None:
+ axis = self._get_axis_number(axis)
+ if isinstance(other, DataFrame):
+ return self._align_frame(other, join=join, axis=axis, level=level,
+ copy=copy, fill_value=fill_value,
+ method=method, limit=limit,
+ fill_axis=fill_axis)
+ elif isinstance(other, Series):
+ return self._align_series(other, join=join, axis=axis, level=level,
+ copy=copy, fill_value=fill_value,
+ method=method, limit=limit,
+ fill_axis=fill_axis)
+ else: # pragma: no cover
+ raise TypeError('unsupported type: %s' % type(other))
+
+ def _align_frame(self, other, join='outer', axis=None, level=None,
+ copy=True, fill_value=None, method=None, limit=None,
+ fill_axis=0):
+ # defaults
+ join_index, join_columns = None, None
+ ilidx, iridx = None, None
+ clidx, cridx = None, None
+
+ is_series = isinstance(self, ABCSeries)
+
+ if axis is None or axis == 0:
+ if not self.index.equals(other.index):
+ join_index, ilidx, iridx = self.index.join(
+ other.index, how=join, level=level, return_indexers=True)
+
+ if axis is None or axis == 1:
+ if not is_series and not self.columns.equals(other.columns):
+ join_columns, clidx, cridx = self.columns.join(
+ other.columns, how=join, level=level, return_indexers=True)
+
+ if is_series:
+ reindexers = {0: [join_index, ilidx]}
+ else:
+ reindexers = {0: [join_index, ilidx], 1: [join_columns, clidx]}
+
+ left = self._reindex_with_indexers(reindexers, copy=copy,
+ fill_value=fill_value,
+ allow_dups=True)
+ # other must be always DataFrame
+ right = other._reindex_with_indexers({0: [join_index, iridx],
+ 1: [join_columns, cridx]},
+ copy=copy, fill_value=fill_value,
+ allow_dups=True)
+
+ if method is not None:
+ left = left.fillna(axis=fill_axis, method=method, limit=limit)
+ right = right.fillna(axis=fill_axis, method=method, limit=limit)
+
+ # if DatetimeIndex have different tz, convert to UTC
+ if is_datetime64tz_dtype(left.index):
+ if left.index.tz != right.index.tz:
+ if join_index is not None:
+ left.index = join_index
+ right.index = join_index
+
+ return left.__finalize__(self), right.__finalize__(other)
+
+ def _align_series(self, other, join='outer', axis=None, level=None,
+ copy=True, fill_value=None, method=None, limit=None,
+ fill_axis=0):
+
+ is_series = isinstance(self, ABCSeries)
+
+ # series/series compat, other must always be a Series
+ if is_series:
+ if axis:
+ raise ValueError('cannot align series to a series other than '
+ 'axis 0')
+
+ # equal
+ if self.index.equals(other.index):
+ join_index, lidx, ridx = None, None, None
+ else:
+ join_index, lidx, ridx = self.index.join(other.index, how=join,
+ level=level,
+ return_indexers=True)
+
+ left = self._reindex_indexer(join_index, lidx, copy)
+ right = other._reindex_indexer(join_index, ridx, copy)
+
+ else:
+ # one has > 1 ndim
+ fdata = self._data
+ if axis == 0:
+ join_index = self.index
+ lidx, ridx = None, None
+ if not self.index.equals(other.index):
+ join_index, lidx, ridx = self.index.join(
+ other.index, how=join, level=level,
+ return_indexers=True)
+
+ if lidx is not None:
+ fdata = fdata.reindex_indexer(join_index, lidx, axis=1)
+
+ elif axis == 1:
+ join_index = self.columns
+ lidx, ridx = None, None
+ if not self.columns.equals(other.index):
+ join_index, lidx, ridx = self.columns.join(
+ other.index, how=join, level=level,
+ return_indexers=True)
+
+ if lidx is not None:
+ fdata = fdata.reindex_indexer(join_index, lidx, axis=0)
+ else:
+ raise ValueError('Must specify axis=0 or 1')
+
+ if copy and fdata is self._data:
+ fdata = fdata.copy()
+
+ left = self._constructor(fdata)
+
+ if ridx is None:
+ right = other
+ else:
+ right = other.reindex(join_index, level=level)
+
+ # fill
+ fill_na = notna(fill_value) or (method is not None)
+ if fill_na:
+ left = left.fillna(fill_value, method=method, limit=limit,
+ axis=fill_axis)
+ right = right.fillna(fill_value, method=method, limit=limit)
+
+ # if DatetimeIndex have different tz, convert to UTC
+ if is_series or (not is_series and axis == 0):
+ if is_datetime64tz_dtype(left.index):
+ if left.index.tz != right.index.tz:
+ if join_index is not None:
+ left.index = join_index
+ right.index = join_index
+
+ return left.__finalize__(self), right.__finalize__(other)
+
+ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
+ errors='raise', try_cast=False):
+ """
+ Equivalent to public method `where`, except that `other` is not
+ applied as a function even if callable. Used in __setitem__.
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+
+ # align the cond to same shape as myself
+ cond = com.apply_if_callable(cond, self)
+ if isinstance(cond, NDFrame):
+ cond, _ = cond.align(self, join='right', broadcast_axis=1)
+ else:
+ if not hasattr(cond, 'shape'):
+ cond = np.asanyarray(cond)
+ if cond.shape != self.shape:
+ raise ValueError('Array conditional must be same shape as '
+ 'self')
+ cond = self._constructor(cond, **self._construct_axes_dict())
+
+ # make sure we are boolean
+ fill_value = True if inplace else False
+ cond = cond.fillna(fill_value)
+
+ msg = "Boolean array expected for the condition, not {dtype}"
+
+ if not isinstance(cond, pd.DataFrame):
+ # This is a single-dimensional object.
+ if not is_bool_dtype(cond):
+ raise ValueError(msg.format(dtype=cond.dtype))
+ elif not cond.empty:
+ for dt in cond.dtypes:
+ if not is_bool_dtype(dt):
+ raise ValueError(msg.format(dtype=dt))
+
+ cond = -cond if inplace else cond
+
+ # try to align with other
+ try_quick = True
+ if hasattr(other, 'align'):
+
+ # align with me
+ if other.ndim <= self.ndim:
+
+ _, other = self.align(other, join='left', axis=axis,
+ level=level, fill_value=np.nan)
+
+ # if we are NOT aligned, raise as we cannot where index
+ if (axis is None and
+ not all(other._get_axis(i).equals(ax)
+ for i, ax in enumerate(self.axes))):
+ raise InvalidIndexError
+
+ # slice me out of the other
+ else:
+ raise NotImplementedError("cannot align with a higher "
+ "dimensional NDFrame")
+
+ if isinstance(other, np.ndarray):
+
+ if other.shape != self.shape:
+
+ if self.ndim == 1:
+
+ icond = cond.values
+
+ # GH 2745 / GH 4192
+ # treat like a scalar
+ if len(other) == 1:
+ other = np.array(other[0])
+
+ # GH 3235
+ # match True cond to other
+ elif len(cond[icond]) == len(other):
+
+ # try to not change dtype at first (if try_quick)
+ if try_quick:
+
+ try:
+ new_other = com.values_from_object(self)
+ new_other = new_other.copy()
+ new_other[icond] = other
+ other = new_other
+ except Exception:
+ try_quick = False
+
+ # let's create a new (if we failed at the above
+ # or not try_quick
+ if not try_quick:
+
+ dtype, fill_value = maybe_promote(other.dtype)
+ new_other = np.empty(len(icond), dtype=dtype)
+ new_other.fill(fill_value)
+ maybe_upcast_putmask(new_other, icond, other)
+ other = new_other
+
+ else:
+ raise ValueError('Length of replacements must equal '
+ 'series length')
+
+ else:
+ raise ValueError('other must be the same shape as self '
+ 'when an ndarray')
+
+ # we are the same shape, so create an actual object for alignment
+ else:
+ other = self._constructor(other, **self._construct_axes_dict())
+
+ if axis is None:
+ axis = 0
+
+ if self.ndim == getattr(other, 'ndim', 0):
+ align = True
+ else:
+ align = (self._get_axis_number(axis) == 1)
+
+ block_axis = self._get_block_manager_axis(axis)
+
+ if inplace:
+ # we may have different type blocks come out of putmask, so
+ # reconstruct the block manager
+
+ self._check_inplace_setting(other)
+ new_data = self._data.putmask(mask=cond, new=other, align=align,
+ inplace=True, axis=block_axis,
+ transpose=self._AXIS_REVERSED)
+ self._update_inplace(new_data)
+
+ else:
+ new_data = self._data.where(other=other, cond=cond, align=align,
+ errors=errors,
+ try_cast=try_cast, axis=block_axis,
+ transpose=self._AXIS_REVERSED)
+
+ return self._constructor(new_data).__finalize__(self)
+
+ _shared_docs['where'] = ("""
+ Replace values where the condition is %(cond_rev)s.
+
+ Parameters
+ ----------
+ cond : boolean %(klass)s, array-like, or callable
+ Where `cond` is %(cond)s, keep the original value. Where
+ %(cond_rev)s, replace with corresponding value from `other`.
+ If `cond` is callable, it is computed on the %(klass)s and
+ should return boolean %(klass)s or array. The callable must
+ not change input %(klass)s (though pandas doesn't check it).
+
+ .. versionadded:: 0.18.1
+ A callable can be used as cond.
+
+ other : scalar, %(klass)s, or callable
+ Entries where `cond` is %(cond_rev)s are replaced with
+ corresponding value from `other`.
+ If other is callable, it is computed on the %(klass)s and
+ should return scalar or %(klass)s. The callable must not
+ change input %(klass)s (though pandas doesn't check it).
+
+ .. versionadded:: 0.18.1
+ A callable can be used as other.
+
+ inplace : boolean, default False
+ Whether to perform the operation in place on the data.
+ axis : int, default None
+ Alignment axis if needed.
+ level : int, default None
+ Alignment level if needed.
+ errors : str, {'raise', 'ignore'}, default `raise`
+ Note that currently this parameter won't affect
+ the results and will always coerce to a suitable dtype.
+
+ - `raise` : allow exceptions to be raised.
+ - `ignore` : suppress exceptions. On error return original object.
+
+ try_cast : boolean, default False
+ Try to cast the result back to the input type (if possible).
+ raise_on_error : boolean, default True
+ Whether to raise on invalid data types (e.g. trying to where on
+ strings).
+
+ .. deprecated:: 0.21.0
+
+ Use `errors`.
+
+ Returns
+ -------
+ wh : same type as caller
+
+ See Also
+ --------
+ :func:`DataFrame.%(name_other)s` : Return an object of same shape as
+ self.
+
+ Notes
+ -----
+ The %(name)s method is an application of the if-then idiom. For each
+ element in the calling DataFrame, if ``cond`` is ``%(cond)s`` the
+ element is used; otherwise the corresponding element from the DataFrame
+ ``other`` is used.
+
+ The signature for :func:`DataFrame.where` differs from
+ :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to
+ ``np.where(m, df1, df2)``.
+
+ For further details and examples see the ``%(name)s`` documentation in
+ :ref:`indexing <indexing.where_mask>`.
+
+ Examples
+ --------
+ >>> s = pd.Series(range(5))
+ >>> s.where(s > 0)
+ 0 NaN
+ 1 1.0
+ 2 2.0
+ 3 3.0
+ 4 4.0
+ dtype: float64
+
+ >>> s.mask(s > 0)
+ 0 0.0
+ 1 NaN
+ 2 NaN
+ 3 NaN
+ 4 NaN
+ dtype: float64
+
+ >>> s.where(s > 1, 10)
+ 0 10
+ 1 10
+ 2 2
+ 3 3
+ 4 4
+ dtype: int64
+
+ >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
+ >>> m = df %% 3 == 0
+ >>> df.where(m, -df)
+ A B
+ 0 0 -1
+ 1 -2 3
+ 2 -4 -5
+ 3 6 -7
+ 4 -8 9
+ >>> df.where(m, -df) == np.where(m, df, -df)
+ A B
+ 0 True True
+ 1 True True
+ 2 True True
+ 3 True True
+ 4 True True
+ >>> df.where(m, -df) == df.mask(~m, -df)
+ A B
+ 0 True True
+ 1 True True
+ 2 True True
+ 3 True True
+ 4 True True
+ """)
+
+ @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="True",
+ cond_rev="False", name='where',
+ name_other='mask'))
+ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
+ errors='raise', try_cast=False, raise_on_error=None):
+
+ if raise_on_error is not None:
+ warnings.warn(
+ "raise_on_error is deprecated in "
+ "favor of errors='raise|ignore'",
+ FutureWarning, stacklevel=2)
+
+ if raise_on_error:
+ errors = 'raise'
+ else:
+ errors = 'ignore'
+
+ other = com.apply_if_callable(other, self)
+ return self._where(cond, other, inplace, axis, level,
+ errors=errors, try_cast=try_cast)
+
+ @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="False",
+ cond_rev="True", name='mask',
+ name_other='where'))
+ def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None,
+ errors='raise', try_cast=False, raise_on_error=None):
+
+ if raise_on_error is not None:
+ warnings.warn(
+ "raise_on_error is deprecated in "
+ "favor of errors='raise|ignore'",
+ FutureWarning, stacklevel=2)
+
+ if raise_on_error:
+ errors = 'raise'
+ else:
+ errors = 'ignore'
+
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ cond = com.apply_if_callable(cond, self)
+
+ # see gh-21891
+ if not hasattr(cond, "__invert__"):
+ cond = np.array(cond)
+
+ return self.where(~cond, other=other, inplace=inplace, axis=axis,
+ level=level, try_cast=try_cast,
+ errors=errors)
+
+ _shared_docs['shift'] = ("""
+ Shift index by desired number of periods with an optional time `freq`.
+
+ When `freq` is not passed, shift the index without realigning the data.
+ If `freq` is passed (in this case, the index must be date or datetime,
+ or it will raise a `NotImplementedError`), the index will be
+ increased using the periods and the `freq`.
+
+ Parameters
+ ----------
+ periods : int
+ Number of periods to shift. Can be positive or negative.
+ freq : DateOffset, tseries.offsets, timedelta, or str, optional
+ Offset to use from the tseries module or time rule (e.g. 'EOM').
+ If `freq` is specified then the index values are shifted but the
+ data is not realigned. That is, use `freq` if you would like to
+ extend the index when shifting and preserve the original data.
+ axis : {0 or 'index', 1 or 'columns', None}, default None
+ Shift direction.
+ fill_value : object, optional
+ The scalar value to use for newly introduced missing values.
+ the default depends on the dtype of `self`.
+ For numeric data, ``np.nan`` is used.
+ For datetime, timedelta, or period data, etc. :attr:`NaT` is used.
+ For extension dtypes, ``self.dtype.na_value`` is used.
+
+ .. versionchanged:: 0.24.0
+
+ Returns
+ -------
+ %(klass)s
+ Copy of input object, shifted.
+
+ See Also
+ --------
+ Index.shift : Shift values of Index.
+ DatetimeIndex.shift : Shift values of DatetimeIndex.
+ PeriodIndex.shift : Shift values of PeriodIndex.
+ tshift : Shift the time index, using the index's frequency if
+ available.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'Col1': [10, 20, 15, 30, 45],
+ ... 'Col2': [13, 23, 18, 33, 48],
+ ... 'Col3': [17, 27, 22, 37, 52]})
+
+ >>> df.shift(periods=3)
+ Col1 Col2 Col3
+ 0 NaN NaN NaN
+ 1 NaN NaN NaN
+ 2 NaN NaN NaN
+ 3 10.0 13.0 17.0
+ 4 20.0 23.0 27.0
+
+ >>> df.shift(periods=1, axis='columns')
+ Col1 Col2 Col3
+ 0 NaN 10.0 13.0
+ 1 NaN 20.0 23.0
+ 2 NaN 15.0 18.0
+ 3 NaN 30.0 33.0
+ 4 NaN 45.0 48.0
+
+ >>> df.shift(periods=3, fill_value=0)
+ Col1 Col2 Col3
+ 0 0 0 0
+ 1 0 0 0
+ 2 0 0 0
+ 3 10 13 17
+ 4 20 23 27
+ """)
+
+ @Appender(_shared_docs['shift'] % _shared_doc_kwargs)
+ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
+ if periods == 0:
+ return self.copy()
+
+ block_axis = self._get_block_manager_axis(axis)
+ if freq is None:
+ new_data = self._data.shift(periods=periods, axis=block_axis,
+ fill_value=fill_value)
+ else:
+ return self.tshift(periods, freq)
+
+ return self._constructor(new_data).__finalize__(self)
+
+ def slice_shift(self, periods=1, axis=0):
+ """
+ Equivalent to `shift` without copying data. The shifted data will
+ not include the dropped periods and the shifted axis will be smaller
+ than the original.
+
+ Parameters
+ ----------
+ periods : int
+ Number of periods to move, can be positive or negative
+
+ Returns
+ -------
+ shifted : same type as caller
+
+ Notes
+ -----
+ While the `slice_shift` is faster than `shift`, you may pay for it
+ later during alignment.
+ """
+ if periods == 0:
+ return self
+
+ if periods > 0:
+ vslicer = slice(None, -periods)
+ islicer = slice(periods, None)
+ else:
+ vslicer = slice(-periods, None)
+ islicer = slice(None, periods)
+
+ new_obj = self._slice(vslicer, axis=axis)
+ shifted_axis = self._get_axis(axis)[islicer]
+ new_obj.set_axis(shifted_axis, axis=axis, inplace=True)
+
+ return new_obj.__finalize__(self)
+
+ def tshift(self, periods=1, freq=None, axis=0):
+ """
+ Shift the time index, using the index's frequency if available.
+
+ Parameters
+ ----------
+ periods : int
+ Number of periods to move, can be positive or negative
+ freq : DateOffset, timedelta, or time rule string, default None
+ Increment to use from the tseries module or time rule (e.g. 'EOM')
+ axis : int or basestring
+ Corresponds to the axis that contains the Index
+
+ Returns
+ -------
+ shifted : NDFrame
+
+ Notes
+ -----
+ If freq is not specified then tries to use the freq or inferred_freq
+ attributes of the index. If neither of those attributes exist, a
+ ValueError is thrown
+ """
+
+ index = self._get_axis(axis)
+ if freq is None:
+ freq = getattr(index, 'freq', None)
+
+ if freq is None:
+ freq = getattr(index, 'inferred_freq', None)
+
+ if freq is None:
+ msg = 'Freq was not given and was not set in the index'
+ raise ValueError(msg)
+
+ if periods == 0:
+ return self
+
+ if isinstance(freq, string_types):
+ freq = to_offset(freq)
+
+ block_axis = self._get_block_manager_axis(axis)
+ if isinstance(index, PeriodIndex):
+ orig_freq = to_offset(index.freq)
+ if freq == orig_freq:
+ new_data = self._data.copy()
+ new_data.axes[block_axis] = index.shift(periods)
+ else:
+ msg = ('Given freq %s does not match PeriodIndex freq %s' %
+ (freq.rule_code, orig_freq.rule_code))
+ raise ValueError(msg)
+ else:
+ new_data = self._data.copy()
+ new_data.axes[block_axis] = index.shift(periods, freq)
+
+ return self._constructor(new_data).__finalize__(self)
+
+ def truncate(self, before=None, after=None, axis=None, copy=True):
+ """
+ Truncate a Series or DataFrame before and after some index value.
+
+ This is a useful shorthand for boolean indexing based on index
+ values above or below certain thresholds.
+
+ Parameters
+ ----------
+ before : date, string, int
+ Truncate all rows before this index value.
+ after : date, string, int
+ Truncate all rows after this index value.
+ axis : {0 or 'index', 1 or 'columns'}, optional
+ Axis to truncate. Truncates the index (rows) by default.
+ copy : boolean, default is True,
+ Return a copy of the truncated section.
+
+ Returns
+ -------
+ type of caller
+ The truncated Series or DataFrame.
+
+ See Also
+ --------
+ DataFrame.loc : Select a subset of a DataFrame by label.
+ DataFrame.iloc : Select a subset of a DataFrame by position.
+
+ Notes
+ -----
+ If the index being truncated contains only datetime values,
+ `before` and `after` may be specified as strings instead of
+ Timestamps.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'],
+ ... 'B': ['f', 'g', 'h', 'i', 'j'],
+ ... 'C': ['k', 'l', 'm', 'n', 'o']},
+ ... index=[1, 2, 3, 4, 5])
+ >>> df
+ A B C
+ 1 a f k
+ 2 b g l
+ 3 c h m
+ 4 d i n
+ 5 e j o
+
+ >>> df.truncate(before=2, after=4)
+ A B C
+ 2 b g l
+ 3 c h m
+ 4 d i n
+
+ The columns of a DataFrame can be truncated.
+
+ >>> df.truncate(before="A", after="B", axis="columns")
+ A B
+ 1 a f
+ 2 b g
+ 3 c h
+ 4 d i
+ 5 e j
+
+ For Series, only rows can be truncated.
+
+ >>> df['A'].truncate(before=2, after=4)
+ 2 b
+ 3 c
+ 4 d
+ Name: A, dtype: object
+
+ The index values in ``truncate`` can be datetimes or string
+ dates.
+
+ >>> dates = pd.date_range('2016-01-01', '2016-02-01', freq='s')
+ >>> df = pd.DataFrame(index=dates, data={'A': 1})
+ >>> df.tail()
+ A
+ 2016-01-31 23:59:56 1
+ 2016-01-31 23:59:57 1
+ 2016-01-31 23:59:58 1
+ 2016-01-31 23:59:59 1
+ 2016-02-01 00:00:00 1
+
+ >>> df.truncate(before=pd.Timestamp('2016-01-05'),
+ ... after=pd.Timestamp('2016-01-10')).tail()
+ A
+ 2016-01-09 23:59:56 1
+ 2016-01-09 23:59:57 1
+ 2016-01-09 23:59:58 1
+ 2016-01-09 23:59:59 1
+ 2016-01-10 00:00:00 1
+
+ Because the index is a DatetimeIndex containing only dates, we can
+ specify `before` and `after` as strings. They will be coerced to
+ Timestamps before truncation.
+
+ >>> df.truncate('2016-01-05', '2016-01-10').tail()
+ A
+ 2016-01-09 23:59:56 1
+ 2016-01-09 23:59:57 1
+ 2016-01-09 23:59:58 1
+ 2016-01-09 23:59:59 1
+ 2016-01-10 00:00:00 1
+
+ Note that ``truncate`` assumes a 0 value for any unspecified time
+ component (midnight). This differs from partial string slicing, which
+ returns any partially matching dates.
+
+ >>> df.loc['2016-01-05':'2016-01-10', :].tail()
+ A
+ 2016-01-10 23:59:55 1
+ 2016-01-10 23:59:56 1
+ 2016-01-10 23:59:57 1
+ 2016-01-10 23:59:58 1
+ 2016-01-10 23:59:59 1
+ """
+
+ if axis is None:
+ axis = self._stat_axis_number
+ axis = self._get_axis_number(axis)
+ ax = self._get_axis(axis)
+
+ # GH 17935
+ # Check that index is sorted
+ if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
+ raise ValueError("truncate requires a sorted index")
+
+ # if we have a date index, convert to dates, otherwise
+ # treat like a slice
+ if ax.is_all_dates:
+ from pandas.core.tools.datetimes import to_datetime
+ before = to_datetime(before)
+ after = to_datetime(after)
+
+ if before is not None and after is not None:
+ if before > after:
+ raise ValueError('Truncate: %s must be after %s' %
+ (after, before))
+
+ slicer = [slice(None, None)] * self._AXIS_LEN
+ slicer[axis] = slice(before, after)
+ result = self.loc[tuple(slicer)]
+
+ if isinstance(ax, MultiIndex):
+ setattr(result, self._get_axis_name(axis),
+ ax.truncate(before, after))
+
+ if copy:
+ result = result.copy()
+
+ return result
+
+ def tz_convert(self, tz, axis=0, level=None, copy=True):
+ """
+ Convert tz-aware axis to target time zone.
+
+ Parameters
+ ----------
+ tz : string or pytz.timezone object
+ axis : the axis to convert
+ level : int, str, default None
+ If axis ia a MultiIndex, convert a specific level. Otherwise
+ must be None
+ copy : boolean, default True
+ Also make a copy of the underlying data
+
+ Returns
+ -------
+
+ Raises
+ ------
+ TypeError
+ If the axis is tz-naive.
+ """
+ axis = self._get_axis_number(axis)
+ ax = self._get_axis(axis)
+
+ def _tz_convert(ax, tz):
+ if not hasattr(ax, 'tz_convert'):
+ if len(ax) > 0:
+ ax_name = self._get_axis_name(axis)
+ raise TypeError('%s is not a valid DatetimeIndex or '
+ 'PeriodIndex' % ax_name)
+ else:
+ ax = DatetimeIndex([], tz=tz)
+ else:
+ ax = ax.tz_convert(tz)
+ return ax
+
+ # if a level is given it must be a MultiIndex level or
+ # equivalent to the axis name
+ if isinstance(ax, MultiIndex):
+ level = ax._get_level_number(level)
+ new_level = _tz_convert(ax.levels[level], tz)
+ ax = ax.set_levels(new_level, level=level)
+ else:
+ if level not in (None, 0, ax.name):
+ raise ValueError("The level {0} is not valid".format(level))
+ ax = _tz_convert(ax, tz)
+
+ result = self._constructor(self._data, copy=copy)
+ result = result.set_axis(ax, axis=axis, inplace=False)
+ return result.__finalize__(self)
+
+ def tz_localize(self, tz, axis=0, level=None, copy=True,
+ ambiguous='raise', nonexistent='raise'):
+ """
+ Localize tz-naive index of a Series or DataFrame to target time zone.
+
+ This operation localizes the Index. To localize the values in a
+ timezone-naive Series, use :meth:`Series.dt.tz_localize`.
+
+ Parameters
+ ----------
+ tz : string or pytz.timezone object
+ axis : the axis to localize
+ level : int, str, default None
+ If axis ia a MultiIndex, localize a specific level. Otherwise
+ must be None
+ copy : boolean, default True
+ Also make a copy of the underlying data
+ ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'
+ When clocks moved backward due to DST, ambiguous times may arise.
+ For example in Central European Time (UTC+01), when going from
+ 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at
+ 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the
+ `ambiguous` parameter dictates how ambiguous times should be
+ handled.
+
+ - 'infer' will attempt to infer fall dst-transition hours based on
+ order
+ - bool-ndarray where True signifies a DST time, False designates
+ a non-DST time (note that this flag is only applicable for
+ ambiguous times)
+ - 'NaT' will return NaT where there are ambiguous times
+ - 'raise' will raise an AmbiguousTimeError if there are ambiguous
+ times
+ nonexistent : str, default 'raise'
+ A nonexistent time does not exist in a particular timezone
+ where clocks moved forward due to DST. Valid valuse are:
+
+ - 'shift_forward' will shift the nonexistent time forward to the
+ closest existing time
+ - 'shift_backward' will shift the nonexistent time backward to the
+ closest existing time
+ - 'NaT' will return NaT where there are nonexistent times
+ - timedelta objects will shift nonexistent times by the timedelta
+ - 'raise' will raise an NonExistentTimeError if there are
+ nonexistent times
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ Series or DataFrame
+ Same type as the input.
+
+ Raises
+ ------
+ TypeError
+ If the TimeSeries is tz-aware and tz is not None.
+
+ Examples
+ --------
+
+ Localize local times:
+
+ >>> s = pd.Series([1],
+ ... index=pd.DatetimeIndex(['2018-09-15 01:30:00']))
+ >>> s.tz_localize('CET')
+ 2018-09-15 01:30:00+02:00 1
+ dtype: int64
+
+ Be careful with DST changes. When there is sequential data, pandas
+ can infer the DST time:
+
+ >>> s = pd.Series(range(7), index=pd.DatetimeIndex([
+ ... '2018-10-28 01:30:00',
+ ... '2018-10-28 02:00:00',
+ ... '2018-10-28 02:30:00',
+ ... '2018-10-28 02:00:00',
+ ... '2018-10-28 02:30:00',
+ ... '2018-10-28 03:00:00',
+ ... '2018-10-28 03:30:00']))
+ >>> s.tz_localize('CET', ambiguous='infer')
+ 2018-10-28 01:30:00+02:00 0
+ 2018-10-28 02:00:00+02:00 1
+ 2018-10-28 02:30:00+02:00 2
+ 2018-10-28 02:00:00+01:00 3
+ 2018-10-28 02:30:00+01:00 4
+ 2018-10-28 03:00:00+01:00 5
+ 2018-10-28 03:30:00+01:00 6
+ dtype: int64
+
+ In some cases, inferring the DST is impossible. In such cases, you can
+ pass an ndarray to the ambiguous parameter to set the DST explicitly
+
+ >>> s = pd.Series(range(3), index=pd.DatetimeIndex([
+ ... '2018-10-28 01:20:00',
+ ... '2018-10-28 02:36:00',
+ ... '2018-10-28 03:46:00']))
+ >>> s.tz_localize('CET', ambiguous=np.array([True, True, False]))
+ 2018-10-28 01:20:00+02:00 0
+ 2018-10-28 02:36:00+02:00 1
+ 2018-10-28 03:46:00+01:00 2
+ dtype: int64
+
+ If the DST transition causes nonexistent times, you can shift these
+ dates forward or backwards with a timedelta object or `'shift_forward'`
+ or `'shift_backwards'`.
+ >>> s = pd.Series(range(2), index=pd.DatetimeIndex([
+ ... '2015-03-29 02:30:00',
+ ... '2015-03-29 03:30:00']))
+ >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_forward')
+ 2015-03-29 03:00:00+02:00 0
+ 2015-03-29 03:30:00+02:00 1
+ dtype: int64
+ >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_backward')
+ 2015-03-29 01:59:59.999999999+01:00 0
+ 2015-03-29 03:30:00+02:00 1
+ dtype: int64
+ >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H'))
+ 2015-03-29 03:30:00+02:00 0
+ 2015-03-29 03:30:00+02:00 1
+ dtype: int64
+ """
+ nonexistent_options = ('raise', 'NaT', 'shift_forward',
+ 'shift_backward')
+ if nonexistent not in nonexistent_options and not isinstance(
+ nonexistent, timedelta):
+ raise ValueError("The nonexistent argument must be one of 'raise',"
+ " 'NaT', 'shift_forward', 'shift_backward' or"
+ " a timedelta object")
+
+ axis = self._get_axis_number(axis)
+ ax = self._get_axis(axis)
+
+ def _tz_localize(ax, tz, ambiguous, nonexistent):
+ if not hasattr(ax, 'tz_localize'):
+ if len(ax) > 0:
+ ax_name = self._get_axis_name(axis)
+ raise TypeError('%s is not a valid DatetimeIndex or '
+ 'PeriodIndex' % ax_name)
+ else:
+ ax = DatetimeIndex([], tz=tz)
+ else:
+ ax = ax.tz_localize(
+ tz, ambiguous=ambiguous, nonexistent=nonexistent
+ )
+ return ax
+
+ # if a level is given it must be a MultiIndex level or
+ # equivalent to the axis name
+ if isinstance(ax, MultiIndex):
+ level = ax._get_level_number(level)
+ new_level = _tz_localize(
+ ax.levels[level], tz, ambiguous, nonexistent
+ )
+ ax = ax.set_levels(new_level, level=level)
+ else:
+ if level not in (None, 0, ax.name):
+ raise ValueError("The level {0} is not valid".format(level))
+ ax = _tz_localize(ax, tz, ambiguous, nonexistent)
+
+ result = self._constructor(self._data, copy=copy)
+ result = result.set_axis(ax, axis=axis, inplace=False)
+ return result.__finalize__(self)
+
+ # ----------------------------------------------------------------------
+ # Numeric Methods
+ def abs(self):
+ """
+ Return a Series/DataFrame with absolute numeric value of each element.
+
+ This function only applies to elements that are all numeric.
+
+ Returns
+ -------
+ abs
+ Series/DataFrame containing the absolute value of each element.
+
+ See Also
+ --------
+ numpy.absolute : Calculate the absolute value element-wise.
+
+ Notes
+ -----
+ For ``complex`` inputs, ``1.2 + 1j``, the absolute value is
+ :math:`\\sqrt{ a^2 + b^2 }`.
+
+ Examples
+ --------
+ Absolute numeric values in a Series.
+
+ >>> s = pd.Series([-1.10, 2, -3.33, 4])
+ >>> s.abs()
+ 0 1.10
+ 1 2.00
+ 2 3.33
+ 3 4.00
+ dtype: float64
+
+ Absolute numeric values in a Series with complex numbers.
+
+ >>> s = pd.Series([1.2 + 1j])
+ >>> s.abs()
+ 0 1.56205
+ dtype: float64
+
+ Absolute numeric values in a Series with a Timedelta element.
+
+ >>> s = pd.Series([pd.Timedelta('1 days')])
+ >>> s.abs()
+ 0 1 days
+ dtype: timedelta64[ns]
+
+ Select rows with data closest to certain value using argsort (from
+ `StackOverflow <https://stackoverflow.com/a/17758115>`__).
+
+ >>> df = pd.DataFrame({
+ ... 'a': [4, 5, 6, 7],
+ ... 'b': [10, 20, 30, 40],
+ ... 'c': [100, 50, -30, -50]
+ ... })
+ >>> df
+ a b c
+ 0 4 10 100
+ 1 5 20 50
+ 2 6 30 -30
+ 3 7 40 -50
+ >>> df.loc[(df.c - 43).abs().argsort()]
+ a b c
+ 1 5 20 50
+ 0 4 10 100
+ 2 6 30 -30
+ 3 7 40 -50
+ """
+ return np.abs(self)
+
+ def describe(self, percentiles=None, include=None, exclude=None):
+ """
+ Generate descriptive statistics that summarize the central tendency,
+ dispersion and shape of a dataset's distribution, excluding
+ ``NaN`` values.
+
+ Analyzes both numeric and object series, as well
+ as ``DataFrame`` column sets of mixed data types. The output
+ will vary depending on what is provided. Refer to the notes
+ below for more detail.
+
+ Parameters
+ ----------
+ percentiles : list-like of numbers, optional
+ The percentiles to include in the output. All should
+ fall between 0 and 1. The default is
+ ``[.25, .5, .75]``, which returns the 25th, 50th, and
+ 75th percentiles.
+ include : 'all', list-like of dtypes or None (default), optional
+ A white list of data types to include in the result. Ignored
+ for ``Series``. Here are the options:
+
+ - 'all' : All columns of the input will be included in the output.
+ - A list-like of dtypes : Limits the results to the
+ provided data types.
+ To limit the result to numeric types submit
+ ``numpy.number``. To limit it instead to object columns submit
+ the ``numpy.object`` data type. Strings
+ can also be used in the style of
+ ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
+ select pandas categorical columns, use ``'category'``
+ - None (default) : The result will include all numeric columns.
+ exclude : list-like of dtypes or None (default), optional,
+ A black list of data types to omit from the result. Ignored
+ for ``Series``. Here are the options:
+
+ - A list-like of dtypes : Excludes the provided data types
+ from the result. To exclude numeric types submit
+ ``numpy.number``. To exclude object columns submit the data
+ type ``numpy.object``. Strings can also be used in the style of
+ ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
+ exclude pandas categorical columns, use ``'category'``
+ - None (default) : The result will exclude nothing.
+
+ Returns
+ -------
+ Series or DataFrame
+ Summary statistics of the Series or Dataframe provided.
+
+ See Also
+ --------
+ DataFrame.count: Count number of non-NA/null observations.
+ DataFrame.max: Maximum of the values in the object.
+ DataFrame.min: Minimum of the values in the object.
+ DataFrame.mean: Mean of the values.
+ DataFrame.std: Standard deviation of the obersvations.
+ DataFrame.select_dtypes: Subset of a DataFrame including/excluding
+ columns based on their dtype.
+
+ Notes
+ -----
+ For numeric data, the result's index will include ``count``,
+ ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and
+ upper percentiles. By default the lower percentile is ``25`` and the
+ upper percentile is ``75``. The ``50`` percentile is the
+ same as the median.
+
+ For object data (e.g. strings or timestamps), the result's index
+ will include ``count``, ``unique``, ``top``, and ``freq``. The ``top``
+ is the most common value. The ``freq`` is the most common value's
+ frequency. Timestamps also include the ``first`` and ``last`` items.
+
+ If multiple object values have the highest count, then the
+ ``count`` and ``top`` results will be arbitrarily chosen from
+ among those with the highest count.
+
+ For mixed data types provided via a ``DataFrame``, the default is to
+ return only an analysis of numeric columns. If the dataframe consists
+ only of object and categorical data without any numeric columns, the
+ default is to return an analysis of both the object and categorical
+ columns. If ``include='all'`` is provided as an option, the result
+ will include a union of attributes of each type.
+
+ The `include` and `exclude` parameters can be used to limit
+ which columns in a ``DataFrame`` are analyzed for the output.
+ The parameters are ignored when analyzing a ``Series``.
+
+ Examples
+ --------
+ Describing a numeric ``Series``.
+
+ >>> s = pd.Series([1, 2, 3])
+ >>> s.describe()
+ count 3.0
+ mean 2.0
+ std 1.0
+ min 1.0
+ 25% 1.5
+ 50% 2.0
+ 75% 2.5
+ max 3.0
+ dtype: float64
+
+ Describing a categorical ``Series``.
+
+ >>> s = pd.Series(['a', 'a', 'b', 'c'])
+ >>> s.describe()
+ count 4
+ unique 3
+ top a
+ freq 2
+ dtype: object
+
+ Describing a timestamp ``Series``.
+
+ >>> s = pd.Series([
+ ... np.datetime64("2000-01-01"),
+ ... np.datetime64("2010-01-01"),
+ ... np.datetime64("2010-01-01")
+ ... ])
+ >>> s.describe()
+ count 3
+ unique 2
+ top 2010-01-01 00:00:00
+ freq 2
+ first 2000-01-01 00:00:00
+ last 2010-01-01 00:00:00
+ dtype: object
+
+ Describing a ``DataFrame``. By default only numeric fields
+ are returned.
+
+ >>> df = pd.DataFrame({'categorical': pd.Categorical(['d','e','f']),
+ ... 'numeric': [1, 2, 3],
+ ... 'object': ['a', 'b', 'c']
+ ... })
+ >>> df.describe()
+ numeric
+ count 3.0
+ mean 2.0
+ std 1.0
+ min 1.0
+ 25% 1.5
+ 50% 2.0
+ 75% 2.5
+ max 3.0
+
+ Describing all columns of a ``DataFrame`` regardless of data type.
+
+ >>> df.describe(include='all')
+ categorical numeric object
+ count 3 3.0 3
+ unique 3 NaN 3
+ top f NaN c
+ freq 1 NaN 1
+ mean NaN 2.0 NaN
+ std NaN 1.0 NaN
+ min NaN 1.0 NaN
+ 25% NaN 1.5 NaN
+ 50% NaN 2.0 NaN
+ 75% NaN 2.5 NaN
+ max NaN 3.0 NaN
+
+ Describing a column from a ``DataFrame`` by accessing it as
+ an attribute.
+
+ >>> df.numeric.describe()
+ count 3.0
+ mean 2.0
+ std 1.0
+ min 1.0
+ 25% 1.5
+ 50% 2.0
+ 75% 2.5
+ max 3.0
+ Name: numeric, dtype: float64
+
+ Including only numeric columns in a ``DataFrame`` description.
+
+ >>> df.describe(include=[np.number])
+ numeric
+ count 3.0
+ mean 2.0
+ std 1.0
+ min 1.0
+ 25% 1.5
+ 50% 2.0
+ 75% 2.5
+ max 3.0
+
+ Including only string columns in a ``DataFrame`` description.
+
+ >>> df.describe(include=[np.object])
+ object
+ count 3
+ unique 3
+ top c
+ freq 1
+
+ Including only categorical columns from a ``DataFrame`` description.
+
+ >>> df.describe(include=['category'])
+ categorical
+ count 3
+ unique 3
+ top f
+ freq 1
+
+ Excluding numeric columns from a ``DataFrame`` description.
+
+ >>> df.describe(exclude=[np.number])
+ categorical object
+ count 3 3
+ unique 3 3
+ top f c
+ freq 1 1
+
+ Excluding object columns from a ``DataFrame`` description.
+
+ >>> df.describe(exclude=[np.object])
+ categorical numeric
+ count 3 3.0
+ unique 3 NaN
+ top f NaN
+ freq 1 NaN
+ mean NaN 2.0
+ std NaN 1.0
+ min NaN 1.0
+ 25% NaN 1.5
+ 50% NaN 2.0
+ 75% NaN 2.5
+ max NaN 3.0
+ """
+ if self.ndim >= 3:
+ msg = "describe is not implemented on Panel objects."
+ raise NotImplementedError(msg)
+ elif self.ndim == 2 and self.columns.size == 0:
+ raise ValueError("Cannot describe a DataFrame without columns")
+
+ if percentiles is not None:
+ # explicit conversion of `percentiles` to list
+ percentiles = list(percentiles)
+
+ # get them all to be in [0, 1]
+ self._check_percentile(percentiles)
+
+ # median should always be included
+ if 0.5 not in percentiles:
+ percentiles.append(0.5)
+ percentiles = np.asarray(percentiles)
+ else:
+ percentiles = np.array([0.25, 0.5, 0.75])
+
+ # sort and check for duplicates
+ unique_pcts = np.unique(percentiles)
+ if len(unique_pcts) < len(percentiles):
+ raise ValueError("percentiles cannot contain duplicates")
+ percentiles = unique_pcts
+
+ formatted_percentiles = format_percentiles(percentiles)
+
+ def describe_numeric_1d(series):
+ stat_index = (['count', 'mean', 'std', 'min'] +
+ formatted_percentiles + ['max'])
+ d = ([series.count(), series.mean(), series.std(), series.min()] +
+ series.quantile(percentiles).tolist() + [series.max()])
+ return pd.Series(d, index=stat_index, name=series.name)
+
+ def describe_categorical_1d(data):
+ names = ['count', 'unique']
+ objcounts = data.value_counts()
+ count_unique = len(objcounts[objcounts != 0])
+ result = [data.count(), count_unique]
+ if result[1] > 0:
+ top, freq = objcounts.index[0], objcounts.iloc[0]
+
+ if is_datetime64_any_dtype(data):
+ tz = data.dt.tz
+ asint = data.dropna().values.view('i8')
+ top = Timestamp(top)
+ if top.tzinfo is not None and tz is not None:
+ # Don't tz_localize(None) if key is already tz-aware
+ top = top.tz_convert(tz)
+ else:
+ top = top.tz_localize(tz)
+ names += ['top', 'freq', 'first', 'last']
+ result += [top, freq,
+ Timestamp(asint.min(), tz=tz),
+ Timestamp(asint.max(), tz=tz)]
+ else:
+ names += ['top', 'freq']
+ result += [top, freq]
+
+ return pd.Series(result, index=names, name=data.name)
+
+ def describe_1d(data):
+ if is_bool_dtype(data):
+ return describe_categorical_1d(data)
+ elif is_numeric_dtype(data):
+ return describe_numeric_1d(data)
+ elif is_timedelta64_dtype(data):
+ return describe_numeric_1d(data)
+ else:
+ return describe_categorical_1d(data)
+
+ if self.ndim == 1:
+ return describe_1d(self)
+ elif (include is None) and (exclude is None):
+ # when some numerics are found, keep only numerics
+ data = self.select_dtypes(include=[np.number])
+ if len(data.columns) == 0:
+ data = self
+ elif include == 'all':
+ if exclude is not None:
+ msg = "exclude must be None when include is 'all'"
+ raise ValueError(msg)
+ data = self
+ else:
+ data = self.select_dtypes(include=include, exclude=exclude)
+
+ ldesc = [describe_1d(s) for _, s in data.iteritems()]
+ # set a convenient order for rows
+ names = []
+ ldesc_indexes = sorted((x.index for x in ldesc), key=len)
+ for idxnames in ldesc_indexes:
+ for name in idxnames:
+ if name not in names:
+ names.append(name)
+
+ d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
+ d.columns = data.columns.copy()
+ return d
+
+ def _check_percentile(self, q):
+ """
+ Validate percentiles (used by describe and quantile).
+ """
+
+ msg = ("percentiles should all be in the interval [0, 1]. "
+ "Try {0} instead.")
+ q = np.asarray(q)
+ if q.ndim == 0:
+ if not 0 <= q <= 1:
+ raise ValueError(msg.format(q / 100.0))
+ else:
+ if not all(0 <= qs <= 1 for qs in q):
+ raise ValueError(msg.format(q / 100.0))
+ return q
+
+ _shared_docs['pct_change'] = """
+ Percentage change between the current and a prior element.
+
+ Computes the percentage change from the immediately previous row by
+ default. This is useful in comparing the percentage of change in a time
+ series of elements.
+
+ Parameters
+ ----------
+ periods : int, default 1
+ Periods to shift for forming percent change.
+ fill_method : str, default 'pad'
+ How to handle NAs before computing percent changes.
+ limit : int, default None
+ The number of consecutive NAs to fill before stopping.
+ freq : DateOffset, timedelta, or offset alias string, optional
+ Increment to use from time series API (e.g. 'M' or BDay()).
+ **kwargs
+ Additional keyword arguments are passed into
+ `DataFrame.shift` or `Series.shift`.
+
+ Returns
+ -------
+ chg : Series or DataFrame
+ The same type as the calling object.
+
+ See Also
+ --------
+ Series.diff : Compute the difference of two elements in a Series.
+ DataFrame.diff : Compute the difference of two elements in a DataFrame.
+ Series.shift : Shift the index by some number of periods.
+ DataFrame.shift : Shift the index by some number of periods.
+
+ Examples
+ --------
+ **Series**
+
+ >>> s = pd.Series([90, 91, 85])
+ >>> s
+ 0 90
+ 1 91
+ 2 85
+ dtype: int64
+
+ >>> s.pct_change()
+ 0 NaN
+ 1 0.011111
+ 2 -0.065934
+ dtype: float64
+
+ >>> s.pct_change(periods=2)
+ 0 NaN
+ 1 NaN
+ 2 -0.055556
+ dtype: float64
+
+ See the percentage change in a Series where filling NAs with last
+ valid observation forward to next valid.
+
+ >>> s = pd.Series([90, 91, None, 85])
+ >>> s
+ 0 90.0
+ 1 91.0
+ 2 NaN
+ 3 85.0
+ dtype: float64
+
+ >>> s.pct_change(fill_method='ffill')
+ 0 NaN
+ 1 0.011111
+ 2 0.000000
+ 3 -0.065934
+ dtype: float64
+
+ **DataFrame**
+
+ Percentage change in French franc, Deutsche Mark, and Italian lira from
+ 1980-01-01 to 1980-03-01.
+
+ >>> df = pd.DataFrame({
+ ... 'FR': [4.0405, 4.0963, 4.3149],
+ ... 'GR': [1.7246, 1.7482, 1.8519],
+ ... 'IT': [804.74, 810.01, 860.13]},
+ ... index=['1980-01-01', '1980-02-01', '1980-03-01'])
+ >>> df
+ FR GR IT
+ 1980-01-01 4.0405 1.7246 804.74
+ 1980-02-01 4.0963 1.7482 810.01
+ 1980-03-01 4.3149 1.8519 860.13
+
+ >>> df.pct_change()
+ FR GR IT
+ 1980-01-01 NaN NaN NaN
+ 1980-02-01 0.013810 0.013684 0.006549
+ 1980-03-01 0.053365 0.059318 0.061876
+
+ Percentage of change in GOOG and APPL stock volume. Shows computing
+ the percentage change between columns.
+
+ >>> df = pd.DataFrame({
+ ... '2016': [1769950, 30586265],
+ ... '2015': [1500923, 40912316],
+ ... '2014': [1371819, 41403351]},
+ ... index=['GOOG', 'APPL'])
+ >>> df
+ 2016 2015 2014
+ GOOG 1769950 1500923 1371819
+ APPL 30586265 40912316 41403351
+
+ >>> df.pct_change(axis='columns')
+ 2016 2015 2014
+ GOOG NaN -0.151997 -0.086016
+ APPL NaN 0.337604 0.012002
+ """
+
+ @Appender(_shared_docs['pct_change'] % _shared_doc_kwargs)
+ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None,
+ **kwargs):
+ # TODO: Not sure if above is correct - need someone to confirm.
+ axis = self._get_axis_number(kwargs.pop('axis', self._stat_axis_name))
+ if fill_method is None:
+ data = self
+ else:
+ data = self.fillna(method=fill_method, limit=limit, axis=axis)
+
+ rs = (data.div(data.shift(periods=periods, freq=freq, axis=axis,
+ **kwargs)) - 1)
+ rs = rs.reindex_like(data)
+ if freq is None:
+ mask = isna(com.values_from_object(data))
+ np.putmask(rs.values, mask, np.nan)
+ return rs
+
+ def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs):
+ if axis is None:
+ raise ValueError("Must specify 'axis' when aggregating by level.")
+ grouped = self.groupby(level=level, axis=axis, sort=False)
+ if hasattr(grouped, name) and skipna:
+ return getattr(grouped, name)(**kwargs)
+ axis = self._get_axis_number(axis)
+ method = getattr(type(self), name)
+ applyf = lambda x: method(x, axis=axis, skipna=skipna, **kwargs)
+ return grouped.aggregate(applyf)
+
+ @classmethod
+ def _add_numeric_operations(cls):
+ """
+ Add the operations to the cls; evaluate the doc strings again
+ """
+
+ axis_descr, name, name2 = _doc_parms(cls)
+
+ cls.any = _make_logical_function(
+ cls, 'any', name, name2, axis_descr, _any_desc, nanops.nanany,
+ _any_see_also, _any_examples, empty_value=False)
+ cls.all = _make_logical_function(
+ cls, 'all', name, name2, axis_descr, _all_desc, nanops.nanall,
+ _all_see_also, _all_examples, empty_value=True)
+
+ @Substitution(outname='mad',
+ desc="Return the mean absolute deviation of the values "
+ "for the requested axis.",
+ name1=name, name2=name2, axis_descr=axis_descr,
+ min_count='', see_also='', examples='')
+ @Appender(_num_doc)
+ def mad(self, axis=None, skipna=None, level=None):
+ if skipna is None:
+ skipna = True
+ if axis is None:
+ axis = self._stat_axis_number
+ if level is not None:
+ return self._agg_by_level('mad', axis=axis, level=level,
+ skipna=skipna)
+
+ data = self._get_numeric_data()
+ if axis == 0:
+ demeaned = data - data.mean(axis=0)
+ else:
+ demeaned = data.sub(data.mean(axis=1), axis=0)
+ return np.abs(demeaned).mean(axis=axis, skipna=skipna)
+
+ cls.mad = mad
+
+ cls.sem = _make_stat_function_ddof(
+ cls, 'sem', name, name2, axis_descr,
+ "Return unbiased standard error of the mean over requested "
+ "axis.\n\nNormalized by N-1 by default. This can be changed "
+ "using the ddof argument",
+ nanops.nansem)
+ cls.var = _make_stat_function_ddof(
+ cls, 'var', name, name2, axis_descr,
+ "Return unbiased variance over requested axis.\n\nNormalized by "
+ "N-1 by default. This can be changed using the ddof argument",
+ nanops.nanvar)
+ cls.std = _make_stat_function_ddof(
+ cls, 'std', name, name2, axis_descr,
+ "Return sample standard deviation over requested axis."
+ "\n\nNormalized by N-1 by default. This can be changed using the "
+ "ddof argument",
+ nanops.nanstd)
+
+ @Substitution(outname='compounded',
+ desc="Return the compound percentage of the values for "
+ "the requested axis.", name1=name, name2=name2,
+ axis_descr=axis_descr,
+ min_count='', see_also='', examples='')
+ @Appender(_num_doc)
+ def compound(self, axis=None, skipna=None, level=None):
+ if skipna is None:
+ skipna = True
+ return (1 + self).prod(axis=axis, skipna=skipna, level=level) - 1
+
+ cls.compound = compound
+
+ cls.cummin = _make_cum_function(
+ cls, 'cummin', name, name2, axis_descr, "minimum",
+ lambda y, axis: np.minimum.accumulate(y, axis), "min",
+ np.inf, np.nan, _cummin_examples)
+ cls.cumsum = _make_cum_function(
+ cls, 'cumsum', name, name2, axis_descr, "sum",
+ lambda y, axis: y.cumsum(axis), "sum", 0.,
+ np.nan, _cumsum_examples)
+ cls.cumprod = _make_cum_function(
+ cls, 'cumprod', name, name2, axis_descr, "product",
+ lambda y, axis: y.cumprod(axis), "prod", 1.,
+ np.nan, _cumprod_examples)
+ cls.cummax = _make_cum_function(
+ cls, 'cummax', name, name2, axis_descr, "maximum",
+ lambda y, axis: np.maximum.accumulate(y, axis), "max",
+ -np.inf, np.nan, _cummax_examples)
+
+ cls.sum = _make_min_count_stat_function(
+ cls, 'sum', name, name2, axis_descr,
+ """Return the sum of the values for the requested axis.\n
+ This is equivalent to the method ``numpy.sum``.""",
+ nanops.nansum, _stat_func_see_also, _sum_examples)
+ cls.mean = _make_stat_function(
+ cls, 'mean', name, name2, axis_descr,
+ 'Return the mean of the values for the requested axis.',
+ nanops.nanmean)
+ cls.skew = _make_stat_function(
+ cls, 'skew', name, name2, axis_descr,
+ 'Return unbiased skew over requested axis\nNormalized by N-1.',
+ nanops.nanskew)
+ cls.kurt = _make_stat_function(
+ cls, 'kurt', name, name2, axis_descr,
+ "Return unbiased kurtosis over requested axis using Fisher's "
+ "definition of\nkurtosis (kurtosis of normal == 0.0). Normalized "
+ "by N-1.",
+ nanops.nankurt)
+ cls.kurtosis = cls.kurt
+ cls.prod = _make_min_count_stat_function(
+ cls, 'prod', name, name2, axis_descr,
+ 'Return the product of the values for the requested axis.',
+ nanops.nanprod, examples=_prod_examples)
+ cls.product = cls.prod
+ cls.median = _make_stat_function(
+ cls, 'median', name, name2, axis_descr,
+ 'Return the median of the values for the requested axis.',
+ nanops.nanmedian)
+ cls.max = _make_stat_function(
+ cls, 'max', name, name2, axis_descr,
+ """Return the maximum of the values for the requested axis.\n
+ If you want the *index* of the maximum, use ``idxmax``. This is
+ the equivalent of the ``numpy.ndarray`` method ``argmax``.""",
+ nanops.nanmax, _stat_func_see_also, _max_examples)
+ cls.min = _make_stat_function(
+ cls, 'min', name, name2, axis_descr,
+ """Return the minimum of the values for the requested axis.\n
+ If you want the *index* of the minimum, use ``idxmin``. This is
+ the equivalent of the ``numpy.ndarray`` method ``argmin``.""",
+ nanops.nanmin, _stat_func_see_also, _min_examples)
+
+ @classmethod
+ def _add_series_only_operations(cls):
+ """
+ Add the series only operations to the cls; evaluate the doc
+ strings again.
+ """
+
+ axis_descr, name, name2 = _doc_parms(cls)
+
+ def nanptp(values, axis=0, skipna=True):
+ nmax = nanops.nanmax(values, axis, skipna)
+ nmin = nanops.nanmin(values, axis, skipna)
+ warnings.warn("Method .ptp is deprecated and will be removed "
+ "in a future version. Use numpy.ptp instead.",
+ FutureWarning, stacklevel=4)
+ return nmax - nmin
+
+ cls.ptp = _make_stat_function(
+ cls, 'ptp', name, name2, axis_descr,
+ """Returns the difference between the maximum value and the
+ minimum value in the object. This is the equivalent of the
+ ``numpy.ndarray`` method ``ptp``.\n\n.. deprecated:: 0.24.0
+ Use numpy.ptp instead""",
+ nanptp)
+
+ @classmethod
+ def _add_series_or_dataframe_operations(cls):
+ """
+ Add the series or dataframe only operations to the cls; evaluate
+ the doc strings again.
+ """
+
+ from pandas.core import window as rwindow
+
+ @Appender(rwindow.rolling.__doc__)
+ def rolling(self, window, min_periods=None, center=False,
+ win_type=None, on=None, axis=0, closed=None):
+ axis = self._get_axis_number(axis)
+ return rwindow.rolling(self, window=window,
+ min_periods=min_periods,
+ center=center, win_type=win_type,
+ on=on, axis=axis, closed=closed)
+
+ cls.rolling = rolling
+
+ @Appender(rwindow.expanding.__doc__)
+ def expanding(self, min_periods=1, center=False, axis=0):
+ axis = self._get_axis_number(axis)
+ return rwindow.expanding(self, min_periods=min_periods,
+ center=center, axis=axis)
+
+ cls.expanding = expanding
+
+ @Appender(rwindow.ewm.__doc__)
+ def ewm(self, com=None, span=None, halflife=None, alpha=None,
+ min_periods=0, adjust=True, ignore_na=False,
+ axis=0):
+ axis = self._get_axis_number(axis)
+ return rwindow.ewm(self, com=com, span=span, halflife=halflife,
+ alpha=alpha, min_periods=min_periods,
+ adjust=adjust, ignore_na=ignore_na, axis=axis)
+
+ cls.ewm = ewm
+
+ @Appender(_shared_docs['transform'] % dict(axis="", **_shared_doc_kwargs))
+ def transform(self, func, *args, **kwargs):
+ result = self.agg(func, *args, **kwargs)
+ if is_scalar(result) or len(result) != len(self):
+ raise ValueError("transforms cannot produce "
+ "aggregated results")
+
+ return result
+
+ # ----------------------------------------------------------------------
+ # Misc methods
+
+ _shared_docs['valid_index'] = """
+ Return index for %(position)s non-NA/null value.
+
+ Returns
+ --------
+ scalar : type of index
+
+ Notes
+ --------
+ If all elements are non-NA/null, returns None.
+ Also returns None for empty %(klass)s.
+ """
+
+ def _find_valid_index(self, how):
+ """
+ Retrieves the index of the first valid value.
+
+ Parameters
+ ----------
+ how : {'first', 'last'}
+ Use this parameter to change between the first or last valid index.
+
+ Returns
+ -------
+ idx_first_valid : type of index
+ """
+ assert how in ['first', 'last']
+
+ if len(self) == 0: # early stop
+ return None
+ is_valid = ~self.isna()
+
+ if self.ndim == 2:
+ is_valid = is_valid.any(1) # reduce axis 1
+
+ if how == 'first':
+ idxpos = is_valid.values[::].argmax()
+
+ if how == 'last':
+ idxpos = len(self) - 1 - is_valid.values[::-1].argmax()
+
+ chk_notna = is_valid.iat[idxpos]
+ idx = self.index[idxpos]
+
+ if not chk_notna:
+ return None
+ return idx
+
+ @Appender(_shared_docs['valid_index'] % {'position': 'first',
+ 'klass': 'NDFrame'})
+ def first_valid_index(self):
+ return self._find_valid_index('first')
+
+ @Appender(_shared_docs['valid_index'] % {'position': 'last',
+ 'klass': 'NDFrame'})
+ def last_valid_index(self):
+ return self._find_valid_index('last')
+
+
+def _doc_parms(cls):
+ """Return a tuple of the doc parms."""
+ axis_descr = "{%s}" % ', '.join(["{0} ({1})".format(a, i)
+ for i, a in enumerate(cls._AXIS_ORDERS)])
+ name = (cls._constructor_sliced.__name__
+ if cls._AXIS_LEN > 1 else 'scalar')
+ name2 = cls.__name__
+ return axis_descr, name, name2
+
+
+_num_doc = """
+%(desc)s
+
+Parameters
+----------
+axis : %(axis_descr)s
+ Axis for the function to be applied on.
+skipna : bool, default True
+ Exclude NA/null values when computing the result.
+level : int or level name, default None
+ If the axis is a MultiIndex (hierarchical), count along a
+ particular level, collapsing into a %(name1)s.
+numeric_only : bool, default None
+ Include only float, int, boolean columns. If None, will attempt to use
+ everything, then use only numeric data. Not implemented for Series.
+%(min_count)s\
+**kwargs
+ Additional keyword arguments to be passed to the function.
+
+Returns
+-------
+%(outname)s : %(name1)s or %(name2)s (if level specified)
+%(see_also)s
+%(examples)s\
+"""
+
+_num_ddof_doc = """
+%(desc)s
+
+Parameters
+----------
+axis : %(axis_descr)s
+skipna : boolean, default True
+ Exclude NA/null values. If an entire row/column is NA, the result
+ will be NA
+level : int or level name, default None
+ If the axis is a MultiIndex (hierarchical), count along a
+ particular level, collapsing into a %(name1)s
+ddof : int, default 1
+ Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
+ where N represents the number of elements.
+numeric_only : boolean, default None
+ Include only float, int, boolean columns. If None, will attempt to use
+ everything, then use only numeric data. Not implemented for Series.
+
+Returns
+-------
+%(outname)s : %(name1)s or %(name2)s (if level specified)\n"""
+
+_bool_doc = """
+%(desc)s
+
+Parameters
+----------
+axis : {0 or 'index', 1 or 'columns', None}, default 0
+ Indicate which axis or axes should be reduced.
+
+ * 0 / 'index' : reduce the index, return a Series whose index is the
+ original column labels.
+ * 1 / 'columns' : reduce the columns, return a Series whose index is the
+ original index.
+ * None : reduce all axes, return a scalar.
+
+bool_only : bool, default None
+ Include only boolean columns. If None, will attempt to use everything,
+ then use only boolean data. Not implemented for Series.
+skipna : bool, default True
+ Exclude NA/null values. If the entire row/column is NA and skipna is
+ True, then the result will be %(empty_value)s, as for an empty row/column.
+ If skipna is False, then NA are treated as True, because these are not
+ equal to zero.
+level : int or level name, default None
+ If the axis is a MultiIndex (hierarchical), count along a
+ particular level, collapsing into a %(name1)s.
+**kwargs : any, default None
+ Additional keywords have no effect but might be accepted for
+ compatibility with NumPy.
+
+Returns
+-------
+%(name1)s or %(name2)s
+ If level is specified, then, %(name2)s is returned; otherwise, %(name1)s
+ is returned.
+
+%(see_also)s
+%(examples)s"""
+
+_all_desc = """\
+Return whether all elements are True, potentially over an axis.
+
+Returns True unless there at least one element within a series or
+along a Dataframe axis that is False or equivalent (e.g. zero or
+empty)."""
+
+_all_examples = """\
+Examples
+--------
+**Series**
+
+>>> pd.Series([True, True]).all()
+True
+>>> pd.Series([True, False]).all()
+False
+>>> pd.Series([]).all()
+True
+>>> pd.Series([np.nan]).all()
+True
+>>> pd.Series([np.nan]).all(skipna=False)
+True
+
+**DataFrames**
+
+Create a dataframe from a dictionary.
+
+>>> df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]})
+>>> df
+ col1 col2
+0 True True
+1 True False
+
+Default behaviour checks if column-wise values all return True.
+
+>>> df.all()
+col1 True
+col2 False
+dtype: bool
+
+Specify ``axis='columns'`` to check if row-wise values all return True.
+
+>>> df.all(axis='columns')
+0 True
+1 False
+dtype: bool
+
+Or ``axis=None`` for whether every value is True.
+
+>>> df.all(axis=None)
+False
+"""
+
+_all_see_also = """\
+See Also
+--------
+Series.all : Return True if all elements are True.
+DataFrame.any : Return True if one (or more) elements are True.
+"""
+
+_cnum_doc = """
+Return cumulative %(desc)s over a DataFrame or Series axis.
+
+Returns a DataFrame or Series of the same size containing the cumulative
+%(desc)s.
+
+Parameters
+----------
+axis : {0 or 'index', 1 or 'columns'}, default 0
+ The index or the name of the axis. 0 is equivalent to None or 'index'.
+skipna : boolean, default True
+ Exclude NA/null values. If an entire row/column is NA, the result
+ will be NA.
+*args, **kwargs :
+ Additional keywords have no effect but might be accepted for
+ compatibility with NumPy.
+
+Returns
+-------
+%(outname)s : %(name1)s or %(name2)s\n
+See Also
+--------
+core.window.Expanding.%(accum_func_name)s : Similar functionality
+ but ignores ``NaN`` values.
+%(name2)s.%(accum_func_name)s : Return the %(desc)s over
+ %(name2)s axis.
+%(name2)s.cummax : Return cumulative maximum over %(name2)s axis.
+%(name2)s.cummin : Return cumulative minimum over %(name2)s axis.
+%(name2)s.cumsum : Return cumulative sum over %(name2)s axis.
+%(name2)s.cumprod : Return cumulative product over %(name2)s axis.
+
+%(examples)s
+"""
+
+_cummin_examples = """\
+Examples
+--------
+**Series**
+
+>>> s = pd.Series([2, np.nan, 5, -1, 0])
+>>> s
+0 2.0
+1 NaN
+2 5.0
+3 -1.0
+4 0.0
+dtype: float64
+
+By default, NA values are ignored.
+
+>>> s.cummin()
+0 2.0
+1 NaN
+2 2.0
+3 -1.0
+4 -1.0
+dtype: float64
+
+To include NA values in the operation, use ``skipna=False``
+
+>>> s.cummin(skipna=False)
+0 2.0
+1 NaN
+2 NaN
+3 NaN
+4 NaN
+dtype: float64
+
+**DataFrame**
+
+>>> df = pd.DataFrame([[2.0, 1.0],
+... [3.0, np.nan],
+... [1.0, 0.0]],
+... columns=list('AB'))
+>>> df
+ A B
+0 2.0 1.0
+1 3.0 NaN
+2 1.0 0.0
+
+By default, iterates over rows and finds the minimum
+in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
+
+>>> df.cummin()
+ A B
+0 2.0 1.0
+1 2.0 NaN
+2 1.0 0.0
+
+To iterate over columns and find the minimum in each row,
+use ``axis=1``
+
+>>> df.cummin(axis=1)
+ A B
+0 2.0 1.0
+1 3.0 NaN
+2 1.0 0.0
+"""
+
+_cumsum_examples = """\
+Examples
+--------
+**Series**
+
+>>> s = pd.Series([2, np.nan, 5, -1, 0])
+>>> s
+0 2.0
+1 NaN
+2 5.0
+3 -1.0
+4 0.0
+dtype: float64
+
+By default, NA values are ignored.
+
+>>> s.cumsum()
+0 2.0
+1 NaN
+2 7.0
+3 6.0
+4 6.0
+dtype: float64
+
+To include NA values in the operation, use ``skipna=False``
+
+>>> s.cumsum(skipna=False)
+0 2.0
+1 NaN
+2 NaN
+3 NaN
+4 NaN
+dtype: float64
+
+**DataFrame**
+
+>>> df = pd.DataFrame([[2.0, 1.0],
+... [3.0, np.nan],
+... [1.0, 0.0]],
+... columns=list('AB'))
+>>> df
+ A B
+0 2.0 1.0
+1 3.0 NaN
+2 1.0 0.0
+
+By default, iterates over rows and finds the sum
+in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
+
+>>> df.cumsum()
+ A B
+0 2.0 1.0
+1 5.0 NaN
+2 6.0 1.0
+
+To iterate over columns and find the sum in each row,
+use ``axis=1``
+
+>>> df.cumsum(axis=1)
+ A B
+0 2.0 3.0
+1 3.0 NaN
+2 1.0 1.0
+"""
+
+_cumprod_examples = """\
+Examples
+--------
+**Series**
+
+>>> s = pd.Series([2, np.nan, 5, -1, 0])
+>>> s
+0 2.0
+1 NaN
+2 5.0
+3 -1.0
+4 0.0
+dtype: float64
+
+By default, NA values are ignored.
+
+>>> s.cumprod()
+0 2.0
+1 NaN
+2 10.0
+3 -10.0
+4 -0.0
+dtype: float64
+
+To include NA values in the operation, use ``skipna=False``
+
+>>> s.cumprod(skipna=False)
+0 2.0
+1 NaN
+2 NaN
+3 NaN
+4 NaN
+dtype: float64
+
+**DataFrame**
+
+>>> df = pd.DataFrame([[2.0, 1.0],
+... [3.0, np.nan],
+... [1.0, 0.0]],
+... columns=list('AB'))
+>>> df
+ A B
+0 2.0 1.0
+1 3.0 NaN
+2 1.0 0.0
+
+By default, iterates over rows and finds the product
+in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
+
+>>> df.cumprod()
+ A B
+0 2.0 1.0
+1 6.0 NaN
+2 6.0 0.0
+
+To iterate over columns and find the product in each row,
+use ``axis=1``
+
+>>> df.cumprod(axis=1)
+ A B
+0 2.0 2.0
+1 3.0 NaN
+2 1.0 0.0
+"""
+
+_cummax_examples = """\
+Examples
+--------
+**Series**
+
+>>> s = pd.Series([2, np.nan, 5, -1, 0])
+>>> s
+0 2.0
+1 NaN
+2 5.0
+3 -1.0
+4 0.0
+dtype: float64
+
+By default, NA values are ignored.
+
+>>> s.cummax()
+0 2.0
+1 NaN
+2 5.0
+3 5.0
+4 5.0
+dtype: float64
+
+To include NA values in the operation, use ``skipna=False``
+
+>>> s.cummax(skipna=False)
+0 2.0
+1 NaN
+2 NaN
+3 NaN
+4 NaN
+dtype: float64
+
+**DataFrame**
+
+>>> df = pd.DataFrame([[2.0, 1.0],
+... [3.0, np.nan],
+... [1.0, 0.0]],
+... columns=list('AB'))
+>>> df
+ A B
+0 2.0 1.0
+1 3.0 NaN
+2 1.0 0.0
+
+By default, iterates over rows and finds the maximum
+in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
+
+>>> df.cummax()
+ A B
+0 2.0 1.0
+1 3.0 NaN
+2 3.0 1.0
+
+To iterate over columns and find the maximum in each row,
+use ``axis=1``
+
+>>> df.cummax(axis=1)
+ A B
+0 2.0 2.0
+1 3.0 NaN
+2 1.0 1.0
+"""
+
+_any_see_also = """\
+See Also
+--------
+numpy.any : Numpy version of this method.
+Series.any : Return whether any element is True.
+Series.all : Return whether all elements are True.
+DataFrame.any : Return whether any element is True over requested axis.
+DataFrame.all : Return whether all elements are True over requested axis.
+"""
+
+_any_desc = """\
+Return whether any element is True, potentially over an axis.
+
+Returns False unless there at least one element within a series or
+along a Dataframe axis that is True or equivalent (e.g. non-zero or
+non-empty)."""
+
+_any_examples = """\
+Examples
+--------
+**Series**
+
+For Series input, the output is a scalar indicating whether any element
+is True.
+
+>>> pd.Series([False, False]).any()
+False
+>>> pd.Series([True, False]).any()
+True
+>>> pd.Series([]).any()
+False
+>>> pd.Series([np.nan]).any()
+False
+>>> pd.Series([np.nan]).any(skipna=False)
+True
+
+**DataFrame**
+
+Whether each column contains at least one True element (the default).
+
+>>> df = pd.DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]})
+>>> df
+ A B C
+0 1 0 0
+1 2 2 0
+
+>>> df.any()
+A True
+B True
+C False
+dtype: bool
+
+Aggregating over the columns.
+
+>>> df = pd.DataFrame({"A": [True, False], "B": [1, 2]})
+>>> df
+ A B
+0 True 1
+1 False 2
+
+>>> df.any(axis='columns')
+0 True
+1 True
+dtype: bool
+
+>>> df = pd.DataFrame({"A": [True, False], "B": [1, 0]})
+>>> df
+ A B
+0 True 1
+1 False 0
+
+>>> df.any(axis='columns')
+0 True
+1 False
+dtype: bool
+
+Aggregating over the entire DataFrame with ``axis=None``.
+
+>>> df.any(axis=None)
+True
+
+`any` for an empty DataFrame is an empty Series.
+
+>>> pd.DataFrame([]).any()
+Series([], dtype: bool)
+"""
+
+_shared_docs['stat_func_example'] = """\
+Examples
+--------
+
+>>> idx = pd.MultiIndex.from_arrays([
+... ['warm', 'warm', 'cold', 'cold'],
+... ['dog', 'falcon', 'fish', 'spider']],
+... names=['blooded', 'animal'])
+>>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
+>>> s
+blooded animal
+warm dog 4
+ falcon 2
+cold fish 0
+ spider 8
+Name: legs, dtype: int64
+
+>>> s.{stat_func}()
+{default_output}
+
+{verb} using level names, as well as indices.
+
+>>> s.{stat_func}(level='blooded')
+blooded
+warm {level_output_0}
+cold {level_output_1}
+Name: legs, dtype: int64
+
+>>> s.{stat_func}(level=0)
+blooded
+warm {level_output_0}
+cold {level_output_1}
+Name: legs, dtype: int64
+"""
+
+_sum_examples = _shared_docs['stat_func_example'].format(
+ stat_func='sum',
+ verb='Sum',
+ default_output=14,
+ level_output_0=6,
+ level_output_1=8)
+
+_sum_examples += """
+By default, the sum of an empty or all-NA Series is ``0``.
+
+>>> pd.Series([]).sum() # min_count=0 is the default
+0.0
+
+This can be controlled with the ``min_count`` parameter. For example, if
+you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
+
+>>> pd.Series([]).sum(min_count=1)
+nan
+
+Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
+empty series identically.
+
+>>> pd.Series([np.nan]).sum()
+0.0
+
+>>> pd.Series([np.nan]).sum(min_count=1)
+nan
+"""
+
+_max_examples = _shared_docs['stat_func_example'].format(
+ stat_func='max',
+ verb='Max',
+ default_output=8,
+ level_output_0=4,
+ level_output_1=8)
+
+_min_examples = _shared_docs['stat_func_example'].format(
+ stat_func='min',
+ verb='Min',
+ default_output=0,
+ level_output_0=2,
+ level_output_1=0)
+
+_stat_func_see_also = """
+See Also
+--------
+Series.sum : Return the sum.
+Series.min : Return the minimum.
+Series.max : Return the maximum.
+Series.idxmin : Return the index of the minimum.
+Series.idxmax : Return the index of the maximum.
+DataFrame.min : Return the sum over the requested axis.
+DataFrame.min : Return the minimum over the requested axis.
+DataFrame.max : Return the maximum over the requested axis.
+DataFrame.idxmin : Return the index of the minimum over the requested axis.
+DataFrame.idxmax : Return the index of the maximum over the requested axis.
+"""
+
+_prod_examples = """\
+Examples
+--------
+By default, the product of an empty or all-NA Series is ``1``
+
+>>> pd.Series([]).prod()
+1.0
+
+This can be controlled with the ``min_count`` parameter
+
+>>> pd.Series([]).prod(min_count=1)
+nan
+
+Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
+empty series identically.
+
+>>> pd.Series([np.nan]).prod()
+1.0
+
+>>> pd.Series([np.nan]).prod(min_count=1)
+nan
+"""
+
+_min_count_stub = """\
+min_count : int, default 0
+ The required number of valid values to perform the operation. If fewer than
+ ``min_count`` non-NA values are present the result will be NA.
+
+ .. versionadded :: 0.22.0
+
+ Added with the default being 0. This means the sum of an all-NA
+ or empty Series is 0, and the product of an all-NA or empty
+ Series is 1.
+"""
+
+
+def _make_min_count_stat_function(cls, name, name1, name2, axis_descr, desc,
+ f, see_also='', examples=''):
+ @Substitution(outname=name, desc=desc, name1=name1, name2=name2,
+ axis_descr=axis_descr, min_count=_min_count_stub,
+ see_also=see_also, examples=examples)
+ @Appender(_num_doc)
+ def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
+ min_count=0,
+ **kwargs):
+ if name == 'sum':
+ nv.validate_sum(tuple(), kwargs)
+ elif name == 'prod':
+ nv.validate_prod(tuple(), kwargs)
+ else:
+ nv.validate_stat_func(tuple(), kwargs, fname=name)
+ if skipna is None:
+ skipna = True
+ if axis is None:
+ axis = self._stat_axis_number
+ if level is not None:
+ return self._agg_by_level(name, axis=axis, level=level,
+ skipna=skipna, min_count=min_count)
+ return self._reduce(f, name, axis=axis, skipna=skipna,
+ numeric_only=numeric_only, min_count=min_count)
+
+ return set_function_name(stat_func, name, cls)
+
+
+def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f,
+ see_also='', examples=''):
+ @Substitution(outname=name, desc=desc, name1=name1, name2=name2,
+ axis_descr=axis_descr, min_count='', see_also=see_also,
+ examples=examples)
+ @Appender(_num_doc)
+ def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
+ **kwargs):
+ if name == 'median':
+ nv.validate_median(tuple(), kwargs)
+ else:
+ nv.validate_stat_func(tuple(), kwargs, fname=name)
+ if skipna is None:
+ skipna = True
+ if axis is None:
+ axis = self._stat_axis_number
+ if level is not None:
+ return self._agg_by_level(name, axis=axis, level=level,
+ skipna=skipna)
+ return self._reduce(f, name, axis=axis, skipna=skipna,
+ numeric_only=numeric_only)
+
+ return set_function_name(stat_func, name, cls)
+
+
+def _make_stat_function_ddof(cls, name, name1, name2, axis_descr, desc, f):
+ @Substitution(outname=name, desc=desc, name1=name1, name2=name2,
+ axis_descr=axis_descr)
+ @Appender(_num_ddof_doc)
+ def stat_func(self, axis=None, skipna=None, level=None, ddof=1,
+ numeric_only=None, **kwargs):
+ nv.validate_stat_ddof_func(tuple(), kwargs, fname=name)
+ if skipna is None:
+ skipna = True
+ if axis is None:
+ axis = self._stat_axis_number
+ if level is not None:
+ return self._agg_by_level(name, axis=axis, level=level,
+ skipna=skipna, ddof=ddof)
+ return self._reduce(f, name, axis=axis, numeric_only=numeric_only,
+ skipna=skipna, ddof=ddof)
+
+ return set_function_name(stat_func, name, cls)
+
+
+def _make_cum_function(cls, name, name1, name2, axis_descr, desc,
+ accum_func, accum_func_name, mask_a, mask_b, examples):
+ @Substitution(outname=name, desc=desc, name1=name1, name2=name2,
+ axis_descr=axis_descr, accum_func_name=accum_func_name,
+ examples=examples)
+ @Appender(_cnum_doc)
+ def cum_func(self, axis=None, skipna=True, *args, **kwargs):
+ skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name)
+ if axis is None:
+ axis = self._stat_axis_number
+ else:
+ axis = self._get_axis_number(axis)
+
+ y = com.values_from_object(self).copy()
+
+ if (skipna and
+ issubclass(y.dtype.type, (np.datetime64, np.timedelta64))):
+ result = accum_func(y, axis)
+ mask = isna(self)
+ np.putmask(result, mask, iNaT)
+ elif skipna and not issubclass(y.dtype.type, (np.integer, np.bool_)):
+ mask = isna(self)
+ np.putmask(y, mask, mask_a)
+ result = accum_func(y, axis)
+ np.putmask(result, mask, mask_b)
+ else:
+ result = accum_func(y, axis)
+
+ d = self._construct_axes_dict()
+ d['copy'] = False
+ return self._constructor(result, **d).__finalize__(self)
+
+ return set_function_name(cum_func, name, cls)
+
+
+def _make_logical_function(cls, name, name1, name2, axis_descr, desc, f,
+ see_also, examples, empty_value):
+ @Substitution(outname=name, desc=desc, name1=name1, name2=name2,
+ axis_descr=axis_descr, see_also=see_also, examples=examples,
+ empty_value=empty_value)
+ @Appender(_bool_doc)
+ def logical_func(self, axis=0, bool_only=None, skipna=True, level=None,
+ **kwargs):
+ nv.validate_logical_func(tuple(), kwargs, fname=name)
+ if level is not None:
+ if bool_only is not None:
+ raise NotImplementedError("Option bool_only is not "
+ "implemented with option level.")
+ return self._agg_by_level(name, axis=axis, level=level,
+ skipna=skipna)
+ return self._reduce(f, name, axis=axis, skipna=skipna,
+ numeric_only=bool_only, filter_type='bool')
+
+ return set_function_name(logical_func, name, cls)
+
+
+# install the indexes
+for _name, _indexer in indexing.get_indexers_list():
+ NDFrame._create_indexer(_name, _indexer)
diff --git a/contrib/python/pandas/py2/pandas/core/groupby/__init__.py b/contrib/python/pandas/py2/pandas/core/groupby/__init__.py
new file mode 100644
index 00000000000..9c15a5ebfe0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/groupby/__init__.py
@@ -0,0 +1,4 @@
+from pandas.core.groupby.groupby import GroupBy # noqa: F401
+from pandas.core.groupby.generic import ( # noqa: F401
+ SeriesGroupBy, DataFrameGroupBy, PanelGroupBy)
+from pandas.core.groupby.grouper import Grouper # noqa: F401
diff --git a/contrib/python/pandas/py2/pandas/core/groupby/base.py b/contrib/python/pandas/py2/pandas/core/groupby/base.py
new file mode 100644
index 00000000000..ebba4a0a939
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/groupby/base.py
@@ -0,0 +1,158 @@
+"""
+Provide basic components for groupby. These defintiions
+hold the whitelist of methods that are exposed on the
+SeriesGroupBy and the DataFrameGroupBy objects.
+"""
+
+import types
+
+from pandas.util._decorators import make_signature
+
+from pandas.core.dtypes.common import is_list_like, is_scalar
+
+
+class GroupByMixin(object):
+ """
+ Provide the groupby facilities to the mixed object.
+ """
+
+ @staticmethod
+ def _dispatch(name, *args, **kwargs):
+ """
+ Dispatch to apply.
+ """
+
+ def outer(self, *args, **kwargs):
+ def f(x):
+ x = self._shallow_copy(x, groupby=self._groupby)
+ return getattr(x, name)(*args, **kwargs)
+ return self._groupby.apply(f)
+ outer.__name__ = name
+ return outer
+
+ def _gotitem(self, key, ndim, subset=None):
+ """
+ Sub-classes to define. Return a sliced object.
+
+ Parameters
+ ----------
+ key : string / list of selections
+ ndim : 1,2
+ requested ndim of result
+ subset : object, default None
+ subset to act on
+ """
+ # create a new object to prevent aliasing
+ if subset is None:
+ subset = self.obj
+
+ # we need to make a shallow copy of ourselves
+ # with the same groupby
+ kwargs = {attr: getattr(self, attr) for attr in self._attributes}
+
+ # Try to select from a DataFrame, falling back to a Series
+ try:
+ groupby = self._groupby[key]
+ except IndexError:
+ groupby = self._groupby
+
+ self = self.__class__(subset,
+ groupby=groupby,
+ parent=self,
+ **kwargs)
+ self._reset_cache()
+ if subset.ndim == 2:
+ if is_scalar(key) and key in subset or is_list_like(key):
+ self._selection = key
+ return self
+
+
+# special case to prevent duplicate plots when catching exceptions when
+# forwarding methods from NDFrames
+plotting_methods = frozenset(['plot', 'hist'])
+
+common_apply_whitelist = frozenset([
+ 'quantile', 'fillna', 'mad', 'take',
+ 'idxmax', 'idxmin', 'tshift',
+ 'skew', 'corr', 'cov', 'diff'
+]) | plotting_methods
+
+series_apply_whitelist = ((common_apply_whitelist |
+ {'nlargest', 'nsmallest',
+ 'is_monotonic_increasing',
+ 'is_monotonic_decreasing'})
+ ) | frozenset(['dtype', 'unique'])
+
+dataframe_apply_whitelist = ((common_apply_whitelist |
+ frozenset(['dtypes', 'corrwith'])))
+
+cython_transforms = frozenset(['cumprod', 'cumsum', 'shift',
+ 'cummin', 'cummax'])
+
+cython_cast_blacklist = frozenset(['rank', 'count', 'size'])
+
+
+def whitelist_method_generator(base, klass, whitelist):
+ """
+ Yields all GroupBy member defs for DataFrame/Series names in whitelist.
+
+ Parameters
+ ----------
+ base : class
+ base class
+ klass : class
+ class where members are defined.
+ Should be Series or DataFrame
+ whitelist : list
+ list of names of klass methods to be constructed
+
+ Returns
+ -------
+ The generator yields a sequence of strings, each suitable for exec'ing,
+ that define implementations of the named methods for DataFrameGroupBy
+ or SeriesGroupBy.
+
+ Since we don't want to override methods explicitly defined in the
+ base class, any such name is skipped.
+ """
+
+ method_wrapper_template = \
+ """def %(name)s(%(sig)s) :
+ \"""
+ %(doc)s
+ \"""
+ f = %(self)s.__getattr__('%(name)s')
+ return f(%(args)s)"""
+ property_wrapper_template = \
+ """@property
+def %(name)s(self) :
+ \"""
+ %(doc)s
+ \"""
+ return self.__getattr__('%(name)s')"""
+
+ for name in whitelist:
+ # don't override anything that was explicitly defined
+ # in the base class
+ if hasattr(base, name):
+ continue
+ # ugly, but we need the name string itself in the method.
+ f = getattr(klass, name)
+ doc = f.__doc__
+ doc = doc if type(doc) == str else ''
+ if isinstance(f, types.MethodType):
+ wrapper_template = method_wrapper_template
+ decl, args = make_signature(f)
+ # pass args by name to f because otherwise
+ # GroupBy._make_wrapper won't know whether
+ # we passed in an axis parameter.
+ args_by_name = ['{0}={0}'.format(arg) for arg in args[1:]]
+ params = {'name': name,
+ 'doc': doc,
+ 'sig': ','.join(decl),
+ 'self': args[0],
+ 'args': ','.join(args_by_name)}
+ else:
+ wrapper_template = property_wrapper_template
+ params = {'name': name, 'doc': doc}
+ yield wrapper_template % params
diff --git a/contrib/python/pandas/py2/pandas/core/groupby/categorical.py b/contrib/python/pandas/py2/pandas/core/groupby/categorical.py
new file mode 100644
index 00000000000..85f51323a97
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/groupby/categorical.py
@@ -0,0 +1,100 @@
+import numpy as np
+
+from pandas.core.algorithms import unique1d
+from pandas.core.arrays.categorical import (
+ Categorical, CategoricalDtype, _recode_for_categories)
+
+
+def recode_for_groupby(c, sort, observed):
+ """
+ Code the categories to ensure we can groupby for categoricals.
+
+ If observed=True, we return a new Categorical with the observed
+ categories only.
+
+ If sort=False, return a copy of self, coded with categories as
+ returned by .unique(), followed by any categories not appearing in
+ the data. If sort=True, return self.
+
+ This method is needed solely to ensure the categorical index of the
+ GroupBy result has categories in the order of appearance in the data
+ (GH-8868).
+
+ Parameters
+ ----------
+ c : Categorical
+ sort : boolean
+ The value of the sort parameter groupby was called with.
+ observed : boolean
+ Account only for the observed values
+
+ Returns
+ -------
+ New Categorical
+ If sort=False, the new categories are set to the order of
+ appearance in codes (unless ordered=True, in which case the
+ original order is preserved), followed by any unrepresented
+ categories in the original order.
+ Categorical or None
+ If we are observed, return the original categorical, otherwise None
+ """
+
+ # we only care about observed values
+ if observed:
+ unique_codes = unique1d(c.codes)
+
+ take_codes = unique_codes[unique_codes != -1]
+ if c.ordered:
+ take_codes = np.sort(take_codes)
+
+ # we recode according to the uniques
+ categories = c.categories.take(take_codes)
+ codes = _recode_for_categories(c.codes,
+ c.categories,
+ categories)
+
+ # return a new categorical that maps our new codes
+ # and categories
+ dtype = CategoricalDtype(categories, ordered=c.ordered)
+ return Categorical(codes, dtype=dtype, fastpath=True), c
+
+ # Already sorted according to c.categories; all is fine
+ if sort:
+ return c, None
+
+ # sort=False should order groups in as-encountered order (GH-8868)
+ cat = c.unique()
+
+ # But for groupby to work, all categories should be present,
+ # including those missing from the data (GH-13179), which .unique()
+ # above dropped
+ cat = cat.add_categories(
+ c.categories[~c.categories.isin(cat.categories)])
+
+ return c.reorder_categories(cat.categories), None
+
+
+def recode_from_groupby(c, sort, ci):
+ """
+ Reverse the codes_to_groupby to account for sort / observed.
+
+ Parameters
+ ----------
+ c : Categorical
+ sort : boolean
+ The value of the sort parameter groupby was called with.
+ ci : CategoricalIndex
+ The codes / categories to recode
+
+ Returns
+ -------
+ CategoricalIndex
+ """
+
+ # we re-order to the original category orderings
+ if sort:
+ return ci.set_categories(c.categories)
+
+ # we are not sorting, so add unobserved to the end
+ return ci.add_categories(
+ c.categories[~c.categories.isin(ci.categories)])
diff --git a/contrib/python/pandas/py2/pandas/core/groupby/generic.py b/contrib/python/pandas/py2/pandas/core/groupby/generic.py
new file mode 100644
index 00000000000..5be98b01338
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/groupby/generic.py
@@ -0,0 +1,1673 @@
+"""
+Define the SeriesGroupBy, DataFrameGroupBy, and PanelGroupBy
+classes that hold the groupby interfaces (and some implementations).
+
+These are user facing as the result of the ``df.groupby(...)`` operations,
+which here returns a DataFrameGroupBy object.
+"""
+
+import collections
+import copy
+from functools import partial
+from textwrap import dedent
+import warnings
+
+import numpy as np
+
+from pandas._libs import Timestamp, lib
+import pandas.compat as compat
+from pandas.compat import lzip, map
+from pandas.compat.numpy import _np_version_under1p13
+from pandas.errors import AbstractMethodError
+from pandas.util._decorators import Appender, Substitution
+
+from pandas.core.dtypes.cast import maybe_downcast_to_dtype
+from pandas.core.dtypes.common import (
+ ensure_int64, ensure_platform_int, is_bool, is_datetimelike,
+ is_integer_dtype, is_interval_dtype, is_numeric_dtype, is_scalar)
+from pandas.core.dtypes.missing import isna, notna
+
+import pandas.core.algorithms as algorithms
+from pandas.core.arrays import Categorical
+from pandas.core.base import DataError, SpecificationError
+import pandas.core.common as com
+from pandas.core.frame import DataFrame
+from pandas.core.generic import NDFrame, _shared_docs
+from pandas.core.groupby import base
+from pandas.core.groupby.groupby import (
+ GroupBy, _apply_docs, _transform_template)
+from pandas.core.index import CategoricalIndex, Index, MultiIndex
+import pandas.core.indexes.base as ibase
+from pandas.core.internals import BlockManager, make_block
+from pandas.core.panel import Panel
+from pandas.core.series import Series
+
+from pandas.plotting._core import boxplot_frame_groupby
+
+
+class NDFrameGroupBy(GroupBy):
+
+ def _iterate_slices(self):
+ if self.axis == 0:
+ # kludge
+ if self._selection is None:
+ slice_axis = self.obj.columns
+ else:
+ slice_axis = self._selection_list
+ slicer = lambda x: self.obj[x]
+ else:
+ slice_axis = self.obj.index
+ slicer = self.obj.xs
+
+ for val in slice_axis:
+ if val in self.exclusions:
+ continue
+ yield val, slicer(val)
+
+ def _cython_agg_general(self, how, alt=None, numeric_only=True,
+ min_count=-1):
+ new_items, new_blocks = self._cython_agg_blocks(
+ how, alt=alt, numeric_only=numeric_only, min_count=min_count)
+ return self._wrap_agged_blocks(new_items, new_blocks)
+
+ def _wrap_agged_blocks(self, items, blocks):
+ obj = self._obj_with_exclusions
+
+ new_axes = list(obj._data.axes)
+
+ # more kludge
+ if self.axis == 0:
+ new_axes[0], new_axes[1] = new_axes[1], self.grouper.result_index
+ else:
+ new_axes[self.axis] = self.grouper.result_index
+
+ # Make sure block manager integrity check passes.
+ assert new_axes[0].equals(items)
+ new_axes[0] = items
+
+ mgr = BlockManager(blocks, new_axes)
+
+ new_obj = type(obj)(mgr)
+
+ return self._post_process_cython_aggregate(new_obj)
+
+ _block_agg_axis = 0
+
+ def _cython_agg_blocks(self, how, alt=None, numeric_only=True,
+ min_count=-1):
+ # TODO: the actual managing of mgr_locs is a PITA
+ # here, it should happen via BlockManager.combine
+
+ data, agg_axis = self._get_data_to_aggregate()
+
+ if numeric_only:
+ data = data.get_numeric_data(copy=False)
+
+ new_blocks = []
+ new_items = []
+ deleted_items = []
+ for block in data.blocks:
+
+ locs = block.mgr_locs.as_array
+ try:
+ result, _ = self.grouper.aggregate(
+ block.values, how, axis=agg_axis, min_count=min_count)
+ except NotImplementedError:
+ # generally if we have numeric_only=False
+ # and non-applicable functions
+ # try to python agg
+
+ if alt is None:
+ # we cannot perform the operation
+ # in an alternate way, exclude the block
+ deleted_items.append(locs)
+ continue
+
+ # call our grouper again with only this block
+ from pandas.core.groupby.groupby import groupby
+
+ obj = self.obj[data.items[locs]]
+ s = groupby(obj, self.grouper)
+ result = s.aggregate(lambda x: alt(x, axis=self.axis))
+
+ finally:
+
+ # see if we can cast the block back to the original dtype
+ result = block._try_coerce_and_cast_result(result)
+ newb = block.make_block(result)
+
+ new_items.append(locs)
+ new_blocks.append(newb)
+
+ if len(new_blocks) == 0:
+ raise DataError('No numeric types to aggregate')
+
+ # reset the locs in the blocks to correspond to our
+ # current ordering
+ indexer = np.concatenate(new_items)
+ new_items = data.items.take(np.sort(indexer))
+
+ if len(deleted_items):
+
+ # we need to adjust the indexer to account for the
+ # items we have removed
+ # really should be done in internals :<
+
+ deleted = np.concatenate(deleted_items)
+ ai = np.arange(len(data))
+ mask = np.zeros(len(data))
+ mask[deleted] = 1
+ indexer = (ai - mask.cumsum())[indexer]
+
+ offset = 0
+ for b in new_blocks:
+ loc = len(b.mgr_locs)
+ b.mgr_locs = indexer[offset:(offset + loc)]
+ offset += loc
+
+ return new_items, new_blocks
+
+ def _get_data_to_aggregate(self):
+ obj = self._obj_with_exclusions
+ if self.axis == 0:
+ return obj.swapaxes(0, 1)._data, 1
+ else:
+ return obj._data, self.axis
+
+ def _post_process_cython_aggregate(self, obj):
+ # undoing kludge from below
+ if self.axis == 0:
+ obj = obj.swapaxes(0, 1)
+ return obj
+
+ def aggregate(self, arg, *args, **kwargs):
+
+ _level = kwargs.pop('_level', None)
+ result, how = self._aggregate(arg, _level=_level, *args, **kwargs)
+ if how is None:
+ return result
+
+ if result is None:
+
+ # grouper specific aggregations
+ if self.grouper.nkeys > 1:
+ return self._python_agg_general(arg, *args, **kwargs)
+ else:
+
+ # try to treat as if we are passing a list
+ try:
+ assert not args and not kwargs
+ result = self._aggregate_multiple_funcs(
+ [arg], _level=_level, _axis=self.axis)
+ result.columns = Index(
+ result.columns.levels[0],
+ name=self._selected_obj.columns.name)
+ except Exception:
+ result = self._aggregate_generic(arg, *args, **kwargs)
+
+ if not self.as_index:
+ self._insert_inaxis_grouper_inplace(result)
+ result.index = np.arange(len(result))
+
+ return result._convert(datetime=True)
+
+ agg = aggregate
+
+ def _aggregate_generic(self, func, *args, **kwargs):
+ if self.grouper.nkeys != 1:
+ raise AssertionError('Number of keys must be 1')
+
+ axis = self.axis
+ obj = self._obj_with_exclusions
+
+ result = {}
+ if axis != obj._info_axis_number:
+ try:
+ for name, data in self:
+ result[name] = self._try_cast(func(data, *args, **kwargs),
+ data)
+ except Exception:
+ return self._aggregate_item_by_item(func, *args, **kwargs)
+ else:
+ for name in self.indices:
+ try:
+ data = self.get_group(name, obj=obj)
+ result[name] = self._try_cast(func(data, *args, **kwargs),
+ data)
+ except Exception:
+ wrapper = lambda x: func(x, *args, **kwargs)
+ result[name] = data.apply(wrapper, axis=axis)
+
+ return self._wrap_generic_output(result, obj)
+
+ def _wrap_aggregated_output(self, output, names=None):
+ raise AbstractMethodError(self)
+
+ def _aggregate_item_by_item(self, func, *args, **kwargs):
+ # only for axis==0
+
+ obj = self._obj_with_exclusions
+ result = {}
+ cannot_agg = []
+ errors = None
+ for item in obj:
+ try:
+ data = obj[item]
+ colg = SeriesGroupBy(data, selection=item,
+ grouper=self.grouper)
+ result[item] = self._try_cast(
+ colg.aggregate(func, *args, **kwargs), data)
+ except ValueError:
+ cannot_agg.append(item)
+ continue
+ except TypeError as e:
+ cannot_agg.append(item)
+ errors = e
+ continue
+
+ result_columns = obj.columns
+ if cannot_agg:
+ result_columns = result_columns.drop(cannot_agg)
+
+ # GH6337
+ if not len(result_columns) and errors is not None:
+ raise errors
+
+ return DataFrame(result, columns=result_columns)
+
+ def _decide_output_index(self, output, labels):
+ if len(output) == len(labels):
+ output_keys = labels
+ else:
+ output_keys = sorted(output)
+ try:
+ output_keys.sort()
+ except Exception: # pragma: no cover
+ pass
+
+ if isinstance(labels, MultiIndex):
+ output_keys = MultiIndex.from_tuples(output_keys,
+ names=labels.names)
+
+ return output_keys
+
+ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
+ from pandas.core.index import _all_indexes_same
+ from pandas.core.tools.numeric import to_numeric
+
+ if len(keys) == 0:
+ return DataFrame(index=keys)
+
+ key_names = self.grouper.names
+
+ # GH12824.
+ def first_not_none(values):
+ try:
+ return next(com._not_none(*values))
+ except StopIteration:
+ return None
+
+ v = first_not_none(values)
+
+ if v is None:
+ # GH9684. If all values are None, then this will throw an error.
+ # We'd prefer it return an empty dataframe.
+ return DataFrame()
+ elif isinstance(v, DataFrame):
+ return self._concat_objects(keys, values,
+ not_indexed_same=not_indexed_same)
+ elif self.grouper.groupings is not None:
+ if len(self.grouper.groupings) > 1:
+ key_index = self.grouper.result_index
+
+ else:
+ ping = self.grouper.groupings[0]
+ if len(keys) == ping.ngroups:
+ key_index = ping.group_index
+ key_index.name = key_names[0]
+
+ key_lookup = Index(keys)
+ indexer = key_lookup.get_indexer(key_index)
+
+ # reorder the values
+ values = [values[i] for i in indexer]
+ else:
+
+ key_index = Index(keys, name=key_names[0])
+
+ # don't use the key indexer
+ if not self.as_index:
+ key_index = None
+
+ # make Nones an empty object
+ v = first_not_none(values)
+ if v is None:
+ return DataFrame()
+ elif isinstance(v, NDFrame):
+ values = [
+ x if x is not None else
+ v._constructor(**v._construct_axes_dict())
+ for x in values
+ ]
+
+ v = values[0]
+
+ if isinstance(v, (np.ndarray, Index, Series)):
+ if isinstance(v, Series):
+ applied_index = self._selected_obj._get_axis(self.axis)
+ all_indexed_same = _all_indexes_same([
+ x.index for x in values
+ ])
+ singular_series = (len(values) == 1 and
+ applied_index.nlevels == 1)
+
+ # GH3596
+ # provide a reduction (Frame -> Series) if groups are
+ # unique
+ if self.squeeze:
+
+ # assign the name to this series
+ if singular_series:
+ values[0].name = keys[0]
+
+ # GH2893
+ # we have series in the values array, we want to
+ # produce a series:
+ # if any of the sub-series are not indexed the same
+ # OR we don't have a multi-index and we have only a
+ # single values
+ return self._concat_objects(
+ keys, values, not_indexed_same=not_indexed_same
+ )
+
+ # still a series
+ # path added as of GH 5545
+ elif all_indexed_same:
+ from pandas.core.reshape.concat import concat
+ return concat(values)
+
+ if not all_indexed_same:
+ # GH 8467
+ return self._concat_objects(
+ keys, values, not_indexed_same=True,
+ )
+
+ try:
+ if self.axis == 0:
+ # GH6124 if the list of Series have a consistent name,
+ # then propagate that name to the result.
+ index = v.index.copy()
+ if index.name is None:
+ # Only propagate the series name to the result
+ # if all series have a consistent name. If the
+ # series do not have a consistent name, do
+ # nothing.
+ names = {v.name for v in values}
+ if len(names) == 1:
+ index.name = list(names)[0]
+
+ # normally use vstack as its faster than concat
+ # and if we have mi-columns
+ if (isinstance(v.index, MultiIndex) or
+ key_index is None or
+ isinstance(key_index, MultiIndex)):
+ stacked_values = np.vstack([
+ np.asarray(v) for v in values
+ ])
+ result = DataFrame(stacked_values, index=key_index,
+ columns=index)
+ else:
+ # GH5788 instead of stacking; concat gets the
+ # dtypes correct
+ from pandas.core.reshape.concat import concat
+ result = concat(values, keys=key_index,
+ names=key_index.names,
+ axis=self.axis).unstack()
+ result.columns = index
+ else:
+ stacked_values = np.vstack([np.asarray(v)
+ for v in values])
+ result = DataFrame(stacked_values.T, index=v.index,
+ columns=key_index)
+
+ except (ValueError, AttributeError):
+ # GH1738: values is list of arrays of unequal lengths fall
+ # through to the outer else caluse
+ return Series(values, index=key_index,
+ name=self._selection_name)
+
+ # if we have date/time like in the original, then coerce dates
+ # as we are stacking can easily have object dtypes here
+ so = self._selected_obj
+ if (so.ndim == 2 and so.dtypes.apply(is_datetimelike).any()):
+ result = result.apply(
+ lambda x: to_numeric(x, errors='ignore'))
+ date_cols = self._selected_obj.select_dtypes(
+ include=['datetime', 'timedelta']).columns
+ date_cols = date_cols.intersection(result.columns)
+ result[date_cols] = (result[date_cols]
+ ._convert(datetime=True,
+ coerce=True))
+ else:
+ result = result._convert(datetime=True)
+
+ return self._reindex_output(result)
+
+ # values are not series or array-like but scalars
+ else:
+ # only coerce dates if we find at least 1 datetime
+ coerce = any(isinstance(x, Timestamp) for x in values)
+ # self._selection_name not passed through to Series as the
+ # result should not take the name of original selection
+ # of columns
+ return (Series(values, index=key_index)
+ ._convert(datetime=True,
+ coerce=coerce))
+
+ else:
+ # Handle cases like BinGrouper
+ return self._concat_objects(keys, values,
+ not_indexed_same=not_indexed_same)
+
+ def _transform_general(self, func, *args, **kwargs):
+ from pandas.core.reshape.concat import concat
+
+ applied = []
+ obj = self._obj_with_exclusions
+ gen = self.grouper.get_iterator(obj, axis=self.axis)
+ fast_path, slow_path = self._define_paths(func, *args, **kwargs)
+
+ path = None
+ for name, group in gen:
+ object.__setattr__(group, 'name', name)
+
+ if path is None:
+ # Try slow path and fast path.
+ try:
+ path, res = self._choose_path(fast_path, slow_path, group)
+ except TypeError:
+ return self._transform_item_by_item(obj, fast_path)
+ except ValueError:
+ msg = 'transform must return a scalar value for each group'
+ raise ValueError(msg)
+ else:
+ res = path(group)
+
+ if isinstance(res, Series):
+
+ # we need to broadcast across the
+ # other dimension; this will preserve dtypes
+ # GH14457
+ if not np.prod(group.shape):
+ continue
+ elif res.index.is_(obj.index):
+ r = concat([res] * len(group.columns), axis=1)
+ r.columns = group.columns
+ r.index = group.index
+ else:
+ r = DataFrame(
+ np.concatenate([res.values] * len(group.index)
+ ).reshape(group.shape),
+ columns=group.columns, index=group.index)
+
+ applied.append(r)
+ else:
+ applied.append(res)
+
+ concat_index = obj.columns if self.axis == 0 else obj.index
+ concatenated = concat(applied, join_axes=[concat_index],
+ axis=self.axis, verify_integrity=False)
+ return self._set_result_index_ordered(concatenated)
+
+ @Substitution(klass='DataFrame', selected='')
+ @Appender(_transform_template)
+ def transform(self, func, *args, **kwargs):
+
+ # optimized transforms
+ func = self._is_cython_func(func) or func
+ if isinstance(func, compat.string_types):
+ if func in base.cython_transforms:
+ # cythonized transform
+ return getattr(self, func)(*args, **kwargs)
+ else:
+ # cythonized aggregation and merge
+ result = getattr(self, func)(*args, **kwargs)
+ else:
+ return self._transform_general(func, *args, **kwargs)
+
+ # a reduction transform
+ if not isinstance(result, DataFrame):
+ return self._transform_general(func, *args, **kwargs)
+
+ obj = self._obj_with_exclusions
+
+ # nuiscance columns
+ if not result.columns.equals(obj.columns):
+ return self._transform_general(func, *args, **kwargs)
+
+ return self._transform_fast(result, obj, func)
+
+ def _transform_fast(self, result, obj, func_nm):
+ """
+ Fast transform path for aggregations
+ """
+ # if there were groups with no observations (Categorical only?)
+ # try casting data to original dtype
+ cast = self._transform_should_cast(func_nm)
+
+ # for each col, reshape to to size of original frame
+ # by take operation
+ ids, _, ngroup = self.grouper.group_info
+ output = []
+ for i, _ in enumerate(result.columns):
+ res = algorithms.take_1d(result.iloc[:, i].values, ids)
+ if cast:
+ res = self._try_cast(res, obj.iloc[:, i])
+ output.append(res)
+
+ return DataFrame._from_arrays(output, columns=result.columns,
+ index=obj.index)
+
+ def _define_paths(self, func, *args, **kwargs):
+ if isinstance(func, compat.string_types):
+ fast_path = lambda group: getattr(group, func)(*args, **kwargs)
+ slow_path = lambda group: group.apply(
+ lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis)
+ else:
+ fast_path = lambda group: func(group, *args, **kwargs)
+ slow_path = lambda group: group.apply(
+ lambda x: func(x, *args, **kwargs), axis=self.axis)
+ return fast_path, slow_path
+
+ def _choose_path(self, fast_path, slow_path, group):
+ path = slow_path
+ res = slow_path(group)
+
+ # if we make it here, test if we can use the fast path
+ try:
+ res_fast = fast_path(group)
+
+ # verify fast path does not change columns (and names), otherwise
+ # its results cannot be joined with those of the slow path
+ if res_fast.columns != group.columns:
+ return path, res
+ # verify numerical equality with the slow path
+ if res.shape == res_fast.shape:
+ res_r = res.values.ravel()
+ res_fast_r = res_fast.values.ravel()
+ mask = notna(res_r)
+ if (res_r[mask] == res_fast_r[mask]).all():
+ path = fast_path
+ except Exception:
+ pass
+ return path, res
+
+ def _transform_item_by_item(self, obj, wrapper):
+ # iterate through columns
+ output = {}
+ inds = []
+ for i, col in enumerate(obj):
+ try:
+ output[col] = self[col].transform(wrapper)
+ inds.append(i)
+ except Exception:
+ pass
+
+ if len(output) == 0: # pragma: no cover
+ raise TypeError('Transform function invalid for data types')
+
+ columns = obj.columns
+ if len(output) < len(obj.columns):
+ columns = columns.take(inds)
+
+ return DataFrame(output, index=obj.index, columns=columns)
+
+ def filter(self, func, dropna=True, *args, **kwargs): # noqa
+ """
+ Return a copy of a DataFrame excluding elements from groups that
+ do not satisfy the boolean criterion specified by func.
+
+ Parameters
+ ----------
+ f : function
+ Function to apply to each subframe. Should return True or False.
+ dropna : Drop groups that do not pass the filter. True by default;
+ if False, groups that evaluate False are filled with NaNs.
+
+ Returns
+ -------
+ filtered : DataFrame
+
+ Notes
+ -----
+ Each subframe is endowed the attribute 'name' in case you need to know
+ which group you are working on.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
+ ... 'foo', 'bar'],
+ ... 'B' : [1, 2, 3, 4, 5, 6],
+ ... 'C' : [2.0, 5., 8., 1., 2., 9.]})
+ >>> grouped = df.groupby('A')
+ >>> grouped.filter(lambda x: x['B'].mean() > 3.)
+ A B C
+ 1 bar 2 5.0
+ 3 bar 4 1.0
+ 5 bar 6 9.0
+ """
+
+ indices = []
+
+ obj = self._selected_obj
+ gen = self.grouper.get_iterator(obj, axis=self.axis)
+
+ for name, group in gen:
+ object.__setattr__(group, 'name', name)
+
+ res = func(group, *args, **kwargs)
+
+ try:
+ res = res.squeeze()
+ except AttributeError: # allow e.g., scalars and frames to pass
+ pass
+
+ # interpret the result of the filter
+ if is_bool(res) or (is_scalar(res) and isna(res)):
+ if res and notna(res):
+ indices.append(self._get_index(name))
+ else:
+ # non scalars aren't allowed
+ raise TypeError("filter function returned a %s, "
+ "but expected a scalar bool" %
+ type(res).__name__)
+
+ return self._apply_filter(indices, dropna)
+
+
+class SeriesGroupBy(GroupBy):
+ #
+ # Make class defs of attributes on SeriesGroupBy whitelist
+
+ _apply_whitelist = base.series_apply_whitelist
+ for _def_str in base.whitelist_method_generator(
+ GroupBy, Series, _apply_whitelist):
+ exec(_def_str)
+
+ @property
+ def _selection_name(self):
+ """
+ since we are a series, we by definition only have
+ a single name, but may be the result of a selection or
+ the name of our object
+ """
+ if self._selection is None:
+ return self.obj.name
+ else:
+ return self._selection
+
+ _agg_see_also_doc = dedent("""
+ See Also
+ --------
+ pandas.Series.groupby.apply
+ pandas.Series.groupby.transform
+ pandas.Series.aggregate
+ """)
+
+ _agg_examples_doc = dedent("""
+ Examples
+ --------
+ >>> s = pd.Series([1, 2, 3, 4])
+
+ >>> s
+ 0 1
+ 1 2
+ 2 3
+ 3 4
+ dtype: int64
+
+ >>> s.groupby([1, 1, 2, 2]).min()
+ 1 1
+ 2 3
+ dtype: int64
+
+ >>> s.groupby([1, 1, 2, 2]).agg('min')
+ 1 1
+ 2 3
+ dtype: int64
+
+ >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max'])
+ min max
+ 1 1 2
+ 2 3 4
+ """)
+
+ @Appender(_apply_docs['template']
+ .format(input='series',
+ examples=_apply_docs['series_examples']))
+ def apply(self, func, *args, **kwargs):
+ return super(SeriesGroupBy, self).apply(func, *args, **kwargs)
+
+ @Substitution(see_also=_agg_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded='',
+ klass='Series',
+ axis='')
+ @Appender(_shared_docs['aggregate'])
+ def aggregate(self, func_or_funcs, *args, **kwargs):
+ _level = kwargs.pop('_level', None)
+ if isinstance(func_or_funcs, compat.string_types):
+ return getattr(self, func_or_funcs)(*args, **kwargs)
+
+ if isinstance(func_or_funcs, compat.Iterable):
+ # Catch instances of lists / tuples
+ # but not the class list / tuple itself.
+ ret = self._aggregate_multiple_funcs(func_or_funcs,
+ (_level or 0) + 1)
+ else:
+ cyfunc = self._is_cython_func(func_or_funcs)
+ if cyfunc and not args and not kwargs:
+ return getattr(self, cyfunc)()
+
+ if self.grouper.nkeys > 1:
+ return self._python_agg_general(func_or_funcs, *args, **kwargs)
+
+ try:
+ return self._python_agg_general(func_or_funcs, *args, **kwargs)
+ except Exception:
+ result = self._aggregate_named(func_or_funcs, *args, **kwargs)
+
+ index = Index(sorted(result), name=self.grouper.names[0])
+ ret = Series(result, index=index)
+
+ if not self.as_index: # pragma: no cover
+ print('Warning, ignoring as_index=True')
+
+ # _level handled at higher
+ if not _level and isinstance(ret, dict):
+ from pandas import concat
+ ret = concat(ret, axis=1)
+ return ret
+
+ agg = aggregate
+
+ def _aggregate_multiple_funcs(self, arg, _level):
+ if isinstance(arg, dict):
+
+ # show the deprecation, but only if we
+ # have not shown a higher level one
+ # GH 15931
+ if isinstance(self._selected_obj, Series) and _level <= 1:
+ warnings.warn(
+ ("using a dict on a Series for aggregation\n"
+ "is deprecated and will be removed in a future "
+ "version"),
+ FutureWarning, stacklevel=3)
+
+ columns = list(arg.keys())
+ arg = list(arg.items())
+ elif any(isinstance(x, (tuple, list)) for x in arg):
+ arg = [(x, x) if not isinstance(x, (tuple, list)) else x
+ for x in arg]
+
+ # indicated column order
+ columns = lzip(*arg)[0]
+ else:
+ # list of functions / function names
+ columns = []
+ for f in arg:
+ if isinstance(f, compat.string_types):
+ columns.append(f)
+ else:
+ # protect against callables without names
+ columns.append(com.get_callable_name(f))
+ arg = lzip(columns, arg)
+
+ results = {}
+ for name, func in arg:
+ obj = self
+ if name in results:
+ raise SpecificationError(
+ 'Function names must be unique, found multiple named '
+ '{}'.format(name))
+
+ # reset the cache so that we
+ # only include the named selection
+ if name in self._selected_obj:
+ obj = copy.copy(obj)
+ obj._reset_cache()
+ obj._selection = name
+ results[name] = obj.aggregate(func)
+
+ if any(isinstance(x, DataFrame) for x in compat.itervalues(results)):
+ # let higher level handle
+ if _level:
+ return results
+
+ return DataFrame(results, columns=columns)
+
+ def _wrap_output(self, output, index, names=None):
+ """ common agg/transform wrapping logic """
+ output = output[self._selection_name]
+
+ if names is not None:
+ return DataFrame(output, index=index, columns=names)
+ else:
+ name = self._selection_name
+ if name is None:
+ name = self._selected_obj.name
+ return Series(output, index=index, name=name)
+
+ def _wrap_aggregated_output(self, output, names=None):
+ return self._wrap_output(output=output,
+ index=self.grouper.result_index,
+ names=names)
+
+ def _wrap_transformed_output(self, output, names=None):
+ return self._wrap_output(output=output,
+ index=self.obj.index,
+ names=names)
+
+ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
+ if len(keys) == 0:
+ # GH #6265
+ return Series([], name=self._selection_name, index=keys)
+
+ def _get_index():
+ if self.grouper.nkeys > 1:
+ index = MultiIndex.from_tuples(keys, names=self.grouper.names)
+ else:
+ index = Index(keys, name=self.grouper.names[0])
+ return index
+
+ if isinstance(values[0], dict):
+ # GH #823
+ index = _get_index()
+ result = DataFrame(values, index=index).stack()
+ result.name = self._selection_name
+ return result
+
+ if isinstance(values[0], (Series, dict)):
+ return self._concat_objects(keys, values,
+ not_indexed_same=not_indexed_same)
+ elif isinstance(values[0], DataFrame):
+ # possible that Series -> DataFrame by applied function
+ return self._concat_objects(keys, values,
+ not_indexed_same=not_indexed_same)
+ else:
+ # GH #6265
+ return Series(values, index=_get_index(),
+ name=self._selection_name)
+
+ def _aggregate_named(self, func, *args, **kwargs):
+ result = {}
+
+ for name, group in self:
+ group.name = name
+ output = func(group, *args, **kwargs)
+ if isinstance(output, (Series, Index, np.ndarray)):
+ raise Exception('Must produce aggregated value')
+ result[name] = self._try_cast(output, group)
+
+ return result
+
+ @Substitution(klass='Series', selected='A.')
+ @Appender(_transform_template)
+ def transform(self, func, *args, **kwargs):
+ func = self._is_cython_func(func) or func
+
+ # if string function
+ if isinstance(func, compat.string_types):
+ if func in base.cython_transforms:
+ # cythonized transform
+ return getattr(self, func)(*args, **kwargs)
+ else:
+ # cythonized aggregation and merge
+ return self._transform_fast(
+ lambda: getattr(self, func)(*args, **kwargs), func)
+
+ # reg transform
+ klass = self._selected_obj.__class__
+ results = []
+ wrapper = lambda x: func(x, *args, **kwargs)
+ for name, group in self:
+ object.__setattr__(group, 'name', name)
+ res = wrapper(group)
+
+ if hasattr(res, 'values'):
+ res = res.values
+
+ indexer = self._get_index(name)
+ s = klass(res, indexer)
+ results.append(s)
+
+ from pandas.core.reshape.concat import concat
+ result = concat(results).sort_index()
+
+ # we will only try to coerce the result type if
+ # we have a numeric dtype, as these are *always* udfs
+ # the cython take a different path (and casting)
+ dtype = self._selected_obj.dtype
+ if is_numeric_dtype(dtype):
+ result = maybe_downcast_to_dtype(result, dtype)
+
+ result.name = self._selected_obj.name
+ result.index = self._selected_obj.index
+ return result
+
+ def _transform_fast(self, func, func_nm):
+ """
+ fast version of transform, only applicable to
+ builtin/cythonizable functions
+ """
+ if isinstance(func, compat.string_types):
+ func = getattr(self, func)
+
+ ids, _, ngroup = self.grouper.group_info
+ cast = self._transform_should_cast(func_nm)
+ out = algorithms.take_1d(func()._values, ids)
+ if cast:
+ out = self._try_cast(out, self.obj)
+ return Series(out, index=self.obj.index, name=self.obj.name)
+
+ def filter(self, func, dropna=True, *args, **kwargs): # noqa
+ """
+ Return a copy of a Series excluding elements from groups that
+ do not satisfy the boolean criterion specified by func.
+
+ Parameters
+ ----------
+ func : function
+ To apply to each group. Should return True or False.
+ dropna : Drop groups that do not pass the filter. True by default;
+ if False, groups that evaluate False are filled with NaNs.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
+ ... 'foo', 'bar'],
+ ... 'B' : [1, 2, 3, 4, 5, 6],
+ ... 'C' : [2.0, 5., 8., 1., 2., 9.]})
+ >>> grouped = df.groupby('A')
+ >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.)
+ 1 2
+ 3 4
+ 5 6
+ Name: B, dtype: int64
+
+ Returns
+ -------
+ filtered : Series
+ """
+ if isinstance(func, compat.string_types):
+ wrapper = lambda x: getattr(x, func)(*args, **kwargs)
+ else:
+ wrapper = lambda x: func(x, *args, **kwargs)
+
+ # Interpret np.nan as False.
+ def true_and_notna(x, *args, **kwargs):
+ b = wrapper(x, *args, **kwargs)
+ return b and notna(b)
+
+ try:
+ indices = [self._get_index(name) for name, group in self
+ if true_and_notna(group)]
+ except ValueError:
+ raise TypeError("the filter must return a boolean result")
+ except TypeError:
+ raise TypeError("the filter must return a boolean result")
+
+ filtered = self._apply_filter(indices, dropna)
+ return filtered
+
+ def nunique(self, dropna=True):
+ """ Returns number of unique elements in the group """
+ ids, _, _ = self.grouper.group_info
+
+ val = self.obj.get_values()
+
+ try:
+ sorter = np.lexsort((val, ids))
+ except TypeError: # catches object dtypes
+ msg = 'val.dtype must be object, got {}'.format(val.dtype)
+ assert val.dtype == object, msg
+ val, _ = algorithms.factorize(val, sort=False)
+ sorter = np.lexsort((val, ids))
+ _isna = lambda a: a == -1
+ else:
+ _isna = isna
+
+ ids, val = ids[sorter], val[sorter]
+
+ # group boundaries are where group ids change
+ # unique observations are where sorted values change
+ idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
+ inc = np.r_[1, val[1:] != val[:-1]]
+
+ # 1st item of each group is a new unique observation
+ mask = _isna(val)
+ if dropna:
+ inc[idx] = 1
+ inc[mask] = 0
+ else:
+ inc[mask & np.r_[False, mask[:-1]]] = 0
+ inc[idx] = 1
+
+ out = np.add.reduceat(inc, idx).astype('int64', copy=False)
+ if len(ids):
+ # NaN/NaT group exists if the head of ids is -1,
+ # so remove it from res and exclude its index from idx
+ if ids[0] == -1:
+ res = out[1:]
+ idx = idx[np.flatnonzero(idx)]
+ else:
+ res = out
+ else:
+ res = out[1:]
+ ri = self.grouper.result_index
+
+ # we might have duplications among the bins
+ if len(res) != len(ri):
+ res, out = np.zeros(len(ri), dtype=out.dtype), res
+ res[ids[idx]] = out
+
+ return Series(res,
+ index=ri,
+ name=self._selection_name)
+
+ @Appender(Series.describe.__doc__)
+ def describe(self, **kwargs):
+ result = self.apply(lambda x: x.describe(**kwargs))
+ if self.axis == 1:
+ return result.T
+ return result.unstack()
+
+ def value_counts(self, normalize=False, sort=True, ascending=False,
+ bins=None, dropna=True):
+
+ from pandas.core.reshape.tile import cut
+ from pandas.core.reshape.merge import _get_join_indexers
+
+ if bins is not None and not np.iterable(bins):
+ # scalar bins cannot be done at top level
+ # in a backward compatible way
+ return self.apply(Series.value_counts,
+ normalize=normalize,
+ sort=sort,
+ ascending=ascending,
+ bins=bins)
+
+ ids, _, _ = self.grouper.group_info
+ val = self.obj.get_values()
+
+ # groupby removes null keys from groupings
+ mask = ids != -1
+ ids, val = ids[mask], val[mask]
+
+ if bins is None:
+ lab, lev = algorithms.factorize(val, sort=True)
+ llab = lambda lab, inc: lab[inc]
+ else:
+
+ # lab is a Categorical with categories an IntervalIndex
+ lab = cut(Series(val), bins, include_lowest=True)
+ lev = lab.cat.categories
+ lab = lev.take(lab.cat.codes)
+ llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]
+
+ if is_interval_dtype(lab):
+ # TODO: should we do this inside II?
+ sorter = np.lexsort((lab.left, lab.right, ids))
+ else:
+ sorter = np.lexsort((lab, ids))
+
+ ids, lab = ids[sorter], lab[sorter]
+
+ # group boundaries are where group ids change
+ idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
+
+ # new values are where sorted labels change
+ lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
+ inc = np.r_[True, lchanges]
+ inc[idx] = True # group boundaries are also new values
+ out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
+
+ # num. of times each group should be repeated
+ rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
+
+ # multi-index components
+ labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)]
+ levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
+ names = self.grouper.names + [self._selection_name]
+
+ if dropna:
+ mask = labels[-1] != -1
+ if mask.all():
+ dropna = False
+ else:
+ out, labels = out[mask], [label[mask] for label in labels]
+
+ if normalize:
+ out = out.astype('float')
+ d = np.diff(np.r_[idx, len(ids)])
+ if dropna:
+ m = ids[lab == -1]
+ np.add.at(d, m, -1)
+ acc = rep(d)[mask]
+ else:
+ acc = rep(d)
+ out /= acc
+
+ if sort and bins is None:
+ cat = ids[inc][mask] if dropna else ids[inc]
+ sorter = np.lexsort((out if ascending else -out, cat))
+ out, labels[-1] = out[sorter], labels[-1][sorter]
+
+ if bins is None:
+ mi = MultiIndex(levels=levels, codes=labels, names=names,
+ verify_integrity=False)
+
+ if is_integer_dtype(out):
+ out = ensure_int64(out)
+ return Series(out, index=mi, name=self._selection_name)
+
+ # for compat. with libgroupby.value_counts need to ensure every
+ # bin is present at every index level, null filled with zeros
+ diff = np.zeros(len(out), dtype='bool')
+ for lab in labels[:-1]:
+ diff |= np.r_[True, lab[1:] != lab[:-1]]
+
+ ncat, nbin = diff.sum(), len(levels[-1])
+
+ left = [np.repeat(np.arange(ncat), nbin),
+ np.tile(np.arange(nbin), ncat)]
+
+ right = [diff.cumsum() - 1, labels[-1]]
+
+ _, idx = _get_join_indexers(left, right, sort=False, how='left')
+ out = np.where(idx != -1, out[idx], 0)
+
+ if sort:
+ sorter = np.lexsort((out if ascending else -out, left[0]))
+ out, left[-1] = out[sorter], left[-1][sorter]
+
+ # build the multi-index w/ full levels
+ codes = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1]))
+ codes.append(left[-1])
+
+ mi = MultiIndex(levels=levels, codes=codes, names=names,
+ verify_integrity=False)
+
+ if is_integer_dtype(out):
+ out = ensure_int64(out)
+ return Series(out, index=mi, name=self._selection_name)
+
+ def count(self):
+ """ Compute count of group, excluding missing values """
+ ids, _, ngroups = self.grouper.group_info
+ val = self.obj.get_values()
+
+ mask = (ids != -1) & ~isna(val)
+ ids = ensure_platform_int(ids)
+ minlength = ngroups or (None if _np_version_under1p13 else 0)
+ out = np.bincount(ids[mask], minlength=minlength)
+
+ return Series(out,
+ index=self.grouper.result_index,
+ name=self._selection_name,
+ dtype='int64')
+
+ def _apply_to_column_groupbys(self, func):
+ """ return a pass thru """
+ return func(self)
+
+ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None):
+ """Calcuate pct_change of each value to previous entry in group"""
+ # TODO: Remove this conditional when #23918 is fixed
+ if freq:
+ return self.apply(lambda x: x.pct_change(periods=periods,
+ fill_method=fill_method,
+ limit=limit, freq=freq))
+ filled = getattr(self, fill_method)(limit=limit)
+ fill_grp = filled.groupby(self.grouper.labels)
+ shifted = fill_grp.shift(periods=periods, freq=freq)
+
+ return (filled / shifted) - 1
+
+
+class DataFrameGroupBy(NDFrameGroupBy):
+
+ _apply_whitelist = base.dataframe_apply_whitelist
+
+ #
+ # Make class defs of attributes on DataFrameGroupBy whitelist.
+ for _def_str in base.whitelist_method_generator(
+ GroupBy, DataFrame, _apply_whitelist):
+ exec(_def_str)
+
+ _block_agg_axis = 1
+
+ _agg_see_also_doc = dedent("""
+ See Also
+ --------
+ pandas.DataFrame.groupby.apply
+ pandas.DataFrame.groupby.transform
+ pandas.DataFrame.aggregate
+ """)
+
+ _agg_examples_doc = dedent("""
+ Examples
+ --------
+
+ >>> df = pd.DataFrame({'A': [1, 1, 2, 2],
+ ... 'B': [1, 2, 3, 4],
+ ... 'C': np.random.randn(4)})
+
+ >>> df
+ A B C
+ 0 1 1 0.362838
+ 1 1 2 0.227877
+ 2 2 3 1.267767
+ 3 2 4 -0.562860
+
+ The aggregation is for each column.
+
+ >>> df.groupby('A').agg('min')
+ B C
+ A
+ 1 1 0.227877
+ 2 3 -0.562860
+
+ Multiple aggregations
+
+ >>> df.groupby('A').agg(['min', 'max'])
+ B C
+ min max min max
+ A
+ 1 1 2 0.227877 0.362838
+ 2 3 4 -0.562860 1.267767
+
+ Select a column for aggregation
+
+ >>> df.groupby('A').B.agg(['min', 'max'])
+ min max
+ A
+ 1 1 2
+ 2 3 4
+
+ Different aggregations per column
+
+ >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})
+ B C
+ min max sum
+ A
+ 1 1 2 0.590716
+ 2 3 4 0.704907
+ """)
+
+ @Substitution(see_also=_agg_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded='',
+ klass='DataFrame',
+ axis='')
+ @Appender(_shared_docs['aggregate'])
+ def aggregate(self, arg, *args, **kwargs):
+ return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)
+
+ agg = aggregate
+
+ def _gotitem(self, key, ndim, subset=None):
+ """
+ sub-classes to define
+ return a sliced object
+
+ Parameters
+ ----------
+ key : string / list of selections
+ ndim : 1,2
+ requested ndim of result
+ subset : object, default None
+ subset to act on
+ """
+
+ if ndim == 2:
+ if subset is None:
+ subset = self.obj
+ return DataFrameGroupBy(subset, self.grouper, selection=key,
+ grouper=self.grouper,
+ exclusions=self.exclusions,
+ as_index=self.as_index,
+ observed=self.observed)
+ elif ndim == 1:
+ if subset is None:
+ subset = self.obj[key]
+ return SeriesGroupBy(subset, selection=key,
+ grouper=self.grouper)
+
+ raise AssertionError("invalid ndim for _gotitem")
+
+ def _wrap_generic_output(self, result, obj):
+ result_index = self.grouper.levels[0]
+
+ if self.axis == 0:
+ return DataFrame(result, index=obj.columns,
+ columns=result_index).T
+ else:
+ return DataFrame(result, index=obj.index,
+ columns=result_index)
+
+ def _get_data_to_aggregate(self):
+ obj = self._obj_with_exclusions
+ if self.axis == 1:
+ return obj.T._data, 1
+ else:
+ return obj._data, 1
+
+ def _insert_inaxis_grouper_inplace(self, result):
+ # zip in reverse so we can always insert at loc 0
+ izip = zip(* map(reversed, (
+ self.grouper.names,
+ self.grouper.get_group_levels(),
+ [grp.in_axis for grp in self.grouper.groupings])))
+
+ for name, lev, in_axis in izip:
+ if in_axis:
+ result.insert(0, name, lev)
+
+ def _wrap_aggregated_output(self, output, names=None):
+ agg_axis = 0 if self.axis == 1 else 1
+ agg_labels = self._obj_with_exclusions._get_axis(agg_axis)
+
+ output_keys = self._decide_output_index(output, agg_labels)
+
+ if not self.as_index:
+ result = DataFrame(output, columns=output_keys)
+ self._insert_inaxis_grouper_inplace(result)
+ result = result._consolidate()
+ else:
+ index = self.grouper.result_index
+ result = DataFrame(output, index=index, columns=output_keys)
+
+ if self.axis == 1:
+ result = result.T
+
+ return self._reindex_output(result)._convert(datetime=True)
+
+ def _wrap_transformed_output(self, output, names=None):
+ return DataFrame(output, index=self.obj.index)
+
+ def _wrap_agged_blocks(self, items, blocks):
+ if not self.as_index:
+ index = np.arange(blocks[0].values.shape[-1])
+ mgr = BlockManager(blocks, [items, index])
+ result = DataFrame(mgr)
+
+ self._insert_inaxis_grouper_inplace(result)
+ result = result._consolidate()
+ else:
+ index = self.grouper.result_index
+ mgr = BlockManager(blocks, [items, index])
+ result = DataFrame(mgr)
+
+ if self.axis == 1:
+ result = result.T
+
+ return self._reindex_output(result)._convert(datetime=True)
+
+ def _reindex_output(self, result):
+ """
+ If we have categorical groupers, then we want to make sure that
+ we have a fully reindex-output to the levels. These may have not
+ participated in the groupings (e.g. may have all been
+ nan groups);
+
+ This can re-expand the output space
+ """
+
+ # we need to re-expand the output space to accomodate all values
+ # whether observed or not in the cartesian product of our groupes
+ groupings = self.grouper.groupings
+ if groupings is None:
+ return result
+ elif len(groupings) == 1:
+ return result
+
+ # if we only care about the observed values
+ # we are done
+ elif self.observed:
+ return result
+
+ # reindexing only applies to a Categorical grouper
+ elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex))
+ for ping in groupings):
+ return result
+
+ levels_list = [ping.group_index for ping in groupings]
+ index, _ = MultiIndex.from_product(
+ levels_list, names=self.grouper.names).sortlevel()
+
+ if self.as_index:
+ d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
+ return result.reindex(**d)
+
+ # GH 13204
+ # Here, the categorical in-axis groupers, which need to be fully
+ # expanded, are columns in `result`. An idea is to do:
+ # result = result.set_index(self.grouper.names)
+ # .reindex(index).reset_index()
+ # but special care has to be taken because of possible not-in-axis
+ # groupers.
+ # So, we manually select and drop the in-axis grouper columns,
+ # reindex `result`, and then reset the in-axis grouper columns.
+
+ # Select in-axis groupers
+ in_axis_grps = [(i, ping.name) for (i, ping)
+ in enumerate(groupings) if ping.in_axis]
+ g_nums, g_names = zip(*in_axis_grps)
+
+ result = result.drop(labels=list(g_names), axis=1)
+
+ # Set a temp index and reindex (possibly expanding)
+ result = result.set_index(self.grouper.result_index
+ ).reindex(index, copy=False)
+
+ # Reset in-axis grouper columns
+ # (using level numbers `g_nums` because level names may not be unique)
+ result = result.reset_index(level=g_nums)
+
+ return result.reset_index(drop=True)
+
+ def _iterate_column_groupbys(self):
+ for i, colname in enumerate(self._selected_obj.columns):
+ yield colname, SeriesGroupBy(self._selected_obj.iloc[:, i],
+ selection=colname,
+ grouper=self.grouper,
+ exclusions=self.exclusions)
+
+ def _apply_to_column_groupbys(self, func):
+ from pandas.core.reshape.concat import concat
+ return concat(
+ (func(col_groupby) for _, col_groupby
+ in self._iterate_column_groupbys()),
+ keys=self._selected_obj.columns, axis=1)
+
+ def _fill(self, direction, limit=None):
+ """Overridden method to join grouped columns in output"""
+ res = super(DataFrameGroupBy, self)._fill(direction, limit=limit)
+ output = collections.OrderedDict(
+ (grp.name, grp.grouper) for grp in self.grouper.groupings)
+
+ from pandas import concat
+ return concat((self._wrap_transformed_output(output), res), axis=1)
+
+ def count(self):
+ """ Compute count of group, excluding missing values """
+ from pandas.core.dtypes.missing import _isna_ndarraylike as _isna
+
+ data, _ = self._get_data_to_aggregate()
+ ids, _, ngroups = self.grouper.group_info
+ mask = ids != -1
+
+ val = ((mask & ~_isna(np.atleast_2d(blk.get_values())))
+ for blk in data.blocks)
+ loc = (blk.mgr_locs for blk in data.blocks)
+
+ counter = partial(
+ lib.count_level_2d, labels=ids, max_bin=ngroups, axis=1)
+ blk = map(make_block, map(counter, val), loc)
+
+ return self._wrap_agged_blocks(data.items, list(blk))
+
+ def nunique(self, dropna=True):
+ """
+ Return DataFrame with number of distinct observations per group for
+ each column.
+
+ .. versionadded:: 0.20.0
+
+ Parameters
+ ----------
+ dropna : boolean, default True
+ Don't include NaN in the counts.
+
+ Returns
+ -------
+ nunique: DataFrame
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
+ ... 'ham', 'ham'],
+ ... 'value1': [1, 5, 5, 2, 5, 5],
+ ... 'value2': list('abbaxy')})
+ >>> df
+ id value1 value2
+ 0 spam 1 a
+ 1 egg 5 b
+ 2 egg 5 b
+ 3 spam 2 a
+ 4 ham 5 x
+ 5 ham 5 y
+
+ >>> df.groupby('id').nunique()
+ id value1 value2
+ id
+ egg 1 1 1
+ ham 1 1 2
+ spam 1 2 1
+
+ # check for rows with the same id but conflicting values
+ >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())
+ id value1 value2
+ 0 spam 1 a
+ 3 spam 2 a
+ 4 ham 5 x
+ 5 ham 5 y
+ """
+
+ obj = self._selected_obj
+
+ def groupby_series(obj, col=None):
+ return SeriesGroupBy(obj,
+ selection=col,
+ grouper=self.grouper).nunique(dropna=dropna)
+
+ if isinstance(obj, Series):
+ results = groupby_series(obj)
+ else:
+ from pandas.core.reshape.concat import concat
+ results = [groupby_series(obj[col], col) for col in obj.columns]
+ results = concat(results, axis=1)
+
+ if not self.as_index:
+ results.index = ibase.default_index(len(results))
+ return results
+
+ boxplot = boxplot_frame_groupby
+
+
+class PanelGroupBy(NDFrameGroupBy):
+
+ def aggregate(self, arg, *args, **kwargs):
+ return super(PanelGroupBy, self).aggregate(arg, *args, **kwargs)
+
+ agg = aggregate
+
+ def _iterate_slices(self):
+ if self.axis == 0:
+ # kludge
+ if self._selection is None:
+ slice_axis = self._selected_obj.items
+ else:
+ slice_axis = self._selection_list
+ slicer = lambda x: self._selected_obj[x]
+ else:
+ raise NotImplementedError("axis other than 0 is not supported")
+
+ for val in slice_axis:
+ if val in self.exclusions:
+ continue
+
+ yield val, slicer(val)
+
+ def aggregate(self, arg, *args, **kwargs):
+ """
+ Aggregate using input function or dict of {column -> function}
+
+ Parameters
+ ----------
+ arg : function or dict
+ Function to use for aggregating groups. If a function, must either
+ work when passed a Panel or when passed to Panel.apply. If
+ pass a dict, the keys must be DataFrame column names
+
+ Returns
+ -------
+ aggregated : Panel
+ """
+ if isinstance(arg, compat.string_types):
+ return getattr(self, arg)(*args, **kwargs)
+
+ return self._aggregate_generic(arg, *args, **kwargs)
+
+ def _wrap_generic_output(self, result, obj):
+ if self.axis == 0:
+ new_axes = list(obj.axes)
+ new_axes[0] = self.grouper.result_index
+ elif self.axis == 1:
+ x, y, z = obj.axes
+ new_axes = [self.grouper.result_index, z, x]
+ else:
+ x, y, z = obj.axes
+ new_axes = [self.grouper.result_index, y, x]
+
+ result = Panel._from_axes(result, new_axes)
+
+ if self.axis == 1:
+ result = result.swapaxes(0, 1).swapaxes(0, 2)
+ elif self.axis == 2:
+ result = result.swapaxes(0, 2)
+
+ return result
+
+ def _aggregate_item_by_item(self, func, *args, **kwargs):
+ obj = self._obj_with_exclusions
+ result = {}
+
+ if self.axis > 0:
+ for item in obj:
+ try:
+ itemg = DataFrameGroupBy(obj[item],
+ axis=self.axis - 1,
+ grouper=self.grouper)
+ result[item] = itemg.aggregate(func, *args, **kwargs)
+ except (ValueError, TypeError):
+ raise
+ new_axes = list(obj.axes)
+ new_axes[self.axis] = self.grouper.result_index
+ return Panel._from_axes(result, new_axes)
+ else:
+ raise ValueError("axis value must be greater than 0")
+
+ def _wrap_aggregated_output(self, output, names=None):
+ raise AbstractMethodError(self)
diff --git a/contrib/python/pandas/py2/pandas/core/groupby/groupby.py b/contrib/python/pandas/py2/pandas/core/groupby/groupby.py
new file mode 100644
index 00000000000..8766fdbc297
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/groupby/groupby.py
@@ -0,0 +1,2110 @@
+"""
+Provide the groupby split-apply-combine paradigm. Define the GroupBy
+class providing the base-class of operations.
+
+The SeriesGroupBy and DataFrameGroupBy sub-class
+(defined in pandas.core.groupby.generic)
+expose these user-facing objects to provide specific functionailty.
+"""
+
+import collections
+from contextlib import contextmanager
+import datetime
+from functools import partial, wraps
+import types
+import warnings
+
+import numpy as np
+
+from pandas._libs import Timestamp, groupby as libgroupby
+import pandas.compat as compat
+from pandas.compat import callable, range, set_function_name, zip
+from pandas.compat.numpy import function as nv
+from pandas.errors import AbstractMethodError
+from pandas.util._decorators import Appender, Substitution, cache_readonly
+from pandas.util._validators import validate_kwargs
+
+from pandas.core.dtypes.cast import maybe_downcast_to_dtype
+from pandas.core.dtypes.common import (
+ ensure_float, is_extension_array_dtype, is_numeric_dtype, is_scalar)
+from pandas.core.dtypes.missing import isna, notna
+
+import pandas.core.algorithms as algorithms
+from pandas.core.base import (
+ DataError, GroupByError, PandasObject, SelectionMixin, SpecificationError)
+import pandas.core.common as com
+from pandas.core.config import option_context
+from pandas.core.frame import DataFrame
+from pandas.core.generic import NDFrame
+from pandas.core.groupby import base
+from pandas.core.index import Index, MultiIndex
+from pandas.core.series import Series
+from pandas.core.sorting import get_group_index_sorter
+
+_common_see_also = """
+ See Also
+ --------
+ pandas.Series.%(name)s
+ pandas.DataFrame.%(name)s
+ pandas.Panel.%(name)s
+"""
+
+_apply_docs = dict(
+ template="""
+ Apply function `func` group-wise and combine the results together.
+
+ The function passed to `apply` must take a {input} as its first
+ argument and return a DataFrame, Series or scalar. `apply` will
+ then take care of combining the results back together into a single
+ dataframe or series. `apply` is therefore a highly flexible
+ grouping method.
+
+ While `apply` is a very flexible method, its downside is that
+ using it can be quite a bit slower than using more specific methods
+ like `agg` or `transform`. Pandas offers a wide range of method that will
+ be much faster than using `apply` for their specific purposes, so try to
+ use them before reaching for `apply`.
+
+ Parameters
+ ----------
+ func : callable
+ A callable that takes a {input} as its first argument, and
+ returns a dataframe, a series or a scalar. In addition the
+ callable may take positional and keyword arguments.
+ args, kwargs : tuple and dict
+ Optional positional and keyword arguments to pass to `func`.
+
+ Returns
+ -------
+ applied : Series or DataFrame
+
+ See Also
+ --------
+ pipe : Apply function to the full GroupBy object instead of to each
+ group.
+ aggregate : Apply aggregate function to the GroupBy object.
+ transform : Apply function column-by-column to the GroupBy object.
+ Series.apply : Apply a function to a Series.
+ DataFrame.apply : Apply a function to each row or column of a DataFrame.
+ """,
+ dataframe_examples="""
+ >>> df = pd.DataFrame({'A': 'a a b'.split(),
+ 'B': [1,2,3],
+ 'C': [4,6, 5]})
+ >>> g = df.groupby('A')
+
+ Notice that ``g`` has two groups, ``a`` and ``b``.
+ Calling `apply` in various ways, we can get different grouping results:
+
+ Example 1: below the function passed to `apply` takes a DataFrame as
+ its argument and returns a DataFrame. `apply` combines the result for
+ each group together into a new DataFrame:
+
+ >>> g[['B', 'C']].apply(lambda x: x / x.sum())
+ B C
+ 0 0.333333 0.4
+ 1 0.666667 0.6
+ 2 1.000000 1.0
+
+ Example 2: The function passed to `apply` takes a DataFrame as
+ its argument and returns a Series. `apply` combines the result for
+ each group together into a new DataFrame:
+
+ >>> g[['B', 'C']].apply(lambda x: x.max() - x.min())
+ B C
+ A
+ a 1 2
+ b 0 0
+
+ Example 3: The function passed to `apply` takes a DataFrame as
+ its argument and returns a scalar. `apply` combines the result for
+ each group together into a Series, including setting the index as
+ appropriate:
+
+ >>> g.apply(lambda x: x.C.max() - x.B.min())
+ A
+ a 5
+ b 2
+ dtype: int64
+ """,
+ series_examples="""
+ >>> s = pd.Series([0, 1, 2], index='a a b'.split())
+ >>> g = s.groupby(s.index)
+
+ From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``.
+ Calling `apply` in various ways, we can get different grouping results:
+
+ Example 1: The function passed to `apply` takes a Series as
+ its argument and returns a Series. `apply` combines the result for
+ each group together into a new Series:
+
+ >>> g.apply(lambda x: x*2 if x.name == 'b' else x/2)
+ 0 0.0
+ 1 0.5
+ 2 4.0
+ dtype: float64
+
+ Example 2: The function passed to `apply` takes a Series as
+ its argument and returns a scalar. `apply` combines the result for
+ each group together into a Series, including setting the index as
+ appropriate:
+
+ >>> g.apply(lambda x: x.max() - x.min())
+ a 1
+ b 0
+ dtype: int64
+
+ Notes
+ -----
+ In the current implementation `apply` calls `func` twice on the
+ first group to decide whether it can take a fast or slow code
+ path. This can lead to unexpected behavior if `func` has
+ side-effects, as they will take effect twice for the first
+ group.
+
+ Examples
+ --------
+ {examples}
+ """)
+
+_pipe_template = """\
+Apply a function `func` with arguments to this %(klass)s object and return
+the function's result.
+
+%(versionadded)s
+
+Use `.pipe` when you want to improve readability by chaining together
+functions that expect Series, DataFrames, GroupBy or Resampler objects.
+Instead of writing
+
+>>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c)
+
+You can write
+
+>>> (df.groupby('group')
+... .pipe(f)
+... .pipe(g, arg1=a)
+... .pipe(h, arg2=b, arg3=c))
+
+which is much more readable.
+
+Parameters
+----------
+func : callable or tuple of (callable, string)
+ Function to apply to this %(klass)s object or, alternatively,
+ a `(callable, data_keyword)` tuple where `data_keyword` is a
+ string indicating the keyword of `callable` that expects the
+ %(klass)s object.
+args : iterable, optional
+ positional arguments passed into `func`.
+kwargs : dict, optional
+ a dictionary of keyword arguments passed into `func`.
+
+Returns
+-------
+object : the return type of `func`.
+
+See Also
+--------
+pandas.Series.pipe : Apply a function with arguments to a series.
+pandas.DataFrame.pipe: Apply a function with arguments to a dataframe.
+apply : Apply function to each group instead of to the
+ full %(klass)s object.
+
+Notes
+-----
+See more `here
+<http://pandas.pydata.org/pandas-docs/stable/groupby.html#piping-function-calls>`_
+
+Examples
+--------
+%(examples)s
+"""
+
+_transform_template = """
+Call function producing a like-indexed %(klass)s on each group and
+return a %(klass)s having the same indexes as the original object
+filled with the transformed values
+
+Parameters
+----------
+f : function
+ Function to apply to each group
+
+Returns
+-------
+%(klass)s
+
+See Also
+--------
+aggregate, transform
+
+Notes
+-----
+Each group is endowed the attribute 'name' in case you need to know
+which group you are working on.
+
+The current implementation imposes three requirements on f:
+
+* f must return a value that either has the same shape as the input
+ subframe or can be broadcast to the shape of the input subframe.
+ For example, f returns a scalar it will be broadcast to have the
+ same shape as the input subframe.
+* if this is a DataFrame, f must support application column-by-column
+ in the subframe. If f also supports application to the entire subframe,
+ then a fast path is used starting from the second chunk.
+* f must not mutate groups. Mutation is not supported and may
+ produce unexpected results.
+
+Examples
+--------
+
+# Same shape
+>>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
+... 'foo', 'bar'],
+... 'B' : ['one', 'one', 'two', 'three',
+... 'two', 'two'],
+... 'C' : [1, 5, 5, 2, 5, 5],
+... 'D' : [2.0, 5., 8., 1., 2., 9.]})
+>>> grouped = df.groupby('A')
+>>> grouped.transform(lambda x: (x - x.mean()) / x.std())
+ C D
+0 -1.154701 -0.577350
+1 0.577350 0.000000
+2 0.577350 1.154701
+3 -1.154701 -1.000000
+4 0.577350 -0.577350
+5 0.577350 1.000000
+
+# Broadcastable
+>>> grouped.transform(lambda x: x.max() - x.min())
+ C D
+0 4 6.0
+1 3 8.0
+2 4 6.0
+3 3 8.0
+4 4 6.0
+5 3 8.0
+"""
+
+
+class GroupByPlot(PandasObject):
+ """
+ Class implementing the .plot attribute for groupby objects.
+ """
+
+ def __init__(self, groupby):
+ self._groupby = groupby
+
+ def __call__(self, *args, **kwargs):
+ def f(self):
+ return self.plot(*args, **kwargs)
+ f.__name__ = 'plot'
+ return self._groupby.apply(f)
+
+ def __getattr__(self, name):
+ def attr(*args, **kwargs):
+ def f(self):
+ return getattr(self.plot, name)(*args, **kwargs)
+ return self._groupby.apply(f)
+ return attr
+
+
+@contextmanager
+def _group_selection_context(groupby):
+ """
+ Set / reset the _group_selection_context.
+ """
+ groupby._set_group_selection()
+ yield groupby
+ groupby._reset_group_selection()
+
+
+class _GroupBy(PandasObject, SelectionMixin):
+ _group_selection = None
+ _apply_whitelist = frozenset()
+
+ def __init__(self, obj, keys=None, axis=0, level=None,
+ grouper=None, exclusions=None, selection=None, as_index=True,
+ sort=True, group_keys=True, squeeze=False,
+ observed=False, **kwargs):
+
+ self._selection = selection
+
+ if isinstance(obj, NDFrame):
+ obj._consolidate_inplace()
+
+ self.level = level
+
+ if not as_index:
+ if not isinstance(obj, DataFrame):
+ raise TypeError('as_index=False only valid with DataFrame')
+ if axis != 0:
+ raise ValueError('as_index=False only valid for axis=0')
+
+ self.as_index = as_index
+ self.keys = keys
+ self.sort = sort
+ self.group_keys = group_keys
+ self.squeeze = squeeze
+ self.observed = observed
+ self.mutated = kwargs.pop('mutated', False)
+
+ if grouper is None:
+ from pandas.core.groupby.grouper import _get_grouper
+ grouper, exclusions, obj = _get_grouper(obj, keys,
+ axis=axis,
+ level=level,
+ sort=sort,
+ observed=observed,
+ mutated=self.mutated)
+
+ self.obj = obj
+ self.axis = obj._get_axis_number(axis)
+ self.grouper = grouper
+ self.exclusions = set(exclusions) if exclusions else set()
+
+ # we accept no other args
+ validate_kwargs('group', kwargs, {})
+
+ def __len__(self):
+ return len(self.groups)
+
+ def __unicode__(self):
+ # TODO: Better unicode/repr for GroupBy object
+ return object.__repr__(self)
+
+ def _assure_grouper(self):
+ """
+ We create the grouper on instantiation sub-classes may have a
+ different policy.
+ """
+ pass
+
+ @property
+ def groups(self):
+ """
+ Dict {group name -> group labels}.
+ """
+ self._assure_grouper()
+ return self.grouper.groups
+
+ @property
+ def ngroups(self):
+ self._assure_grouper()
+ return self.grouper.ngroups
+
+ @property
+ def indices(self):
+ """
+ Dict {group name -> group indices}.
+ """
+ self._assure_grouper()
+ return self.grouper.indices
+
+ def _get_indices(self, names):
+ """
+ Safe get multiple indices, translate keys for
+ datelike to underlying repr.
+ """
+
+ def get_converter(s):
+ # possibly convert to the actual key types
+ # in the indices, could be a Timestamp or a np.datetime64
+ if isinstance(s, (Timestamp, datetime.datetime)):
+ return lambda key: Timestamp(key)
+ elif isinstance(s, np.datetime64):
+ return lambda key: Timestamp(key).asm8
+ else:
+ return lambda key: key
+
+ if len(names) == 0:
+ return []
+
+ if len(self.indices) > 0:
+ index_sample = next(iter(self.indices))
+ else:
+ index_sample = None # Dummy sample
+
+ name_sample = names[0]
+ if isinstance(index_sample, tuple):
+ if not isinstance(name_sample, tuple):
+ msg = ("must supply a tuple to get_group with multiple"
+ " grouping keys")
+ raise ValueError(msg)
+ if not len(name_sample) == len(index_sample):
+ try:
+ # If the original grouper was a tuple
+ return [self.indices[name] for name in names]
+ except KeyError:
+ # turns out it wasn't a tuple
+ msg = ("must supply a same-length tuple to get_group"
+ " with multiple grouping keys")
+ raise ValueError(msg)
+
+ converters = [get_converter(s) for s in index_sample]
+ names = [tuple(f(n) for f, n in zip(converters, name))
+ for name in names]
+
+ else:
+ converter = get_converter(index_sample)
+ names = [converter(name) for name in names]
+
+ return [self.indices.get(name, []) for name in names]
+
+ def _get_index(self, name):
+ """
+ Safe get index, translate keys for datelike to underlying repr.
+ """
+ return self._get_indices([name])[0]
+
+ @cache_readonly
+ def _selected_obj(self):
+
+ if self._selection is None or isinstance(self.obj, Series):
+ if self._group_selection is not None:
+ return self.obj[self._group_selection]
+ return self.obj
+ else:
+ return self.obj[self._selection]
+
+ def _reset_group_selection(self):
+ """
+ Clear group based selection.
+
+ Used for methods needing to return info on each group regardless of
+ whether a group selection was previously set.
+ """
+ if self._group_selection is not None:
+ # GH12839 clear cached selection too when changing group selection
+ self._group_selection = None
+ self._reset_cache('_selected_obj')
+
+ def _set_group_selection(self):
+ """
+ Create group based selection.
+
+ Used when selection is not passed directly but instead via a grouper.
+
+ NOTE: this should be paired with a call to _reset_group_selection
+ """
+ grp = self.grouper
+ if not (self.as_index and
+ getattr(grp, 'groupings', None) is not None and
+ self.obj.ndim > 1 and
+ self._group_selection is None):
+ return
+
+ ax = self.obj._info_axis
+ groupers = [g.name for g in grp.groupings
+ if g.level is None and g.in_axis]
+
+ if len(groupers):
+ # GH12839 clear selected obj cache when group selection changes
+ self._group_selection = ax.difference(Index(groupers),
+ sort=False).tolist()
+ self._reset_cache('_selected_obj')
+
+ def _set_result_index_ordered(self, result):
+ # set the result index on the passed values object and
+ # return the new object, xref 8046
+
+ # the values/counts are repeated according to the group index
+ # shortcut if we have an already ordered grouper
+ if not self.grouper.is_monotonic:
+ index = Index(np.concatenate(
+ self._get_indices(self.grouper.result_index)))
+ result.set_axis(index, axis=self.axis, inplace=True)
+ result = result.sort_index(axis=self.axis)
+
+ result.set_axis(self.obj._get_axis(self.axis), axis=self.axis,
+ inplace=True)
+ return result
+
+ def _dir_additions(self):
+ return self.obj._dir_additions() | self._apply_whitelist
+
+ def __getattr__(self, attr):
+ if attr in self._internal_names_set:
+ return object.__getattribute__(self, attr)
+ if attr in self.obj:
+ return self[attr]
+ if hasattr(self.obj, attr):
+ return self._make_wrapper(attr)
+
+ raise AttributeError("%r object has no attribute %r" %
+ (type(self).__name__, attr))
+
+ @Substitution(klass='GroupBy',
+ versionadded='.. versionadded:: 0.21.0',
+ examples="""\
+>>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]})
+>>> df
+ A B
+0 a 1
+1 b 2
+2 a 3
+3 b 4
+
+To get the difference between each groups maximum and minimum value in one
+pass, you can do
+
+>>> df.groupby('A').pipe(lambda x: x.max() - x.min())
+ B
+A
+a 2
+b 2""")
+ @Appender(_pipe_template)
+ def pipe(self, func, *args, **kwargs):
+ return com._pipe(self, func, *args, **kwargs)
+
+ plot = property(GroupByPlot)
+
+ def _make_wrapper(self, name):
+ if name not in self._apply_whitelist:
+ is_callable = callable(getattr(self._selected_obj, name, None))
+ kind = ' callable ' if is_callable else ' '
+ msg = ("Cannot access{0}attribute {1!r} of {2!r} objects, try "
+ "using the 'apply' method".format(kind, name,
+ type(self).__name__))
+ raise AttributeError(msg)
+
+ self._set_group_selection()
+
+ # need to setup the selection
+ # as are not passed directly but in the grouper
+ f = getattr(self._selected_obj, name)
+ if not isinstance(f, types.MethodType):
+ return self.apply(lambda self: getattr(self, name))
+
+ f = getattr(type(self._selected_obj), name)
+
+ def wrapper(*args, **kwargs):
+ # a little trickery for aggregation functions that need an axis
+ # argument
+ kwargs_with_axis = kwargs.copy()
+ if ('axis' not in kwargs_with_axis or
+ kwargs_with_axis['axis'] is None):
+ kwargs_with_axis['axis'] = self.axis
+
+ def curried_with_axis(x):
+ return f(x, *args, **kwargs_with_axis)
+
+ def curried(x):
+ return f(x, *args, **kwargs)
+
+ # preserve the name so we can detect it when calling plot methods,
+ # to avoid duplicates
+ curried.__name__ = curried_with_axis.__name__ = name
+
+ # special case otherwise extra plots are created when catching the
+ # exception below
+ if name in base.plotting_methods:
+ return self.apply(curried)
+
+ try:
+ return self.apply(curried_with_axis)
+ except Exception:
+ try:
+ return self.apply(curried)
+ except Exception:
+
+ # related to : GH3688
+ # try item-by-item
+ # this can be called recursively, so need to raise
+ # ValueError
+ # if we don't have this method to indicated to aggregate to
+ # mark this column as an error
+ try:
+ return self._aggregate_item_by_item(name,
+ *args, **kwargs)
+ except (AttributeError):
+ raise ValueError
+
+ return wrapper
+
+ def get_group(self, name, obj=None):
+ """
+ Constructs NDFrame from group with provided name.
+
+ Parameters
+ ----------
+ name : object
+ the name of the group to get as a DataFrame
+ obj : NDFrame, default None
+ the NDFrame to take the DataFrame out of. If
+ it is None, the object groupby was called on will
+ be used
+
+ Returns
+ -------
+ group : same type as obj
+ """
+ if obj is None:
+ obj = self._selected_obj
+
+ inds = self._get_index(name)
+ if not len(inds):
+ raise KeyError(name)
+
+ return obj._take(inds, axis=self.axis)
+
+ def __iter__(self):
+ """
+ Groupby iterator.
+
+ Returns
+ -------
+ Generator yielding sequence of (name, subsetted object)
+ for each group
+ """
+ return self.grouper.get_iterator(self.obj, axis=self.axis)
+
+ @Appender(_apply_docs['template']
+ .format(input="dataframe",
+ examples=_apply_docs['dataframe_examples']))
+ def apply(self, func, *args, **kwargs):
+
+ func = self._is_builtin_func(func)
+
+ # this is needed so we don't try and wrap strings. If we could
+ # resolve functions to their callable functions prior, this
+ # wouldn't be needed
+ if args or kwargs:
+ if callable(func):
+
+ @wraps(func)
+ def f(g):
+ with np.errstate(all='ignore'):
+ return func(g, *args, **kwargs)
+ else:
+ raise ValueError('func must be a callable if args or '
+ 'kwargs are supplied')
+ else:
+ f = func
+
+ # ignore SettingWithCopy here in case the user mutates
+ with option_context('mode.chained_assignment', None):
+ try:
+ result = self._python_apply_general(f)
+ except Exception:
+
+ # gh-20949
+ # try again, with .apply acting as a filtering
+ # operation, by excluding the grouping column
+ # This would normally not be triggered
+ # except if the udf is trying an operation that
+ # fails on *some* columns, e.g. a numeric operation
+ # on a string grouper column
+
+ with _group_selection_context(self):
+ return self._python_apply_general(f)
+
+ return result
+
+ def _python_apply_general(self, f):
+ keys, values, mutated = self.grouper.apply(f, self._selected_obj,
+ self.axis)
+
+ return self._wrap_applied_output(
+ keys,
+ values,
+ not_indexed_same=mutated or self.mutated)
+
+ def _iterate_slices(self):
+ yield self._selection_name, self._selected_obj
+
+ def transform(self, func, *args, **kwargs):
+ raise AbstractMethodError(self)
+
+ def _cumcount_array(self, ascending=True):
+ """
+ Parameters
+ ----------
+ ascending : bool, default True
+ If False, number in reverse, from length of group - 1 to 0.
+
+ Notes
+ -----
+ this is currently implementing sort=False
+ (though the default is sort=True) for groupby in general
+ """
+ ids, _, ngroups = self.grouper.group_info
+ sorter = get_group_index_sorter(ids, ngroups)
+ ids, count = ids[sorter], len(ids)
+
+ if count == 0:
+ return np.empty(0, dtype=np.int64)
+
+ run = np.r_[True, ids[:-1] != ids[1:]]
+ rep = np.diff(np.r_[np.nonzero(run)[0], count])
+ out = (~run).cumsum()
+
+ if ascending:
+ out -= np.repeat(out[run], rep)
+ else:
+ out = np.repeat(out[np.r_[run[1:], True]], rep) - out
+
+ rev = np.empty(count, dtype=np.intp)
+ rev[sorter] = np.arange(count, dtype=np.intp)
+ return out[rev].astype(np.int64, copy=False)
+
+ def _try_cast(self, result, obj, numeric_only=False):
+ """
+ Try to cast the result to our obj original type,
+ we may have roundtripped through object in the mean-time.
+
+ If numeric_only is True, then only try to cast numerics
+ and not datetimelikes.
+
+ """
+ if obj.ndim > 1:
+ dtype = obj._values.dtype
+ else:
+ dtype = obj.dtype
+
+ if not is_scalar(result):
+ if is_extension_array_dtype(dtype):
+ # The function can return something of any type, so check
+ # if the type is compatible with the calling EA.
+ try:
+ result = obj._values._from_sequence(result, dtype=dtype)
+ except Exception:
+ # https://github.com/pandas-dev/pandas/issues/22850
+ # pandas has no control over what 3rd-party ExtensionArrays
+ # do in _values_from_sequence. We still want ops to work
+ # though, so we catch any regular Exception.
+ pass
+ elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:
+ result = maybe_downcast_to_dtype(result, dtype)
+
+ return result
+
+ def _transform_should_cast(self, func_nm):
+ """
+ Parameters:
+ -----------
+ func_nm: str
+ The name of the aggregation function being performed
+
+ Returns:
+ --------
+ bool
+ Whether transform should attempt to cast the result of aggregation
+ """
+ return (self.size().fillna(0) > 0).any() and (
+ func_nm not in base.cython_cast_blacklist)
+
+ def _cython_transform(self, how, numeric_only=True, **kwargs):
+ output = collections.OrderedDict()
+ for name, obj in self._iterate_slices():
+ is_numeric = is_numeric_dtype(obj.dtype)
+ if numeric_only and not is_numeric:
+ continue
+
+ try:
+ result, names = self.grouper.transform(obj.values, how,
+ **kwargs)
+ except NotImplementedError:
+ continue
+ except AssertionError as e:
+ raise GroupByError(str(e))
+ if self._transform_should_cast(how):
+ output[name] = self._try_cast(result, obj)
+ else:
+ output[name] = result
+
+ if len(output) == 0:
+ raise DataError('No numeric types to aggregate')
+
+ return self._wrap_transformed_output(output, names)
+
+ def _cython_agg_general(self, how, alt=None, numeric_only=True,
+ min_count=-1):
+ output = {}
+ for name, obj in self._iterate_slices():
+ is_numeric = is_numeric_dtype(obj.dtype)
+ if numeric_only and not is_numeric:
+ continue
+
+ try:
+ result, names = self.grouper.aggregate(obj.values, how,
+ min_count=min_count)
+ except AssertionError as e:
+ raise GroupByError(str(e))
+ output[name] = self._try_cast(result, obj)
+
+ if len(output) == 0:
+ raise DataError('No numeric types to aggregate')
+
+ return self._wrap_aggregated_output(output, names)
+
+ def _python_agg_general(self, func, *args, **kwargs):
+ func = self._is_builtin_func(func)
+ f = lambda x: func(x, *args, **kwargs)
+
+ # iterate through "columns" ex exclusions to populate output dict
+ output = {}
+ for name, obj in self._iterate_slices():
+ try:
+ result, counts = self.grouper.agg_series(obj, f)
+ output[name] = self._try_cast(result, obj, numeric_only=True)
+ except TypeError:
+ continue
+
+ if len(output) == 0:
+ return self._python_apply_general(f)
+
+ if self.grouper._filter_empty_groups:
+
+ mask = counts.ravel() > 0
+ for name, result in compat.iteritems(output):
+
+ # since we are masking, make sure that we have a float object
+ values = result
+ if is_numeric_dtype(values.dtype):
+ values = ensure_float(values)
+
+ output[name] = self._try_cast(values[mask], result)
+
+ return self._wrap_aggregated_output(output)
+
+ def _wrap_applied_output(self, *args, **kwargs):
+ raise AbstractMethodError(self)
+
+ def _concat_objects(self, keys, values, not_indexed_same=False):
+ from pandas.core.reshape.concat import concat
+
+ def reset_identity(values):
+ # reset the identities of the components
+ # of the values to prevent aliasing
+ for v in com._not_none(*values):
+ ax = v._get_axis(self.axis)
+ ax._reset_identity()
+ return values
+
+ if not not_indexed_same:
+ result = concat(values, axis=self.axis)
+ ax = self._selected_obj._get_axis(self.axis)
+
+ if isinstance(result, Series):
+ result = result.reindex(ax)
+ else:
+
+ # this is a very unfortunate situation
+ # we have a multi-index that is NOT lexsorted
+ # and we have a result which is duplicated
+ # we can't reindex, so we resort to this
+ # GH 14776
+ if isinstance(ax, MultiIndex) and not ax.is_unique:
+ indexer = algorithms.unique1d(
+ result.index.get_indexer_for(ax.values))
+ result = result.take(indexer, axis=self.axis)
+ else:
+ result = result.reindex(ax, axis=self.axis)
+
+ elif self.group_keys:
+
+ values = reset_identity(values)
+ if self.as_index:
+
+ # possible MI return case
+ group_keys = keys
+ group_levels = self.grouper.levels
+ group_names = self.grouper.names
+
+ result = concat(values, axis=self.axis, keys=group_keys,
+ levels=group_levels, names=group_names,
+ sort=False)
+ else:
+
+ # GH5610, returns a MI, with the first level being a
+ # range index
+ keys = list(range(len(values)))
+ result = concat(values, axis=self.axis, keys=keys)
+ else:
+ values = reset_identity(values)
+ result = concat(values, axis=self.axis)
+
+ if (isinstance(result, Series) and
+ getattr(self, '_selection_name', None) is not None):
+
+ result.name = self._selection_name
+
+ return result
+
+ def _apply_filter(self, indices, dropna):
+ if len(indices) == 0:
+ indices = np.array([], dtype='int64')
+ else:
+ indices = np.sort(np.concatenate(indices))
+ if dropna:
+ filtered = self._selected_obj.take(indices, axis=self.axis)
+ else:
+ mask = np.empty(len(self._selected_obj.index), dtype=bool)
+ mask.fill(False)
+ mask[indices.astype(int)] = True
+ # mask fails to broadcast when passed to where; broadcast manually.
+ mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
+ filtered = self._selected_obj.where(mask) # Fill with NaNs.
+ return filtered
+
+
+class GroupBy(_GroupBy):
+
+ """
+ Class for grouping and aggregating relational data.
+
+ See aggregate, transform, and apply functions on this object.
+
+ It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
+
+ ::
+
+ grouped = groupby(obj, ...)
+
+ Parameters
+ ----------
+ obj : pandas object
+ axis : int, default 0
+ level : int, default None
+ Level of MultiIndex
+ groupings : list of Grouping objects
+ Most users should ignore this
+ exclusions : array-like, optional
+ List of columns to exclude
+ name : string
+ Most users should ignore this
+
+ Returns
+ -------
+ **Attributes**
+ groups : dict
+ {group name -> group labels}
+ len(grouped) : int
+ Number of groups
+
+ Notes
+ -----
+ After grouping, see aggregate, apply, and transform functions. Here are
+ some other brief notes about usage. When grouping by multiple groups, the
+ result index will be a MultiIndex (hierarchical) by default.
+
+ Iteration produces (key, group) tuples, i.e. chunking the data by group. So
+ you can write code like:
+
+ ::
+
+ grouped = obj.groupby(keys, axis=axis)
+ for key, group in grouped:
+ # do something with the data
+
+ Function calls on GroupBy, if not specially implemented, "dispatch" to the
+ grouped data. So if you group a DataFrame and wish to invoke the std()
+ method on each group, you can simply do:
+
+ ::
+
+ df.groupby(mapper).std()
+
+ rather than
+
+ ::
+
+ df.groupby(mapper).aggregate(np.std)
+
+ You can pass arguments to these "wrapped" functions, too.
+
+ See the online documentation for full exposition on these topics and much
+ more
+ """
+ def _bool_agg(self, val_test, skipna):
+ """
+ Shared func to call any / all Cython GroupBy implementations.
+ """
+
+ def objs_to_bool(vals):
+ try:
+ vals = vals.astype(np.bool)
+ except ValueError: # for objects
+ vals = np.array([bool(x) for x in vals])
+
+ return vals.view(np.uint8)
+
+ def result_to_bool(result):
+ return result.astype(np.bool, copy=False)
+
+ return self._get_cythonized_result('group_any_all', self.grouper,
+ aggregate=True,
+ cython_dtype=np.uint8,
+ needs_values=True,
+ needs_mask=True,
+ pre_processing=objs_to_bool,
+ post_processing=result_to_bool,
+ val_test=val_test, skipna=skipna)
+
+ @Substitution(name='groupby')
+ @Appender(_common_see_also)
+ def any(self, skipna=True):
+ """
+ Returns True if any value in the group is truthful, else False.
+
+ Parameters
+ ----------
+ skipna : bool, default True
+ Flag to ignore nan values during truth testing
+ """
+ return self._bool_agg('any', skipna)
+
+ @Substitution(name='groupby')
+ @Appender(_common_see_also)
+ def all(self, skipna=True):
+ """
+ Returns True if all values in the group are truthful, else False.
+
+ Parameters
+ ----------
+ skipna : bool, default True
+ Flag to ignore nan values during truth testing
+ """
+ return self._bool_agg('all', skipna)
+
+ @Substitution(name='groupby')
+ @Appender(_common_see_also)
+ def count(self):
+ """
+ Compute count of group, excluding missing values.
+ """
+
+ # defined here for API doc
+ raise NotImplementedError
+
+ @Substitution(name='groupby', see_also=_common_see_also)
+ def mean(self, *args, **kwargs):
+ """
+ Compute mean of groups, excluding missing values.
+
+ Returns
+ -------
+ pandas.Series or pandas.DataFrame
+
+ %(see_also)s
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
+ ... 'B': [np.nan, 2, 3, 4, 5],
+ ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])
+
+ Groupby one column and return the mean of the remaining columns in
+ each group.
+
+ >>> df.groupby('A').mean()
+ >>>
+ B C
+ A
+ 1 3.0 1.333333
+ 2 4.0 1.500000
+
+ Groupby two columns and return the mean of the remaining column.
+
+ >>> df.groupby(['A', 'B']).mean()
+ >>>
+ C
+ A B
+ 1 2.0 2
+ 4.0 1
+ 2 3.0 1
+ 5.0 2
+
+ Groupby one column and return the mean of only particular column in
+ the group.
+
+ >>> df.groupby('A')['B'].mean()
+ >>>
+ A
+ 1 3.0
+ 2 4.0
+ Name: B, dtype: float64
+ """
+ nv.validate_groupby_func('mean', args, kwargs, ['numeric_only'])
+ try:
+ return self._cython_agg_general('mean', **kwargs)
+ except GroupByError:
+ raise
+ except Exception: # pragma: no cover
+ with _group_selection_context(self):
+ f = lambda x: x.mean(axis=self.axis, **kwargs)
+ return self._python_agg_general(f)
+
+ @Substitution(name='groupby')
+ @Appender(_common_see_also)
+ def median(self, **kwargs):
+ """
+ Compute median of groups, excluding missing values.
+
+ For multiple groupings, the result index will be a MultiIndex
+ """
+ try:
+ return self._cython_agg_general('median', **kwargs)
+ except GroupByError:
+ raise
+ except Exception: # pragma: no cover
+
+ def f(x):
+ if isinstance(x, np.ndarray):
+ x = Series(x)
+ return x.median(axis=self.axis, **kwargs)
+ with _group_selection_context(self):
+ return self._python_agg_general(f)
+
+ @Substitution(name='groupby')
+ @Appender(_common_see_also)
+ def std(self, ddof=1, *args, **kwargs):
+ """
+ Compute standard deviation of groups, excluding missing values.
+
+ For multiple groupings, the result index will be a MultiIndex.
+
+ Parameters
+ ----------
+ ddof : integer, default 1
+ degrees of freedom
+ """
+
+ # TODO: implement at Cython level?
+ nv.validate_groupby_func('std', args, kwargs)
+ return np.sqrt(self.var(ddof=ddof, **kwargs))
+
+ @Substitution(name='groupby')
+ @Appender(_common_see_also)
+ def var(self, ddof=1, *args, **kwargs):
+ """
+ Compute variance of groups, excluding missing values.
+
+ For multiple groupings, the result index will be a MultiIndex.
+
+ Parameters
+ ----------
+ ddof : integer, default 1
+ degrees of freedom
+ """
+ nv.validate_groupby_func('var', args, kwargs)
+ if ddof == 1:
+ try:
+ return self._cython_agg_general('var', **kwargs)
+ except Exception:
+ f = lambda x: x.var(ddof=ddof, **kwargs)
+ with _group_selection_context(self):
+ return self._python_agg_general(f)
+ else:
+ f = lambda x: x.var(ddof=ddof, **kwargs)
+ with _group_selection_context(self):
+ return self._python_agg_general(f)
+
+ @Substitution(name='groupby')
+ @Appender(_common_see_also)
+ def sem(self, ddof=1):
+ """
+ Compute standard error of the mean of groups, excluding missing values.
+
+ For multiple groupings, the result index will be a MultiIndex.
+
+ Parameters
+ ----------
+ ddof : integer, default 1
+ degrees of freedom
+ """
+
+ return self.std(ddof=ddof) / np.sqrt(self.count())
+
+ @Substitution(name='groupby')
+ @Appender(_common_see_also)
+ def size(self):
+ """
+ Compute group sizes.
+ """
+ result = self.grouper.size()
+
+ if isinstance(self.obj, Series):
+ result.name = getattr(self.obj, 'name', None)
+ return result
+
+ @classmethod
+ def _add_numeric_operations(cls):
+ """
+ Add numeric operations to the GroupBy generically.
+ """
+
+ def groupby_function(name, alias, npfunc,
+ numeric_only=True, _convert=False,
+ min_count=-1):
+
+ _local_template = "Compute %(f)s of group values"
+
+ @Substitution(name='groupby', f=name)
+ @Appender(_common_see_also)
+ @Appender(_local_template)
+ def f(self, **kwargs):
+ if 'numeric_only' not in kwargs:
+ kwargs['numeric_only'] = numeric_only
+ if 'min_count' not in kwargs:
+ kwargs['min_count'] = min_count
+
+ self._set_group_selection()
+ try:
+ return self._cython_agg_general(
+ alias, alt=npfunc, **kwargs)
+ except AssertionError as e:
+ raise SpecificationError(str(e))
+ except Exception:
+ result = self.aggregate(
+ lambda x: npfunc(x, axis=self.axis))
+ if _convert:
+ result = result._convert(datetime=True)
+ return result
+
+ set_function_name(f, name, cls)
+
+ return f
+
+ def first_compat(x, axis=0):
+
+ def first(x):
+ x = x.to_numpy()
+
+ x = x[notna(x)]
+ if len(x) == 0:
+ return np.nan
+ return x[0]
+
+ if isinstance(x, DataFrame):
+ return x.apply(first, axis=axis)
+ else:
+ return first(x)
+
+ def last_compat(x, axis=0):
+
+ def last(x):
+ x = x.to_numpy()
+ x = x[notna(x)]
+ if len(x) == 0:
+ return np.nan
+ return x[-1]
+
+ if isinstance(x, DataFrame):
+ return x.apply(last, axis=axis)
+ else:
+ return last(x)
+
+ cls.sum = groupby_function('sum', 'add', np.sum, min_count=0)
+ cls.prod = groupby_function('prod', 'prod', np.prod, min_count=0)
+ cls.min = groupby_function('min', 'min', np.min, numeric_only=False)
+ cls.max = groupby_function('max', 'max', np.max, numeric_only=False)
+ cls.first = groupby_function('first', 'first', first_compat,
+ numeric_only=False)
+ cls.last = groupby_function('last', 'last', last_compat,
+ numeric_only=False)
+
+ @Substitution(name='groupby')
+ @Appender(_common_see_also)
+ def ohlc(self):
+ """
+ Compute sum of values, excluding missing values.
+
+ For multiple groupings, the result index will be a MultiIndex
+ """
+
+ return self._apply_to_column_groupbys(
+ lambda x: x._cython_agg_general('ohlc'))
+
+ @Appender(DataFrame.describe.__doc__)
+ def describe(self, **kwargs):
+ with _group_selection_context(self):
+ result = self.apply(lambda x: x.describe(**kwargs))
+ if self.axis == 1:
+ return result.T
+ return result.unstack()
+
+ def resample(self, rule, *args, **kwargs):
+ """
+ Provide resampling when using a TimeGrouper.
+
+ Given a grouper, the function resamples it according to a string
+ "string" -> "frequency".
+
+ See the :ref:`frequency aliases <timeseries.offset_aliases>`
+ documentation for more details.
+
+ Parameters
+ ----------
+ rule : str or DateOffset
+ The offset string or object representing target grouper conversion.
+ *args, **kwargs
+ Possible arguments are `how`, `fill_method`, `limit`, `kind` and
+ `on`, and other arguments of `TimeGrouper`.
+
+ Returns
+ -------
+ Grouper
+ Return a new grouper with our resampler appended.
+
+ See Also
+ --------
+ pandas.Grouper : Specify a frequency to resample with when
+ grouping by a key.
+ DatetimeIndex.resample : Frequency conversion and resampling of
+ time series.
+
+ Examples
+ --------
+ >>> idx = pd.date_range('1/1/2000', periods=4, freq='T')
+ >>> df = pd.DataFrame(data=4 * [range(2)],
+ ... index=idx,
+ ... columns=['a', 'b'])
+ >>> df.iloc[2, 0] = 5
+ >>> df
+ a b
+ 2000-01-01 00:00:00 0 1
+ 2000-01-01 00:01:00 0 1
+ 2000-01-01 00:02:00 5 1
+ 2000-01-01 00:03:00 0 1
+
+ Downsample the DataFrame into 3 minute bins and sum the values of
+ the timestamps falling into a bin.
+
+ >>> df.groupby('a').resample('3T').sum()
+ a b
+ a
+ 0 2000-01-01 00:00:00 0 2
+ 2000-01-01 00:03:00 0 1
+ 5 2000-01-01 00:00:00 5 1
+
+ Upsample the series into 30 second bins.
+
+ >>> df.groupby('a').resample('30S').sum()
+ a b
+ a
+ 0 2000-01-01 00:00:00 0 1
+ 2000-01-01 00:00:30 0 0
+ 2000-01-01 00:01:00 0 1
+ 2000-01-01 00:01:30 0 0
+ 2000-01-01 00:02:00 0 0
+ 2000-01-01 00:02:30 0 0
+ 2000-01-01 00:03:00 0 1
+ 5 2000-01-01 00:02:00 5 1
+
+ Resample by month. Values are assigned to the month of the period.
+
+ >>> df.groupby('a').resample('M').sum()
+ a b
+ a
+ 0 2000-01-31 0 3
+ 5 2000-01-31 5 1
+
+ Downsample the series into 3 minute bins as above, but close the right
+ side of the bin interval.
+
+ >>> df.groupby('a').resample('3T', closed='right').sum()
+ a b
+ a
+ 0 1999-12-31 23:57:00 0 1
+ 2000-01-01 00:00:00 0 2
+ 5 2000-01-01 00:00:00 5 1
+
+ Downsample the series into 3 minute bins and close the right side of
+ the bin interval, but label each bin using the right edge instead of
+ the left.
+
+ >>> df.groupby('a').resample('3T', closed='right', label='right').sum()
+ a b
+ a
+ 0 2000-01-01 00:00:00 0 1
+ 2000-01-01 00:03:00 0 2
+ 5 2000-01-01 00:03:00 5 1
+
+ Add an offset of twenty seconds.
+
+ >>> df.groupby('a').resample('3T', loffset='20s').sum()
+ a b
+ a
+ 0 2000-01-01 00:00:20 0 2
+ 2000-01-01 00:03:20 0 1
+ 5 2000-01-01 00:00:20 5 1
+ """
+ from pandas.core.resample import get_resampler_for_grouping
+ return get_resampler_for_grouping(self, rule, *args, **kwargs)
+
+ @Substitution(name='groupby')
+ @Appender(_common_see_also)
+ def rolling(self, *args, **kwargs):
+ """
+ Return a rolling grouper, providing rolling functionality per group.
+ """
+ from pandas.core.window import RollingGroupby
+ return RollingGroupby(self, *args, **kwargs)
+
+ @Substitution(name='groupby')
+ @Appender(_common_see_also)
+ def expanding(self, *args, **kwargs):
+ """
+ Return an expanding grouper, providing expanding
+ functionality per group.
+ """
+ from pandas.core.window import ExpandingGroupby
+ return ExpandingGroupby(self, *args, **kwargs)
+
+ def _fill(self, direction, limit=None):
+ """
+ Shared function for `pad` and `backfill` to call Cython method.
+
+ Parameters
+ ----------
+ direction : {'ffill', 'bfill'}
+ Direction passed to underlying Cython function. `bfill` will cause
+ values to be filled backwards. `ffill` and any other values will
+ default to a forward fill
+ limit : int, default None
+ Maximum number of consecutive values to fill. If `None`, this
+ method will convert to -1 prior to passing to Cython
+
+ Returns
+ -------
+ `Series` or `DataFrame` with filled values
+
+ See Also
+ --------
+ pad
+ backfill
+ """
+ # Need int value for Cython
+ if limit is None:
+ limit = -1
+
+ return self._get_cythonized_result('group_fillna_indexer',
+ self.grouper, needs_mask=True,
+ cython_dtype=np.int64,
+ result_is_index=True,
+ direction=direction, limit=limit)
+
+ @Substitution(name='groupby')
+ def pad(self, limit=None):
+ """
+ Forward fill the values.
+
+ Parameters
+ ----------
+ limit : integer, optional
+ limit of how many values to fill
+
+ See Also
+ --------
+ Series.pad
+ DataFrame.pad
+ Series.fillna
+ DataFrame.fillna
+ """
+ return self._fill('ffill', limit=limit)
+ ffill = pad
+
+ @Substitution(name='groupby')
+ def backfill(self, limit=None):
+ """
+ Backward fill the values.
+
+ Parameters
+ ----------
+ limit : integer, optional
+ limit of how many values to fill
+
+ See Also
+ --------
+ Series.backfill
+ DataFrame.backfill
+ Series.fillna
+ DataFrame.fillna
+ """
+ return self._fill('bfill', limit=limit)
+ bfill = backfill
+
+ @Substitution(name='groupby', see_also=_common_see_also)
+ def nth(self, n, dropna=None):
+ """
+ Take the nth row from each group if n is an int, or a subset of rows
+ if n is a list of ints.
+
+ If dropna, will take the nth non-null row, dropna is either
+ Truthy (if a Series) or 'all', 'any' (if a DataFrame);
+ this is equivalent to calling dropna(how=dropna) before the
+ groupby.
+
+ Parameters
+ ----------
+ n : int or list of ints
+ a single nth value for the row or a list of nth values
+ dropna : None or str, optional
+ apply the specified dropna operation before counting which row is
+ the nth row. Needs to be None, 'any' or 'all'
+
+ %(see_also)s
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
+ ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B'])
+ >>> g = df.groupby('A')
+ >>> g.nth(0)
+ B
+ A
+ 1 NaN
+ 2 3.0
+ >>> g.nth(1)
+ B
+ A
+ 1 2.0
+ 2 5.0
+ >>> g.nth(-1)
+ B
+ A
+ 1 4.0
+ 2 5.0
+ >>> g.nth([0, 1])
+ B
+ A
+ 1 NaN
+ 1 2.0
+ 2 3.0
+ 2 5.0
+
+ Specifying `dropna` allows count ignoring ``NaN``
+
+ >>> g.nth(0, dropna='any')
+ B
+ A
+ 1 2.0
+ 2 3.0
+
+ NaNs denote group exhausted when using dropna
+
+ >>> g.nth(3, dropna='any')
+ B
+ A
+ 1 NaN
+ 2 NaN
+
+ Specifying `as_index=False` in `groupby` keeps the original index.
+
+ >>> df.groupby('A', as_index=False).nth(1)
+ A B
+ 1 1 2.0
+ 4 2 5.0
+ """
+
+ if isinstance(n, int):
+ nth_values = [n]
+ elif isinstance(n, (set, list, tuple)):
+ nth_values = list(set(n))
+ if dropna is not None:
+ raise ValueError(
+ "dropna option with a list of nth values is not supported")
+ else:
+ raise TypeError("n needs to be an int or a list/set/tuple of ints")
+
+ nth_values = np.array(nth_values, dtype=np.intp)
+ self._set_group_selection()
+
+ if not dropna:
+ mask_left = np.in1d(self._cumcount_array(), nth_values)
+ mask_right = np.in1d(self._cumcount_array(ascending=False) + 1,
+ -nth_values)
+ mask = mask_left | mask_right
+
+ out = self._selected_obj[mask]
+ if not self.as_index:
+ return out
+
+ ids, _, _ = self.grouper.group_info
+ out.index = self.grouper.result_index[ids[mask]]
+
+ return out.sort_index() if self.sort else out
+
+ if dropna not in ['any', 'all']:
+ if isinstance(self._selected_obj, Series) and dropna is True:
+ warnings.warn("the dropna={dropna} keyword is deprecated,"
+ "use dropna='all' instead. "
+ "For a Series groupby, dropna must be "
+ "either None, 'any' or 'all'.".format(
+ dropna=dropna),
+ FutureWarning,
+ stacklevel=2)
+ dropna = 'all'
+ else:
+ # Note: when agg-ing picker doesn't raise this,
+ # just returns NaN
+ raise ValueError("For a DataFrame groupby, dropna must be "
+ "either None, 'any' or 'all', "
+ "(was passed {dropna}).".format(
+ dropna=dropna))
+
+ # old behaviour, but with all and any support for DataFrames.
+ # modified in GH 7559 to have better perf
+ max_len = n if n >= 0 else - 1 - n
+ dropped = self.obj.dropna(how=dropna, axis=self.axis)
+
+ # get a new grouper for our dropped obj
+ if self.keys is None and self.level is None:
+
+ # we don't have the grouper info available
+ # (e.g. we have selected out
+ # a column that is not in the current object)
+ axis = self.grouper.axis
+ grouper = axis[axis.isin(dropped.index)]
+
+ else:
+
+ # create a grouper with the original parameters, but on the dropped
+ # object
+ from pandas.core.groupby.grouper import _get_grouper
+ grouper, _, _ = _get_grouper(dropped, key=self.keys,
+ axis=self.axis, level=self.level,
+ sort=self.sort,
+ mutated=self.mutated)
+
+ grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort)
+ sizes, result = grb.size(), grb.nth(n)
+ mask = (sizes < max_len).values
+
+ # set the results which don't meet the criteria
+ if len(result) and mask.any():
+ result.loc[mask] = np.nan
+
+ # reset/reindex to the original groups
+ if (len(self.obj) == len(dropped) or
+ len(result) == len(self.grouper.result_index)):
+ result.index = self.grouper.result_index
+ else:
+ result = result.reindex(self.grouper.result_index)
+
+ return result
+
+ @Substitution(name='groupby')
+ def ngroup(self, ascending=True):
+ """
+ Number each group from 0 to the number of groups - 1.
+
+ This is the enumerative complement of cumcount. Note that the
+ numbers given to the groups match the order in which the groups
+ would be seen when iterating over the groupby object, not the
+ order they are first observed.
+
+ .. versionadded:: 0.20.2
+
+ Parameters
+ ----------
+ ascending : bool, default True
+ If False, number in reverse, from number of group - 1 to 0.
+
+ See Also
+ --------
+ .cumcount : Number the rows in each group.
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame({"A": list("aaabba")})
+ >>> df
+ A
+ 0 a
+ 1 a
+ 2 a
+ 3 b
+ 4 b
+ 5 a
+ >>> df.groupby('A').ngroup()
+ 0 0
+ 1 0
+ 2 0
+ 3 1
+ 4 1
+ 5 0
+ dtype: int64
+ >>> df.groupby('A').ngroup(ascending=False)
+ 0 1
+ 1 1
+ 2 1
+ 3 0
+ 4 0
+ 5 1
+ dtype: int64
+ >>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup()
+ 0 0
+ 1 0
+ 2 1
+ 3 3
+ 4 2
+ 5 0
+ dtype: int64
+ """
+
+ with _group_selection_context(self):
+ index = self._selected_obj.index
+ result = Series(self.grouper.group_info[0], index)
+ if not ascending:
+ result = self.ngroups - 1 - result
+ return result
+
+ @Substitution(name='groupby')
+ def cumcount(self, ascending=True):
+ """
+ Number each item in each group from 0 to the length of that group - 1.
+
+ Essentially this is equivalent to
+
+ >>> self.apply(lambda x: pd.Series(np.arange(len(x)), x.index))
+
+ Parameters
+ ----------
+ ascending : bool, default True
+ If False, number in reverse, from length of group - 1 to 0.
+
+ See Also
+ --------
+ .ngroup : Number the groups themselves.
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
+ ... columns=['A'])
+ >>> df
+ A
+ 0 a
+ 1 a
+ 2 a
+ 3 b
+ 4 b
+ 5 a
+ >>> df.groupby('A').cumcount()
+ 0 0
+ 1 1
+ 2 2
+ 3 0
+ 4 1
+ 5 3
+ dtype: int64
+ >>> df.groupby('A').cumcount(ascending=False)
+ 0 3
+ 1 2
+ 2 1
+ 3 1
+ 4 0
+ 5 0
+ dtype: int64
+ """
+
+ with _group_selection_context(self):
+ index = self._selected_obj.index
+ cumcounts = self._cumcount_array(ascending=ascending)
+ return Series(cumcounts, index)
+
+ @Substitution(name='groupby')
+ @Appender(_common_see_also)
+ def rank(self, method='average', ascending=True, na_option='keep',
+ pct=False, axis=0):
+ """
+ Provides the rank of values within each group.
+
+ Parameters
+ ----------
+ method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
+ * average: average rank of group
+ * min: lowest rank in group
+ * max: highest rank in group
+ * first: ranks assigned in order they appear in the array
+ * dense: like 'min', but rank always increases by 1 between groups
+ ascending : boolean, default True
+ False for ranks by high (1) to low (N)
+ na_option : {'keep', 'top', 'bottom'}, default 'keep'
+ * keep: leave NA values where they are
+ * top: smallest rank if ascending
+ * bottom: smallest rank if descending
+ pct : boolean, default False
+ Compute percentage rank of data within each group
+ axis : int, default 0
+ The axis of the object over which to compute the rank.
+
+ Returns
+ -----
+ DataFrame with ranking of values within each group
+ """
+ if na_option not in {'keep', 'top', 'bottom'}:
+ msg = "na_option must be one of 'keep', 'top', or 'bottom'"
+ raise ValueError(msg)
+ return self._cython_transform('rank', numeric_only=False,
+ ties_method=method, ascending=ascending,
+ na_option=na_option, pct=pct, axis=axis)
+
+ @Substitution(name='groupby')
+ @Appender(_common_see_also)
+ def cumprod(self, axis=0, *args, **kwargs):
+ """
+ Cumulative product for each group.
+ """
+ nv.validate_groupby_func('cumprod', args, kwargs,
+ ['numeric_only', 'skipna'])
+ if axis != 0:
+ return self.apply(lambda x: x.cumprod(axis=axis, **kwargs))
+
+ return self._cython_transform('cumprod', **kwargs)
+
+ @Substitution(name='groupby')
+ @Appender(_common_see_also)
+ def cumsum(self, axis=0, *args, **kwargs):
+ """
+ Cumulative sum for each group.
+ """
+ nv.validate_groupby_func('cumsum', args, kwargs,
+ ['numeric_only', 'skipna'])
+ if axis != 0:
+ return self.apply(lambda x: x.cumsum(axis=axis, **kwargs))
+
+ return self._cython_transform('cumsum', **kwargs)
+
+ @Substitution(name='groupby')
+ @Appender(_common_see_also)
+ def cummin(self, axis=0, **kwargs):
+ """
+ Cumulative min for each group.
+ """
+ if axis != 0:
+ return self.apply(lambda x: np.minimum.accumulate(x, axis))
+
+ return self._cython_transform('cummin', numeric_only=False)
+
+ @Substitution(name='groupby')
+ @Appender(_common_see_also)
+ def cummax(self, axis=0, **kwargs):
+ """
+ Cumulative max for each group.
+ """
+ if axis != 0:
+ return self.apply(lambda x: np.maximum.accumulate(x, axis))
+
+ return self._cython_transform('cummax', numeric_only=False)
+
+ def _get_cythonized_result(self, how, grouper, aggregate=False,
+ cython_dtype=None, needs_values=False,
+ needs_mask=False, needs_ngroups=False,
+ result_is_index=False,
+ pre_processing=None, post_processing=None,
+ **kwargs):
+ """
+ Get result for Cythonized functions.
+
+ Parameters
+ ----------
+ how : str, Cythonized function name to be called
+ grouper : Grouper object containing pertinent group info
+ aggregate : bool, default False
+ Whether the result should be aggregated to match the number of
+ groups
+ cython_dtype : default None
+ Type of the array that will be modified by the Cython call. If
+ `None`, the type will be inferred from the values of each slice
+ needs_values : bool, default False
+ Whether the values should be a part of the Cython call
+ signature
+ needs_mask : bool, default False
+ Whether boolean mask needs to be part of the Cython call
+ signature
+ needs_ngroups : bool, default False
+ Whether number of groups is part of the Cython call signature
+ result_is_index : bool, default False
+ Whether the result of the Cython operation is an index of
+ values to be retrieved, instead of the actual values themselves
+ pre_processing : function, default None
+ Function to be applied to `values` prior to passing to Cython
+ Raises if `needs_values` is False
+ post_processing : function, default None
+ Function to be applied to result of Cython function
+ **kwargs : dict
+ Extra arguments to be passed back to Cython funcs
+
+ Returns
+ -------
+ `Series` or `DataFrame` with filled values
+ """
+ if result_is_index and aggregate:
+ raise ValueError("'result_is_index' and 'aggregate' cannot both "
+ "be True!")
+ if post_processing:
+ if not callable(pre_processing):
+ raise ValueError("'post_processing' must be a callable!")
+ if pre_processing:
+ if not callable(pre_processing):
+ raise ValueError("'pre_processing' must be a callable!")
+ if not needs_values:
+ raise ValueError("Cannot use 'pre_processing' without "
+ "specifying 'needs_values'!")
+
+ labels, _, ngroups = grouper.group_info
+ output = collections.OrderedDict()
+ base_func = getattr(libgroupby, how)
+
+ for name, obj in self._iterate_slices():
+ if aggregate:
+ result_sz = ngroups
+ else:
+ result_sz = len(obj.values)
+
+ if not cython_dtype:
+ cython_dtype = obj.values.dtype
+
+ result = np.zeros(result_sz, dtype=cython_dtype)
+ func = partial(base_func, result, labels)
+ if needs_values:
+ vals = obj.values
+ if pre_processing:
+ vals = pre_processing(vals)
+ func = partial(func, vals)
+
+ if needs_mask:
+ mask = isna(obj.values).view(np.uint8)
+ func = partial(func, mask)
+
+ if needs_ngroups:
+ func = partial(func, ngroups)
+
+ func(**kwargs) # Call func to modify indexer values in place
+
+ if result_is_index:
+ result = algorithms.take_nd(obj.values, result)
+
+ if post_processing:
+ result = post_processing(result)
+
+ output[name] = result
+
+ if aggregate:
+ return self._wrap_aggregated_output(output)
+ else:
+ return self._wrap_transformed_output(output)
+
+ @Substitution(name='groupby')
+ @Appender(_common_see_also)
+ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
+ """
+ Shift each group by periods observations.
+
+ Parameters
+ ----------
+ periods : integer, default 1
+ number of periods to shift
+ freq : frequency string
+ axis : axis to shift, default 0
+ fill_value : optional
+
+ .. versionadded:: 0.24.0
+ """
+
+ if freq is not None or axis != 0 or not isna(fill_value):
+ return self.apply(lambda x: x.shift(periods, freq,
+ axis, fill_value))
+
+ return self._get_cythonized_result('group_shift_indexer',
+ self.grouper, cython_dtype=np.int64,
+ needs_ngroups=True,
+ result_is_index=True,
+ periods=periods)
+
+ @Substitution(name='groupby')
+ @Appender(_common_see_also)
+ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None,
+ axis=0):
+ """
+ Calculate pct_change of each value to previous entry in group.
+ """
+ if freq is not None or axis != 0:
+ return self.apply(lambda x: x.pct_change(periods=periods,
+ fill_method=fill_method,
+ limit=limit, freq=freq,
+ axis=axis))
+ filled = getattr(self, fill_method)(limit=limit)
+ filled = filled.drop(self.grouper.names, axis=1)
+ fill_grp = filled.groupby(self.grouper.labels)
+ shifted = fill_grp.shift(periods=periods, freq=freq)
+ return (filled / shifted) - 1
+
+ @Substitution(name='groupby', see_also=_common_see_also)
+ def head(self, n=5):
+ """
+ Returns first n rows of each group.
+
+ Essentially equivalent to ``.apply(lambda x: x.head(n))``,
+ except ignores as_index flag.
+
+ %(see_also)s
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]],
+ columns=['A', 'B'])
+ >>> df.groupby('A', as_index=False).head(1)
+ A B
+ 0 1 2
+ 2 5 6
+ >>> df.groupby('A').head(1)
+ A B
+ 0 1 2
+ 2 5 6
+ """
+ self._reset_group_selection()
+ mask = self._cumcount_array() < n
+ return self._selected_obj[mask]
+
+ @Substitution(name='groupby', see_also=_common_see_also)
+ def tail(self, n=5):
+ """
+ Returns last n rows of each group.
+
+ Essentially equivalent to ``.apply(lambda x: x.tail(n))``,
+ except ignores as_index flag.
+
+ %(see_also)s
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]],
+ columns=['A', 'B'])
+ >>> df.groupby('A').tail(1)
+ A B
+ 1 a 2
+ 3 b 2
+ >>> df.groupby('A').head(1)
+ A B
+ 0 a 1
+ 2 b 1
+ """
+ self._reset_group_selection()
+ mask = self._cumcount_array(ascending=False) < n
+ return self._selected_obj[mask]
+
+
+GroupBy._add_numeric_operations()
+
+
+@Appender(GroupBy.__doc__)
+def groupby(obj, by, **kwds):
+ if isinstance(obj, Series):
+ from pandas.core.groupby.generic import SeriesGroupBy
+ klass = SeriesGroupBy
+ elif isinstance(obj, DataFrame):
+ from pandas.core.groupby.generic import DataFrameGroupBy
+ klass = DataFrameGroupBy
+ else: # pragma: no cover
+ raise TypeError('invalid type: {}'.format(obj))
+
+ return klass(obj, by, **kwds)
diff --git a/contrib/python/pandas/py2/pandas/core/groupby/grouper.py b/contrib/python/pandas/py2/pandas/core/groupby/grouper.py
new file mode 100644
index 00000000000..260417bc0d5
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/groupby/grouper.py
@@ -0,0 +1,632 @@
+"""
+Provide user facing operators for doing the split part of the
+split-apply-combine paradigm.
+"""
+
+import warnings
+
+import numpy as np
+
+import pandas.compat as compat
+from pandas.compat import callable, zip
+from pandas.util._decorators import cache_readonly
+
+from pandas.core.dtypes.common import (
+ ensure_categorical, is_categorical_dtype, is_datetime64_dtype, is_hashable,
+ is_list_like, is_scalar, is_timedelta64_dtype)
+from pandas.core.dtypes.generic import ABCSeries
+
+import pandas.core.algorithms as algorithms
+from pandas.core.arrays import Categorical, ExtensionArray
+import pandas.core.common as com
+from pandas.core.frame import DataFrame
+from pandas.core.groupby.ops import BaseGrouper
+from pandas.core.index import CategoricalIndex, Index, MultiIndex
+from pandas.core.series import Series
+
+from pandas.io.formats.printing import pprint_thing
+
+
+class Grouper(object):
+ """
+ A Grouper allows the user to specify a groupby instruction for a target
+ object
+
+ This specification will select a column via the key parameter, or if the
+ level and/or axis parameters are given, a level of the index of the target
+ object.
+
+ These are local specifications and will override 'global' settings,
+ that is the parameters axis and level which are passed to the groupby
+ itself.
+
+ Parameters
+ ----------
+ key : string, defaults to None
+ groupby key, which selects the grouping column of the target
+ level : name/number, defaults to None
+ the level for the target index
+ freq : string / frequency object, defaults to None
+ This will groupby the specified frequency if the target selection
+ (via key or level) is a datetime-like object. For full specification
+ of available frequencies, please see `here
+ <http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases>`_.
+ axis : number/name of the axis, defaults to 0
+ sort : boolean, default to False
+ whether to sort the resulting labels
+
+ additional kwargs to control time-like groupers (when `freq` is passed)
+
+ closed : closed end of interval; 'left' or 'right'
+ label : interval boundary to use for labeling; 'left' or 'right'
+ convention : {'start', 'end', 'e', 's'}
+ If grouper is PeriodIndex
+ base, loffset
+
+ Returns
+ -------
+ A specification for a groupby instruction
+
+ Examples
+ --------
+
+ Syntactic sugar for ``df.groupby('A')``
+
+ >>> df.groupby(Grouper(key='A'))
+
+ Specify a resample operation on the column 'date'
+
+ >>> df.groupby(Grouper(key='date', freq='60s'))
+
+ Specify a resample operation on the level 'date' on the columns axis
+ with a frequency of 60s
+
+ >>> df.groupby(Grouper(level='date', freq='60s', axis=1))
+ """
+ _attributes = ('key', 'level', 'freq', 'axis', 'sort')
+
+ def __new__(cls, *args, **kwargs):
+ if kwargs.get('freq') is not None:
+ from pandas.core.resample import TimeGrouper
+ cls = TimeGrouper
+ return super(Grouper, cls).__new__(cls)
+
+ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
+ self.key = key
+ self.level = level
+ self.freq = freq
+ self.axis = axis
+ self.sort = sort
+
+ self.grouper = None
+ self.obj = None
+ self.indexer = None
+ self.binner = None
+ self._grouper = None
+
+ @property
+ def ax(self):
+ return self.grouper
+
+ def _get_grouper(self, obj, validate=True):
+ """
+ Parameters
+ ----------
+ obj : the subject object
+ validate : boolean, default True
+ if True, validate the grouper
+
+ Returns
+ -------
+ a tuple of binner, grouper, obj (possibly sorted)
+ """
+
+ self._set_grouper(obj)
+ self.grouper, exclusions, self.obj = _get_grouper(self.obj, [self.key],
+ axis=self.axis,
+ level=self.level,
+ sort=self.sort,
+ validate=validate)
+ return self.binner, self.grouper, self.obj
+
+ def _set_grouper(self, obj, sort=False):
+ """
+ given an object and the specifications, setup the internal grouper
+ for this particular specification
+
+ Parameters
+ ----------
+ obj : the subject object
+ sort : bool, default False
+ whether the resulting grouper should be sorted
+ """
+
+ if self.key is not None and self.level is not None:
+ raise ValueError(
+ "The Grouper cannot specify both a key and a level!")
+
+ # Keep self.grouper value before overriding
+ if self._grouper is None:
+ self._grouper = self.grouper
+
+ # the key must be a valid info item
+ if self.key is not None:
+ key = self.key
+ # The 'on' is already defined
+ if (getattr(self.grouper, 'name', None) == key and
+ isinstance(obj, ABCSeries)):
+ ax = self._grouper.take(obj.index)
+ else:
+ if key not in obj._info_axis:
+ raise KeyError(
+ "The grouper name {0} is not found".format(key))
+ ax = Index(obj[key], name=key)
+
+ else:
+ ax = obj._get_axis(self.axis)
+ if self.level is not None:
+ level = self.level
+
+ # if a level is given it must be a mi level or
+ # equivalent to the axis name
+ if isinstance(ax, MultiIndex):
+ level = ax._get_level_number(level)
+ ax = Index(ax._get_level_values(level),
+ name=ax.names[level])
+
+ else:
+ if level not in (0, ax.name):
+ raise ValueError(
+ "The level {0} is not valid".format(level))
+
+ # possibly sort
+ if (self.sort or sort) and not ax.is_monotonic:
+ # use stable sort to support first, last, nth
+ indexer = self.indexer = ax.argsort(kind='mergesort')
+ ax = ax.take(indexer)
+ obj = obj._take(indexer, axis=self.axis, is_copy=False)
+
+ self.obj = obj
+ self.grouper = ax
+ return self.grouper
+
+ @property
+ def groups(self):
+ return self.grouper.groups
+
+ def __repr__(self):
+ attrs_list = ["{}={!r}".format(attr_name, getattr(self, attr_name))
+ for attr_name in self._attributes
+ if getattr(self, attr_name) is not None]
+ attrs = ", ".join(attrs_list)
+ cls_name = self.__class__.__name__
+ return "{}({})".format(cls_name, attrs)
+
+
+class Grouping(object):
+
+ """
+ Holds the grouping information for a single key
+
+ Parameters
+ ----------
+ index : Index
+ grouper :
+ obj :
+ name :
+ level :
+ observed : boolean, default False
+ If we are a Categorical, use the observed values
+ in_axis : if the Grouping is a column in self.obj and hence among
+ Groupby.exclusions list
+
+ Returns
+ -------
+ **Attributes**:
+ * indices : dict of {group -> index_list}
+ * labels : ndarray, group labels
+ * ids : mapping of label -> group
+ * counts : array of group counts
+ * group_index : unique groups
+ * groups : dict of {group -> label_list}
+ """
+
+ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
+ sort=True, observed=False, in_axis=False):
+
+ self.name = name
+ self.level = level
+ self.grouper = _convert_grouper(index, grouper)
+ self.all_grouper = None
+ self.index = index
+ self.sort = sort
+ self.obj = obj
+ self.observed = observed
+ self.in_axis = in_axis
+
+ # right place for this?
+ if isinstance(grouper, (Series, Index)) and name is None:
+ self.name = grouper.name
+
+ if isinstance(grouper, MultiIndex):
+ self.grouper = grouper.values
+
+ # we have a single grouper which may be a myriad of things,
+ # some of which are dependent on the passing in level
+
+ if level is not None:
+ if not isinstance(level, int):
+ if level not in index.names:
+ raise AssertionError('Level {} not in index'.format(level))
+ level = index.names.index(level)
+
+ if self.name is None:
+ self.name = index.names[level]
+
+ self.grouper, self._labels, self._group_index = \
+ index._get_grouper_for_level(self.grouper, level)
+
+ # a passed Grouper like, directly get the grouper in the same way
+ # as single grouper groupby, use the group_info to get labels
+ elif isinstance(self.grouper, Grouper):
+ # get the new grouper; we already have disambiguated
+ # what key/level refer to exactly, don't need to
+ # check again as we have by this point converted these
+ # to an actual value (rather than a pd.Grouper)
+ _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False)
+ if self.name is None:
+ self.name = grouper.result_index.name
+ self.obj = self.grouper.obj
+ self.grouper = grouper
+
+ else:
+ if self.grouper is None and self.name is not None:
+ self.grouper = self.obj[self.name]
+
+ elif isinstance(self.grouper, (list, tuple)):
+ self.grouper = com.asarray_tuplesafe(self.grouper)
+
+ # a passed Categorical
+ elif is_categorical_dtype(self.grouper):
+
+ from pandas.core.groupby.categorical import recode_for_groupby
+ self.grouper, self.all_grouper = recode_for_groupby(
+ self.grouper, self.sort, observed)
+ categories = self.grouper.categories
+
+ # we make a CategoricalIndex out of the cat grouper
+ # preserving the categories / ordered attributes
+ self._labels = self.grouper.codes
+ if observed:
+ codes = algorithms.unique1d(self.grouper.codes)
+ codes = codes[codes != -1]
+ else:
+ codes = np.arange(len(categories))
+
+ self._group_index = CategoricalIndex(
+ Categorical.from_codes(
+ codes=codes,
+ categories=categories,
+ ordered=self.grouper.ordered))
+
+ # we are done
+ if isinstance(self.grouper, Grouping):
+ self.grouper = self.grouper.grouper
+
+ # no level passed
+ elif not isinstance(self.grouper,
+ (Series, Index, ExtensionArray, np.ndarray)):
+ if getattr(self.grouper, 'ndim', 1) != 1:
+ t = self.name or str(type(self.grouper))
+ raise ValueError(
+ "Grouper for '{}' not 1-dimensional".format(t))
+ self.grouper = self.index.map(self.grouper)
+ if not (hasattr(self.grouper, "__len__") and
+ len(self.grouper) == len(self.index)):
+ errmsg = ('Grouper result violates len(labels) == '
+ 'len(data)\nresult: %s' %
+ pprint_thing(self.grouper))
+ self.grouper = None # Try for sanity
+ raise AssertionError(errmsg)
+
+ # if we have a date/time-like grouper, make sure that we have
+ # Timestamps like
+ if getattr(self.grouper, 'dtype', None) is not None:
+ if is_datetime64_dtype(self.grouper):
+ from pandas import to_datetime
+ self.grouper = to_datetime(self.grouper)
+ elif is_timedelta64_dtype(self.grouper):
+ from pandas import to_timedelta
+ self.grouper = to_timedelta(self.grouper)
+
+ def __repr__(self):
+ return 'Grouping({0})'.format(self.name)
+
+ def __iter__(self):
+ return iter(self.indices)
+
+ _labels = None
+ _group_index = None
+
+ @property
+ def ngroups(self):
+ return len(self.group_index)
+
+ @cache_readonly
+ def indices(self):
+ # we have a list of groupers
+ if isinstance(self.grouper, BaseGrouper):
+ return self.grouper.indices
+
+ values = ensure_categorical(self.grouper)
+ return values._reverse_indexer()
+
+ @property
+ def labels(self):
+ if self._labels is None:
+ self._make_labels()
+ return self._labels
+
+ @cache_readonly
+ def result_index(self):
+ if self.all_grouper is not None:
+ from pandas.core.groupby.categorical import recode_from_groupby
+ return recode_from_groupby(self.all_grouper,
+ self.sort, self.group_index)
+ return self.group_index
+
+ @property
+ def group_index(self):
+ if self._group_index is None:
+ self._make_labels()
+ return self._group_index
+
+ def _make_labels(self):
+ if self._labels is None or self._group_index is None:
+ # we have a list of groupers
+ if isinstance(self.grouper, BaseGrouper):
+ labels = self.grouper.label_info
+ uniques = self.grouper.result_index
+ else:
+ labels, uniques = algorithms.factorize(
+ self.grouper, sort=self.sort)
+ uniques = Index(uniques, name=self.name)
+ self._labels = labels
+ self._group_index = uniques
+
+ @cache_readonly
+ def groups(self):
+ return self.index.groupby(Categorical.from_codes(self.labels,
+ self.group_index))
+
+
+def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
+ observed=False, mutated=False, validate=True):
+ """
+ create and return a BaseGrouper, which is an internal
+ mapping of how to create the grouper indexers.
+ This may be composed of multiple Grouping objects, indicating
+ multiple groupers
+
+ Groupers are ultimately index mappings. They can originate as:
+ index mappings, keys to columns, functions, or Groupers
+
+ Groupers enable local references to axis,level,sort, while
+ the passed in axis, level, and sort are 'global'.
+
+ This routine tries to figure out what the passing in references
+ are and then creates a Grouping for each one, combined into
+ a BaseGrouper.
+
+ If observed & we have a categorical grouper, only show the observed
+ values
+
+ If validate, then check for key/level overlaps
+
+ """
+ group_axis = obj._get_axis(axis)
+
+ # validate that the passed single level is compatible with the passed
+ # axis of the object
+ if level is not None:
+ # TODO: These if-block and else-block are almost same.
+ # MultiIndex instance check is removable, but it seems that there are
+ # some processes only for non-MultiIndex in else-block,
+ # eg. `obj.index.name != level`. We have to consider carefully whether
+ # these are applicable for MultiIndex. Even if these are applicable,
+ # we need to check if it makes no side effect to subsequent processes
+ # on the outside of this condition.
+ # (GH 17621)
+ if isinstance(group_axis, MultiIndex):
+ if is_list_like(level) and len(level) == 1:
+ level = level[0]
+
+ if key is None and is_scalar(level):
+ # Get the level values from group_axis
+ key = group_axis.get_level_values(level)
+ level = None
+
+ else:
+ # allow level to be a length-one list-like object
+ # (e.g., level=[0])
+ # GH 13901
+ if is_list_like(level):
+ nlevels = len(level)
+ if nlevels == 1:
+ level = level[0]
+ elif nlevels == 0:
+ raise ValueError('No group keys passed!')
+ else:
+ raise ValueError('multiple levels only valid with '
+ 'MultiIndex')
+
+ if isinstance(level, compat.string_types):
+ if obj.index.name != level:
+ raise ValueError('level name {} is not the name of the '
+ 'index'.format(level))
+ elif level > 0 or level < -1:
+ raise ValueError(
+ 'level > 0 or level < -1 only valid with MultiIndex')
+
+ # NOTE: `group_axis` and `group_axis.get_level_values(level)`
+ # are same in this section.
+ level = None
+ key = group_axis
+
+ # a passed-in Grouper, directly convert
+ if isinstance(key, Grouper):
+ binner, grouper, obj = key._get_grouper(obj, validate=False)
+ if key.key is None:
+ return grouper, [], obj
+ else:
+ return grouper, {key.key}, obj
+
+ # already have a BaseGrouper, just return it
+ elif isinstance(key, BaseGrouper):
+ return key, [], obj
+
+ # In the future, a tuple key will always mean an actual key,
+ # not an iterable of keys. In the meantime, we attempt to provide
+ # a warning. We can assume that the user wanted a list of keys when
+ # the key is not in the index. We just have to be careful with
+ # unhashble elements of `key`. Any unhashable elements implies that
+ # they wanted a list of keys.
+ # https://github.com/pandas-dev/pandas/issues/18314
+ is_tuple = isinstance(key, tuple)
+ all_hashable = is_tuple and is_hashable(key)
+
+ if is_tuple:
+ if ((all_hashable and key not in obj and set(key).issubset(obj))
+ or not all_hashable):
+ # column names ('a', 'b') -> ['a', 'b']
+ # arrays like (a, b) -> [a, b]
+ msg = ("Interpreting tuple 'by' as a list of keys, rather than "
+ "a single key. Use 'by=[...]' instead of 'by=(...)'. In "
+ "the future, a tuple will always mean a single key.")
+ warnings.warn(msg, FutureWarning, stacklevel=5)
+ key = list(key)
+
+ if not isinstance(key, list):
+ keys = [key]
+ match_axis_length = False
+ else:
+ keys = key
+ match_axis_length = len(keys) == len(group_axis)
+
+ # what are we after, exactly?
+ any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
+ any_groupers = any(isinstance(g, Grouper) for g in keys)
+ any_arraylike = any(isinstance(g, (list, tuple, Series, Index, np.ndarray))
+ for g in keys)
+
+ try:
+ if isinstance(obj, DataFrame):
+ all_in_columns_index = all(g in obj.columns or g in obj.index.names
+ for g in keys)
+ else:
+ all_in_columns_index = False
+ except Exception:
+ all_in_columns_index = False
+
+ if (not any_callable and not all_in_columns_index and
+ not any_arraylike and not any_groupers and
+ match_axis_length and level is None):
+ keys = [com.asarray_tuplesafe(keys)]
+
+ if isinstance(level, (tuple, list)):
+ if key is None:
+ keys = [None] * len(level)
+ levels = level
+ else:
+ levels = [level] * len(keys)
+
+ groupings = []
+ exclusions = []
+
+ # if the actual grouper should be obj[key]
+ def is_in_axis(key):
+ if not _is_label_like(key):
+ try:
+ obj._data.items.get_loc(key)
+ except Exception:
+ return False
+
+ return True
+
+ # if the grouper is obj[name]
+ def is_in_obj(gpr):
+ try:
+ return id(gpr) == id(obj[gpr.name])
+ except Exception:
+ return False
+
+ for i, (gpr, level) in enumerate(zip(keys, levels)):
+
+ if is_in_obj(gpr): # df.groupby(df['name'])
+ in_axis, name = True, gpr.name
+ exclusions.append(name)
+
+ elif is_in_axis(gpr): # df.groupby('name')
+ if gpr in obj:
+ if validate:
+ obj._check_label_or_level_ambiguity(gpr)
+ in_axis, name, gpr = True, gpr, obj[gpr]
+ exclusions.append(name)
+ elif obj._is_level_reference(gpr):
+ in_axis, name, level, gpr = False, None, gpr, None
+ else:
+ raise KeyError(gpr)
+ elif isinstance(gpr, Grouper) and gpr.key is not None:
+ # Add key to exclusions
+ exclusions.append(gpr.key)
+ in_axis, name = False, None
+ else:
+ in_axis, name = False, None
+
+ if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]:
+ raise ValueError(
+ ("Length of grouper ({len_gpr}) and axis ({len_axis})"
+ " must be same length"
+ .format(len_gpr=len(gpr), len_axis=obj.shape[axis])))
+
+ # create the Grouping
+ # allow us to passing the actual Grouping as the gpr
+ ping = (Grouping(group_axis,
+ gpr,
+ obj=obj,
+ name=name,
+ level=level,
+ sort=sort,
+ observed=observed,
+ in_axis=in_axis)
+ if not isinstance(gpr, Grouping) else gpr)
+
+ groupings.append(ping)
+
+ if len(groupings) == 0:
+ raise ValueError('No group keys passed!')
+
+ # create the internals grouper
+ grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated)
+ return grouper, exclusions, obj
+
+
+def _is_label_like(val):
+ return (isinstance(val, (compat.string_types, tuple)) or
+ (val is not None and is_scalar(val)))
+
+
+def _convert_grouper(axis, grouper):
+ if isinstance(grouper, dict):
+ return grouper.get
+ elif isinstance(grouper, Series):
+ if grouper.index.equals(axis):
+ return grouper._values
+ else:
+ return grouper.reindex(axis)._values
+ elif isinstance(grouper, (list, Series, Index, np.ndarray)):
+ if len(grouper) != len(axis):
+ raise ValueError('Grouper and axis must be same length')
+ return grouper
+ else:
+ return grouper
diff --git a/contrib/python/pandas/py2/pandas/core/groupby/ops.py b/contrib/python/pandas/py2/pandas/core/groupby/ops.py
new file mode 100644
index 00000000000..87f48d5a405
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/groupby/ops.py
@@ -0,0 +1,898 @@
+"""
+Provide classes to perform the groupby aggregate operations.
+
+These are not exposed to the user and provide implementations of the grouping
+operations, primarily in cython. These classes (BaseGrouper and BinGrouper)
+are contained *in* the SeriesGroupBy and DataFrameGroupBy objects.
+"""
+
+import collections
+
+import numpy as np
+
+from pandas._libs import NaT, groupby as libgroupby, iNaT, lib, reduction
+from pandas.compat import lzip, range, zip
+from pandas.errors import AbstractMethodError
+from pandas.util._decorators import cache_readonly
+
+from pandas.core.dtypes.common import (
+ ensure_float64, ensure_int64, ensure_int64_or_float64, ensure_object,
+ ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_complex_dtype,
+ is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype,
+ is_timedelta64_dtype, needs_i8_conversion)
+from pandas.core.dtypes.missing import _maybe_fill, isna
+
+import pandas.core.algorithms as algorithms
+from pandas.core.base import SelectionMixin
+import pandas.core.common as com
+from pandas.core.frame import DataFrame
+from pandas.core.generic import NDFrame
+from pandas.core.groupby import base
+from pandas.core.index import Index, MultiIndex, ensure_index
+from pandas.core.series import Series
+from pandas.core.sorting import (
+ compress_group_index, decons_obs_group_ids, get_flattened_iterator,
+ get_group_index, get_group_index_sorter, get_indexer_dict)
+
+
+def generate_bins_generic(values, binner, closed):
+ """
+ Generate bin edge offsets and bin labels for one array using another array
+ which has bin edge values. Both arrays must be sorted.
+
+ Parameters
+ ----------
+ values : array of values
+ binner : a comparable array of values representing bins into which to bin
+ the first array. Note, 'values' end-points must fall within 'binner'
+ end-points.
+ closed : which end of bin is closed; left (default), right
+
+ Returns
+ -------
+ bins : array of offsets (into 'values' argument) of bins.
+ Zero and last edge are excluded in result, so for instance the first
+ bin is values[0:bin[0]] and the last is values[bin[-1]:]
+ """
+ lenidx = len(values)
+ lenbin = len(binner)
+
+ if lenidx <= 0 or lenbin <= 0:
+ raise ValueError("Invalid length for values or for binner")
+
+ # check binner fits data
+ if values[0] < binner[0]:
+ raise ValueError("Values falls before first bin")
+
+ if values[lenidx - 1] > binner[lenbin - 1]:
+ raise ValueError("Values falls after last bin")
+
+ bins = np.empty(lenbin - 1, dtype=np.int64)
+
+ j = 0 # index into values
+ bc = 0 # bin count
+
+ # linear scan, presume nothing about values/binner except that it fits ok
+ for i in range(0, lenbin - 1):
+ r_bin = binner[i + 1]
+
+ # count values in current bin, advance to next bin
+ while j < lenidx and (values[j] < r_bin or
+ (closed == 'right' and values[j] == r_bin)):
+ j += 1
+
+ bins[bc] = j
+ bc += 1
+
+ return bins
+
+
+class BaseGrouper(object):
+ """
+ This is an internal Grouper class, which actually holds
+ the generated groups
+
+ Parameters
+ ----------
+ axis : int
+ the axis to group
+ groupings : array of grouping
+ all the grouping instances to handle in this grouper
+ for example for grouper list to groupby, need to pass the list
+ sort : boolean, default True
+ whether this grouper will give sorted result or not
+ group_keys : boolean, default True
+ mutated : boolean, default False
+ indexer : intp array, optional
+ the indexer created by Grouper
+ some groupers (TimeGrouper) will sort its axis and its
+ group_info is also sorted, so need the indexer to reorder
+
+ """
+
+ def __init__(self, axis, groupings, sort=True, group_keys=True,
+ mutated=False, indexer=None):
+ self._filter_empty_groups = self.compressed = len(groupings) != 1
+ self.axis = axis
+ self.groupings = groupings
+ self.sort = sort
+ self.group_keys = group_keys
+ self.mutated = mutated
+ self.indexer = indexer
+
+ @property
+ def shape(self):
+ return tuple(ping.ngroups for ping in self.groupings)
+
+ def __iter__(self):
+ return iter(self.indices)
+
+ @property
+ def nkeys(self):
+ return len(self.groupings)
+
+ def get_iterator(self, data, axis=0):
+ """
+ Groupby iterator
+
+ Returns
+ -------
+ Generator yielding sequence of (name, subsetted object)
+ for each group
+ """
+ splitter = self._get_splitter(data, axis=axis)
+ keys = self._get_group_keys()
+ for key, (i, group) in zip(keys, splitter):
+ yield key, group
+
+ def _get_splitter(self, data, axis=0):
+ comp_ids, _, ngroups = self.group_info
+ return get_splitter(data, comp_ids, ngroups, axis=axis)
+
+ def _get_group_keys(self):
+ if len(self.groupings) == 1:
+ return self.levels[0]
+ else:
+ comp_ids, _, ngroups = self.group_info
+
+ # provide "flattened" iterator for multi-group setting
+ return get_flattened_iterator(comp_ids,
+ ngroups,
+ self.levels,
+ self.labels)
+
+ def apply(self, f, data, axis=0):
+ mutated = self.mutated
+ splitter = self._get_splitter(data, axis=axis)
+ group_keys = self._get_group_keys()
+
+ # oh boy
+ f_name = com.get_callable_name(f)
+ if (f_name not in base.plotting_methods and
+ hasattr(splitter, 'fast_apply') and axis == 0):
+ try:
+ values, mutated = splitter.fast_apply(f, group_keys)
+ return group_keys, values, mutated
+ except reduction.InvalidApply:
+ # we detect a mutation of some kind
+ # so take slow path
+ pass
+ except Exception:
+ # raise this error to the caller
+ pass
+
+ result_values = []
+ for key, (i, group) in zip(group_keys, splitter):
+ object.__setattr__(group, 'name', key)
+
+ # group might be modified
+ group_axes = _get_axes(group)
+ res = f(group)
+ if not _is_indexed_like(res, group_axes):
+ mutated = True
+ result_values.append(res)
+
+ return group_keys, result_values, mutated
+
+ @cache_readonly
+ def indices(self):
+ """ dict {group name -> group indices} """
+ if len(self.groupings) == 1:
+ return self.groupings[0].indices
+ else:
+ label_list = [ping.labels for ping in self.groupings]
+ keys = [com.values_from_object(ping.group_index)
+ for ping in self.groupings]
+ return get_indexer_dict(label_list, keys)
+
+ @property
+ def labels(self):
+ return [ping.labels for ping in self.groupings]
+
+ @property
+ def levels(self):
+ return [ping.group_index for ping in self.groupings]
+
+ @property
+ def names(self):
+ return [ping.name for ping in self.groupings]
+
+ def size(self):
+ """
+ Compute group sizes
+
+ """
+ ids, _, ngroup = self.group_info
+ ids = ensure_platform_int(ids)
+ if ngroup:
+ out = np.bincount(ids[ids != -1], minlength=ngroup)
+ else:
+ out = ids
+ return Series(out,
+ index=self.result_index,
+ dtype='int64')
+
+ @cache_readonly
+ def groups(self):
+ """ dict {group name -> group labels} """
+ if len(self.groupings) == 1:
+ return self.groupings[0].groups
+ else:
+ to_groupby = lzip(*(ping.grouper for ping in self.groupings))
+ to_groupby = Index(to_groupby)
+ return self.axis.groupby(to_groupby)
+
+ @cache_readonly
+ def is_monotonic(self):
+ # return if my group orderings are monotonic
+ return Index(self.group_info[0]).is_monotonic
+
+ @cache_readonly
+ def group_info(self):
+ comp_ids, obs_group_ids = self._get_compressed_labels()
+
+ ngroups = len(obs_group_ids)
+ comp_ids = ensure_int64(comp_ids)
+ return comp_ids, obs_group_ids, ngroups
+
+ @cache_readonly
+ def label_info(self):
+ # return the labels of items in original grouped axis
+ labels, _, _ = self.group_info
+ if self.indexer is not None:
+ sorter = np.lexsort((labels, self.indexer))
+ labels = labels[sorter]
+ return labels
+
+ def _get_compressed_labels(self):
+ all_labels = [ping.labels for ping in self.groupings]
+ if len(all_labels) > 1:
+ group_index = get_group_index(all_labels, self.shape,
+ sort=True, xnull=True)
+ return compress_group_index(group_index, sort=self.sort)
+
+ ping = self.groupings[0]
+ return ping.labels, np.arange(len(ping.group_index))
+
+ @cache_readonly
+ def ngroups(self):
+ return len(self.result_index)
+
+ @property
+ def recons_labels(self):
+ comp_ids, obs_ids, _ = self.group_info
+ labels = (ping.labels for ping in self.groupings)
+ return decons_obs_group_ids(
+ comp_ids, obs_ids, self.shape, labels, xnull=True)
+
+ @cache_readonly
+ def result_index(self):
+ if not self.compressed and len(self.groupings) == 1:
+ return self.groupings[0].result_index.rename(self.names[0])
+
+ codes = self.recons_labels
+ levels = [ping.result_index for ping in self.groupings]
+ result = MultiIndex(levels=levels,
+ codes=codes,
+ verify_integrity=False,
+ names=self.names)
+ return result
+
+ def get_group_levels(self):
+ if not self.compressed and len(self.groupings) == 1:
+ return [self.groupings[0].result_index]
+
+ name_list = []
+ for ping, labels in zip(self.groupings, self.recons_labels):
+ labels = ensure_platform_int(labels)
+ levels = ping.result_index.take(labels)
+
+ name_list.append(levels)
+
+ return name_list
+
+ # ------------------------------------------------------------
+ # Aggregation functions
+
+ _cython_functions = {
+ 'aggregate': {
+ 'add': 'group_add',
+ 'prod': 'group_prod',
+ 'min': 'group_min',
+ 'max': 'group_max',
+ 'mean': 'group_mean',
+ 'median': {
+ 'name': 'group_median'
+ },
+ 'var': 'group_var',
+ 'first': {
+ 'name': 'group_nth',
+ 'f': lambda func, a, b, c, d, e: func(a, b, c, d, 1, -1)
+ },
+ 'last': 'group_last',
+ 'ohlc': 'group_ohlc',
+ },
+
+ 'transform': {
+ 'cumprod': 'group_cumprod',
+ 'cumsum': 'group_cumsum',
+ 'cummin': 'group_cummin',
+ 'cummax': 'group_cummax',
+ 'rank': {
+ 'name': 'group_rank',
+ 'f': lambda func, a, b, c, d, **kwargs: func(
+ a, b, c, d,
+ kwargs.get('ties_method', 'average'),
+ kwargs.get('ascending', True),
+ kwargs.get('pct', False),
+ kwargs.get('na_option', 'keep')
+ )
+ }
+ }
+ }
+
+ _cython_arity = {
+ 'ohlc': 4, # OHLC
+ }
+
+ _name_functions = {
+ 'ohlc': lambda *args: ['open', 'high', 'low', 'close']
+ }
+
+ def _is_builtin_func(self, arg):
+ """
+ if we define an builtin function for this argument, return it,
+ otherwise return the arg
+ """
+ return SelectionMixin._builtin_table.get(arg, arg)
+
+ def _get_cython_function(self, kind, how, values, is_numeric):
+
+ dtype_str = values.dtype.name
+
+ def get_func(fname):
+ # see if there is a fused-type version of function
+ # only valid for numeric
+ f = getattr(libgroupby, fname, None)
+ if f is not None and is_numeric:
+ return f
+
+ # otherwise find dtype-specific version, falling back to object
+ for dt in [dtype_str, 'object']:
+ f = getattr(libgroupby, "{fname}_{dtype_str}".format(
+ fname=fname, dtype_str=dtype_str), None)
+ if f is not None:
+ return f
+
+ ftype = self._cython_functions[kind][how]
+
+ if isinstance(ftype, dict):
+ func = afunc = get_func(ftype['name'])
+
+ # a sub-function
+ f = ftype.get('f')
+ if f is not None:
+
+ def wrapper(*args, **kwargs):
+ return f(afunc, *args, **kwargs)
+
+ # need to curry our sub-function
+ func = wrapper
+
+ else:
+ func = get_func(ftype)
+
+ if func is None:
+ raise NotImplementedError(
+ "function is not implemented for this dtype: "
+ "[how->{how},dtype->{dtype_str}]".format(how=how,
+ dtype_str=dtype_str))
+
+ return func
+
+ def _cython_operation(self, kind, values, how, axis, min_count=-1,
+ **kwargs):
+ assert kind in ['transform', 'aggregate']
+
+ # can we do this operation with our cython functions
+ # if not raise NotImplementedError
+
+ # we raise NotImplemented if this is an invalid operation
+ # entirely, e.g. adding datetimes
+
+ # categoricals are only 1d, so we
+ # are not setup for dim transforming
+ if is_categorical_dtype(values):
+ raise NotImplementedError(
+ "categoricals are not support in cython ops ATM")
+ elif is_datetime64_any_dtype(values):
+ if how in ['add', 'prod', 'cumsum', 'cumprod']:
+ raise NotImplementedError(
+ "datetime64 type does not support {} "
+ "operations".format(how))
+ elif is_timedelta64_dtype(values):
+ if how in ['prod', 'cumprod']:
+ raise NotImplementedError(
+ "timedelta64 type does not support {} "
+ "operations".format(how))
+
+ arity = self._cython_arity.get(how, 1)
+
+ vdim = values.ndim
+ swapped = False
+ if vdim == 1:
+ values = values[:, None]
+ out_shape = (self.ngroups, arity)
+ else:
+ if axis > 0:
+ swapped = True
+ values = values.swapaxes(0, axis)
+ if arity > 1:
+ raise NotImplementedError("arity of more than 1 is not "
+ "supported for the 'how' argument")
+ out_shape = (self.ngroups,) + values.shape[1:]
+
+ is_datetimelike = needs_i8_conversion(values.dtype)
+ is_numeric = is_numeric_dtype(values.dtype)
+
+ if is_datetimelike:
+ values = values.view('int64')
+ is_numeric = True
+ elif is_bool_dtype(values.dtype):
+ values = ensure_float64(values)
+ elif is_integer_dtype(values):
+ # we use iNaT for the missing value on ints
+ # so pre-convert to guard this condition
+ if (values == iNaT).any():
+ values = ensure_float64(values)
+ else:
+ values = ensure_int64_or_float64(values)
+ elif is_numeric and not is_complex_dtype(values):
+ values = ensure_float64(values)
+ else:
+ values = values.astype(object)
+
+ try:
+ func = self._get_cython_function(
+ kind, how, values, is_numeric)
+ except NotImplementedError:
+ if is_numeric:
+ values = ensure_float64(values)
+ func = self._get_cython_function(
+ kind, how, values, is_numeric)
+ else:
+ raise
+
+ if how == 'rank':
+ out_dtype = 'float'
+ else:
+ if is_numeric:
+ out_dtype = '{kind}{itemsize}'.format(
+ kind=values.dtype.kind, itemsize=values.dtype.itemsize)
+ else:
+ out_dtype = 'object'
+
+ labels, _, _ = self.group_info
+
+ if kind == 'aggregate':
+ result = _maybe_fill(np.empty(out_shape, dtype=out_dtype),
+ fill_value=np.nan)
+ counts = np.zeros(self.ngroups, dtype=np.int64)
+ result = self._aggregate(
+ result, counts, values, labels, func, is_numeric,
+ is_datetimelike, min_count)
+ elif kind == 'transform':
+ result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
+ fill_value=np.nan)
+
+ # TODO: min_count
+ result = self._transform(
+ result, values, labels, func, is_numeric, is_datetimelike,
+ **kwargs)
+
+ if is_integer_dtype(result) and not is_datetimelike:
+ mask = result == iNaT
+ if mask.any():
+ result = result.astype('float64')
+ result[mask] = np.nan
+
+ if (kind == 'aggregate' and
+ self._filter_empty_groups and not counts.all()):
+ if result.ndim == 2:
+ try:
+ result = lib.row_bool_subset(
+ result, (counts > 0).view(np.uint8))
+ except ValueError:
+ result = lib.row_bool_subset_object(
+ ensure_object(result),
+ (counts > 0).view(np.uint8))
+ else:
+ result = result[counts > 0]
+
+ if vdim == 1 and arity == 1:
+ result = result[:, 0]
+
+ if how in self._name_functions:
+ # TODO
+ names = self._name_functions[how]()
+ else:
+ names = None
+
+ if swapped:
+ result = result.swapaxes(0, axis)
+
+ return result, names
+
+ def aggregate(self, values, how, axis=0, min_count=-1):
+ return self._cython_operation('aggregate', values, how, axis,
+ min_count=min_count)
+
+ def transform(self, values, how, axis=0, **kwargs):
+ return self._cython_operation('transform', values, how, axis, **kwargs)
+
+ def _aggregate(self, result, counts, values, comp_ids, agg_func,
+ is_numeric, is_datetimelike, min_count=-1):
+ if values.ndim > 3:
+ # punting for now
+ raise NotImplementedError("number of dimensions is currently "
+ "limited to 3")
+ elif values.ndim > 2:
+ for i, chunk in enumerate(values.transpose(2, 0, 1)):
+
+ chunk = chunk.squeeze()
+ agg_func(result[:, :, i], counts, chunk, comp_ids,
+ min_count)
+ else:
+ agg_func(result, counts, values, comp_ids, min_count)
+
+ return result
+
+ def _transform(self, result, values, comp_ids, transform_func,
+ is_numeric, is_datetimelike, **kwargs):
+
+ comp_ids, _, ngroups = self.group_info
+ if values.ndim > 3:
+ # punting for now
+ raise NotImplementedError("number of dimensions is currently "
+ "limited to 3")
+ elif values.ndim > 2:
+ for i, chunk in enumerate(values.transpose(2, 0, 1)):
+
+ transform_func(result[:, :, i], values,
+ comp_ids, is_datetimelike, **kwargs)
+ else:
+ transform_func(result, values, comp_ids, is_datetimelike, **kwargs)
+
+ return result
+
+ def agg_series(self, obj, func):
+ try:
+ return self._aggregate_series_fast(obj, func)
+ except Exception:
+ return self._aggregate_series_pure_python(obj, func)
+
+ def _aggregate_series_fast(self, obj, func):
+ func = self._is_builtin_func(func)
+
+ if obj.index._has_complex_internals:
+ raise TypeError('Incompatible index for Cython grouper')
+
+ group_index, _, ngroups = self.group_info
+
+ # avoids object / Series creation overhead
+ dummy = obj._get_values(slice(None, 0)).to_dense()
+ indexer = get_group_index_sorter(group_index, ngroups)
+ obj = obj._take(indexer).to_dense()
+ group_index = algorithms.take_nd(
+ group_index, indexer, allow_fill=False)
+ grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups,
+ dummy)
+ result, counts = grouper.get_result()
+ return result, counts
+
+ def _aggregate_series_pure_python(self, obj, func):
+
+ group_index, _, ngroups = self.group_info
+
+ counts = np.zeros(ngroups, dtype=int)
+ result = None
+
+ splitter = get_splitter(obj, group_index, ngroups, axis=self.axis)
+
+ for label, group in splitter:
+ res = func(group)
+ if result is None:
+ if (isinstance(res, (Series, Index, np.ndarray))):
+ raise ValueError('Function does not reduce')
+ result = np.empty(ngroups, dtype='O')
+
+ counts[label] = group.shape[0]
+ result[label] = res
+
+ result = lib.maybe_convert_objects(result, try_float=0)
+ return result, counts
+
+
+class BinGrouper(BaseGrouper):
+
+ """
+ This is an internal Grouper class
+
+ Parameters
+ ----------
+ bins : the split index of binlabels to group the item of axis
+ binlabels : the label list
+ filter_empty : boolean, default False
+ mutated : boolean, default False
+ indexer : a intp array
+
+ Examples
+ --------
+ bins: [2, 4, 6, 8, 10]
+ binlabels: DatetimeIndex(['2005-01-01', '2005-01-03',
+ '2005-01-05', '2005-01-07', '2005-01-09'],
+ dtype='datetime64[ns]', freq='2D')
+
+ the group_info, which contains the label of each item in grouped
+ axis, the index of label in label list, group number, is
+
+ (array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), array([0, 1, 2, 3, 4]), 5)
+
+ means that, the grouped axis has 10 items, can be grouped into 5
+ labels, the first and second items belong to the first label, the
+ third and forth items belong to the second label, and so on
+
+ """
+
+ def __init__(self, bins, binlabels, filter_empty=False, mutated=False,
+ indexer=None):
+ self.bins = ensure_int64(bins)
+ self.binlabels = ensure_index(binlabels)
+ self._filter_empty_groups = filter_empty
+ self.mutated = mutated
+ self.indexer = indexer
+
+ @cache_readonly
+ def groups(self):
+ """ dict {group name -> group labels} """
+
+ # this is mainly for compat
+ # GH 3881
+ result = {key: value for key, value in zip(self.binlabels, self.bins)
+ if key is not NaT}
+ return result
+
+ @property
+ def nkeys(self):
+ return 1
+
+ def get_iterator(self, data, axis=0):
+ """
+ Groupby iterator
+
+ Returns
+ -------
+ Generator yielding sequence of (name, subsetted object)
+ for each group
+ """
+ if isinstance(data, NDFrame):
+ slicer = lambda start, edge: data._slice(
+ slice(start, edge), axis=axis)
+ length = len(data.axes[axis])
+ else:
+ slicer = lambda start, edge: data[slice(start, edge)]
+ length = len(data)
+
+ start = 0
+ for edge, label in zip(self.bins, self.binlabels):
+ if label is not NaT:
+ yield label, slicer(start, edge)
+ start = edge
+
+ if start < length:
+ yield self.binlabels[-1], slicer(start, None)
+
+ @cache_readonly
+ def indices(self):
+ indices = collections.defaultdict(list)
+
+ i = 0
+ for label, bin in zip(self.binlabels, self.bins):
+ if i < bin:
+ if label is not NaT:
+ indices[label] = list(range(i, bin))
+ i = bin
+ return indices
+
+ @cache_readonly
+ def group_info(self):
+ ngroups = self.ngroups
+ obs_group_ids = np.arange(ngroups)
+ rep = np.diff(np.r_[0, self.bins])
+
+ rep = ensure_platform_int(rep)
+ if ngroups == len(self.bins):
+ comp_ids = np.repeat(np.arange(ngroups), rep)
+ else:
+ comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep)
+
+ return (comp_ids.astype('int64', copy=False),
+ obs_group_ids.astype('int64', copy=False),
+ ngroups)
+
+ @cache_readonly
+ def result_index(self):
+ if len(self.binlabels) != 0 and isna(self.binlabels[0]):
+ return self.binlabels[1:]
+
+ return self.binlabels
+
+ @property
+ def levels(self):
+ return [self.binlabels]
+
+ @property
+ def names(self):
+ return [self.binlabels.name]
+
+ @property
+ def groupings(self):
+ from pandas.core.groupby.grouper import Grouping
+ return [Grouping(lvl, lvl, in_axis=False, level=None, name=name)
+ for lvl, name in zip(self.levels, self.names)]
+
+ def agg_series(self, obj, func):
+ dummy = obj[:0]
+ grouper = reduction.SeriesBinGrouper(obj, func, self.bins, dummy)
+ return grouper.get_result()
+
+
+def _get_axes(group):
+ if isinstance(group, Series):
+ return [group.index]
+ else:
+ return group.axes
+
+
+def _is_indexed_like(obj, axes):
+ if isinstance(obj, Series):
+ if len(axes) > 1:
+ return False
+ return obj.index.equals(axes[0])
+ elif isinstance(obj, DataFrame):
+ return obj.index.equals(axes[0])
+
+ return False
+
+
+# ----------------------------------------------------------------------
+# Splitting / application
+
+
+class DataSplitter(object):
+
+ def __init__(self, data, labels, ngroups, axis=0):
+ self.data = data
+ self.labels = ensure_int64(labels)
+ self.ngroups = ngroups
+
+ self.axis = axis
+
+ @cache_readonly
+ def slabels(self):
+ # Sorted labels
+ return algorithms.take_nd(self.labels, self.sort_idx, allow_fill=False)
+
+ @cache_readonly
+ def sort_idx(self):
+ # Counting sort indexer
+ return get_group_index_sorter(self.labels, self.ngroups)
+
+ def __iter__(self):
+ sdata = self._get_sorted_data()
+
+ if self.ngroups == 0:
+ # we are inside a generator, rather than raise StopIteration
+ # we merely return signal the end
+ return
+
+ starts, ends = lib.generate_slices(self.slabels, self.ngroups)
+
+ for i, (start, end) in enumerate(zip(starts, ends)):
+ # Since I'm now compressing the group ids, it's now not "possible"
+ # to produce empty slices because such groups would not be observed
+ # in the data
+ # if start >= end:
+ # raise AssertionError('Start %s must be less than end %s'
+ # % (str(start), str(end)))
+ yield i, self._chop(sdata, slice(start, end))
+
+ def _get_sorted_data(self):
+ return self.data._take(self.sort_idx, axis=self.axis)
+
+ def _chop(self, sdata, slice_obj):
+ return sdata.iloc[slice_obj]
+
+ def apply(self, f):
+ raise AbstractMethodError(self)
+
+
+class SeriesSplitter(DataSplitter):
+
+ def _chop(self, sdata, slice_obj):
+ return sdata._get_values(slice_obj).to_dense()
+
+
+class FrameSplitter(DataSplitter):
+
+ def fast_apply(self, f, names):
+ # must return keys::list, values::list, mutated::bool
+ try:
+ starts, ends = lib.generate_slices(self.slabels, self.ngroups)
+ except Exception:
+ # fails when all -1
+ return [], True
+
+ sdata = self._get_sorted_data()
+ results, mutated = reduction.apply_frame_axis0(sdata, f, names,
+ starts, ends)
+
+ return results, mutated
+
+ def _chop(self, sdata, slice_obj):
+ if self.axis == 0:
+ return sdata.iloc[slice_obj]
+ else:
+ return sdata._slice(slice_obj, axis=1) # .loc[:, slice_obj]
+
+
+class NDFrameSplitter(DataSplitter):
+
+ def __init__(self, data, labels, ngroups, axis=0):
+ super(NDFrameSplitter, self).__init__(data, labels, ngroups, axis=axis)
+
+ self.factory = data._constructor
+
+ def _get_sorted_data(self):
+ # this is the BlockManager
+ data = self.data._data
+
+ # this is sort of wasteful but...
+ sorted_axis = data.axes[self.axis].take(self.sort_idx)
+ sorted_data = data.reindex_axis(sorted_axis, axis=self.axis)
+
+ return sorted_data
+
+ def _chop(self, sdata, slice_obj):
+ return self.factory(sdata.get_slice(slice_obj, axis=self.axis))
+
+
+def get_splitter(data, *args, **kwargs):
+ if isinstance(data, Series):
+ klass = SeriesSplitter
+ elif isinstance(data, DataFrame):
+ klass = FrameSplitter
+ else:
+ klass = NDFrameSplitter
+
+ return klass(data, *args, **kwargs)
diff --git a/contrib/python/pandas/py2/pandas/core/index.py b/contrib/python/pandas/py2/pandas/core/index.py
new file mode 100644
index 00000000000..2d1c22f5623
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/index.py
@@ -0,0 +1,3 @@
+# flake8: noqa
+from pandas.core.indexes.api import *
+from pandas.core.indexes.multi import _sparsify
diff --git a/contrib/python/pandas/py2/pandas/core/indexes/__init__.py b/contrib/python/pandas/py2/pandas/core/indexes/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/indexes/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/core/indexes/accessors.py b/contrib/python/pandas/py2/pandas/core/indexes/accessors.py
new file mode 100644
index 00000000000..c43469d3c3a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/indexes/accessors.py
@@ -0,0 +1,325 @@
+"""
+datetimelike delegation
+"""
+import numpy as np
+
+from pandas.core.dtypes.common import (
+ is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
+ is_datetime_arraylike, is_integer_dtype, is_list_like, is_period_arraylike,
+ is_timedelta64_dtype)
+from pandas.core.dtypes.generic import ABCSeries
+
+from pandas.core.accessor import PandasDelegate, delegate_names
+from pandas.core.algorithms import take_1d
+from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray
+from pandas.core.base import NoNewAttributesMixin, PandasObject
+from pandas.core.indexes.datetimes import DatetimeIndex
+from pandas.core.indexes.timedeltas import TimedeltaIndex
+
+
+class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin):
+
+ def __init__(self, data, orig):
+ if not isinstance(data, ABCSeries):
+ raise TypeError("cannot convert an object of type {0} to a "
+ "datetimelike index".format(type(data)))
+
+ self._parent = data
+ self.orig = orig
+ self.name = getattr(data, 'name', None)
+ self._freeze()
+
+ def _get_values(self):
+ data = self._parent
+ if is_datetime64_dtype(data.dtype):
+ return DatetimeIndex(data, copy=False, name=self.name)
+
+ elif is_datetime64tz_dtype(data.dtype):
+ return DatetimeIndex(data, copy=False, name=self.name)
+
+ elif is_timedelta64_dtype(data.dtype):
+ return TimedeltaIndex(data, copy=False, name=self.name)
+
+ else:
+ if is_period_arraylike(data):
+ # TODO: use to_period_array
+ return PeriodArray(data, copy=False)
+ if is_datetime_arraylike(data):
+ return DatetimeIndex(data, copy=False, name=self.name)
+
+ raise TypeError("cannot convert an object of type {0} to a "
+ "datetimelike index".format(type(data)))
+
+ def _delegate_property_get(self, name):
+ from pandas import Series
+ values = self._get_values()
+
+ result = getattr(values, name)
+
+ # maybe need to upcast (ints)
+ if isinstance(result, np.ndarray):
+ if is_integer_dtype(result):
+ result = result.astype('int64')
+ elif not is_list_like(result):
+ return result
+
+ result = np.asarray(result)
+
+ # blow up if we operate on categories
+ if self.orig is not None:
+ result = take_1d(result, self.orig.cat.codes)
+ index = self.orig.index
+ else:
+ index = self._parent.index
+ # return the result as a Series, which is by definition a copy
+ result = Series(result, index=index, name=self.name)
+
+ # setting this object will show a SettingWithCopyWarning/Error
+ result._is_copy = ("modifications to a property of a datetimelike "
+ "object are not supported and are discarded. "
+ "Change values on the original.")
+
+ return result
+
+ def _delegate_property_set(self, name, value, *args, **kwargs):
+ raise ValueError("modifications to a property of a datetimelike "
+ "object are not supported. Change values on the "
+ "original.")
+
+ def _delegate_method(self, name, *args, **kwargs):
+ from pandas import Series
+ values = self._get_values()
+
+ method = getattr(values, name)
+ result = method(*args, **kwargs)
+
+ if not is_list_like(result):
+ return result
+
+ result = Series(result, index=self._parent.index, name=self.name)
+
+ # setting this object will show a SettingWithCopyWarning/Error
+ result._is_copy = ("modifications to a method of a datetimelike "
+ "object are not supported and are discarded. "
+ "Change values on the original.")
+
+ return result
+
+
+@delegate_names(delegate=DatetimeArray,
+ accessors=DatetimeArray._datetimelike_ops,
+ typ="property")
+@delegate_names(delegate=DatetimeArray,
+ accessors=DatetimeArray._datetimelike_methods,
+ typ="method")
+class DatetimeProperties(Properties):
+ """
+ Accessor object for datetimelike properties of the Series values.
+
+ Examples
+ --------
+ >>> s.dt.hour
+ >>> s.dt.second
+ >>> s.dt.quarter
+
+ Returns a Series indexed like the original Series.
+ Raises TypeError if the Series does not contain datetimelike values.
+ """
+
+ def to_pydatetime(self):
+ """
+ Return the data as an array of native Python datetime objects.
+
+ Timezone information is retained if present.
+
+ .. warning::
+
+ Python's datetime uses microsecond resolution, which is lower than
+ pandas (nanosecond). The values are truncated.
+
+ Returns
+ -------
+ numpy.ndarray
+ object dtype array containing native Python datetime objects.
+
+ See Also
+ --------
+ datetime.datetime : Standard library value for a datetime.
+
+ Examples
+ --------
+ >>> s = pd.Series(pd.date_range('20180310', periods=2))
+ >>> s
+ 0 2018-03-10
+ 1 2018-03-11
+ dtype: datetime64[ns]
+
+ >>> s.dt.to_pydatetime()
+ array([datetime.datetime(2018, 3, 10, 0, 0),
+ datetime.datetime(2018, 3, 11, 0, 0)], dtype=object)
+
+ pandas' nanosecond precision is truncated to microseconds.
+
+ >>> s = pd.Series(pd.date_range('20180310', periods=2, freq='ns'))
+ >>> s
+ 0 2018-03-10 00:00:00.000000000
+ 1 2018-03-10 00:00:00.000000001
+ dtype: datetime64[ns]
+
+ >>> s.dt.to_pydatetime()
+ array([datetime.datetime(2018, 3, 10, 0, 0),
+ datetime.datetime(2018, 3, 10, 0, 0)], dtype=object)
+ """
+ return self._get_values().to_pydatetime()
+
+ @property
+ def freq(self):
+ return self._get_values().inferred_freq
+
+
+@delegate_names(delegate=TimedeltaArray,
+ accessors=TimedeltaArray._datetimelike_ops,
+ typ="property")
+@delegate_names(delegate=TimedeltaArray,
+ accessors=TimedeltaArray._datetimelike_methods,
+ typ="method")
+class TimedeltaProperties(Properties):
+ """
+ Accessor object for datetimelike properties of the Series values.
+
+ Examples
+ --------
+ >>> s.dt.hours
+ >>> s.dt.seconds
+
+ Returns a Series indexed like the original Series.
+ Raises TypeError if the Series does not contain datetimelike values.
+ """
+
+ def to_pytimedelta(self):
+ """
+ Return an array of native `datetime.timedelta` objects.
+
+ Python's standard `datetime` library uses a different representation
+ timedelta's. This method converts a Series of pandas Timedeltas
+ to `datetime.timedelta` format with the same length as the original
+ Series.
+
+ Returns
+ -------
+ a : numpy.ndarray
+ 1D array containing data with `datetime.timedelta` type.
+
+ See Also
+ --------
+ datetime.timedelta
+
+ Examples
+ --------
+ >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='d'))
+ >>> s
+ 0 0 days
+ 1 1 days
+ 2 2 days
+ 3 3 days
+ 4 4 days
+ dtype: timedelta64[ns]
+
+ >>> s.dt.to_pytimedelta()
+ array([datetime.timedelta(0), datetime.timedelta(1),
+ datetime.timedelta(2), datetime.timedelta(3),
+ datetime.timedelta(4)], dtype=object)
+ """
+ return self._get_values().to_pytimedelta()
+
+ @property
+ def components(self):
+ """
+ Return a Dataframe of the components of the Timedeltas.
+
+ Returns
+ -------
+ DataFrame
+
+ Examples
+ --------
+ >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='s'))
+ >>> s
+ 0 00:00:00
+ 1 00:00:01
+ 2 00:00:02
+ 3 00:00:03
+ 4 00:00:04
+ dtype: timedelta64[ns]
+ >>> s.dt.components
+ days hours minutes seconds milliseconds microseconds nanoseconds
+ 0 0 0 0 0 0 0 0
+ 1 0 0 0 1 0 0 0
+ 2 0 0 0 2 0 0 0
+ 3 0 0 0 3 0 0 0
+ 4 0 0 0 4 0 0 0
+ """ # noqa: E501
+ return self._get_values().components.set_index(self._parent.index)
+
+ @property
+ def freq(self):
+ return self._get_values().inferred_freq
+
+
+@delegate_names(delegate=PeriodArray,
+ accessors=PeriodArray._datetimelike_ops,
+ typ="property")
+@delegate_names(delegate=PeriodArray,
+ accessors=PeriodArray._datetimelike_methods,
+ typ="method")
+class PeriodProperties(Properties):
+ """
+ Accessor object for datetimelike properties of the Series values.
+
+ Examples
+ --------
+ >>> s.dt.hour
+ >>> s.dt.second
+ >>> s.dt.quarter
+
+ Returns a Series indexed like the original Series.
+ Raises TypeError if the Series does not contain datetimelike values.
+ """
+
+
+class CombinedDatetimelikeProperties(DatetimeProperties,
+ TimedeltaProperties, PeriodProperties):
+
+ def __new__(cls, data):
+ # CombinedDatetimelikeProperties isn't really instantiated. Instead
+ # we need to choose which parent (datetime or timedelta) is
+ # appropriate. Since we're checking the dtypes anyway, we'll just
+ # do all the validation here.
+ from pandas import Series
+
+ if not isinstance(data, Series):
+ raise TypeError("cannot convert an object of type {0} to a "
+ "datetimelike index".format(type(data)))
+
+ orig = data if is_categorical_dtype(data) else None
+ if orig is not None:
+ data = Series(orig.values.categories,
+ name=orig.name,
+ copy=False)
+
+ try:
+ if is_datetime64_dtype(data.dtype):
+ return DatetimeProperties(data, orig)
+ elif is_datetime64tz_dtype(data.dtype):
+ return DatetimeProperties(data, orig)
+ elif is_timedelta64_dtype(data.dtype):
+ return TimedeltaProperties(data, orig)
+ elif is_period_arraylike(data):
+ return PeriodProperties(data, orig)
+ elif is_datetime_arraylike(data):
+ return DatetimeProperties(data, orig)
+ except Exception:
+ pass # we raise an attribute error anyway
+
+ raise AttributeError("Can only use .dt accessor with datetimelike "
+ "values")
diff --git a/contrib/python/pandas/py2/pandas/core/indexes/api.py b/contrib/python/pandas/py2/pandas/core/indexes/api.py
new file mode 100644
index 00000000000..6299fc482d0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/indexes/api.py
@@ -0,0 +1,286 @@
+import textwrap
+import warnings
+
+from pandas._libs import NaT, lib
+
+import pandas.core.common as com
+from pandas.core.indexes.base import (
+ Index, _new_Index, ensure_index, ensure_index_from_sequences)
+from pandas.core.indexes.base import InvalidIndexError # noqa:F401
+from pandas.core.indexes.category import CategoricalIndex # noqa:F401
+from pandas.core.indexes.datetimes import DatetimeIndex
+from pandas.core.indexes.interval import IntervalIndex # noqa:F401
+from pandas.core.indexes.multi import MultiIndex # noqa:F401
+from pandas.core.indexes.numeric import ( # noqa:F401
+ Float64Index, Int64Index, NumericIndex, UInt64Index)
+from pandas.core.indexes.period import PeriodIndex
+from pandas.core.indexes.range import RangeIndex # noqa:F401
+from pandas.core.indexes.timedeltas import TimedeltaIndex
+
+_sort_msg = textwrap.dedent("""\
+Sorting because non-concatenation axis is not aligned. A future version
+of pandas will change to not sort by default.
+
+To accept the future behavior, pass 'sort=False'.
+
+To retain the current behavior and silence the warning, pass 'sort=True'.
+""")
+
+
+# TODO: there are many places that rely on these private methods existing in
+# pandas.core.index
+__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
+ 'CategoricalIndex', 'IntervalIndex', 'RangeIndex', 'UInt64Index',
+ 'InvalidIndexError', 'TimedeltaIndex',
+ 'PeriodIndex', 'DatetimeIndex',
+ '_new_Index', 'NaT',
+ 'ensure_index', 'ensure_index_from_sequences',
+ '_get_combined_index',
+ '_get_objs_combined_axis', '_union_indexes',
+ '_get_consensus_names',
+ '_all_indexes_same']
+
+
+def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True):
+ """
+ Extract combined index: return intersection or union (depending on the
+ value of "intersect") of indexes on given axis, or None if all objects
+ lack indexes (e.g. they are numpy arrays).
+
+ Parameters
+ ----------
+ objs : list of objects
+ Each object will only be considered if it has a _get_axis
+ attribute.
+ intersect : bool, default False
+ If True, calculate the intersection between indexes. Otherwise,
+ calculate the union.
+ axis : {0 or 'index', 1 or 'outer'}, default 0
+ The axis to extract indexes from.
+ sort : bool, default True
+ Whether the result index should come out sorted or not.
+
+ Returns
+ -------
+ Index
+ """
+ obs_idxes = [obj._get_axis(axis) for obj in objs
+ if hasattr(obj, '_get_axis')]
+ if obs_idxes:
+ return _get_combined_index(obs_idxes, intersect=intersect, sort=sort)
+
+
+def _get_distinct_objs(objs):
+ """
+ Return a list with distinct elements of "objs" (different ids).
+ Preserves order.
+ """
+ ids = set()
+ res = []
+ for obj in objs:
+ if not id(obj) in ids:
+ ids.add(id(obj))
+ res.append(obj)
+ return res
+
+
+def _get_combined_index(indexes, intersect=False, sort=False):
+ """
+ Return the union or intersection of indexes.
+
+ Parameters
+ ----------
+ indexes : list of Index or list objects
+ When intersect=True, do not accept list of lists.
+ intersect : bool, default False
+ If True, calculate the intersection between indexes. Otherwise,
+ calculate the union.
+ sort : bool, default False
+ Whether the result index should come out sorted or not.
+
+ Returns
+ -------
+ Index
+ """
+
+ # TODO: handle index names!
+ indexes = _get_distinct_objs(indexes)
+ if len(indexes) == 0:
+ index = Index([])
+ elif len(indexes) == 1:
+ index = indexes[0]
+ elif intersect:
+ index = indexes[0]
+ for other in indexes[1:]:
+ index = index.intersection(other)
+ else:
+ index = _union_indexes(indexes, sort=sort)
+ index = ensure_index(index)
+
+ if sort:
+ try:
+ index = index.sort_values()
+ except TypeError:
+ pass
+ return index
+
+
+def _union_indexes(indexes, sort=True):
+ """
+ Return the union of indexes.
+
+ The behavior of sort and names is not consistent.
+
+ Parameters
+ ----------
+ indexes : list of Index or list objects
+ sort : bool, default True
+ Whether the result index should come out sorted or not.
+
+ Returns
+ -------
+ Index
+ """
+ if len(indexes) == 0:
+ raise AssertionError('Must have at least 1 Index to union')
+ if len(indexes) == 1:
+ result = indexes[0]
+ if isinstance(result, list):
+ result = Index(sorted(result))
+ return result
+
+ indexes, kind = _sanitize_and_check(indexes)
+
+ def _unique_indices(inds):
+ """
+ Convert indexes to lists and concatenate them, removing duplicates.
+
+ The final dtype is inferred.
+
+ Parameters
+ ----------
+ inds : list of Index or list objects
+
+ Returns
+ -------
+ Index
+ """
+ def conv(i):
+ if isinstance(i, Index):
+ i = i.tolist()
+ return i
+
+ return Index(
+ lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort))
+
+ if kind == 'special':
+ result = indexes[0]
+
+ if hasattr(result, 'union_many'):
+ return result.union_many(indexes[1:])
+ else:
+ for other in indexes[1:]:
+ result = result.union(other)
+ return result
+ elif kind == 'array':
+ index = indexes[0]
+ for other in indexes[1:]:
+ if not index.equals(other):
+
+ if sort is None:
+ # TODO: remove once pd.concat sort default changes
+ warnings.warn(_sort_msg, FutureWarning, stacklevel=8)
+ sort = True
+
+ return _unique_indices(indexes)
+
+ name = _get_consensus_names(indexes)[0]
+ if name != index.name:
+ index = index._shallow_copy(name=name)
+ return index
+ else: # kind='list'
+ return _unique_indices(indexes)
+
+
+def _sanitize_and_check(indexes):
+ """
+ Verify the type of indexes and convert lists to Index.
+
+ Cases:
+
+ - [list, list, ...]: Return ([list, list, ...], 'list')
+ - [list, Index, ...]: Return _sanitize_and_check([Index, Index, ...])
+ Lists are sorted and converted to Index.
+ - [Index, Index, ...]: Return ([Index, Index, ...], TYPE)
+ TYPE = 'special' if at least one special type, 'array' otherwise.
+
+ Parameters
+ ----------
+ indexes : list of Index or list objects
+
+ Returns
+ -------
+ sanitized_indexes : list of Index or list objects
+ type : {'list', 'array', 'special'}
+ """
+ kinds = list({type(index) for index in indexes})
+
+ if list in kinds:
+ if len(kinds) > 1:
+ indexes = [Index(com.try_sort(x))
+ if not isinstance(x, Index) else
+ x for x in indexes]
+ kinds.remove(list)
+ else:
+ return indexes, 'list'
+
+ if len(kinds) > 1 or Index not in kinds:
+ return indexes, 'special'
+ else:
+ return indexes, 'array'
+
+
+def _get_consensus_names(indexes):
+ """
+ Give a consensus 'names' to indexes.
+
+ If there's exactly one non-empty 'names', return this,
+ otherwise, return empty.
+
+ Parameters
+ ----------
+ indexes : list of Index objects
+
+ Returns
+ -------
+ list
+ A list representing the consensus 'names' found.
+ """
+
+ # find the non-none names, need to tupleify to make
+ # the set hashable, then reverse on return
+ consensus_names = {tuple(i.names) for i in indexes
+ if com._any_not_none(*i.names)}
+ if len(consensus_names) == 1:
+ return list(list(consensus_names)[0])
+ return [None] * indexes[0].nlevels
+
+
+def _all_indexes_same(indexes):
+ """
+ Determine if all indexes contain the same elements.
+
+ Parameters
+ ----------
+ indexes : list of Index objects
+
+ Returns
+ -------
+ bool
+ True if all indexes contain the same elements, False otherwise.
+ """
+ first = indexes[0]
+ for index in indexes[1:]:
+ if not first.equals(index):
+ return False
+ return True
diff --git a/contrib/python/pandas/py2/pandas/core/indexes/base.py b/contrib/python/pandas/py2/pandas/core/indexes/base.py
new file mode 100644
index 00000000000..55a32f1fd4f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/indexes/base.py
@@ -0,0 +1,5410 @@
+from datetime import datetime, timedelta
+import operator
+from textwrap import dedent
+import warnings
+
+import numpy as np
+
+from pandas._libs import (
+ Timedelta, algos as libalgos, index as libindex, join as libjoin, lib,
+ tslibs)
+from pandas._libs.lib import is_datetime_array
+import pandas.compat as compat
+from pandas.compat import range, set_function_name, u
+from pandas.compat.numpy import function as nv
+from pandas.util._decorators import Appender, Substitution, cache_readonly
+
+from pandas.core.dtypes.cast import maybe_cast_to_integer_array
+from pandas.core.dtypes.common import (
+ ensure_categorical, ensure_int64, ensure_object, ensure_platform_int,
+ is_bool, is_bool_dtype, is_categorical, is_categorical_dtype,
+ is_datetime64_any_dtype, is_datetime64tz_dtype, is_dtype_equal,
+ is_dtype_union_equal, is_extension_array_dtype, is_float, is_float_dtype,
+ is_hashable, is_integer, is_integer_dtype, is_interval_dtype, is_iterator,
+ is_list_like, is_object_dtype, is_period_dtype, is_scalar,
+ is_signed_integer_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype,
+ pandas_dtype)
+import pandas.core.dtypes.concat as _concat
+from pandas.core.dtypes.generic import (
+ ABCDataFrame, ABCDateOffset, ABCDatetimeArray, ABCIndexClass,
+ ABCMultiIndex, ABCPandasArray, ABCPeriodIndex, ABCSeries,
+ ABCTimedeltaArray, ABCTimedeltaIndex)
+from pandas.core.dtypes.missing import array_equivalent, isna
+
+from pandas.core import ops
+from pandas.core.accessor import CachedAccessor, DirNamesMixin
+import pandas.core.algorithms as algos
+from pandas.core.arrays import ExtensionArray
+from pandas.core.base import IndexOpsMixin, PandasObject
+import pandas.core.common as com
+from pandas.core.indexes.frozen import FrozenList
+import pandas.core.missing as missing
+from pandas.core.ops import get_op_result_name, make_invalid_op
+import pandas.core.sorting as sorting
+from pandas.core.strings import StringMethods
+
+from pandas.io.formats.printing import (
+ default_pprint, format_object_attrs, format_object_summary, pprint_thing)
+
+__all__ = ['Index']
+
+_unsortable_types = frozenset(('mixed', 'mixed-integer'))
+
+_index_doc_kwargs = dict(klass='Index', inplace='',
+ target_klass='Index',
+ unique='Index', duplicated='np.ndarray')
+_index_shared_docs = dict()
+
+
+def _try_get_item(x):
+ try:
+ return x.item()
+ except AttributeError:
+ return x
+
+
+def _make_comparison_op(op, cls):
+ def cmp_method(self, other):
+ if isinstance(other, (np.ndarray, Index, ABCSeries)):
+ if other.ndim > 0 and len(self) != len(other):
+ raise ValueError('Lengths must match to compare')
+
+ if is_object_dtype(self) and not isinstance(self, ABCMultiIndex):
+ # don't pass MultiIndex
+ with np.errstate(all='ignore'):
+ result = ops._comp_method_OBJECT_ARRAY(op, self.values, other)
+
+ else:
+
+ # numpy will show a DeprecationWarning on invalid elementwise
+ # comparisons, this will raise in the future
+ with warnings.catch_warnings(record=True):
+ warnings.filterwarnings("ignore", "elementwise", FutureWarning)
+ with np.errstate(all='ignore'):
+ result = op(self.values, np.asarray(other))
+
+ # technically we could support bool dtyped Index
+ # for now just return the indexing array directly
+ if is_bool_dtype(result):
+ return result
+ try:
+ return Index(result)
+ except TypeError:
+ return result
+
+ name = '__{name}__'.format(name=op.__name__)
+ # TODO: docstring?
+ return set_function_name(cmp_method, name, cls)
+
+
+def _make_arithmetic_op(op, cls):
+ def index_arithmetic_method(self, other):
+ if isinstance(other, (ABCSeries, ABCDataFrame)):
+ return NotImplemented
+ elif isinstance(other, ABCTimedeltaIndex):
+ # Defer to subclass implementation
+ return NotImplemented
+ elif (isinstance(other, (np.ndarray, ABCTimedeltaArray)) and
+ is_timedelta64_dtype(other)):
+ # GH#22390; wrap in Series for op, this will in turn wrap in
+ # TimedeltaIndex, but will correctly raise TypeError instead of
+ # NullFrequencyError for add/sub ops
+ from pandas import Series
+ other = Series(other)
+ out = op(self, other)
+ return Index(out, name=self.name)
+
+ other = self._validate_for_numeric_binop(other, op)
+
+ # handle time-based others
+ if isinstance(other, (ABCDateOffset, np.timedelta64, timedelta)):
+ return self._evaluate_with_timedelta_like(other, op)
+ elif isinstance(other, (datetime, np.datetime64)):
+ return self._evaluate_with_datetime_like(other, op)
+
+ values = self.values
+ with np.errstate(all='ignore'):
+ result = op(values, other)
+
+ result = missing.dispatch_missing(op, values, other, result)
+
+ attrs = self._get_attributes_dict()
+ attrs = self._maybe_update_attributes(attrs)
+ if op is divmod:
+ result = (Index(result[0], **attrs), Index(result[1], **attrs))
+ else:
+ result = Index(result, **attrs)
+ return result
+
+ name = '__{name}__'.format(name=op.__name__)
+ # TODO: docstring?
+ return set_function_name(index_arithmetic_method, name, cls)
+
+
+class InvalidIndexError(Exception):
+ pass
+
+
+_o_dtype = np.dtype(object)
+_Identity = object
+
+
+def _new_Index(cls, d):
+ """
+ This is called upon unpickling, rather than the default which doesn't
+ have arguments and breaks __new__.
+ """
+ # required for backward compat, because PI can't be instantiated with
+ # ordinals through __new__ GH #13277
+ if issubclass(cls, ABCPeriodIndex):
+ from pandas.core.indexes.period import _new_PeriodIndex
+ return _new_PeriodIndex(cls, **d)
+ return cls.__new__(cls, **d)
+
+
+class Index(IndexOpsMixin, PandasObject):
+ """
+ Immutable ndarray implementing an ordered, sliceable set. The basic object
+ storing axis labels for all pandas objects.
+
+ Parameters
+ ----------
+ data : array-like (1-dimensional)
+ dtype : NumPy dtype (default: object)
+ If dtype is None, we find the dtype that best fits the data.
+ If an actual dtype is provided, we coerce to that dtype if it's safe.
+ Otherwise, an error will be raised.
+ copy : bool
+ Make a copy of input ndarray
+ name : object
+ Name to be stored in the index
+ tupleize_cols : bool (default: True)
+ When True, attempt to create a MultiIndex if possible
+
+ See Also
+ ---------
+ RangeIndex : Index implementing a monotonic integer range.
+ CategoricalIndex : Index of :class:`Categorical` s.
+ MultiIndex : A multi-level, or hierarchical, Index.
+ IntervalIndex : An Index of :class:`Interval` s.
+ DatetimeIndex, TimedeltaIndex, PeriodIndex
+ Int64Index, UInt64Index, Float64Index
+
+ Notes
+ -----
+ An Index instance can **only** contain hashable objects
+
+ Examples
+ --------
+ >>> pd.Index([1, 2, 3])
+ Int64Index([1, 2, 3], dtype='int64')
+
+ >>> pd.Index(list('abc'))
+ Index(['a', 'b', 'c'], dtype='object')
+ """
+ # tolist is not actually deprecated, just suppressed in the __dir__
+ _deprecations = DirNamesMixin._deprecations | frozenset(['tolist'])
+
+ # To hand over control to subclasses
+ _join_precedence = 1
+
+ # Cython methods; see github.com/cython/cython/issues/2647
+ # for why we need to wrap these instead of making them class attributes
+ # Moreover, cython will choose the appropriate-dtyped sub-function
+ # given the dtypes of the passed arguments
+ def _left_indexer_unique(self, left, right):
+ return libjoin.left_join_indexer_unique(left, right)
+
+ def _left_indexer(self, left, right):
+ return libjoin.left_join_indexer(left, right)
+
+ def _inner_indexer(self, left, right):
+ return libjoin.inner_join_indexer(left, right)
+
+ def _outer_indexer(self, left, right):
+ return libjoin.outer_join_indexer(left, right)
+
+ _typ = 'index'
+ _data = None
+ _id = None
+ name = None
+ asi8 = None
+ _comparables = ['name']
+ _attributes = ['name']
+ _is_numeric_dtype = False
+ _can_hold_na = True
+
+ # would we like our indexing holder to defer to us
+ _defer_to_indexing = False
+
+ # prioritize current class for _shallow_copy_with_infer,
+ # used to infer integers as datetime-likes
+ _infer_as_myclass = False
+
+ _engine_type = libindex.ObjectEngine
+
+ _accessors = {'str'}
+
+ str = CachedAccessor("str", StringMethods)
+
+ # --------------------------------------------------------------------
+ # Constructors
+
+ def __new__(cls, data=None, dtype=None, copy=False, name=None,
+ fastpath=None, tupleize_cols=True, **kwargs):
+
+ if name is None and hasattr(data, 'name'):
+ name = data.name
+
+ if fastpath is not None:
+ warnings.warn("The 'fastpath' keyword is deprecated, and will be "
+ "removed in a future version.",
+ FutureWarning, stacklevel=2)
+ if fastpath:
+ return cls._simple_new(data, name)
+
+ from .range import RangeIndex
+ if isinstance(data, ABCPandasArray):
+ # ensure users don't accidentally put a PandasArray in an index.
+ data = data.to_numpy()
+
+ # range
+ if isinstance(data, RangeIndex):
+ return RangeIndex(start=data, copy=copy, dtype=dtype, name=name)
+ elif isinstance(data, range):
+ return RangeIndex.from_range(data, copy=copy, dtype=dtype,
+ name=name)
+
+ # categorical
+ elif is_categorical_dtype(data) or is_categorical_dtype(dtype):
+ from .category import CategoricalIndex
+ return CategoricalIndex(data, dtype=dtype, copy=copy, name=name,
+ **kwargs)
+
+ # interval
+ elif ((is_interval_dtype(data) or is_interval_dtype(dtype)) and
+ not is_object_dtype(dtype)):
+ from .interval import IntervalIndex
+ closed = kwargs.get('closed', None)
+ return IntervalIndex(data, dtype=dtype, name=name, copy=copy,
+ closed=closed)
+
+ elif (is_datetime64_any_dtype(data) or
+ (dtype is not None and is_datetime64_any_dtype(dtype)) or
+ 'tz' in kwargs):
+ from pandas import DatetimeIndex
+
+ if dtype is not None and is_dtype_equal(_o_dtype, dtype):
+ # GH#23524 passing `dtype=object` to DatetimeIndex is invalid,
+ # will raise in the where `data` is already tz-aware. So
+ # we leave it out of this step and cast to object-dtype after
+ # the DatetimeIndex construction.
+ # Note we can pass copy=False because the .astype below
+ # will always make a copy
+ result = DatetimeIndex(data, copy=False, name=name, **kwargs)
+ return result.astype(object)
+ else:
+ result = DatetimeIndex(data, copy=copy, name=name,
+ dtype=dtype, **kwargs)
+ return result
+
+ elif (is_timedelta64_dtype(data) or
+ (dtype is not None and is_timedelta64_dtype(dtype))):
+ from pandas import TimedeltaIndex
+ if dtype is not None and is_dtype_equal(_o_dtype, dtype):
+ # Note we can pass copy=False because the .astype below
+ # will always make a copy
+ result = TimedeltaIndex(data, copy=False, name=name, **kwargs)
+ return result.astype(object)
+ else:
+ result = TimedeltaIndex(data, copy=copy, name=name,
+ dtype=dtype, **kwargs)
+ return result
+
+ elif is_period_dtype(data) and not is_object_dtype(dtype):
+ from pandas import PeriodIndex
+ result = PeriodIndex(data, copy=copy, name=name, **kwargs)
+ return result
+
+ # extension dtype
+ elif is_extension_array_dtype(data) or is_extension_array_dtype(dtype):
+ data = np.asarray(data)
+ if not (dtype is None or is_object_dtype(dtype)):
+
+ # coerce to the provided dtype
+ data = dtype.construct_array_type()._from_sequence(
+ data, dtype=dtype, copy=False)
+
+ # coerce to the object dtype
+ data = data.astype(object)
+ return Index(data, dtype=object, copy=copy, name=name,
+ **kwargs)
+
+ # index-like
+ elif isinstance(data, (np.ndarray, Index, ABCSeries)):
+ if dtype is not None:
+ try:
+
+ # we need to avoid having numpy coerce
+ # things that look like ints/floats to ints unless
+ # they are actually ints, e.g. '0' and 0.0
+ # should not be coerced
+ # GH 11836
+ if is_integer_dtype(dtype):
+ inferred = lib.infer_dtype(data, skipna=False)
+ if inferred == 'integer':
+ data = maybe_cast_to_integer_array(data, dtype,
+ copy=copy)
+ elif inferred in ['floating', 'mixed-integer-float']:
+ if isna(data).any():
+ raise ValueError('cannot convert float '
+ 'NaN to integer')
+
+ if inferred == "mixed-integer-float":
+ data = maybe_cast_to_integer_array(data, dtype)
+
+ # If we are actually all equal to integers,
+ # then coerce to integer.
+ try:
+ return cls._try_convert_to_int_index(
+ data, copy, name, dtype)
+ except ValueError:
+ pass
+
+ # Return an actual float index.
+ from .numeric import Float64Index
+ return Float64Index(data, copy=copy, dtype=dtype,
+ name=name)
+
+ elif inferred == 'string':
+ pass
+ else:
+ data = data.astype(dtype)
+ elif is_float_dtype(dtype):
+ inferred = lib.infer_dtype(data, skipna=False)
+ if inferred == 'string':
+ pass
+ else:
+ data = data.astype(dtype)
+ else:
+ data = np.array(data, dtype=dtype, copy=copy)
+
+ except (TypeError, ValueError) as e:
+ msg = str(e)
+ if ("cannot convert float" in msg or
+ "Trying to coerce float values to integer" in msg):
+ raise
+
+ # maybe coerce to a sub-class
+ from pandas.core.indexes.period import (
+ PeriodIndex, IncompatibleFrequency)
+
+ if is_signed_integer_dtype(data.dtype):
+ from .numeric import Int64Index
+ return Int64Index(data, copy=copy, dtype=dtype, name=name)
+ elif is_unsigned_integer_dtype(data.dtype):
+ from .numeric import UInt64Index
+ return UInt64Index(data, copy=copy, dtype=dtype, name=name)
+ elif is_float_dtype(data.dtype):
+ from .numeric import Float64Index
+ return Float64Index(data, copy=copy, dtype=dtype, name=name)
+ elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):
+ subarr = data.astype('object')
+ else:
+ subarr = com.asarray_tuplesafe(data, dtype=object)
+
+ # asarray_tuplesafe does not always copy underlying data,
+ # so need to make sure that this happens
+ if copy:
+ subarr = subarr.copy()
+
+ if dtype is None:
+ inferred = lib.infer_dtype(subarr, skipna=False)
+ if inferred == 'integer':
+ try:
+ return cls._try_convert_to_int_index(
+ subarr, copy, name, dtype)
+ except ValueError:
+ pass
+
+ return Index(subarr, copy=copy,
+ dtype=object, name=name)
+ elif inferred in ['floating', 'mixed-integer-float']:
+ from .numeric import Float64Index
+ return Float64Index(subarr, copy=copy, name=name)
+ elif inferred == 'interval':
+ from .interval import IntervalIndex
+ return IntervalIndex(subarr, name=name, copy=copy)
+ elif inferred == 'boolean':
+ # don't support boolean explicitly ATM
+ pass
+ elif inferred != 'string':
+ if inferred.startswith('datetime'):
+ if (lib.is_datetime_with_singletz_array(subarr) or
+ 'tz' in kwargs):
+ # only when subarr has the same tz
+ from pandas import DatetimeIndex
+ try:
+ return DatetimeIndex(subarr, copy=copy,
+ name=name, **kwargs)
+ except tslibs.OutOfBoundsDatetime:
+ pass
+
+ elif inferred.startswith('timedelta'):
+ from pandas import TimedeltaIndex
+ return TimedeltaIndex(subarr, copy=copy, name=name,
+ **kwargs)
+ elif inferred == 'period':
+ try:
+ return PeriodIndex(subarr, name=name, **kwargs)
+ except IncompatibleFrequency:
+ pass
+ return cls._simple_new(subarr, name)
+
+ elif hasattr(data, '__array__'):
+ return Index(np.asarray(data), dtype=dtype, copy=copy, name=name,
+ **kwargs)
+ elif data is None or is_scalar(data):
+ cls._scalar_data_error(data)
+ else:
+ if tupleize_cols and is_list_like(data):
+ # GH21470: convert iterable to list before determining if empty
+ if is_iterator(data):
+ data = list(data)
+
+ if data and all(isinstance(e, tuple) for e in data):
+ # we must be all tuples, otherwise don't construct
+ # 10697
+ from .multi import MultiIndex
+ return MultiIndex.from_tuples(
+ data, names=name or kwargs.get('names'))
+ # other iterable of some kind
+ subarr = com.asarray_tuplesafe(data, dtype=object)
+ return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs)
+
+ """
+ NOTE for new Index creation:
+
+ - _simple_new: It returns new Index with the same type as the caller.
+ All metadata (such as name) must be provided by caller's responsibility.
+ Using _shallow_copy is recommended because it fills these metadata
+ otherwise specified.
+
+ - _shallow_copy: It returns new Index with the same type (using
+ _simple_new), but fills caller's metadata otherwise specified. Passed
+ kwargs will overwrite corresponding metadata.
+
+ - _shallow_copy_with_infer: It returns new Index inferring its type
+ from passed values. It fills caller's metadata otherwise specified as the
+ same as _shallow_copy.
+
+ See each method's docstring.
+ """
+
+ @classmethod
+ def _simple_new(cls, values, name=None, dtype=None, **kwargs):
+ """
+ We require that we have a dtype compat for the values. If we are passed
+ a non-dtype compat, then coerce using the constructor.
+
+ Must be careful not to recurse.
+ """
+ if not hasattr(values, 'dtype'):
+ if (values is None or not len(values)) and dtype is not None:
+ values = np.empty(0, dtype=dtype)
+ else:
+ values = np.array(values, copy=False)
+ if is_object_dtype(values):
+ values = cls(values, name=name, dtype=dtype,
+ **kwargs)._ndarray_values
+
+ if isinstance(values, (ABCSeries, ABCIndexClass)):
+ # Index._data must always be an ndarray.
+ # This is no-copy for when _values is an ndarray,
+ # which should be always at this point.
+ values = np.asarray(values._values)
+
+ result = object.__new__(cls)
+ result._data = values
+ # _index_data is a (temporary?) fix to ensure that the direct data
+ # manipulation we do in `_libs/reduction.pyx` continues to work.
+ # We need access to the actual ndarray, since we're messing with
+ # data buffers and strides. We don't re-use `_ndarray_values`, since
+ # we actually set this value too.
+ result._index_data = values
+ result.name = name
+ for k, v in compat.iteritems(kwargs):
+ setattr(result, k, v)
+ return result._reset_identity()
+
+ @cache_readonly
+ def _constructor(self):
+ return type(self)
+
+ # --------------------------------------------------------------------
+ # Index Internals Methods
+
+ def _get_attributes_dict(self):
+ """
+ Return an attributes dict for my class.
+ """
+ return {k: getattr(self, k, None) for k in self._attributes}
+
+ _index_shared_docs['_shallow_copy'] = """
+ Create a new Index with the same class as the caller, don't copy the
+ data, use the same object attributes with passed in attributes taking
+ precedence.
+
+ *this is an internal non-public method*
+
+ Parameters
+ ----------
+ values : the values to create the new Index, optional
+ kwargs : updates the default attributes for this Index
+ """
+
+ @Appender(_index_shared_docs['_shallow_copy'])
+ def _shallow_copy(self, values=None, **kwargs):
+ if values is None:
+ values = self.values
+ attributes = self._get_attributes_dict()
+ attributes.update(kwargs)
+ if not len(values) and 'dtype' not in kwargs:
+ attributes['dtype'] = self.dtype
+
+ # _simple_new expects an the type of self._data
+ values = getattr(values, '_values', values)
+ if isinstance(values, ABCDatetimeArray):
+ # `self.values` returns `self` for tz-aware, so we need to unwrap
+ # more specifically
+ values = values.asi8
+
+ return self._simple_new(values, **attributes)
+
+ def _shallow_copy_with_infer(self, values, **kwargs):
+ """
+ Create a new Index inferring the class with passed value, don't copy
+ the data, use the same object attributes with passed in attributes
+ taking precedence.
+
+ *this is an internal non-public method*
+
+ Parameters
+ ----------
+ values : the values to create the new Index, optional
+ kwargs : updates the default attributes for this Index
+ """
+ attributes = self._get_attributes_dict()
+ attributes.update(kwargs)
+ attributes['copy'] = False
+ if not len(values) and 'dtype' not in kwargs:
+ attributes['dtype'] = self.dtype
+ if self._infer_as_myclass:
+ try:
+ return self._constructor(values, **attributes)
+ except (TypeError, ValueError):
+ pass
+ return Index(values, **attributes)
+
+ def _update_inplace(self, result, **kwargs):
+ # guard when called from IndexOpsMixin
+ raise TypeError("Index can't be updated inplace")
+
+ def is_(self, other):
+ """
+ More flexible, faster check like ``is`` but that works through views.
+
+ Note: this is *not* the same as ``Index.identical()``, which checks
+ that metadata is also the same.
+
+ Parameters
+ ----------
+ other : object
+ other object to compare against.
+
+ Returns
+ -------
+ True if both have same underlying data, False otherwise : bool
+ """
+ # use something other than None to be clearer
+ return self._id is getattr(
+ other, '_id', Ellipsis) and self._id is not None
+
+ def _reset_identity(self):
+ """
+ Initializes or resets ``_id`` attribute with new object.
+ """
+ self._id = _Identity()
+ return self
+
+ def _cleanup(self):
+ self._engine.clear_mapping()
+
+ @cache_readonly
+ def _engine(self):
+ # property, for now, slow to look up
+ return self._engine_type(lambda: self._ndarray_values, len(self))
+
+ # --------------------------------------------------------------------
+ # Array-Like Methods
+
+ # ndarray compat
+ def __len__(self):
+ """
+ Return the length of the Index.
+ """
+ return len(self._data)
+
+ def __array__(self, dtype=None):
+ """
+ The array interface, return my values.
+ """
+ return np.asarray(self._data, dtype=dtype)
+
+ def __array_wrap__(self, result, context=None):
+ """
+ Gets called after a ufunc.
+ """
+ result = lib.item_from_zerodim(result)
+ if is_bool_dtype(result) or lib.is_scalar(result):
+ return result
+
+ attrs = self._get_attributes_dict()
+ attrs = self._maybe_update_attributes(attrs)
+ return Index(result, **attrs)
+
+ @cache_readonly
+ def dtype(self):
+ """
+ Return the dtype object of the underlying data.
+ """
+ return self._data.dtype
+
+ @cache_readonly
+ def dtype_str(self):
+ """
+ Return the dtype str of the underlying data.
+ """
+ return str(self.dtype)
+
+ def ravel(self, order='C'):
+ """
+ Return an ndarray of the flattened values of the underlying data.
+
+ See Also
+ --------
+ numpy.ndarray.ravel
+ """
+ return self._ndarray_values.ravel(order=order)
+
+ def view(self, cls=None):
+
+ # we need to see if we are subclassing an
+ # index type here
+ if cls is not None and not hasattr(cls, '_typ'):
+ result = self._data.view(cls)
+ else:
+ result = self._shallow_copy()
+ if isinstance(result, Index):
+ result._id = self._id
+ return result
+
+ _index_shared_docs['astype'] = """
+ Create an Index with values cast to dtypes. The class of a new Index
+ is determined by dtype. When conversion is impossible, a ValueError
+ exception is raised.
+
+ Parameters
+ ----------
+ dtype : numpy dtype or pandas type
+ Note that any signed integer `dtype` is treated as ``'int64'``,
+ and any unsigned integer `dtype` is treated as ``'uint64'``,
+ regardless of the size.
+ copy : bool, default True
+ By default, astype always returns a newly allocated object.
+ If copy is set to False and internal requirements on dtype are
+ satisfied, the original data is used to create a new Index
+ or the original Index is returned.
+
+ .. versionadded:: 0.19.0
+ """
+
+ @Appender(_index_shared_docs['astype'])
+ def astype(self, dtype, copy=True):
+ if is_dtype_equal(self.dtype, dtype):
+ return self.copy() if copy else self
+
+ elif is_categorical_dtype(dtype):
+ from .category import CategoricalIndex
+ return CategoricalIndex(self.values, name=self.name, dtype=dtype,
+ copy=copy)
+ elif is_datetime64tz_dtype(dtype):
+ # TODO(GH-24559): Remove this block, use the following elif.
+ # avoid FutureWarning from DatetimeIndex constructor.
+ from pandas import DatetimeIndex
+ tz = pandas_dtype(dtype).tz
+ return (DatetimeIndex(np.asarray(self))
+ .tz_localize("UTC").tz_convert(tz))
+
+ elif is_extension_array_dtype(dtype):
+ return Index(np.asarray(self), dtype=dtype, copy=copy)
+
+ try:
+ if is_datetime64tz_dtype(dtype):
+ from pandas import DatetimeIndex
+ return DatetimeIndex(self.values, name=self.name, dtype=dtype,
+ copy=copy)
+ return Index(self.values.astype(dtype, copy=copy), name=self.name,
+ dtype=dtype)
+ except (TypeError, ValueError):
+ msg = 'Cannot cast {name} to dtype {dtype}'
+ raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
+
+ _index_shared_docs['take'] = """
+ Return a new %(klass)s of the values selected by the indices.
+
+ For internal compatibility with numpy arrays.
+
+ Parameters
+ ----------
+ indices : list
+ Indices to be taken
+ axis : int, optional
+ The axis over which to select values, always 0.
+ allow_fill : bool, default True
+ fill_value : bool, default None
+ If allow_fill=True and fill_value is not None, indices specified by
+ -1 is regarded as NA. If Index doesn't hold NA, raise ValueError
+
+ See Also
+ --------
+ numpy.ndarray.take
+ """
+
+ @Appender(_index_shared_docs['take'] % _index_doc_kwargs)
+ def take(self, indices, axis=0, allow_fill=True,
+ fill_value=None, **kwargs):
+ if kwargs:
+ nv.validate_take(tuple(), kwargs)
+ indices = ensure_platform_int(indices)
+ if self._can_hold_na:
+ taken = self._assert_take_fillable(self.values, indices,
+ allow_fill=allow_fill,
+ fill_value=fill_value,
+ na_value=self._na_value)
+ else:
+ if allow_fill and fill_value is not None:
+ msg = 'Unable to fill values because {0} cannot contain NA'
+ raise ValueError(msg.format(self.__class__.__name__))
+ taken = self.values.take(indices)
+ return self._shallow_copy(taken)
+
+ def _assert_take_fillable(self, values, indices, allow_fill=True,
+ fill_value=None, na_value=np.nan):
+ """
+ Internal method to handle NA filling of take.
+ """
+ indices = ensure_platform_int(indices)
+
+ # only fill if we are passing a non-None fill_value
+ if allow_fill and fill_value is not None:
+ if (indices < -1).any():
+ msg = ('When allow_fill=True and fill_value is not None, '
+ 'all indices must be >= -1')
+ raise ValueError(msg)
+ taken = algos.take(values,
+ indices,
+ allow_fill=allow_fill,
+ fill_value=na_value)
+ else:
+ taken = values.take(indices)
+ return taken
+
+ _index_shared_docs['repeat'] = """
+ Repeat elements of a %(klass)s.
+
+ Returns a new %(klass)s where each element of the current %(klass)s
+ is repeated consecutively a given number of times.
+
+ Parameters
+ ----------
+ repeats : int or array of ints
+ The number of repetitions for each element. This should be a
+ non-negative integer. Repeating 0 times will return an empty
+ %(klass)s.
+ axis : None
+ Must be ``None``. Has no effect but is accepted for compatibility
+ with numpy.
+
+ Returns
+ -------
+ repeated_index : %(klass)s
+ Newly created %(klass)s with repeated elements.
+
+ See Also
+ --------
+ Series.repeat : Equivalent function for Series.
+ numpy.repeat : Similar method for :class:`numpy.ndarray`.
+
+ Examples
+ --------
+ >>> idx = pd.Index(['a', 'b', 'c'])
+ >>> idx
+ Index(['a', 'b', 'c'], dtype='object')
+ >>> idx.repeat(2)
+ Index(['a', 'a', 'b', 'b', 'c', 'c'], dtype='object')
+ >>> idx.repeat([1, 2, 3])
+ Index(['a', 'b', 'b', 'c', 'c', 'c'], dtype='object')
+ """
+
+ @Appender(_index_shared_docs['repeat'] % _index_doc_kwargs)
+ def repeat(self, repeats, axis=None):
+ nv.validate_repeat(tuple(), dict(axis=axis))
+ return self._shallow_copy(self._values.repeat(repeats))
+
+ # --------------------------------------------------------------------
+ # Copying Methods
+
+ _index_shared_docs['copy'] = """
+ Make a copy of this object. Name and dtype sets those attributes on
+ the new object.
+
+ Parameters
+ ----------
+ name : string, optional
+ deep : boolean, default False
+ dtype : numpy dtype or pandas type
+
+ Returns
+ -------
+ copy : Index
+
+ Notes
+ -----
+ In most cases, there should be no functional difference from using
+ ``deep``, but if ``deep`` is passed it will attempt to deepcopy.
+ """
+
+ @Appender(_index_shared_docs['copy'])
+ def copy(self, name=None, deep=False, dtype=None, **kwargs):
+ if deep:
+ new_index = self._shallow_copy(self._data.copy())
+ else:
+ new_index = self._shallow_copy()
+
+ names = kwargs.get('names')
+ names = self._validate_names(name=name, names=names, deep=deep)
+ new_index = new_index.set_names(names)
+
+ if dtype:
+ new_index = new_index.astype(dtype)
+ return new_index
+
+ def __copy__(self, **kwargs):
+ return self.copy(**kwargs)
+
+ def __deepcopy__(self, memo=None):
+ """
+ Parameters
+ ----------
+ memo, default None
+ Standard signature. Unused
+ """
+ if memo is None:
+ memo = {}
+ return self.copy(deep=True)
+
+ # --------------------------------------------------------------------
+ # Rendering Methods
+
+ def __unicode__(self):
+ """
+ Return a string representation for this object.
+
+ Invoked by unicode(df) in py2 only. Yields a Unicode String in both
+ py2/py3.
+ """
+ klass = self.__class__.__name__
+ data = self._format_data()
+ attrs = self._format_attrs()
+ space = self._format_space()
+
+ prepr = (u(",%s") %
+ space).join(u("%s=%s") % (k, v) for k, v in attrs)
+
+ # no data provided, just attributes
+ if data is None:
+ data = ''
+
+ res = u("%s(%s%s)") % (klass, data, prepr)
+
+ return res
+
+ def _format_space(self):
+
+ # using space here controls if the attributes
+ # are line separated or not (the default)
+
+ # max_seq_items = get_option('display.max_seq_items')
+ # if len(self) > max_seq_items:
+ # space = "\n%s" % (' ' * (len(klass) + 1))
+ return " "
+
+ @property
+ def _formatter_func(self):
+ """
+ Return the formatter function.
+ """
+ return default_pprint
+
+ def _format_data(self, name=None):
+ """
+ Return the formatted data as a unicode string.
+ """
+
+ # do we want to justify (only do so for non-objects)
+ is_justify = not (self.inferred_type in ('string', 'unicode') or
+ (self.inferred_type == 'categorical' and
+ is_object_dtype(self.categories)))
+
+ return format_object_summary(self, self._formatter_func,
+ is_justify=is_justify, name=name)
+
+ def _format_attrs(self):
+ """
+ Return a list of tuples of the (attr,formatted_value).
+ """
+ return format_object_attrs(self)
+
+ def _mpl_repr(self):
+ # how to represent ourselves to matplotlib
+ return self.values
+
+ def format(self, name=False, formatter=None, **kwargs):
+ """
+ Render a string representation of the Index.
+ """
+ header = []
+ if name:
+ header.append(pprint_thing(self.name,
+ escape_chars=('\t', '\r', '\n')) if
+ self.name is not None else '')
+
+ if formatter is not None:
+ return header + list(self.map(formatter))
+
+ return self._format_with_header(header, **kwargs)
+
+ def _format_with_header(self, header, na_rep='NaN', **kwargs):
+ values = self.values
+
+ from pandas.io.formats.format import format_array
+
+ if is_categorical_dtype(values.dtype):
+ values = np.array(values)
+
+ elif is_object_dtype(values.dtype):
+ values = lib.maybe_convert_objects(values, safe=1)
+
+ if is_object_dtype(values.dtype):
+ result = [pprint_thing(x, escape_chars=('\t', '\r', '\n'))
+ for x in values]
+
+ # could have nans
+ mask = isna(values)
+ if mask.any():
+ result = np.array(result)
+ result[mask] = na_rep
+ result = result.tolist()
+
+ else:
+ result = _trim_front(format_array(values, None, justify='left'))
+ return header + result
+
+ def to_native_types(self, slicer=None, **kwargs):
+ """
+ Format specified values of `self` and return them.
+
+ Parameters
+ ----------
+ slicer : int, array-like
+ An indexer into `self` that specifies which values
+ are used in the formatting process.
+ kwargs : dict
+ Options for specifying how the values should be formatted.
+ These options include the following:
+
+ 1) na_rep : str
+ The value that serves as a placeholder for NULL values
+ 2) quoting : bool or None
+ Whether or not there are quoted values in `self`
+ 3) date_format : str
+ The format used to represent date-like values
+ """
+
+ values = self
+ if slicer is not None:
+ values = values[slicer]
+ return values._format_native_types(**kwargs)
+
+ def _format_native_types(self, na_rep='', quoting=None, **kwargs):
+ """
+ Actually format specific types of the index.
+ """
+ mask = isna(self)
+ if not self.is_object() and not quoting:
+ values = np.asarray(self).astype(str)
+ else:
+ values = np.array(self, dtype=object, copy=True)
+
+ values[mask] = na_rep
+ return values
+
+ def _summary(self, name=None):
+ """
+ Return a summarized representation.
+
+ Parameters
+ ----------
+ name : str
+ name to use in the summary representation
+
+ Returns
+ -------
+ String with a summarized representation of the index
+ """
+ if len(self) > 0:
+ head = self[0]
+ if (hasattr(head, 'format') and
+ not isinstance(head, compat.string_types)):
+ head = head.format()
+ tail = self[-1]
+ if (hasattr(tail, 'format') and
+ not isinstance(tail, compat.string_types)):
+ tail = tail.format()
+ index_summary = ', %s to %s' % (pprint_thing(head),
+ pprint_thing(tail))
+ else:
+ index_summary = ''
+
+ if name is None:
+ name = type(self).__name__
+ return '%s: %s entries%s' % (name, len(self), index_summary)
+
+ def summary(self, name=None):
+ """
+ Return a summarized representation.
+
+ .. deprecated:: 0.23.0
+ """
+ warnings.warn("'summary' is deprecated and will be removed in a "
+ "future version.", FutureWarning, stacklevel=2)
+ return self._summary(name)
+
+ # --------------------------------------------------------------------
+ # Conversion Methods
+
+ def to_flat_index(self):
+ """
+ Identity method.
+
+ .. versionadded:: 0.24.0
+
+ This is implemented for compatability with subclass implementations
+ when chaining.
+
+ Returns
+ -------
+ pd.Index
+ Caller.
+
+ See Also
+ --------
+ MultiIndex.to_flat_index : Subclass implementation.
+ """
+ return self
+
+ def to_series(self, index=None, name=None):
+ """
+ Create a Series with both index and values equal to the index keys
+ useful with map for returning an indexer based on an index.
+
+ Parameters
+ ----------
+ index : Index, optional
+ index of resulting Series. If None, defaults to original index
+ name : string, optional
+ name of resulting Series. If None, defaults to name of original
+ index
+
+ Returns
+ -------
+ Series : dtype will be based on the type of the Index values.
+ """
+
+ from pandas import Series
+
+ if index is None:
+ index = self._shallow_copy()
+ if name is None:
+ name = self.name
+
+ return Series(self.values.copy(), index=index, name=name)
+
+ def to_frame(self, index=True, name=None):
+ """
+ Create a DataFrame with a column containing the Index.
+
+ .. versionadded:: 0.24.0
+
+ Parameters
+ ----------
+ index : boolean, default True
+ Set the index of the returned DataFrame as the original Index.
+
+ name : object, default None
+ The passed name should substitute for the index name (if it has
+ one).
+
+ Returns
+ -------
+ DataFrame
+ DataFrame containing the original Index data.
+
+ See Also
+ --------
+ Index.to_series : Convert an Index to a Series.
+ Series.to_frame : Convert Series to DataFrame.
+
+ Examples
+ --------
+ >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal')
+ >>> idx.to_frame()
+ animal
+ animal
+ Ant Ant
+ Bear Bear
+ Cow Cow
+
+ By default, the original Index is reused. To enforce a new Index:
+
+ >>> idx.to_frame(index=False)
+ animal
+ 0 Ant
+ 1 Bear
+ 2 Cow
+
+ To override the name of the resulting column, specify `name`:
+
+ >>> idx.to_frame(index=False, name='zoo')
+ zoo
+ 0 Ant
+ 1 Bear
+ 2 Cow
+ """
+
+ from pandas import DataFrame
+ if name is None:
+ name = self.name or 0
+ result = DataFrame({name: self.values.copy()})
+
+ if index:
+ result.index = self
+ return result
+
+ # --------------------------------------------------------------------
+ # Name-Centric Methods
+
+ def _validate_names(self, name=None, names=None, deep=False):
+ """
+ Handles the quirks of having a singular 'name' parameter for general
+ Index and plural 'names' parameter for MultiIndex.
+ """
+ from copy import deepcopy
+ if names is not None and name is not None:
+ raise TypeError("Can only provide one of `names` and `name`")
+ elif names is None and name is None:
+ return deepcopy(self.names) if deep else self.names
+ elif names is not None:
+ if not is_list_like(names):
+ raise TypeError("Must pass list-like as `names`.")
+ return names
+ else:
+ if not is_list_like(name):
+ return [name]
+ return name
+
+ def _get_names(self):
+ return FrozenList((self.name, ))
+
+ def _set_names(self, values, level=None):
+ """
+ Set new names on index. Each name has to be a hashable type.
+
+ Parameters
+ ----------
+ values : str or sequence
+ name(s) to set
+ level : int, level name, or sequence of int/level names (default None)
+ If the index is a MultiIndex (hierarchical), level(s) to set (None
+ for all levels). Otherwise level must be None
+
+ Raises
+ ------
+ TypeError if each name is not hashable.
+ """
+ if not is_list_like(values):
+ raise ValueError('Names must be a list-like')
+ if len(values) != 1:
+ raise ValueError('Length of new names must be 1, got %d' %
+ len(values))
+
+ # GH 20527
+ # All items in 'name' need to be hashable:
+ for name in values:
+ if not is_hashable(name):
+ raise TypeError('{}.name must be a hashable type'
+ .format(self.__class__.__name__))
+ self.name = values[0]
+
+ names = property(fset=_set_names, fget=_get_names)
+
+ def set_names(self, names, level=None, inplace=False):
+ """
+ Set Index or MultiIndex name.
+
+ Able to set new names partially and by level.
+
+ Parameters
+ ----------
+ names : label or list of label
+ Name(s) to set.
+ level : int, label or list of int or label, optional
+ If the index is a MultiIndex, level(s) to set (None for all
+ levels). Otherwise level must be None.
+ inplace : bool, default False
+ Modifies the object directly, instead of creating a new Index or
+ MultiIndex.
+
+ Returns
+ -------
+ Index
+ The same type as the caller or None if inplace is True.
+
+ See Also
+ --------
+ Index.rename : Able to set new names without level.
+
+ Examples
+ --------
+ >>> idx = pd.Index([1, 2, 3, 4])
+ >>> idx
+ Int64Index([1, 2, 3, 4], dtype='int64')
+ >>> idx.set_names('quarter')
+ Int64Index([1, 2, 3, 4], dtype='int64', name='quarter')
+
+ >>> idx = pd.MultiIndex.from_product([['python', 'cobra'],
+ ... [2018, 2019]])
+ >>> idx
+ MultiIndex(levels=[['cobra', 'python'], [2018, 2019]],
+ codes=[[1, 1, 0, 0], [0, 1, 0, 1]])
+ >>> idx.set_names(['kind', 'year'], inplace=True)
+ >>> idx
+ MultiIndex(levels=[['cobra', 'python'], [2018, 2019]],
+ codes=[[1, 1, 0, 0], [0, 1, 0, 1]],
+ names=['kind', 'year'])
+ >>> idx.set_names('species', level=0)
+ MultiIndex(levels=[['cobra', 'python'], [2018, 2019]],
+ codes=[[1, 1, 0, 0], [0, 1, 0, 1]],
+ names=['species', 'year'])
+ """
+
+ if level is not None and not isinstance(self, ABCMultiIndex):
+ raise ValueError('Level must be None for non-MultiIndex')
+
+ if level is not None and not is_list_like(level) and is_list_like(
+ names):
+ msg = "Names must be a string when a single level is provided."
+ raise TypeError(msg)
+
+ if not is_list_like(names) and level is None and self.nlevels > 1:
+ raise TypeError("Must pass list-like as `names`.")
+
+ if not is_list_like(names):
+ names = [names]
+ if level is not None and not is_list_like(level):
+ level = [level]
+
+ if inplace:
+ idx = self
+ else:
+ idx = self._shallow_copy()
+ idx._set_names(names, level=level)
+ if not inplace:
+ return idx
+
+ def rename(self, name, inplace=False):
+ """
+ Alter Index or MultiIndex name.
+
+ Able to set new names without level. Defaults to returning new index.
+ Length of names must match number of levels in MultiIndex.
+
+ Parameters
+ ----------
+ name : label or list of labels
+ Name(s) to set.
+ inplace : boolean, default False
+ Modifies the object directly, instead of creating a new Index or
+ MultiIndex.
+
+ Returns
+ -------
+ Index
+ The same type as the caller or None if inplace is True.
+
+ See Also
+ --------
+ Index.set_names : Able to set new names partially and by level.
+
+ Examples
+ --------
+ >>> idx = pd.Index(['A', 'C', 'A', 'B'], name='score')
+ >>> idx.rename('grade')
+ Index(['A', 'C', 'A', 'B'], dtype='object', name='grade')
+
+ >>> idx = pd.MultiIndex.from_product([['python', 'cobra'],
+ ... [2018, 2019]],
+ ... names=['kind', 'year'])
+ >>> idx
+ MultiIndex(levels=[['cobra', 'python'], [2018, 2019]],
+ codes=[[1, 1, 0, 0], [0, 1, 0, 1]],
+ names=['kind', 'year'])
+ >>> idx.rename(['species', 'year'])
+ MultiIndex(levels=[['cobra', 'python'], [2018, 2019]],
+ codes=[[1, 1, 0, 0], [0, 1, 0, 1]],
+ names=['species', 'year'])
+ >>> idx.rename('species')
+ Traceback (most recent call last):
+ TypeError: Must pass list-like as `names`.
+ """
+ return self.set_names([name], inplace=inplace)
+
+ # --------------------------------------------------------------------
+ # Level-Centric Methods
+
+ @property
+ def nlevels(self):
+ return 1
+
+ def _sort_levels_monotonic(self):
+ """
+ Compat with MultiIndex.
+ """
+ return self
+
+ def _validate_index_level(self, level):
+ """
+ Validate index level.
+
+ For single-level Index getting level number is a no-op, but some
+ verification must be done like in MultiIndex.
+
+ """
+ if isinstance(level, int):
+ if level < 0 and level != -1:
+ raise IndexError("Too many levels: Index has only 1 level,"
+ " %d is not a valid level number" % (level, ))
+ elif level > 0:
+ raise IndexError("Too many levels:"
+ " Index has only 1 level, not %d" %
+ (level + 1))
+ elif level != self.name:
+ raise KeyError('Level %s must be same as name (%s)' %
+ (level, self.name))
+
+ def _get_level_number(self, level):
+ self._validate_index_level(level)
+ return 0
+
+ def sortlevel(self, level=None, ascending=True, sort_remaining=None):
+ """
+ For internal compatibility with with the Index API.
+
+ Sort the Index. This is for compat with MultiIndex
+
+ Parameters
+ ----------
+ ascending : boolean, default True
+ False to sort in descending order
+
+ level, sort_remaining are compat parameters
+
+ Returns
+ -------
+ sorted_index : Index
+ """
+ return self.sort_values(return_indexer=True, ascending=ascending)
+
+ def _get_level_values(self, level):
+ """
+ Return an Index of values for requested level.
+
+ This is primarily useful to get an individual level of values from a
+ MultiIndex, but is provided on Index as well for compatability.
+
+ Parameters
+ ----------
+ level : int or str
+ It is either the integer position or the name of the level.
+
+ Returns
+ -------
+ values : Index
+ Calling object, as there is only one level in the Index.
+
+ See Also
+ --------
+ MultiIndex.get_level_values : Get values for a level of a MultiIndex.
+
+ Notes
+ -----
+ For Index, level should be 0, since there are no multiple levels.
+
+ Examples
+ --------
+
+ >>> idx = pd.Index(list('abc'))
+ >>> idx
+ Index(['a', 'b', 'c'], dtype='object')
+
+ Get level values by supplying `level` as integer:
+
+ >>> idx.get_level_values(0)
+ Index(['a', 'b', 'c'], dtype='object')
+ """
+ self._validate_index_level(level)
+ return self
+
+ get_level_values = _get_level_values
+
+ def droplevel(self, level=0):
+ """
+ Return index with requested level(s) removed.
+
+ If resulting index has only 1 level left, the result will be
+ of Index type, not MultiIndex.
+
+ .. versionadded:: 0.23.1 (support for non-MultiIndex)
+
+ Parameters
+ ----------
+ level : int, str, or list-like, default 0
+ If a string is given, must be the name of a level
+ If list-like, elements must be names or indexes of levels.
+
+ Returns
+ -------
+ index : Index or MultiIndex
+ """
+ if not isinstance(level, (tuple, list)):
+ level = [level]
+
+ levnums = sorted(self._get_level_number(lev) for lev in level)[::-1]
+
+ if len(level) == 0:
+ return self
+ if len(level) >= self.nlevels:
+ raise ValueError("Cannot remove {} levels from an index with {} "
+ "levels: at least one level must be "
+ "left.".format(len(level), self.nlevels))
+ # The two checks above guarantee that here self is a MultiIndex
+
+ new_levels = list(self.levels)
+ new_codes = list(self.codes)
+ new_names = list(self.names)
+
+ for i in levnums:
+ new_levels.pop(i)
+ new_codes.pop(i)
+ new_names.pop(i)
+
+ if len(new_levels) == 1:
+
+ # set nan if needed
+ mask = new_codes[0] == -1
+ result = new_levels[0].take(new_codes[0])
+ if mask.any():
+ result = result.putmask(mask, np.nan)
+
+ result.name = new_names[0]
+ return result
+ else:
+ from .multi import MultiIndex
+ return MultiIndex(levels=new_levels, codes=new_codes,
+ names=new_names, verify_integrity=False)
+
+ _index_shared_docs['_get_grouper_for_level'] = """
+ Get index grouper corresponding to an index level
+
+ Parameters
+ ----------
+ mapper: Group mapping function or None
+ Function mapping index values to groups
+ level : int or None
+ Index level
+
+ Returns
+ -------
+ grouper : Index
+ Index of values to group on
+ labels : ndarray of int or None
+ Array of locations in level_index
+ uniques : Index or None
+ Index of unique values for level
+ """
+
+ @Appender(_index_shared_docs['_get_grouper_for_level'])
+ def _get_grouper_for_level(self, mapper, level=None):
+ assert level is None or level == 0
+ if mapper is None:
+ grouper = self
+ else:
+ grouper = self.map(mapper)
+
+ return grouper, None, None
+
+ # --------------------------------------------------------------------
+ # Introspection Methods
+
+ @property
+ def is_monotonic(self):
+ """
+ Alias for is_monotonic_increasing.
+ """
+ return self.is_monotonic_increasing
+
+ @property
+ def is_monotonic_increasing(self):
+ """
+ Return if the index is monotonic increasing (only equal or
+ increasing) values.
+
+ Examples
+ --------
+ >>> Index([1, 2, 3]).is_monotonic_increasing
+ True
+ >>> Index([1, 2, 2]).is_monotonic_increasing
+ True
+ >>> Index([1, 3, 2]).is_monotonic_increasing
+ False
+ """
+ return self._engine.is_monotonic_increasing
+
+ @property
+ def is_monotonic_decreasing(self):
+ """
+ Return if the index is monotonic decreasing (only equal or
+ decreasing) values.
+
+ Examples
+ --------
+ >>> Index([3, 2, 1]).is_monotonic_decreasing
+ True
+ >>> Index([3, 2, 2]).is_monotonic_decreasing
+ True
+ >>> Index([3, 1, 2]).is_monotonic_decreasing
+ False
+ """
+ return self._engine.is_monotonic_decreasing
+
+ @property
+ def _is_strictly_monotonic_increasing(self):
+ """
+ Return if the index is strictly monotonic increasing
+ (only increasing) values.
+
+ Examples
+ --------
+ >>> Index([1, 2, 3])._is_strictly_monotonic_increasing
+ True
+ >>> Index([1, 2, 2])._is_strictly_monotonic_increasing
+ False
+ >>> Index([1, 3, 2])._is_strictly_monotonic_increasing
+ False
+ """
+ return self.is_unique and self.is_monotonic_increasing
+
+ @property
+ def _is_strictly_monotonic_decreasing(self):
+ """
+ Return if the index is strictly monotonic decreasing
+ (only decreasing) values.
+
+ Examples
+ --------
+ >>> Index([3, 2, 1])._is_strictly_monotonic_decreasing
+ True
+ >>> Index([3, 2, 2])._is_strictly_monotonic_decreasing
+ False
+ >>> Index([3, 1, 2])._is_strictly_monotonic_decreasing
+ False
+ """
+ return self.is_unique and self.is_monotonic_decreasing
+
+ def is_lexsorted_for_tuple(self, tup):
+ return True
+
+ @cache_readonly
+ def is_unique(self):
+ """
+ Return if the index has unique values.
+ """
+ return self._engine.is_unique
+
+ @property
+ def has_duplicates(self):
+ return not self.is_unique
+
+ def is_boolean(self):
+ return self.inferred_type in ['boolean']
+
+ def is_integer(self):
+ return self.inferred_type in ['integer']
+
+ def is_floating(self):
+ return self.inferred_type in ['floating', 'mixed-integer-float']
+
+ def is_numeric(self):
+ return self.inferred_type in ['integer', 'floating']
+
+ def is_object(self):
+ return is_object_dtype(self.dtype)
+
+ def is_categorical(self):
+ """
+ Check if the Index holds categorical data.
+
+ Returns
+ -------
+ boolean
+ True if the Index is categorical.
+
+ See Also
+ --------
+ CategoricalIndex : Index for categorical data.
+
+ Examples
+ --------
+ >>> idx = pd.Index(["Watermelon", "Orange", "Apple",
+ ... "Watermelon"]).astype("category")
+ >>> idx.is_categorical()
+ True
+
+ >>> idx = pd.Index([1, 3, 5, 7])
+ >>> idx.is_categorical()
+ False
+
+ >>> s = pd.Series(["Peter", "Victor", "Elisabeth", "Mar"])
+ >>> s
+ 0 Peter
+ 1 Victor
+ 2 Elisabeth
+ 3 Mar
+ dtype: object
+ >>> s.index.is_categorical()
+ False
+ """
+ return self.inferred_type in ['categorical']
+
+ def is_interval(self):
+ return self.inferred_type in ['interval']
+
+ def is_mixed(self):
+ return self.inferred_type in ['mixed']
+
+ def holds_integer(self):
+ return self.inferred_type in ['integer', 'mixed-integer']
+
+ @cache_readonly
+ def inferred_type(self):
+ """
+ Return a string of the type inferred from the values.
+ """
+ return lib.infer_dtype(self, skipna=False)
+
+ @cache_readonly
+ def is_all_dates(self):
+ if self._data is None:
+ return False
+ return is_datetime_array(ensure_object(self.values))
+
+ # --------------------------------------------------------------------
+ # Pickle Methods
+
+ def __reduce__(self):
+ d = dict(data=self._data)
+ d.update(self._get_attributes_dict())
+ return _new_Index, (self.__class__, d), None
+
+ def __setstate__(self, state):
+ """
+ Necessary for making this object picklable.
+ """
+
+ if isinstance(state, dict):
+ self._data = state.pop('data')
+ for k, v in compat.iteritems(state):
+ setattr(self, k, v)
+
+ elif isinstance(state, tuple):
+
+ if len(state) == 2:
+ nd_state, own_state = state
+ data = np.empty(nd_state[1], dtype=nd_state[2])
+ np.ndarray.__setstate__(data, nd_state)
+ self.name = own_state[0]
+
+ else: # pragma: no cover
+ data = np.empty(state)
+ np.ndarray.__setstate__(data, state)
+
+ self._data = data
+ self._reset_identity()
+ else:
+ raise Exception("invalid pickle state")
+
+ _unpickle_compat = __setstate__
+
+ # --------------------------------------------------------------------
+ # Null Handling Methods
+
+ _na_value = np.nan
+ """The expected NA value to use with this index."""
+
+ @cache_readonly
+ def _isnan(self):
+ """
+ Return if each value is NaN.
+ """
+ if self._can_hold_na:
+ return isna(self)
+ else:
+ # shouldn't reach to this condition by checking hasnans beforehand
+ values = np.empty(len(self), dtype=np.bool_)
+ values.fill(False)
+ return values
+
+ @cache_readonly
+ def _nan_idxs(self):
+ if self._can_hold_na:
+ w, = self._isnan.nonzero()
+ return w
+ else:
+ return np.array([], dtype=np.int64)
+
+ @cache_readonly
+ def hasnans(self):
+ """
+ Return if I have any nans; enables various perf speedups.
+ """
+ if self._can_hold_na:
+ return bool(self._isnan.any())
+ else:
+ return False
+
+ def isna(self):
+ """
+ Detect missing values.
+
+ Return a boolean same-sized object indicating if the values are NA.
+ NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`pd.NaT`, get
+ mapped to ``True`` values.
+ Everything else get mapped to ``False`` values. Characters such as
+ empty strings `''` or :attr:`numpy.inf` are not considered NA values
+ (unless you set ``pandas.options.mode.use_inf_as_na = True``).
+
+ .. versionadded:: 0.20.0
+
+ Returns
+ -------
+ numpy.ndarray
+ A boolean array of whether my values are NA
+
+ See Also
+ --------
+ pandas.Index.notna : Boolean inverse of isna.
+ pandas.Index.dropna : Omit entries with missing values.
+ pandas.isna : Top-level isna.
+ Series.isna : Detect missing values in Series object.
+
+ Examples
+ --------
+ Show which entries in a pandas.Index are NA. The result is an
+ array.
+
+ >>> idx = pd.Index([5.2, 6.0, np.NaN])
+ >>> idx
+ Float64Index([5.2, 6.0, nan], dtype='float64')
+ >>> idx.isna()
+ array([False, False, True], dtype=bool)
+
+ Empty strings are not considered NA values. None is considered an NA
+ value.
+
+ >>> idx = pd.Index(['black', '', 'red', None])
+ >>> idx
+ Index(['black', '', 'red', None], dtype='object')
+ >>> idx.isna()
+ array([False, False, False, True], dtype=bool)
+
+ For datetimes, `NaT` (Not a Time) is considered as an NA value.
+
+ >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'),
+ ... pd.Timestamp(''), None, pd.NaT])
+ >>> idx
+ DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'],
+ dtype='datetime64[ns]', freq=None)
+ >>> idx.isna()
+ array([False, True, True, True], dtype=bool)
+ """
+ return self._isnan
+ isnull = isna
+
+ def notna(self):
+ """
+ Detect existing (non-missing) values.
+
+ Return a boolean same-sized object indicating if the values are not NA.
+ Non-missing values get mapped to ``True``. Characters such as empty
+ strings ``''`` or :attr:`numpy.inf` are not considered NA values
+ (unless you set ``pandas.options.mode.use_inf_as_na = True``).
+ NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False``
+ values.
+
+ .. versionadded:: 0.20.0
+
+ Returns
+ -------
+ numpy.ndarray
+ Boolean array to indicate which entries are not NA.
+
+ See Also
+ --------
+ Index.notnull : Alias of notna.
+ Index.isna: Inverse of notna.
+ pandas.notna : Top-level notna.
+
+ Examples
+ --------
+ Show which entries in an Index are not NA. The result is an
+ array.
+
+ >>> idx = pd.Index([5.2, 6.0, np.NaN])
+ >>> idx
+ Float64Index([5.2, 6.0, nan], dtype='float64')
+ >>> idx.notna()
+ array([ True, True, False])
+
+ Empty strings are not considered NA values. None is considered a NA
+ value.
+
+ >>> idx = pd.Index(['black', '', 'red', None])
+ >>> idx
+ Index(['black', '', 'red', None], dtype='object')
+ >>> idx.notna()
+ array([ True, True, True, False])
+ """
+ return ~self.isna()
+ notnull = notna
+
+ _index_shared_docs['fillna'] = """
+ Fill NA/NaN values with the specified value
+
+ Parameters
+ ----------
+ value : scalar
+ Scalar value to use to fill holes (e.g. 0).
+ This value cannot be a list-likes.
+ downcast : dict, default is None
+ a dict of item->dtype of what to downcast if possible,
+ or the string 'infer' which will try to downcast to an appropriate
+ equal type (e.g. float64 to int64 if possible)
+
+ Returns
+ -------
+ filled : Index
+ """
+
+ @Appender(_index_shared_docs['fillna'])
+ def fillna(self, value=None, downcast=None):
+ self._assert_can_do_op(value)
+ if self.hasnans:
+ result = self.putmask(self._isnan, value)
+ if downcast is None:
+ # no need to care metadata other than name
+ # because it can't have freq if
+ return Index(result, name=self.name)
+ return self._shallow_copy()
+
+ _index_shared_docs['dropna'] = """
+ Return Index without NA/NaN values
+
+ Parameters
+ ----------
+ how : {'any', 'all'}, default 'any'
+ If the Index is a MultiIndex, drop the value when any or all levels
+ are NaN.
+
+ Returns
+ -------
+ valid : Index
+ """
+
+ @Appender(_index_shared_docs['dropna'])
+ def dropna(self, how='any'):
+ if how not in ('any', 'all'):
+ raise ValueError("invalid how option: {0}".format(how))
+
+ if self.hasnans:
+ return self._shallow_copy(self.values[~self._isnan])
+ return self._shallow_copy()
+
+ # --------------------------------------------------------------------
+ # Uniqueness Methods
+
+ _index_shared_docs['index_unique'] = (
+ """
+ Return unique values in the index. Uniques are returned in order
+ of appearance, this does NOT sort.
+
+ Parameters
+ ----------
+ level : int or str, optional, default None
+ Only return values from specified level (for MultiIndex)
+
+ .. versionadded:: 0.23.0
+
+ Returns
+ -------
+ Index without duplicates
+
+ See Also
+ --------
+ unique
+ Series.unique
+ """)
+
+ @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs)
+ def unique(self, level=None):
+ if level is not None:
+ self._validate_index_level(level)
+ result = super(Index, self).unique()
+ return self._shallow_copy(result)
+
+ def drop_duplicates(self, keep='first'):
+ """
+ Return Index with duplicate values removed.
+
+ Parameters
+ ----------
+ keep : {'first', 'last', ``False``}, default 'first'
+ - 'first' : Drop duplicates except for the first occurrence.
+ - 'last' : Drop duplicates except for the last occurrence.
+ - ``False`` : Drop all duplicates.
+
+ Returns
+ -------
+ deduplicated : Index
+
+ See Also
+ --------
+ Series.drop_duplicates : Equivalent method on Series.
+ DataFrame.drop_duplicates : Equivalent method on DataFrame.
+ Index.duplicated : Related method on Index, indicating duplicate
+ Index values.
+
+ Examples
+ --------
+ Generate an pandas.Index with duplicate values.
+
+ >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'])
+
+ The `keep` parameter controls which duplicate values are removed.
+ The value 'first' keeps the first occurrence for each
+ set of duplicated entries. The default value of keep is 'first'.
+
+ >>> idx.drop_duplicates(keep='first')
+ Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object')
+
+ The value 'last' keeps the last occurrence for each set of duplicated
+ entries.
+
+ >>> idx.drop_duplicates(keep='last')
+ Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object')
+
+ The value ``False`` discards all sets of duplicated entries.
+
+ >>> idx.drop_duplicates(keep=False)
+ Index(['cow', 'beetle', 'hippo'], dtype='object')
+ """
+ return super(Index, self).drop_duplicates(keep=keep)
+
+ def duplicated(self, keep='first'):
+ """
+ Indicate duplicate index values.
+
+ Duplicated values are indicated as ``True`` values in the resulting
+ array. Either all duplicates, all except the first, or all except the
+ last occurrence of duplicates can be indicated.
+
+ Parameters
+ ----------
+ keep : {'first', 'last', False}, default 'first'
+ The value or values in a set of duplicates to mark as missing.
+
+ - 'first' : Mark duplicates as ``True`` except for the first
+ occurrence.
+ - 'last' : Mark duplicates as ``True`` except for the last
+ occurrence.
+ - ``False`` : Mark all duplicates as ``True``.
+
+ Returns
+ -------
+ numpy.ndarray
+
+ See Also
+ --------
+ pandas.Series.duplicated : Equivalent method on pandas.Series.
+ pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame.
+ pandas.Index.drop_duplicates : Remove duplicate values from Index.
+
+ Examples
+ --------
+ By default, for each set of duplicated values, the first occurrence is
+ set to False and all others to True:
+
+ >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama'])
+ >>> idx.duplicated()
+ array([False, False, True, False, True])
+
+ which is equivalent to
+
+ >>> idx.duplicated(keep='first')
+ array([False, False, True, False, True])
+
+ By using 'last', the last occurrence of each set of duplicated values
+ is set on False and all others on True:
+
+ >>> idx.duplicated(keep='last')
+ array([ True, False, True, False, False])
+
+ By setting keep on ``False``, all duplicates are True:
+
+ >>> idx.duplicated(keep=False)
+ array([ True, False, True, False, True])
+ """
+ return super(Index, self).duplicated(keep=keep)
+
+ def get_duplicates(self):
+ """
+ Extract duplicated index elements.
+
+ .. deprecated:: 0.23.0
+ Use idx[idx.duplicated()].unique() instead
+
+ Returns a sorted list of index elements which appear more than once in
+ the index.
+
+ Returns
+ -------
+ array-like
+ List of duplicated indexes.
+
+ See Also
+ --------
+ Index.duplicated : Return boolean array denoting duplicates.
+ Index.drop_duplicates : Return Index with duplicates removed.
+
+ Examples
+ --------
+
+ Works on different Index of types.
+
+ >>> pd.Index([1, 2, 2, 3, 3, 3, 4]).get_duplicates() # doctest: +SKIP
+ [2, 3]
+
+ Note that for a DatetimeIndex, it does not return a list but a new
+ DatetimeIndex:
+
+ >>> dates = pd.to_datetime(['2018-01-01', '2018-01-02', '2018-01-03',
+ ... '2018-01-03', '2018-01-04', '2018-01-04'],
+ ... format='%Y-%m-%d')
+ >>> pd.Index(dates).get_duplicates() # doctest: +SKIP
+ DatetimeIndex(['2018-01-03', '2018-01-04'],
+ dtype='datetime64[ns]', freq=None)
+
+ Sorts duplicated elements even when indexes are unordered.
+
+ >>> pd.Index([1, 2, 3, 2, 3, 4, 3]).get_duplicates() # doctest: +SKIP
+ [2, 3]
+
+ Return empty array-like structure when all elements are unique.
+
+ >>> pd.Index([1, 2, 3, 4]).get_duplicates() # doctest: +SKIP
+ []
+ >>> dates = pd.to_datetime(['2018-01-01', '2018-01-02', '2018-01-03'],
+ ... format='%Y-%m-%d')
+ >>> pd.Index(dates).get_duplicates() # doctest: +SKIP
+ DatetimeIndex([], dtype='datetime64[ns]', freq=None)
+ """
+ warnings.warn("'get_duplicates' is deprecated and will be removed in "
+ "a future release. You can use "
+ "idx[idx.duplicated()].unique() instead",
+ FutureWarning, stacklevel=2)
+
+ return self[self.duplicated()].unique()
+
+ def _get_unique_index(self, dropna=False):
+ """
+ Returns an index containing unique values.
+
+ Parameters
+ ----------
+ dropna : bool
+ If True, NaN values are dropped.
+
+ Returns
+ -------
+ uniques : index
+ """
+ if self.is_unique and not dropna:
+ return self
+
+ values = self.values
+
+ if not self.is_unique:
+ values = self.unique()
+
+ if dropna:
+ try:
+ if self.hasnans:
+ values = values[~isna(values)]
+ except NotImplementedError:
+ pass
+
+ return self._shallow_copy(values)
+
+ # --------------------------------------------------------------------
+ # Arithmetic & Logical Methods
+
+ def __add__(self, other):
+ if isinstance(other, (ABCSeries, ABCDataFrame)):
+ return NotImplemented
+ return Index(np.array(self) + other)
+
+ def __radd__(self, other):
+ return Index(other + np.array(self))
+
+ def __iadd__(self, other):
+ # alias for __add__
+ return self + other
+
+ def __sub__(self, other):
+ return Index(np.array(self) - other)
+
+ def __rsub__(self, other):
+ return Index(other - np.array(self))
+
+ def __and__(self, other):
+ return self.intersection(other)
+
+ def __or__(self, other):
+ return self.union(other)
+
+ def __xor__(self, other):
+ return self.symmetric_difference(other)
+
+ def __nonzero__(self):
+ raise ValueError("The truth value of a {0} is ambiguous. "
+ "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
+ .format(self.__class__.__name__))
+
+ __bool__ = __nonzero__
+
+ # --------------------------------------------------------------------
+ # Set Operation Methods
+
+ def _get_reconciled_name_object(self, other):
+ """
+ If the result of a set operation will be self,
+ return self, unless the name changes, in which
+ case make a shallow copy of self.
+ """
+ name = get_op_result_name(self, other)
+ if self.name != name:
+ return self._shallow_copy(name=name)
+ return self
+
+ def _validate_sort_keyword(self, sort):
+ if sort not in [None, False]:
+ raise ValueError("The 'sort' keyword only takes the values of "
+ "None or False; {0} was passed.".format(sort))
+
+ def union(self, other, sort=None):
+ """
+ Form the union of two Index objects.
+
+ Parameters
+ ----------
+ other : Index or array-like
+ sort : bool or None, default None
+ Whether to sort the resulting Index.
+
+ * None : Sort the result, except when
+
+ 1. `self` and `other` are equal.
+ 2. `self` or `other` has length 0.
+ 3. Some values in `self` or `other` cannot be compared.
+ A RuntimeWarning is issued in this case.
+
+ * False : do not sort the result.
+
+ .. versionadded:: 0.24.0
+
+ .. versionchanged:: 0.24.1
+
+ Changed the default value from ``True`` to ``None``
+ (without change in behaviour).
+
+ Returns
+ -------
+ union : Index
+
+ Examples
+ --------
+
+ >>> idx1 = pd.Index([1, 2, 3, 4])
+ >>> idx2 = pd.Index([3, 4, 5, 6])
+ >>> idx1.union(idx2)
+ Int64Index([1, 2, 3, 4, 5, 6], dtype='int64')
+ """
+ self._validate_sort_keyword(sort)
+ self._assert_can_do_setop(other)
+ other = ensure_index(other)
+
+ if len(other) == 0 or self.equals(other):
+ return self._get_reconciled_name_object(other)
+
+ if len(self) == 0:
+ return other._get_reconciled_name_object(self)
+
+ # TODO: is_dtype_union_equal is a hack around
+ # 1. buggy set ops with duplicates (GH #13432)
+ # 2. CategoricalIndex lacking setops (GH #10186)
+ # Once those are fixed, this workaround can be removed
+ if not is_dtype_union_equal(self.dtype, other.dtype):
+ this = self.astype('O')
+ other = other.astype('O')
+ return this.union(other, sort=sort)
+
+ # TODO(EA): setops-refactor, clean all this up
+ if is_period_dtype(self) or is_datetime64tz_dtype(self):
+ lvals = self._ndarray_values
+ else:
+ lvals = self._values
+ if is_period_dtype(other) or is_datetime64tz_dtype(other):
+ rvals = other._ndarray_values
+ else:
+ rvals = other._values
+
+ if self.is_monotonic and other.is_monotonic:
+ try:
+ result = self._outer_indexer(lvals, rvals)[0]
+ except TypeError:
+ # incomparable objects
+ result = list(lvals)
+
+ # worth making this faster? a very unusual case
+ value_set = set(lvals)
+ result.extend([x for x in rvals if x not in value_set])
+ else:
+ indexer = self.get_indexer(other)
+ indexer, = (indexer == -1).nonzero()
+
+ if len(indexer) > 0:
+ other_diff = algos.take_nd(rvals, indexer,
+ allow_fill=False)
+ result = _concat._concat_compat((lvals, other_diff))
+
+ else:
+ result = lvals
+
+ if sort is None:
+ try:
+ result = sorting.safe_sort(result)
+ except TypeError as e:
+ warnings.warn("{}, sort order is undefined for "
+ "incomparable objects".format(e),
+ RuntimeWarning, stacklevel=3)
+
+ # for subclasses
+ return self._wrap_setop_result(other, result)
+
+ def _wrap_setop_result(self, other, result):
+ return self._constructor(result, name=get_op_result_name(self, other))
+
+ def intersection(self, other, sort=False):
+ """
+ Form the intersection of two Index objects.
+
+ This returns a new Index with elements common to the index and `other`.
+
+ Parameters
+ ----------
+ other : Index or array-like
+ sort : False or None, default False
+ Whether to sort the resulting index.
+
+ * False : do not sort the result.
+ * None : sort the result, except when `self` and `other` are equal
+ or when the values cannot be compared.
+
+ .. versionadded:: 0.24.0
+
+ .. versionchanged:: 0.24.1
+
+ Changed the default from ``True`` to ``False``, to match
+ the behaviour of 0.23.4 and earlier.
+
+ Returns
+ -------
+ intersection : Index
+
+ Examples
+ --------
+
+ >>> idx1 = pd.Index([1, 2, 3, 4])
+ >>> idx2 = pd.Index([3, 4, 5, 6])
+ >>> idx1.intersection(idx2)
+ Int64Index([3, 4], dtype='int64')
+ """
+ self._validate_sort_keyword(sort)
+ self._assert_can_do_setop(other)
+ other = ensure_index(other)
+
+ if self.equals(other):
+ return self._get_reconciled_name_object(other)
+
+ if not is_dtype_equal(self.dtype, other.dtype):
+ this = self.astype('O')
+ other = other.astype('O')
+ return this.intersection(other, sort=sort)
+
+ # TODO(EA): setops-refactor, clean all this up
+ if is_period_dtype(self):
+ lvals = self._ndarray_values
+ else:
+ lvals = self._values
+ if is_period_dtype(other):
+ rvals = other._ndarray_values
+ else:
+ rvals = other._values
+
+ if self.is_monotonic and other.is_monotonic:
+ try:
+ result = self._inner_indexer(lvals, rvals)[0]
+ return self._wrap_setop_result(other, result)
+ except TypeError:
+ pass
+
+ try:
+ indexer = Index(rvals).get_indexer(lvals)
+ indexer = indexer.take((indexer != -1).nonzero()[0])
+ except Exception:
+ # duplicates
+ indexer = algos.unique1d(
+ Index(rvals).get_indexer_non_unique(lvals)[0])
+ indexer = indexer[indexer != -1]
+
+ taken = other.take(indexer)
+
+ if sort is None:
+ taken = sorting.safe_sort(taken.values)
+ if self.name != other.name:
+ name = None
+ else:
+ name = self.name
+ return self._shallow_copy(taken, name=name)
+
+ if self.name != other.name:
+ taken.name = None
+
+ return taken
+
+ def difference(self, other, sort=None):
+ """
+ Return a new Index with elements from the index that are not in
+ `other`.
+
+ This is the set difference of two Index objects.
+
+ Parameters
+ ----------
+ other : Index or array-like
+ sort : False or None, default None
+ Whether to sort the resulting index. By default, the
+ values are attempted to be sorted, but any TypeError from
+ incomparable elements is caught by pandas.
+
+ * None : Attempt to sort the result, but catch any TypeErrors
+ from comparing incomparable elements.
+ * False : Do not sort the result.
+
+ .. versionadded:: 0.24.0
+
+ .. versionchanged:: 0.24.1
+
+ Changed the default value from ``True`` to ``None``
+ (without change in behaviour).
+
+ Returns
+ -------
+ difference : Index
+
+ Examples
+ --------
+
+ >>> idx1 = pd.Index([2, 1, 3, 4])
+ >>> idx2 = pd.Index([3, 4, 5, 6])
+ >>> idx1.difference(idx2)
+ Int64Index([1, 2], dtype='int64')
+ >>> idx1.difference(idx2, sort=False)
+ Int64Index([2, 1], dtype='int64')
+ """
+ self._validate_sort_keyword(sort)
+ self._assert_can_do_setop(other)
+
+ if self.equals(other):
+ # pass an empty np.ndarray with the appropriate dtype
+ return self._shallow_copy(self._data[:0])
+
+ other, result_name = self._convert_can_do_setop(other)
+
+ this = self._get_unique_index()
+
+ indexer = this.get_indexer(other)
+ indexer = indexer.take((indexer != -1).nonzero()[0])
+
+ label_diff = np.setdiff1d(np.arange(this.size), indexer,
+ assume_unique=True)
+ the_diff = this.values.take(label_diff)
+ if sort is None:
+ try:
+ the_diff = sorting.safe_sort(the_diff)
+ except TypeError:
+ pass
+
+ return this._shallow_copy(the_diff, name=result_name, freq=None)
+
+ def symmetric_difference(self, other, result_name=None, sort=None):
+ """
+ Compute the symmetric difference of two Index objects.
+
+ Parameters
+ ----------
+ other : Index or array-like
+ result_name : str
+ sort : False or None, default None
+ Whether to sort the resulting index. By default, the
+ values are attempted to be sorted, but any TypeError from
+ incomparable elements is caught by pandas.
+
+ * None : Attempt to sort the result, but catch any TypeErrors
+ from comparing incomparable elements.
+ * False : Do not sort the result.
+
+ .. versionadded:: 0.24.0
+
+ .. versionchanged:: 0.24.1
+
+ Changed the default value from ``True`` to ``None``
+ (without change in behaviour).
+
+ Returns
+ -------
+ symmetric_difference : Index
+
+ Notes
+ -----
+ ``symmetric_difference`` contains elements that appear in either
+ ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by
+ ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates
+ dropped.
+
+ Examples
+ --------
+ >>> idx1 = pd.Index([1, 2, 3, 4])
+ >>> idx2 = pd.Index([2, 3, 4, 5])
+ >>> idx1.symmetric_difference(idx2)
+ Int64Index([1, 5], dtype='int64')
+
+ You can also use the ``^`` operator:
+
+ >>> idx1 ^ idx2
+ Int64Index([1, 5], dtype='int64')
+ """
+ self._validate_sort_keyword(sort)
+ self._assert_can_do_setop(other)
+ other, result_name_update = self._convert_can_do_setop(other)
+ if result_name is None:
+ result_name = result_name_update
+
+ this = self._get_unique_index()
+ other = other._get_unique_index()
+ indexer = this.get_indexer(other)
+
+ # {this} minus {other}
+ common_indexer = indexer.take((indexer != -1).nonzero()[0])
+ left_indexer = np.setdiff1d(np.arange(this.size), common_indexer,
+ assume_unique=True)
+ left_diff = this.values.take(left_indexer)
+
+ # {other} minus {this}
+ right_indexer = (indexer == -1).nonzero()[0]
+ right_diff = other.values.take(right_indexer)
+
+ the_diff = _concat._concat_compat([left_diff, right_diff])
+ if sort is None:
+ try:
+ the_diff = sorting.safe_sort(the_diff)
+ except TypeError:
+ pass
+
+ attribs = self._get_attributes_dict()
+ attribs['name'] = result_name
+ if 'freq' in attribs:
+ attribs['freq'] = None
+ return self._shallow_copy_with_infer(the_diff, **attribs)
+
+ def _assert_can_do_setop(self, other):
+ if not is_list_like(other):
+ raise TypeError('Input must be Index or array-like')
+ return True
+
+ def _convert_can_do_setop(self, other):
+ if not isinstance(other, Index):
+ other = Index(other, name=self.name)
+ result_name = self.name
+ else:
+ result_name = get_op_result_name(self, other)
+ return other, result_name
+
+ # --------------------------------------------------------------------
+ # Indexing Methods
+
+ _index_shared_docs['get_loc'] = """
+ Get integer location, slice or boolean mask for requested label.
+
+ Parameters
+ ----------
+ key : label
+ method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional
+ * default: exact matches only.
+ * pad / ffill: find the PREVIOUS index value if no exact match.
+ * backfill / bfill: use NEXT index value if no exact match
+ * nearest: use the NEAREST index value if no exact match. Tied
+ distances are broken by preferring the larger index value.
+ tolerance : optional
+ Maximum distance from index value for inexact matches. The value of
+ the index at the matching location most satisfy the equation
+ ``abs(index[loc] - key) <= tolerance``.
+
+ Tolerance may be a scalar
+ value, which applies the same tolerance to all values, or
+ list-like, which applies variable tolerance per element. List-like
+ includes list, tuple, array, Series, and must be the same size as
+ the index and its dtype must exactly match the index's type.
+
+ .. versionadded:: 0.21.0 (list-like tolerance)
+
+ Returns
+ -------
+ loc : int if unique index, slice if monotonic index, else mask
+
+ Examples
+ ---------
+ >>> unique_index = pd.Index(list('abc'))
+ >>> unique_index.get_loc('b')
+ 1
+
+ >>> monotonic_index = pd.Index(list('abbc'))
+ >>> monotonic_index.get_loc('b')
+ slice(1, 3, None)
+
+ >>> non_monotonic_index = pd.Index(list('abcb'))
+ >>> non_monotonic_index.get_loc('b')
+ array([False, True, False, True], dtype=bool)
+ """
+
+ @Appender(_index_shared_docs['get_loc'])
+ def get_loc(self, key, method=None, tolerance=None):
+ if method is None:
+ if tolerance is not None:
+ raise ValueError('tolerance argument only valid if using pad, '
+ 'backfill or nearest lookups')
+ try:
+ return self._engine.get_loc(key)
+ except KeyError:
+ return self._engine.get_loc(self._maybe_cast_indexer(key))
+ indexer = self.get_indexer([key], method=method, tolerance=tolerance)
+ if indexer.ndim > 1 or indexer.size > 1:
+ raise TypeError('get_loc requires scalar valued input')
+ loc = indexer.item()
+ if loc == -1:
+ raise KeyError(key)
+ return loc
+
+ _index_shared_docs['get_indexer'] = """
+ Compute indexer and mask for new index given the current index. The
+ indexer should be then used as an input to ndarray.take to align the
+ current data to the new index.
+
+ Parameters
+ ----------
+ target : %(target_klass)s
+ method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional
+ * default: exact matches only.
+ * pad / ffill: find the PREVIOUS index value if no exact match.
+ * backfill / bfill: use NEXT index value if no exact match
+ * nearest: use the NEAREST index value if no exact match. Tied
+ distances are broken by preferring the larger index value.
+ limit : int, optional
+ Maximum number of consecutive labels in ``target`` to match for
+ inexact matches.
+ tolerance : optional
+ Maximum distance between original and new labels for inexact
+ matches. The values of the index at the matching locations most
+ satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
+
+ Tolerance may be a scalar value, which applies the same tolerance
+ to all values, or list-like, which applies variable tolerance per
+ element. List-like includes list, tuple, array, Series, and must be
+ the same size as the index and its dtype must exactly match the
+ index's type.
+
+ .. versionadded:: 0.21.0 (list-like tolerance)
+
+ Returns
+ -------
+ indexer : ndarray of int
+ Integers from 0 to n - 1 indicating that the index at these
+ positions matches the corresponding target values. Missing values
+ in the target are marked by -1.
+
+ Examples
+ --------
+ >>> index = pd.Index(['c', 'a', 'b'])
+ >>> index.get_indexer(['a', 'b', 'x'])
+ array([ 1, 2, -1])
+
+ Notice that the return value is an array of locations in ``index``
+ and ``x`` is marked by -1, as it is not in ``index``.
+ """
+
+ @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs)
+ def get_indexer(self, target, method=None, limit=None, tolerance=None):
+ method = missing.clean_reindex_fill_method(method)
+ target = ensure_index(target)
+ if tolerance is not None:
+ tolerance = self._convert_tolerance(tolerance, target)
+
+ # Treat boolean labels passed to a numeric index as not found. Without
+ # this fix False and True would be treated as 0 and 1 respectively.
+ # (GH #16877)
+ if target.is_boolean() and self.is_numeric():
+ return ensure_platform_int(np.repeat(-1, target.size))
+
+ pself, ptarget = self._maybe_promote(target)
+ if pself is not self or ptarget is not target:
+ return pself.get_indexer(ptarget, method=method, limit=limit,
+ tolerance=tolerance)
+
+ if not is_dtype_equal(self.dtype, target.dtype):
+ this = self.astype(object)
+ target = target.astype(object)
+ return this.get_indexer(target, method=method, limit=limit,
+ tolerance=tolerance)
+
+ if not self.is_unique:
+ raise InvalidIndexError('Reindexing only valid with uniquely'
+ ' valued Index objects')
+
+ if method == 'pad' or method == 'backfill':
+ indexer = self._get_fill_indexer(target, method, limit, tolerance)
+ elif method == 'nearest':
+ indexer = self._get_nearest_indexer(target, limit, tolerance)
+ else:
+ if tolerance is not None:
+ raise ValueError('tolerance argument only valid if doing pad, '
+ 'backfill or nearest reindexing')
+ if limit is not None:
+ raise ValueError('limit argument only valid if doing pad, '
+ 'backfill or nearest reindexing')
+
+ indexer = self._engine.get_indexer(target._ndarray_values)
+
+ return ensure_platform_int(indexer)
+
+ def _convert_tolerance(self, tolerance, target):
+ # override this method on subclasses
+ tolerance = np.asarray(tolerance)
+ if target.size != tolerance.size and tolerance.size > 1:
+ raise ValueError('list-like tolerance size must match '
+ 'target index size')
+ return tolerance
+
+ def _get_fill_indexer(self, target, method, limit=None, tolerance=None):
+ if self.is_monotonic_increasing and target.is_monotonic_increasing:
+ method = (self._engine.get_pad_indexer if method == 'pad' else
+ self._engine.get_backfill_indexer)
+ indexer = method(target._ndarray_values, limit)
+ else:
+ indexer = self._get_fill_indexer_searchsorted(target, method,
+ limit)
+ if tolerance is not None:
+ indexer = self._filter_indexer_tolerance(target._ndarray_values,
+ indexer,
+ tolerance)
+ return indexer
+
+ def _get_fill_indexer_searchsorted(self, target, method, limit=None):
+ """
+ Fallback pad/backfill get_indexer that works for monotonic decreasing
+ indexes and non-monotonic targets.
+ """
+ if limit is not None:
+ raise ValueError('limit argument for %r method only well-defined '
+ 'if index and target are monotonic' % method)
+
+ side = 'left' if method == 'pad' else 'right'
+
+ # find exact matches first (this simplifies the algorithm)
+ indexer = self.get_indexer(target)
+ nonexact = (indexer == -1)
+ indexer[nonexact] = self._searchsorted_monotonic(target[nonexact],
+ side)
+ if side == 'left':
+ # searchsorted returns "indices into a sorted array such that,
+ # if the corresponding elements in v were inserted before the
+ # indices, the order of a would be preserved".
+ # Thus, we need to subtract 1 to find values to the left.
+ indexer[nonexact] -= 1
+ # This also mapped not found values (values of 0 from
+ # np.searchsorted) to -1, which conveniently is also our
+ # sentinel for missing values
+ else:
+ # Mark indices to the right of the largest value as not found
+ indexer[indexer == len(self)] = -1
+ return indexer
+
+ def _get_nearest_indexer(self, target, limit, tolerance):
+ """
+ Get the indexer for the nearest index labels; requires an index with
+ values that can be subtracted from each other (e.g., not strings or
+ tuples).
+ """
+ left_indexer = self.get_indexer(target, 'pad', limit=limit)
+ right_indexer = self.get_indexer(target, 'backfill', limit=limit)
+
+ target = np.asarray(target)
+ left_distances = abs(self.values[left_indexer] - target)
+ right_distances = abs(self.values[right_indexer] - target)
+
+ op = operator.lt if self.is_monotonic_increasing else operator.le
+ indexer = np.where(op(left_distances, right_distances) |
+ (right_indexer == -1), left_indexer, right_indexer)
+ if tolerance is not None:
+ indexer = self._filter_indexer_tolerance(target, indexer,
+ tolerance)
+ return indexer
+
+ def _filter_indexer_tolerance(self, target, indexer, tolerance):
+ distance = abs(self.values[indexer] - target)
+ indexer = np.where(distance <= tolerance, indexer, -1)
+ return indexer
+
+ # --------------------------------------------------------------------
+ # Indexer Conversion Methods
+
+ _index_shared_docs['_convert_scalar_indexer'] = """
+ Convert a scalar indexer.
+
+ Parameters
+ ----------
+ key : label of the slice bound
+ kind : {'ix', 'loc', 'getitem', 'iloc'} or None
+ """
+
+ @Appender(_index_shared_docs['_convert_scalar_indexer'])
+ def _convert_scalar_indexer(self, key, kind=None):
+ assert kind in ['ix', 'loc', 'getitem', 'iloc', None]
+
+ if kind == 'iloc':
+ return self._validate_indexer('positional', key, kind)
+
+ if len(self) and not isinstance(self, ABCMultiIndex,):
+
+ # we can raise here if we are definitive that this
+ # is positional indexing (eg. .ix on with a float)
+ # or label indexing if we are using a type able
+ # to be represented in the index
+
+ if kind in ['getitem', 'ix'] and is_float(key):
+ if not self.is_floating():
+ return self._invalid_indexer('label', key)
+
+ elif kind in ['loc'] and is_float(key):
+
+ # we want to raise KeyError on string/mixed here
+ # technically we *could* raise a TypeError
+ # on anything but mixed though
+ if self.inferred_type not in ['floating',
+ 'mixed-integer-float',
+ 'string',
+ 'unicode',
+ 'mixed']:
+ return self._invalid_indexer('label', key)
+
+ elif kind in ['loc'] and is_integer(key):
+ if not self.holds_integer():
+ return self._invalid_indexer('label', key)
+
+ return key
+
+ _index_shared_docs['_convert_slice_indexer'] = """
+ Convert a slice indexer.
+
+ By definition, these are labels unless 'iloc' is passed in.
+ Floats are not allowed as the start, step, or stop of the slice.
+
+ Parameters
+ ----------
+ key : label of the slice bound
+ kind : {'ix', 'loc', 'getitem', 'iloc'} or None
+ """
+
+ @Appender(_index_shared_docs['_convert_slice_indexer'])
+ def _convert_slice_indexer(self, key, kind=None):
+ assert kind in ['ix', 'loc', 'getitem', 'iloc', None]
+
+ # if we are not a slice, then we are done
+ if not isinstance(key, slice):
+ return key
+
+ # validate iloc
+ if kind == 'iloc':
+ return slice(self._validate_indexer('slice', key.start, kind),
+ self._validate_indexer('slice', key.stop, kind),
+ self._validate_indexer('slice', key.step, kind))
+
+ # potentially cast the bounds to integers
+ start, stop, step = key.start, key.stop, key.step
+
+ # figure out if this is a positional indexer
+ def is_int(v):
+ return v is None or is_integer(v)
+
+ is_null_slicer = start is None and stop is None
+ is_index_slice = is_int(start) and is_int(stop)
+ is_positional = is_index_slice and not self.is_integer()
+
+ if kind == 'getitem':
+ """
+ called from the getitem slicers, validate that we are in fact
+ integers
+ """
+ if self.is_integer() or is_index_slice:
+ return slice(self._validate_indexer('slice', key.start, kind),
+ self._validate_indexer('slice', key.stop, kind),
+ self._validate_indexer('slice', key.step, kind))
+
+ # convert the slice to an indexer here
+
+ # if we are mixed and have integers
+ try:
+ if is_positional and self.is_mixed():
+ # Validate start & stop
+ if start is not None:
+ self.get_loc(start)
+ if stop is not None:
+ self.get_loc(stop)
+ is_positional = False
+ except KeyError:
+ if self.inferred_type == 'mixed-integer-float':
+ raise
+
+ if is_null_slicer:
+ indexer = key
+ elif is_positional:
+ indexer = key
+ else:
+ try:
+ indexer = self.slice_indexer(start, stop, step, kind=kind)
+ except Exception:
+ if is_index_slice:
+ if self.is_integer():
+ raise
+ else:
+ indexer = key
+ else:
+ raise
+
+ return indexer
+
+ def _convert_listlike_indexer(self, keyarr, kind=None):
+ """
+ Parameters
+ ----------
+ keyarr : list-like
+ Indexer to convert.
+
+ Returns
+ -------
+ tuple (indexer, keyarr)
+ indexer is an ndarray or None if cannot convert
+ keyarr are tuple-safe keys
+ """
+ if isinstance(keyarr, Index):
+ keyarr = self._convert_index_indexer(keyarr)
+ else:
+ keyarr = self._convert_arr_indexer(keyarr)
+
+ indexer = self._convert_list_indexer(keyarr, kind=kind)
+ return indexer, keyarr
+
+ _index_shared_docs['_convert_arr_indexer'] = """
+ Convert an array-like indexer to the appropriate dtype.
+
+ Parameters
+ ----------
+ keyarr : array-like
+ Indexer to convert.
+
+ Returns
+ -------
+ converted_keyarr : array-like
+ """
+
+ @Appender(_index_shared_docs['_convert_arr_indexer'])
+ def _convert_arr_indexer(self, keyarr):
+ keyarr = com.asarray_tuplesafe(keyarr)
+ return keyarr
+
+ _index_shared_docs['_convert_index_indexer'] = """
+ Convert an Index indexer to the appropriate dtype.
+
+ Parameters
+ ----------
+ keyarr : Index (or sub-class)
+ Indexer to convert.
+
+ Returns
+ -------
+ converted_keyarr : Index (or sub-class)
+ """
+
+ @Appender(_index_shared_docs['_convert_index_indexer'])
+ def _convert_index_indexer(self, keyarr):
+ return keyarr
+
+ _index_shared_docs['_convert_list_indexer'] = """
+ Convert a list-like indexer to the appropriate dtype.
+
+ Parameters
+ ----------
+ keyarr : Index (or sub-class)
+ Indexer to convert.
+ kind : iloc, ix, loc, optional
+
+ Returns
+ -------
+ positional indexer or None
+ """
+
+ @Appender(_index_shared_docs['_convert_list_indexer'])
+ def _convert_list_indexer(self, keyarr, kind=None):
+ if (kind in [None, 'iloc', 'ix'] and
+ is_integer_dtype(keyarr) and not self.is_floating() and
+ not isinstance(keyarr, ABCPeriodIndex)):
+
+ if self.inferred_type == 'mixed-integer':
+ indexer = self.get_indexer(keyarr)
+ if (indexer >= 0).all():
+ return indexer
+ # missing values are flagged as -1 by get_indexer and negative
+ # indices are already converted to positive indices in the
+ # above if-statement, so the negative flags are changed to
+ # values outside the range of indices so as to trigger an
+ # IndexError in maybe_convert_indices
+ indexer[indexer < 0] = len(self)
+ from pandas.core.indexing import maybe_convert_indices
+ return maybe_convert_indices(indexer, len(self))
+
+ elif not self.inferred_type == 'integer':
+ keyarr = np.where(keyarr < 0, len(self) + keyarr, keyarr)
+ return keyarr
+
+ return None
+
+ def _invalid_indexer(self, form, key):
+ """
+ Consistent invalid indexer message.
+ """
+ raise TypeError("cannot do {form} indexing on {klass} with these "
+ "indexers [{key}] of {kind}".format(
+ form=form, klass=type(self), key=key,
+ kind=type(key)))
+
+ # --------------------------------------------------------------------
+ # Reindex Methods
+
+ def _can_reindex(self, indexer):
+ """
+ Check if we are allowing reindexing with this particular indexer.
+
+ Parameters
+ ----------
+ indexer : an integer indexer
+
+ Raises
+ ------
+ ValueError if its a duplicate axis
+ """
+
+ # trying to reindex on an axis with duplicates
+ if not self.is_unique and len(indexer):
+ raise ValueError("cannot reindex from a duplicate axis")
+
+ def reindex(self, target, method=None, level=None, limit=None,
+ tolerance=None):
+ """
+ Create index with target's values (move/add/delete values
+ as necessary).
+
+ Parameters
+ ----------
+ target : an iterable
+
+ Returns
+ -------
+ new_index : pd.Index
+ Resulting index
+ indexer : np.ndarray or None
+ Indices of output values in original index
+
+ """
+ # GH6552: preserve names when reindexing to non-named target
+ # (i.e. neither Index nor Series).
+ preserve_names = not hasattr(target, 'name')
+
+ # GH7774: preserve dtype/tz if target is empty and not an Index.
+ target = _ensure_has_len(target) # target may be an iterator
+
+ if not isinstance(target, Index) and len(target) == 0:
+ attrs = self._get_attributes_dict()
+ attrs.pop('freq', None) # don't preserve freq
+ values = self._data[:0] # appropriately-dtyped empty array
+ target = self._simple_new(values, dtype=self.dtype, **attrs)
+ else:
+ target = ensure_index(target)
+
+ if level is not None:
+ if method is not None:
+ raise TypeError('Fill method not supported if level passed')
+ _, indexer, _ = self._join_level(target, level, how='right',
+ return_indexers=True)
+ else:
+ if self.equals(target):
+ indexer = None
+ else:
+
+ if self.is_unique:
+ indexer = self.get_indexer(target, method=method,
+ limit=limit,
+ tolerance=tolerance)
+ else:
+ if method is not None or limit is not None:
+ raise ValueError("cannot reindex a non-unique index "
+ "with a method or limit")
+ indexer, missing = self.get_indexer_non_unique(target)
+
+ if preserve_names and target.nlevels == 1 and target.name != self.name:
+ target = target.copy()
+ target.name = self.name
+
+ return target, indexer
+
+ def _reindex_non_unique(self, target):
+ """
+ Create a new index with target's values (move/add/delete values as
+ necessary) use with non-unique Index and a possibly non-unique target.
+
+ Parameters
+ ----------
+ target : an iterable
+
+ Returns
+ -------
+ new_index : pd.Index
+ Resulting index
+ indexer : np.ndarray or None
+ Indices of output values in original index
+
+ """
+
+ target = ensure_index(target)
+ indexer, missing = self.get_indexer_non_unique(target)
+ check = indexer != -1
+ new_labels = self.take(indexer[check])
+ new_indexer = None
+
+ if len(missing):
+ length = np.arange(len(indexer))
+
+ missing = ensure_platform_int(missing)
+ missing_labels = target.take(missing)
+ missing_indexer = ensure_int64(length[~check])
+ cur_labels = self.take(indexer[check]).values
+ cur_indexer = ensure_int64(length[check])
+
+ new_labels = np.empty(tuple([len(indexer)]), dtype=object)
+ new_labels[cur_indexer] = cur_labels
+ new_labels[missing_indexer] = missing_labels
+
+ # a unique indexer
+ if target.is_unique:
+
+ # see GH5553, make sure we use the right indexer
+ new_indexer = np.arange(len(indexer))
+ new_indexer[cur_indexer] = np.arange(len(cur_labels))
+ new_indexer[missing_indexer] = -1
+
+ # we have a non_unique selector, need to use the original
+ # indexer here
+ else:
+
+ # need to retake to have the same size as the indexer
+ indexer[~check] = -1
+
+ # reset the new indexer to account for the new size
+ new_indexer = np.arange(len(self.take(indexer)))
+ new_indexer[~check] = -1
+
+ new_index = self._shallow_copy_with_infer(new_labels, freq=None)
+ return new_index, indexer, new_indexer
+
+ # --------------------------------------------------------------------
+ # Join Methods
+
+ _index_shared_docs['join'] = """
+ Compute join_index and indexers to conform data
+ structures to the new index.
+
+ Parameters
+ ----------
+ other : Index
+ how : {'left', 'right', 'inner', 'outer'}
+ level : int or level name, default None
+ return_indexers : boolean, default False
+ sort : boolean, default False
+ Sort the join keys lexicographically in the result Index. If False,
+ the order of the join keys depends on the join type (how keyword)
+
+ .. versionadded:: 0.20.0
+
+ Returns
+ -------
+ join_index, (left_indexer, right_indexer)
+ """
+
+ @Appender(_index_shared_docs['join'])
+ def join(self, other, how='left', level=None, return_indexers=False,
+ sort=False):
+ self_is_mi = isinstance(self, ABCMultiIndex)
+ other_is_mi = isinstance(other, ABCMultiIndex)
+
+ # try to figure out the join level
+ # GH3662
+ if level is None and (self_is_mi or other_is_mi):
+
+ # have the same levels/names so a simple join
+ if self.names == other.names:
+ pass
+ else:
+ return self._join_multi(other, how=how,
+ return_indexers=return_indexers)
+
+ # join on the level
+ if level is not None and (self_is_mi or other_is_mi):
+ return self._join_level(other, level, how=how,
+ return_indexers=return_indexers)
+
+ other = ensure_index(other)
+
+ if len(other) == 0 and how in ('left', 'outer'):
+ join_index = self._shallow_copy()
+ if return_indexers:
+ rindexer = np.repeat(-1, len(join_index))
+ return join_index, None, rindexer
+ else:
+ return join_index
+
+ if len(self) == 0 and how in ('right', 'outer'):
+ join_index = other._shallow_copy()
+ if return_indexers:
+ lindexer = np.repeat(-1, len(join_index))
+ return join_index, lindexer, None
+ else:
+ return join_index
+
+ if self._join_precedence < other._join_precedence:
+ how = {'right': 'left', 'left': 'right'}.get(how, how)
+ result = other.join(self, how=how, level=level,
+ return_indexers=return_indexers)
+ if return_indexers:
+ x, y, z = result
+ result = x, z, y
+ return result
+
+ if not is_dtype_equal(self.dtype, other.dtype):
+ this = self.astype('O')
+ other = other.astype('O')
+ return this.join(other, how=how, return_indexers=return_indexers)
+
+ _validate_join_method(how)
+
+ if not self.is_unique and not other.is_unique:
+ return self._join_non_unique(other, how=how,
+ return_indexers=return_indexers)
+ elif not self.is_unique or not other.is_unique:
+ if self.is_monotonic and other.is_monotonic:
+ return self._join_monotonic(other, how=how,
+ return_indexers=return_indexers)
+ else:
+ return self._join_non_unique(other, how=how,
+ return_indexers=return_indexers)
+ elif self.is_monotonic and other.is_monotonic:
+ try:
+ return self._join_monotonic(other, how=how,
+ return_indexers=return_indexers)
+ except TypeError:
+ pass
+
+ if how == 'left':
+ join_index = self
+ elif how == 'right':
+ join_index = other
+ elif how == 'inner':
+ # TODO: sort=False here for backwards compat. It may
+ # be better to use the sort parameter passed into join
+ join_index = self.intersection(other, sort=False)
+ elif how == 'outer':
+ # TODO: sort=True here for backwards compat. It may
+ # be better to use the sort parameter passed into join
+ join_index = self.union(other)
+
+ if sort:
+ join_index = join_index.sort_values()
+
+ if return_indexers:
+ if join_index is self:
+ lindexer = None
+ else:
+ lindexer = self.get_indexer(join_index)
+ if join_index is other:
+ rindexer = None
+ else:
+ rindexer = other.get_indexer(join_index)
+ return join_index, lindexer, rindexer
+ else:
+ return join_index
+
+ def _join_multi(self, other, how, return_indexers=True):
+ from .multi import MultiIndex
+ from pandas.core.reshape.merge import _restore_dropped_levels_multijoin
+
+ # figure out join names
+ self_names = set(com._not_none(*self.names))
+ other_names = set(com._not_none(*other.names))
+ overlap = self_names & other_names
+
+ # need at least 1 in common
+ if not overlap:
+ raise ValueError("cannot join with no overlapping index names")
+
+ self_is_mi = isinstance(self, MultiIndex)
+ other_is_mi = isinstance(other, MultiIndex)
+
+ if self_is_mi and other_is_mi:
+
+ # Drop the non-matching levels from left and right respectively
+ ldrop_names = list(self_names - overlap)
+ rdrop_names = list(other_names - overlap)
+
+ self_jnlevels = self.droplevel(ldrop_names)
+ other_jnlevels = other.droplevel(rdrop_names)
+
+ # Join left and right
+ # Join on same leveled multi-index frames is supported
+ join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how,
+ return_indexers=True)
+
+ # Restore the dropped levels
+ # Returned index level order is
+ # common levels, ldrop_names, rdrop_names
+ dropped_names = ldrop_names + rdrop_names
+
+ levels, codes, names = (
+ _restore_dropped_levels_multijoin(self, other,
+ dropped_names,
+ join_idx,
+ lidx, ridx))
+
+ # Re-create the multi-index
+ multi_join_idx = MultiIndex(levels=levels, codes=codes,
+ names=names, verify_integrity=False)
+
+ multi_join_idx = multi_join_idx.remove_unused_levels()
+
+ return multi_join_idx, lidx, ridx
+
+ jl = list(overlap)[0]
+
+ # Case where only one index is multi
+ # make the indices into mi's that match
+ flip_order = False
+ if self_is_mi:
+ self, other = other, self
+ flip_order = True
+ # flip if join method is right or left
+ how = {'right': 'left', 'left': 'right'}.get(how, how)
+
+ level = other.names.index(jl)
+ result = self._join_level(other, level, how=how,
+ return_indexers=return_indexers)
+
+ if flip_order:
+ if isinstance(result, tuple):
+ return result[0], result[2], result[1]
+ return result
+
+ def _join_non_unique(self, other, how='left', return_indexers=False):
+ from pandas.core.reshape.merge import _get_join_indexers
+
+ left_idx, right_idx = _get_join_indexers([self._ndarray_values],
+ [other._ndarray_values],
+ how=how,
+ sort=True)
+
+ left_idx = ensure_platform_int(left_idx)
+ right_idx = ensure_platform_int(right_idx)
+
+ join_index = np.asarray(self._ndarray_values.take(left_idx))
+ mask = left_idx == -1
+ np.putmask(join_index, mask, other._ndarray_values.take(right_idx))
+
+ join_index = self._wrap_joined_index(join_index, other)
+
+ if return_indexers:
+ return join_index, left_idx, right_idx
+ else:
+ return join_index
+
+ def _join_level(self, other, level, how='left', return_indexers=False,
+ keep_order=True):
+ """
+ The join method *only* affects the level of the resulting
+ MultiIndex. Otherwise it just exactly aligns the Index data to the
+ labels of the level in the MultiIndex.
+
+ If ```keep_order == True```, the order of the data indexed by the
+ MultiIndex will not be changed; otherwise, it will tie out
+ with `other`.
+ """
+ from .multi import MultiIndex
+
+ def _get_leaf_sorter(labels):
+ """
+ Returns sorter for the inner most level while preserving the
+ order of higher levels.
+ """
+ if labels[0].size == 0:
+ return np.empty(0, dtype='int64')
+
+ if len(labels) == 1:
+ lab = ensure_int64(labels[0])
+ sorter, _ = libalgos.groupsort_indexer(lab, 1 + lab.max())
+ return sorter
+
+ # find indexers of beginning of each set of
+ # same-key labels w.r.t all but last level
+ tic = labels[0][:-1] != labels[0][1:]
+ for lab in labels[1:-1]:
+ tic |= lab[:-1] != lab[1:]
+
+ starts = np.hstack(([True], tic, [True])).nonzero()[0]
+ lab = ensure_int64(labels[-1])
+ return lib.get_level_sorter(lab, ensure_int64(starts))
+
+ if isinstance(self, MultiIndex) and isinstance(other, MultiIndex):
+ raise TypeError('Join on level between two MultiIndex objects '
+ 'is ambiguous')
+
+ left, right = self, other
+
+ flip_order = not isinstance(self, MultiIndex)
+ if flip_order:
+ left, right = right, left
+ how = {'right': 'left', 'left': 'right'}.get(how, how)
+
+ level = left._get_level_number(level)
+ old_level = left.levels[level]
+
+ if not right.is_unique:
+ raise NotImplementedError('Index._join_level on non-unique index '
+ 'is not implemented')
+
+ new_level, left_lev_indexer, right_lev_indexer = \
+ old_level.join(right, how=how, return_indexers=True)
+
+ if left_lev_indexer is None:
+ if keep_order or len(left) == 0:
+ left_indexer = None
+ join_index = left
+ else: # sort the leaves
+ left_indexer = _get_leaf_sorter(left.codes[:level + 1])
+ join_index = left[left_indexer]
+
+ else:
+ left_lev_indexer = ensure_int64(left_lev_indexer)
+ rev_indexer = lib.get_reverse_indexer(left_lev_indexer,
+ len(old_level))
+
+ new_lev_codes = algos.take_nd(rev_indexer, left.codes[level],
+ allow_fill=False)
+
+ new_codes = list(left.codes)
+ new_codes[level] = new_lev_codes
+
+ new_levels = list(left.levels)
+ new_levels[level] = new_level
+
+ if keep_order: # just drop missing values. o.w. keep order
+ left_indexer = np.arange(len(left), dtype=np.intp)
+ mask = new_lev_codes != -1
+ if not mask.all():
+ new_codes = [lab[mask] for lab in new_codes]
+ left_indexer = left_indexer[mask]
+
+ else: # tie out the order with other
+ if level == 0: # outer most level, take the fast route
+ ngroups = 1 + new_lev_codes.max()
+ left_indexer, counts = libalgos.groupsort_indexer(
+ new_lev_codes, ngroups)
+
+ # missing values are placed first; drop them!
+ left_indexer = left_indexer[counts[0]:]
+ new_codes = [lab[left_indexer] for lab in new_codes]
+
+ else: # sort the leaves
+ mask = new_lev_codes != -1
+ mask_all = mask.all()
+ if not mask_all:
+ new_codes = [lab[mask] for lab in new_codes]
+
+ left_indexer = _get_leaf_sorter(new_codes[:level + 1])
+ new_codes = [lab[left_indexer] for lab in new_codes]
+
+ # left_indexers are w.r.t masked frame.
+ # reverse to original frame!
+ if not mask_all:
+ left_indexer = mask.nonzero()[0][left_indexer]
+
+ join_index = MultiIndex(levels=new_levels, codes=new_codes,
+ names=left.names, verify_integrity=False)
+
+ if right_lev_indexer is not None:
+ right_indexer = algos.take_nd(right_lev_indexer,
+ join_index.codes[level],
+ allow_fill=False)
+ else:
+ right_indexer = join_index.codes[level]
+
+ if flip_order:
+ left_indexer, right_indexer = right_indexer, left_indexer
+
+ if return_indexers:
+ left_indexer = (None if left_indexer is None
+ else ensure_platform_int(left_indexer))
+ right_indexer = (None if right_indexer is None
+ else ensure_platform_int(right_indexer))
+ return join_index, left_indexer, right_indexer
+ else:
+ return join_index
+
+ def _join_monotonic(self, other, how='left', return_indexers=False):
+ if self.equals(other):
+ ret_index = other if how == 'right' else self
+ if return_indexers:
+ return ret_index, None, None
+ else:
+ return ret_index
+
+ sv = self._ndarray_values
+ ov = other._ndarray_values
+
+ if self.is_unique and other.is_unique:
+ # We can perform much better than the general case
+ if how == 'left':
+ join_index = self
+ lidx = None
+ ridx = self._left_indexer_unique(sv, ov)
+ elif how == 'right':
+ join_index = other
+ lidx = self._left_indexer_unique(ov, sv)
+ ridx = None
+ elif how == 'inner':
+ join_index, lidx, ridx = self._inner_indexer(sv, ov)
+ join_index = self._wrap_joined_index(join_index, other)
+ elif how == 'outer':
+ join_index, lidx, ridx = self._outer_indexer(sv, ov)
+ join_index = self._wrap_joined_index(join_index, other)
+ else:
+ if how == 'left':
+ join_index, lidx, ridx = self._left_indexer(sv, ov)
+ elif how == 'right':
+ join_index, ridx, lidx = self._left_indexer(ov, sv)
+ elif how == 'inner':
+ join_index, lidx, ridx = self._inner_indexer(sv, ov)
+ elif how == 'outer':
+ join_index, lidx, ridx = self._outer_indexer(sv, ov)
+ join_index = self._wrap_joined_index(join_index, other)
+
+ if return_indexers:
+ lidx = None if lidx is None else ensure_platform_int(lidx)
+ ridx = None if ridx is None else ensure_platform_int(ridx)
+ return join_index, lidx, ridx
+ else:
+ return join_index
+
+ def _wrap_joined_index(self, joined, other):
+ name = get_op_result_name(self, other)
+ return Index(joined, name=name)
+
+ # --------------------------------------------------------------------
+ # Uncategorized Methods
+
+ @property
+ def values(self):
+ """
+ Return an array representing the data in the Index.
+
+ .. warning::
+
+ We recommend using :attr:`Index.array` or
+ :meth:`Index.to_numpy`, depending on whether you need
+ a reference to the underlying data or a NumPy array.
+
+ Returns
+ -------
+ array: numpy.ndarray or ExtensionArray
+
+ See Also
+ --------
+ Index.array : Reference to the underlying data.
+ Index.to_numpy : A NumPy array representing the underlying data.
+
+ Return the underlying data as an ndarray.
+ """
+ return self._data.view(np.ndarray)
+
+ @property
+ def _values(self):
+ # type: () -> Union[ExtensionArray, Index, np.ndarray]
+ # TODO(EA): remove index types as they become extension arrays
+ """
+ The best array representation.
+
+ This is an ndarray, ExtensionArray, or Index subclass. This differs
+ from ``_ndarray_values``, which always returns an ndarray.
+
+ Both ``_values`` and ``_ndarray_values`` are consistent between
+ ``Series`` and ``Index``.
+
+ It may differ from the public '.values' method.
+
+ index | values | _values | _ndarray_values |
+ ----------------- | --------------- | ------------- | --------------- |
+ Index | ndarray | ndarray | ndarray |
+ CategoricalIndex | Categorical | Categorical | ndarray[int] |
+ DatetimeIndex | ndarray[M8ns] | ndarray[M8ns] | ndarray[M8ns] |
+ DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] |
+ PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] |
+ IntervalIndex | IntervalArray | IntervalArray | ndarray[object] |
+
+ See Also
+ --------
+ values
+ _ndarray_values
+ """
+ return self._data
+
+ def get_values(self):
+ """
+ Return `Index` data as an `numpy.ndarray`.
+
+ Returns
+ -------
+ numpy.ndarray
+ A one-dimensional numpy array of the `Index` values.
+
+ See Also
+ --------
+ Index.values : The attribute that get_values wraps.
+
+ Examples
+ --------
+ Getting the `Index` values of a `DataFrame`:
+
+ >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+ ... index=['a', 'b', 'c'], columns=['A', 'B', 'C'])
+ >>> df
+ A B C
+ a 1 2 3
+ b 4 5 6
+ c 7 8 9
+ >>> df.index.get_values()
+ array(['a', 'b', 'c'], dtype=object)
+
+ Standalone `Index` values:
+
+ >>> idx = pd.Index(['1', '2', '3'])
+ >>> idx.get_values()
+ array(['1', '2', '3'], dtype=object)
+
+ `MultiIndex` arrays also have only one dimension:
+
+ >>> midx = pd.MultiIndex.from_arrays([[1, 2, 3], ['a', 'b', 'c']],
+ ... names=('number', 'letter'))
+ >>> midx.get_values()
+ array([(1, 'a'), (2, 'b'), (3, 'c')], dtype=object)
+ >>> midx.get_values().ndim
+ 1
+ """
+ return self.values
+
+ @Appender(IndexOpsMixin.memory_usage.__doc__)
+ def memory_usage(self, deep=False):
+ result = super(Index, self).memory_usage(deep=deep)
+
+ # include our engine hashtable
+ result += self._engine.sizeof(deep=deep)
+ return result
+
+ _index_shared_docs['where'] = """
+ Return an Index of same shape as self and whose corresponding
+ entries are from self where cond is True and otherwise are from
+ other.
+
+ .. versionadded:: 0.19.0
+
+ Parameters
+ ----------
+ cond : boolean array-like with the same length as self
+ other : scalar, or array-like
+ """
+
+ @Appender(_index_shared_docs['where'])
+ def where(self, cond, other=None):
+ if other is None:
+ other = self._na_value
+
+ dtype = self.dtype
+ values = self.values
+
+ if is_bool(other) or is_bool_dtype(other):
+
+ # bools force casting
+ values = values.astype(object)
+ dtype = None
+
+ values = np.where(cond, values, other)
+
+ if self._is_numeric_dtype and np.any(isna(values)):
+ # We can't coerce to the numeric dtype of "self" (unless
+ # it's float) if there are NaN values in our output.
+ dtype = None
+
+ return self._shallow_copy_with_infer(values, dtype=dtype)
+
+ # construction helpers
+ @classmethod
+ def _try_convert_to_int_index(cls, data, copy, name, dtype):
+ """
+ Attempt to convert an array of data into an integer index.
+
+ Parameters
+ ----------
+ data : The data to convert.
+ copy : Whether to copy the data or not.
+ name : The name of the index returned.
+
+ Returns
+ -------
+ int_index : data converted to either an Int64Index or a
+ UInt64Index
+
+ Raises
+ ------
+ ValueError if the conversion was not successful.
+ """
+
+ from .numeric import Int64Index, UInt64Index
+ if not is_unsigned_integer_dtype(dtype):
+ # skip int64 conversion attempt if uint-like dtype is passed, as
+ # this could return Int64Index when UInt64Index is what's desrired
+ try:
+ res = data.astype('i8', copy=False)
+ if (res == data).all():
+ return Int64Index(res, copy=copy, name=name)
+ except (OverflowError, TypeError, ValueError):
+ pass
+
+ # Conversion to int64 failed (possibly due to overflow) or was skipped,
+ # so let's try now with uint64.
+ try:
+ res = data.astype('u8', copy=False)
+ if (res == data).all():
+ return UInt64Index(res, copy=copy, name=name)
+ except (OverflowError, TypeError, ValueError):
+ pass
+
+ raise ValueError
+
+ @classmethod
+ def _scalar_data_error(cls, data):
+ raise TypeError('{0}(...) must be called with a collection of some '
+ 'kind, {1} was passed'.format(cls.__name__,
+ repr(data)))
+
+ @classmethod
+ def _string_data_error(cls, data):
+ raise TypeError('String dtype not supported, you may need '
+ 'to explicitly cast to a numeric type')
+
+ @classmethod
+ def _coerce_to_ndarray(cls, data):
+ """
+ Coerces data to ndarray.
+
+ Converts other iterables to list first and then to array.
+ Does not touch ndarrays.
+
+ Raises
+ ------
+ TypeError
+ When the data passed in is a scalar.
+ """
+
+ if not isinstance(data, (np.ndarray, Index)):
+ if data is None or is_scalar(data):
+ cls._scalar_data_error(data)
+
+ # other iterable of some kind
+ if not isinstance(data, (ABCSeries, list, tuple)):
+ data = list(data)
+ data = np.asarray(data)
+ return data
+
+ def _coerce_scalar_to_index(self, item):
+ """
+ We need to coerce a scalar to a compat for our index type.
+
+ Parameters
+ ----------
+ item : scalar item to coerce
+ """
+ dtype = self.dtype
+
+ if self._is_numeric_dtype and isna(item):
+ # We can't coerce to the numeric dtype of "self" (unless
+ # it's float) if there are NaN values in our output.
+ dtype = None
+
+ return Index([item], dtype=dtype, **self._get_attributes_dict())
+
+ def _to_safe_for_reshape(self):
+ """
+ Convert to object if we are a categorical.
+ """
+ return self
+
+ def _convert_for_op(self, value):
+ """
+ Convert value to be insertable to ndarray.
+ """
+ return value
+
+ def _assert_can_do_op(self, value):
+ """
+ Check value is valid for scalar op.
+ """
+ if not is_scalar(value):
+ msg = "'value' must be a scalar, passed: {0}"
+ raise TypeError(msg.format(type(value).__name__))
+
+ @property
+ def _has_complex_internals(self):
+ # to disable groupby tricks in MultiIndex
+ return False
+
+ def _is_memory_usage_qualified(self):
+ """
+ Return a boolean if we need a qualified .info display.
+ """
+ return self.is_object()
+
+ def is_type_compatible(self, kind):
+ return kind == self.inferred_type
+
+ _index_shared_docs['contains'] = """
+ Return a boolean indicating whether the provided key is in the index.
+
+ Parameters
+ ----------
+ key : label
+ The key to check if it is present in the index.
+
+ Returns
+ -------
+ bool
+ Whether the key search is in the index.
+
+ See Also
+ --------
+ Index.isin : Returns an ndarray of boolean dtype indicating whether the
+ list-like key is in the index.
+
+ Examples
+ --------
+ >>> idx = pd.Index([1, 2, 3, 4])
+ >>> idx
+ Int64Index([1, 2, 3, 4], dtype='int64')
+
+ >>> idx.contains(2)
+ True
+ >>> idx.contains(6)
+ False
+
+ This is equivalent to:
+
+ >>> 2 in idx
+ True
+ >>> 6 in idx
+ False
+ """
+
+ @Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
+ def __contains__(self, key):
+ hash(key)
+ try:
+ return key in self._engine
+ except (OverflowError, TypeError, ValueError):
+ return False
+
+ @Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
+ def contains(self, key):
+ hash(key)
+ try:
+ return key in self._engine
+ except (TypeError, ValueError):
+ return False
+
+ def __hash__(self):
+ raise TypeError("unhashable type: %r" % type(self).__name__)
+
+ def __setitem__(self, key, value):
+ raise TypeError("Index does not support mutable operations")
+
+ def __getitem__(self, key):
+ """
+ Override numpy.ndarray's __getitem__ method to work as desired.
+
+ This function adds lists and Series as valid boolean indexers
+ (ndarrays only supports ndarray with dtype=bool).
+
+ If resulting ndim != 1, plain ndarray is returned instead of
+ corresponding `Index` subclass.
+
+ """
+ # There's no custom logic to be implemented in __getslice__, so it's
+ # not overloaded intentionally.
+ getitem = self._data.__getitem__
+ promote = self._shallow_copy
+
+ if is_scalar(key):
+ key = com.cast_scalar_indexer(key)
+ return getitem(key)
+
+ if isinstance(key, slice):
+ # This case is separated from the conditional above to avoid
+ # pessimization of basic indexing.
+ return promote(getitem(key))
+
+ if com.is_bool_indexer(key):
+ key = np.asarray(key, dtype=bool)
+
+ key = com.values_from_object(key)
+ result = getitem(key)
+ if not is_scalar(result):
+ return promote(result)
+ else:
+ return result
+
+ def _can_hold_identifiers_and_holds_name(self, name):
+ """
+ Faster check for ``name in self`` when we know `name` is a Python
+ identifier (e.g. in NDFrame.__getattr__, which hits this to support
+ . key lookup). For indexes that can't hold identifiers (everything
+ but object & categorical) we just return False.
+
+ https://github.com/pandas-dev/pandas/issues/19764
+ """
+ if self.is_object() or self.is_categorical():
+ return name in self
+ return False
+
+ def append(self, other):
+ """
+ Append a collection of Index options together.
+
+ Parameters
+ ----------
+ other : Index or list/tuple of indices
+
+ Returns
+ -------
+ appended : Index
+ """
+
+ to_concat = [self]
+
+ if isinstance(other, (list, tuple)):
+ to_concat = to_concat + list(other)
+ else:
+ to_concat.append(other)
+
+ for obj in to_concat:
+ if not isinstance(obj, Index):
+ raise TypeError('all inputs must be Index')
+
+ names = {obj.name for obj in to_concat}
+ name = None if len(names) > 1 else self.name
+
+ return self._concat(to_concat, name)
+
+ def _concat(self, to_concat, name):
+
+ typs = _concat.get_dtype_kinds(to_concat)
+
+ if len(typs) == 1:
+ return self._concat_same_dtype(to_concat, name=name)
+ return _concat._concat_index_asobject(to_concat, name=name)
+
+ def _concat_same_dtype(self, to_concat, name):
+ """
+ Concatenate to_concat which has the same class.
+ """
+ # must be overridden in specific classes
+ return _concat._concat_index_asobject(to_concat, name)
+
+ def putmask(self, mask, value):
+ """
+ Return a new Index of the values set with the mask.
+
+ See Also
+ --------
+ numpy.ndarray.putmask
+ """
+ values = self.values.copy()
+ try:
+ np.putmask(values, mask, self._convert_for_op(value))
+ return self._shallow_copy(values)
+ except (ValueError, TypeError) as err:
+ if is_object_dtype(self):
+ raise err
+
+ # coerces to object
+ return self.astype(object).putmask(mask, value)
+
+ def equals(self, other):
+ """
+ Determines if two Index objects contain the same elements.
+ """
+ if self.is_(other):
+ return True
+
+ if not isinstance(other, Index):
+ return False
+
+ if is_object_dtype(self) and not is_object_dtype(other):
+ # if other is not object, use other's logic for coercion
+ return other.equals(self)
+
+ try:
+ return array_equivalent(com.values_from_object(self),
+ com.values_from_object(other))
+ except Exception:
+ return False
+
+ def identical(self, other):
+ """
+ Similar to equals, but check that other comparable attributes are
+ also equal.
+ """
+ return (self.equals(other) and
+ all((getattr(self, c, None) == getattr(other, c, None)
+ for c in self._comparables)) and
+ type(self) == type(other))
+
+ def asof(self, label):
+ """
+ Return the label from the index, or, if not present, the previous one.
+
+ Assuming that the index is sorted, return the passed index label if it
+ is in the index, or return the previous index label if the passed one
+ is not in the index.
+
+ Parameters
+ ----------
+ label : object
+ The label up to which the method returns the latest index label.
+
+ Returns
+ -------
+ object
+ The passed label if it is in the index. The previous label if the
+ passed label is not in the sorted index or `NaN` if there is no
+ such label.
+
+ See Also
+ --------
+ Series.asof : Return the latest value in a Series up to the
+ passed index.
+ merge_asof : Perform an asof merge (similar to left join but it
+ matches on nearest key rather than equal key).
+ Index.get_loc : An `asof` is a thin wrapper around `get_loc`
+ with method='pad'.
+
+ Examples
+ --------
+ `Index.asof` returns the latest index label up to the passed label.
+
+ >>> idx = pd.Index(['2013-12-31', '2014-01-02', '2014-01-03'])
+ >>> idx.asof('2014-01-01')
+ '2013-12-31'
+
+ If the label is in the index, the method returns the passed label.
+
+ >>> idx.asof('2014-01-02')
+ '2014-01-02'
+
+ If all of the labels in the index are later than the passed label,
+ NaN is returned.
+
+ >>> idx.asof('1999-01-02')
+ nan
+
+ If the index is not sorted, an error is raised.
+
+ >>> idx_not_sorted = pd.Index(['2013-12-31', '2015-01-02',
+ ... '2014-01-03'])
+ >>> idx_not_sorted.asof('2013-12-31')
+ Traceback (most recent call last):
+ ValueError: index must be monotonic increasing or decreasing
+ """
+ try:
+ loc = self.get_loc(label, method='pad')
+ except KeyError:
+ return self._na_value
+ else:
+ if isinstance(loc, slice):
+ loc = loc.indices(len(self))[-1]
+ return self[loc]
+
+ def asof_locs(self, where, mask):
+ """
+ Finds the locations (indices) of the labels from the index for
+ every entry in the `where` argument.
+
+ As in the `asof` function, if the label (a particular entry in
+ `where`) is not in the index, the latest index label upto the
+ passed label is chosen and its index returned.
+
+ If all of the labels in the index are later than a label in `where`,
+ -1 is returned.
+
+ `mask` is used to ignore NA values in the index during calculation.
+
+ Parameters
+ ----------
+ where : Index
+ An Index consisting of an array of timestamps.
+ mask : array-like
+ Array of booleans denoting where values in the original
+ data are not NA.
+
+ Returns
+ -------
+ numpy.ndarray
+ An array of locations (indices) of the labels from the Index
+ which correspond to the return values of the `asof` function
+ for every element in `where`.
+ """
+ locs = self.values[mask].searchsorted(where.values, side='right')
+ locs = np.where(locs > 0, locs - 1, 0)
+
+ result = np.arange(len(self))[mask].take(locs)
+
+ first = mask.argmax()
+ result[(locs == 0) & (where.values < self.values[first])] = -1
+
+ return result
+
+ def sort_values(self, return_indexer=False, ascending=True):
+ """
+ Return a sorted copy of the index.
+
+ Return a sorted copy of the index, and optionally return the indices
+ that sorted the index itself.
+
+ Parameters
+ ----------
+ return_indexer : bool, default False
+ Should the indices that would sort the index be returned.
+ ascending : bool, default True
+ Should the index values be sorted in an ascending order.
+
+ Returns
+ -------
+ sorted_index : pandas.Index
+ Sorted copy of the index.
+ indexer : numpy.ndarray, optional
+ The indices that the index itself was sorted by.
+
+ See Also
+ --------
+ pandas.Series.sort_values : Sort values of a Series.
+ pandas.DataFrame.sort_values : Sort values in a DataFrame.
+
+ Examples
+ --------
+ >>> idx = pd.Index([10, 100, 1, 1000])
+ >>> idx
+ Int64Index([10, 100, 1, 1000], dtype='int64')
+
+ Sort values in ascending order (default behavior).
+
+ >>> idx.sort_values()
+ Int64Index([1, 10, 100, 1000], dtype='int64')
+
+ Sort values in descending order, and also get the indices `idx` was
+ sorted by.
+
+ >>> idx.sort_values(ascending=False, return_indexer=True)
+ (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2]))
+ """
+ _as = self.argsort()
+ if not ascending:
+ _as = _as[::-1]
+
+ sorted_index = self.take(_as)
+
+ if return_indexer:
+ return sorted_index, _as
+ else:
+ return sorted_index
+
+ def sort(self, *args, **kwargs):
+ raise TypeError("cannot sort an Index object in-place, use "
+ "sort_values instead")
+
+ def shift(self, periods=1, freq=None):
+ """
+ Shift index by desired number of time frequency increments.
+
+ This method is for shifting the values of datetime-like indexes
+ by a specified time increment a given number of times.
+
+ Parameters
+ ----------
+ periods : int, default 1
+ Number of periods (or increments) to shift by,
+ can be positive or negative.
+ freq : pandas.DateOffset, pandas.Timedelta or string, optional
+ Frequency increment to shift by.
+ If None, the index is shifted by its own `freq` attribute.
+ Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc.
+
+ Returns
+ -------
+ pandas.Index
+ shifted index
+
+ See Also
+ --------
+ Series.shift : Shift values of Series.
+
+ Notes
+ -----
+ This method is only implemented for datetime-like index classes,
+ i.e., DatetimeIndex, PeriodIndex and TimedeltaIndex.
+
+ Examples
+ --------
+ Put the first 5 month starts of 2011 into an index.
+
+ >>> month_starts = pd.date_range('1/1/2011', periods=5, freq='MS')
+ >>> month_starts
+ DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01',
+ '2011-05-01'],
+ dtype='datetime64[ns]', freq='MS')
+
+ Shift the index by 10 days.
+
+ >>> month_starts.shift(10, freq='D')
+ DatetimeIndex(['2011-01-11', '2011-02-11', '2011-03-11', '2011-04-11',
+ '2011-05-11'],
+ dtype='datetime64[ns]', freq=None)
+
+ The default value of `freq` is the `freq` attribute of the index,
+ which is 'MS' (month start) in this example.
+
+ >>> month_starts.shift(10)
+ DatetimeIndex(['2011-11-01', '2011-12-01', '2012-01-01', '2012-02-01',
+ '2012-03-01'],
+ dtype='datetime64[ns]', freq='MS')
+ """
+ raise NotImplementedError("Not supported for type %s" %
+ type(self).__name__)
+
+ def argsort(self, *args, **kwargs):
+ """
+ Return the integer indices that would sort the index.
+
+ Parameters
+ ----------
+ *args
+ Passed to `numpy.ndarray.argsort`.
+ **kwargs
+ Passed to `numpy.ndarray.argsort`.
+
+ Returns
+ -------
+ numpy.ndarray
+ Integer indices that would sort the index if used as
+ an indexer.
+
+ See Also
+ --------
+ numpy.argsort : Similar method for NumPy arrays.
+ Index.sort_values : Return sorted copy of Index.
+
+ Examples
+ --------
+ >>> idx = pd.Index(['b', 'a', 'd', 'c'])
+ >>> idx
+ Index(['b', 'a', 'd', 'c'], dtype='object')
+
+ >>> order = idx.argsort()
+ >>> order
+ array([1, 0, 3, 2])
+
+ >>> idx[order]
+ Index(['a', 'b', 'c', 'd'], dtype='object')
+ """
+ result = self.asi8
+ if result is None:
+ result = np.array(self)
+ return result.argsort(*args, **kwargs)
+
+ def get_value(self, series, key):
+ """
+ Fast lookup of value from 1-dimensional ndarray. Only use this if you
+ know what you're doing.
+ """
+
+ # if we have something that is Index-like, then
+ # use this, e.g. DatetimeIndex
+ # Things like `Series._get_value` (via .at) pass the EA directly here.
+ s = getattr(series, '_values', series)
+ if isinstance(s, (ExtensionArray, Index)) and is_scalar(key):
+ # GH 20882, 21257
+ # Unify Index and ExtensionArray treatment
+ # First try to convert the key to a location
+ # If that fails, raise a KeyError if an integer
+ # index, otherwise, see if key is an integer, and
+ # try that
+ try:
+ iloc = self.get_loc(key)
+ return s[iloc]
+ except KeyError:
+ if (len(self) > 0 and
+ (self.holds_integer() or self.is_boolean())):
+ raise
+ elif is_integer(key):
+ return s[key]
+
+ s = com.values_from_object(series)
+ k = com.values_from_object(key)
+
+ k = self._convert_scalar_indexer(k, kind='getitem')
+ try:
+ return self._engine.get_value(s, k,
+ tz=getattr(series.dtype, 'tz', None))
+ except KeyError as e1:
+ if len(self) > 0 and (self.holds_integer() or self.is_boolean()):
+ raise
+
+ try:
+ return libindex.get_value_box(s, key)
+ except IndexError:
+ raise
+ except TypeError:
+ # generator/iterator-like
+ if is_iterator(key):
+ raise InvalidIndexError(key)
+ else:
+ raise e1
+ except Exception: # pragma: no cover
+ raise e1
+ except TypeError:
+ # python 3
+ if is_scalar(key): # pragma: no cover
+ raise IndexError(key)
+ raise InvalidIndexError(key)
+
+ def set_value(self, arr, key, value):
+ """
+ Fast lookup of value from 1-dimensional ndarray.
+
+ Notes
+ -----
+ Only use this if you know what you're doing.
+ """
+ self._engine.set_value(com.values_from_object(arr),
+ com.values_from_object(key), value)
+
+ _index_shared_docs['get_indexer_non_unique'] = """
+ Compute indexer and mask for new index given the current index. The
+ indexer should be then used as an input to ndarray.take to align the
+ current data to the new index.
+
+ Parameters
+ ----------
+ target : %(target_klass)s
+
+ Returns
+ -------
+ indexer : ndarray of int
+ Integers from 0 to n - 1 indicating that the index at these
+ positions matches the corresponding target values. Missing values
+ in the target are marked by -1.
+ missing : ndarray of int
+ An indexer into the target of the values not found.
+ These correspond to the -1 in the indexer array
+ """
+
+ @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs)
+ def get_indexer_non_unique(self, target):
+ target = ensure_index(target)
+ if is_categorical(target):
+ target = target.astype(target.dtype.categories.dtype)
+ pself, ptarget = self._maybe_promote(target)
+ if pself is not self or ptarget is not target:
+ return pself.get_indexer_non_unique(ptarget)
+
+ if self.is_all_dates:
+ self = Index(self.asi8)
+ tgt_values = target.asi8
+ else:
+ tgt_values = target._ndarray_values
+
+ indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
+ return ensure_platform_int(indexer), missing
+
+ def get_indexer_for(self, target, **kwargs):
+ """
+ Guaranteed return of an indexer even when non-unique.
+
+ This dispatches to get_indexer or get_indexer_nonunique
+ as appropriate.
+ """
+ if self.is_unique:
+ return self.get_indexer(target, **kwargs)
+ indexer, _ = self.get_indexer_non_unique(target, **kwargs)
+ return indexer
+
+ def _maybe_promote(self, other):
+ # A hack, but it works
+ from pandas import DatetimeIndex
+ if self.inferred_type == 'date' and isinstance(other, DatetimeIndex):
+ return DatetimeIndex(self), other
+ elif self.inferred_type == 'boolean':
+ if not is_object_dtype(self.dtype):
+ return self.astype('object'), other.astype('object')
+ return self, other
+
+ def groupby(self, values):
+ """
+ Group the index labels by a given array of values.
+
+ Parameters
+ ----------
+ values : array
+ Values used to determine the groups.
+
+ Returns
+ -------
+ groups : dict
+ {group name -> group labels}
+ """
+
+ # TODO: if we are a MultiIndex, we can do better
+ # that converting to tuples
+ if isinstance(values, ABCMultiIndex):
+ values = values.values
+ values = ensure_categorical(values)
+ result = values._reverse_indexer()
+
+ # map to the label
+ result = {k: self.take(v) for k, v in compat.iteritems(result)}
+
+ return result
+
+ def map(self, mapper, na_action=None):
+ """
+ Map values using input correspondence (a dict, Series, or function).
+
+ Parameters
+ ----------
+ mapper : function, dict, or Series
+ Mapping correspondence.
+ na_action : {None, 'ignore'}
+ If 'ignore', propagate NA values, without passing them to the
+ mapping correspondence.
+
+ Returns
+ -------
+ applied : Union[Index, MultiIndex], inferred
+ The output of the mapping function applied to the index.
+ If the function returns a tuple with more than one element
+ a MultiIndex will be returned.
+ """
+
+ from .multi import MultiIndex
+ new_values = super(Index, self)._map_values(
+ mapper, na_action=na_action)
+
+ attributes = self._get_attributes_dict()
+
+ # we can return a MultiIndex
+ if new_values.size and isinstance(new_values[0], tuple):
+ if isinstance(self, MultiIndex):
+ names = self.names
+ elif attributes.get('name'):
+ names = [attributes.get('name')] * len(new_values[0])
+ else:
+ names = None
+ return MultiIndex.from_tuples(new_values,
+ names=names)
+
+ attributes['copy'] = False
+ if not new_values.size:
+ # empty
+ attributes['dtype'] = self.dtype
+
+ return Index(new_values, **attributes)
+
+ def isin(self, values, level=None):
+ """
+ Return a boolean array where the index values are in `values`.
+
+ Compute boolean array of whether each index value is found in the
+ passed set of values. The length of the returned boolean array matches
+ the length of the index.
+
+ Parameters
+ ----------
+ values : set or list-like
+ Sought values.
+
+ .. versionadded:: 0.18.1
+
+ Support for values as a set.
+
+ level : str or int, optional
+ Name or position of the index level to use (if the index is a
+ `MultiIndex`).
+
+ Returns
+ -------
+ is_contained : ndarray
+ NumPy array of boolean values.
+
+ See Also
+ --------
+ Series.isin : Same for Series.
+ DataFrame.isin : Same method for DataFrames.
+
+ Notes
+ -----
+ In the case of `MultiIndex` you must either specify `values` as a
+ list-like object containing tuples that are the same length as the
+ number of levels, or specify `level`. Otherwise it will raise a
+ ``ValueError``.
+
+ If `level` is specified:
+
+ - if it is the name of one *and only one* index level, use that level;
+ - otherwise it should be a number indicating level position.
+
+ Examples
+ --------
+ >>> idx = pd.Index([1,2,3])
+ >>> idx
+ Int64Index([1, 2, 3], dtype='int64')
+
+ Check whether each index value in a list of values.
+ >>> idx.isin([1, 4])
+ array([ True, False, False])
+
+ >>> midx = pd.MultiIndex.from_arrays([[1,2,3],
+ ... ['red', 'blue', 'green']],
+ ... names=('number', 'color'))
+ >>> midx
+ MultiIndex(levels=[[1, 2, 3], ['blue', 'green', 'red']],
+ codes=[[0, 1, 2], [2, 0, 1]],
+ names=['number', 'color'])
+
+ Check whether the strings in the 'color' level of the MultiIndex
+ are in a list of colors.
+
+ >>> midx.isin(['red', 'orange', 'yellow'], level='color')
+ array([ True, False, False])
+
+ To check across the levels of a MultiIndex, pass a list of tuples:
+
+ >>> midx.isin([(1, 'red'), (3, 'red')])
+ array([ True, False, False])
+
+ For a DatetimeIndex, string values in `values` are converted to
+ Timestamps.
+
+ >>> dates = ['2000-03-11', '2000-03-12', '2000-03-13']
+ >>> dti = pd.to_datetime(dates)
+ >>> dti
+ DatetimeIndex(['2000-03-11', '2000-03-12', '2000-03-13'],
+ dtype='datetime64[ns]', freq=None)
+
+ >>> dti.isin(['2000-03-11'])
+ array([ True, False, False])
+ """
+ if level is not None:
+ self._validate_index_level(level)
+ return algos.isin(self, values)
+
+ def _get_string_slice(self, key, use_lhs=True, use_rhs=True):
+ # this is for partial string indexing,
+ # overridden in DatetimeIndex, TimedeltaIndex and PeriodIndex
+ raise NotImplementedError
+
+ def slice_indexer(self, start=None, end=None, step=None, kind=None):
+ """
+ For an ordered or unique index, compute the slice indexer for input
+ labels and step.
+
+ Parameters
+ ----------
+ start : label, default None
+ If None, defaults to the beginning
+ end : label, default None
+ If None, defaults to the end
+ step : int, default None
+ kind : string, default None
+
+ Returns
+ -------
+ indexer : slice
+
+ Raises
+ ------
+ KeyError : If key does not exist, or key is not unique and index is
+ not ordered.
+
+ Notes
+ -----
+ This function assumes that the data is sorted, so use at your own peril
+
+ Examples
+ ---------
+ This is a method on all index types. For example you can do:
+
+ >>> idx = pd.Index(list('abcd'))
+ >>> idx.slice_indexer(start='b', end='c')
+ slice(1, 3)
+
+ >>> idx = pd.MultiIndex.from_arrays([list('abcd'), list('efgh')])
+ >>> idx.slice_indexer(start='b', end=('c', 'g'))
+ slice(1, 3)
+ """
+ start_slice, end_slice = self.slice_locs(start, end, step=step,
+ kind=kind)
+
+ # return a slice
+ if not is_scalar(start_slice):
+ raise AssertionError("Start slice bound is non-scalar")
+ if not is_scalar(end_slice):
+ raise AssertionError("End slice bound is non-scalar")
+
+ return slice(start_slice, end_slice, step)
+
+ def _maybe_cast_indexer(self, key):
+ """
+ If we have a float key and are not a floating index, then try to cast
+ to an int if equivalent.
+ """
+
+ if is_float(key) and not self.is_floating():
+ try:
+ ckey = int(key)
+ if ckey == key:
+ key = ckey
+ except (OverflowError, ValueError, TypeError):
+ pass
+ return key
+
+ def _validate_indexer(self, form, key, kind):
+ """
+ If we are positional indexer, validate that we have appropriate
+ typed bounds must be an integer.
+ """
+ assert kind in ['ix', 'loc', 'getitem', 'iloc']
+
+ if key is None:
+ pass
+ elif is_integer(key):
+ pass
+ elif kind in ['iloc', 'getitem']:
+ self._invalid_indexer(form, key)
+ return key
+
+ _index_shared_docs['_maybe_cast_slice_bound'] = """
+ This function should be overloaded in subclasses that allow non-trivial
+ casting on label-slice bounds, e.g. datetime-like indices allowing
+ strings containing formatted datetimes.
+
+ Parameters
+ ----------
+ label : object
+ side : {'left', 'right'}
+ kind : {'ix', 'loc', 'getitem'}
+
+ Returns
+ -------
+ label : object
+
+ Notes
+ -----
+ Value of `side` parameter should be validated in caller.
+
+ """
+
+ @Appender(_index_shared_docs['_maybe_cast_slice_bound'])
+ def _maybe_cast_slice_bound(self, label, side, kind):
+ assert kind in ['ix', 'loc', 'getitem', None]
+
+ # We are a plain index here (sub-class override this method if they
+ # wish to have special treatment for floats/ints, e.g. Float64Index and
+ # datetimelike Indexes
+ # reject them
+ if is_float(label):
+ if not (kind in ['ix'] and (self.holds_integer() or
+ self.is_floating())):
+ self._invalid_indexer('slice', label)
+
+ # we are trying to find integer bounds on a non-integer based index
+ # this is rejected (generally .loc gets you here)
+ elif is_integer(label):
+ self._invalid_indexer('slice', label)
+
+ return label
+
+ def _searchsorted_monotonic(self, label, side='left'):
+ if self.is_monotonic_increasing:
+ return self.searchsorted(label, side=side)
+ elif self.is_monotonic_decreasing:
+ # np.searchsorted expects ascending sort order, have to reverse
+ # everything for it to work (element ordering, search side and
+ # resulting value).
+ pos = self[::-1].searchsorted(label, side='right' if side == 'left'
+ else 'left')
+ return len(self) - pos
+
+ raise ValueError('index must be monotonic increasing or decreasing')
+
+ def _get_loc_only_exact_matches(self, key):
+ """
+ This is overridden on subclasses (namely, IntervalIndex) to control
+ get_slice_bound.
+ """
+ return self.get_loc(key)
+
+ def get_slice_bound(self, label, side, kind):
+ """
+ Calculate slice bound that corresponds to given label.
+
+ Returns leftmost (one-past-the-rightmost if ``side=='right'``) position
+ of given label.
+
+ Parameters
+ ----------
+ label : object
+ side : {'left', 'right'}
+ kind : {'ix', 'loc', 'getitem'}
+ """
+ assert kind in ['ix', 'loc', 'getitem', None]
+
+ if side not in ('left', 'right'):
+ raise ValueError("Invalid value for side kwarg,"
+ " must be either 'left' or 'right': %s" %
+ (side, ))
+
+ original_label = label
+
+ # For datetime indices label may be a string that has to be converted
+ # to datetime boundary according to its resolution.
+ label = self._maybe_cast_slice_bound(label, side, kind)
+
+ # we need to look up the label
+ try:
+ slc = self._get_loc_only_exact_matches(label)
+ except KeyError as err:
+ try:
+ return self._searchsorted_monotonic(label, side)
+ except ValueError:
+ # raise the original KeyError
+ raise err
+
+ if isinstance(slc, np.ndarray):
+ # get_loc may return a boolean array or an array of indices, which
+ # is OK as long as they are representable by a slice.
+ if is_bool_dtype(slc):
+ slc = lib.maybe_booleans_to_slice(slc.view('u1'))
+ else:
+ slc = lib.maybe_indices_to_slice(slc.astype('i8'), len(self))
+ if isinstance(slc, np.ndarray):
+ raise KeyError("Cannot get %s slice bound for non-unique "
+ "label: %r" % (side, original_label))
+
+ if isinstance(slc, slice):
+ if side == 'left':
+ return slc.start
+ else:
+ return slc.stop
+ else:
+ if side == 'right':
+ return slc + 1
+ else:
+ return slc
+
+ def slice_locs(self, start=None, end=None, step=None, kind=None):
+ """
+ Compute slice locations for input labels.
+
+ Parameters
+ ----------
+ start : label, default None
+ If None, defaults to the beginning
+ end : label, default None
+ If None, defaults to the end
+ step : int, defaults None
+ If None, defaults to 1
+ kind : {'ix', 'loc', 'getitem'} or None
+
+ Returns
+ -------
+ start, end : int
+
+ See Also
+ --------
+ Index.get_loc : Get location for a single label.
+
+ Notes
+ -----
+ This method only works if the index is monotonic or unique.
+
+ Examples
+ ---------
+ >>> idx = pd.Index(list('abcd'))
+ >>> idx.slice_locs(start='b', end='c')
+ (1, 3)
+ """
+ inc = (step is None or step >= 0)
+
+ if not inc:
+ # If it's a reverse slice, temporarily swap bounds.
+ start, end = end, start
+
+ start_slice = None
+ if start is not None:
+ start_slice = self.get_slice_bound(start, 'left', kind)
+ if start_slice is None:
+ start_slice = 0
+
+ end_slice = None
+ if end is not None:
+ end_slice = self.get_slice_bound(end, 'right', kind)
+ if end_slice is None:
+ end_slice = len(self)
+
+ if not inc:
+ # Bounds at this moment are swapped, swap them back and shift by 1.
+ #
+ # slice_locs('B', 'A', step=-1): s='B', e='A'
+ #
+ # s='A' e='B'
+ # AFTER SWAP: | |
+ # v ------------------> V
+ # -----------------------------------
+ # | | |A|A|A|A| | | | | |B|B| | | | |
+ # -----------------------------------
+ # ^ <------------------ ^
+ # SHOULD BE: | |
+ # end=s-1 start=e-1
+ #
+ end_slice, start_slice = start_slice - 1, end_slice - 1
+
+ # i == -1 triggers ``len(self) + i`` selection that points to the
+ # last element, not before-the-first one, subtracting len(self)
+ # compensates that.
+ if end_slice == -1:
+ end_slice -= len(self)
+ if start_slice == -1:
+ start_slice -= len(self)
+
+ return start_slice, end_slice
+
+ def delete(self, loc):
+ """
+ Make new Index with passed location(-s) deleted.
+
+ Returns
+ -------
+ new_index : Index
+ """
+ return self._shallow_copy(np.delete(self._data, loc))
+
+ def insert(self, loc, item):
+ """
+ Make new Index inserting new item at location.
+
+ Follows Python list.append semantics for negative values.
+
+ Parameters
+ ----------
+ loc : int
+ item : object
+
+ Returns
+ -------
+ new_index : Index
+ """
+ _self = np.asarray(self)
+ item = self._coerce_scalar_to_index(item)._ndarray_values
+ idx = np.concatenate((_self[:loc], item, _self[loc:]))
+ return self._shallow_copy_with_infer(idx)
+
+ def drop(self, labels, errors='raise'):
+ """
+ Make new Index with passed list of labels deleted.
+
+ Parameters
+ ----------
+ labels : array-like
+ errors : {'ignore', 'raise'}, default 'raise'
+ If 'ignore', suppress error and existing labels are dropped.
+
+ Returns
+ -------
+ dropped : Index
+
+ Raises
+ ------
+ KeyError
+ If not all of the labels are found in the selected axis
+ """
+ arr_dtype = 'object' if self.dtype == 'object' else None
+ labels = com.index_labels_to_array(labels, dtype=arr_dtype)
+ indexer = self.get_indexer(labels)
+ mask = indexer == -1
+ if mask.any():
+ if errors != 'ignore':
+ raise KeyError(
+ '{} not found in axis'.format(labels[mask]))
+ indexer = indexer[~mask]
+ return self.delete(indexer)
+
+ # --------------------------------------------------------------------
+ # Generated Arithmetic, Comparison, and Unary Methods
+
+ def _evaluate_with_timedelta_like(self, other, op):
+ # Timedelta knows how to operate with np.array, so dispatch to that
+ # operation and then wrap the results
+ if self._is_numeric_dtype and op.__name__ in ['add', 'sub',
+ 'radd', 'rsub']:
+ raise TypeError("Operation {opname} between {cls} and {other} "
+ "is invalid".format(opname=op.__name__,
+ cls=self.dtype,
+ other=type(other).__name__))
+
+ other = Timedelta(other)
+ values = self.values
+
+ with np.errstate(all='ignore'):
+ result = op(values, other)
+
+ attrs = self._get_attributes_dict()
+ attrs = self._maybe_update_attributes(attrs)
+ if op == divmod:
+ return Index(result[0], **attrs), Index(result[1], **attrs)
+ return Index(result, **attrs)
+
+ def _evaluate_with_datetime_like(self, other, op):
+ raise TypeError("can only perform ops with datetime like values")
+
+ @classmethod
+ def _add_comparison_methods(cls):
+ """
+ Add in comparison methods.
+ """
+ cls.__eq__ = _make_comparison_op(operator.eq, cls)
+ cls.__ne__ = _make_comparison_op(operator.ne, cls)
+ cls.__lt__ = _make_comparison_op(operator.lt, cls)
+ cls.__gt__ = _make_comparison_op(operator.gt, cls)
+ cls.__le__ = _make_comparison_op(operator.le, cls)
+ cls.__ge__ = _make_comparison_op(operator.ge, cls)
+
+ @classmethod
+ def _add_numeric_methods_add_sub_disabled(cls):
+ """
+ Add in the numeric add/sub methods to disable.
+ """
+ cls.__add__ = make_invalid_op('__add__')
+ cls.__radd__ = make_invalid_op('__radd__')
+ cls.__iadd__ = make_invalid_op('__iadd__')
+ cls.__sub__ = make_invalid_op('__sub__')
+ cls.__rsub__ = make_invalid_op('__rsub__')
+ cls.__isub__ = make_invalid_op('__isub__')
+
+ @classmethod
+ def _add_numeric_methods_disabled(cls):
+ """
+ Add in numeric methods to disable other than add/sub.
+ """
+ cls.__pow__ = make_invalid_op('__pow__')
+ cls.__rpow__ = make_invalid_op('__rpow__')
+ cls.__mul__ = make_invalid_op('__mul__')
+ cls.__rmul__ = make_invalid_op('__rmul__')
+ cls.__floordiv__ = make_invalid_op('__floordiv__')
+ cls.__rfloordiv__ = make_invalid_op('__rfloordiv__')
+ cls.__truediv__ = make_invalid_op('__truediv__')
+ cls.__rtruediv__ = make_invalid_op('__rtruediv__')
+ if not compat.PY3:
+ cls.__div__ = make_invalid_op('__div__')
+ cls.__rdiv__ = make_invalid_op('__rdiv__')
+ cls.__mod__ = make_invalid_op('__mod__')
+ cls.__divmod__ = make_invalid_op('__divmod__')
+ cls.__neg__ = make_invalid_op('__neg__')
+ cls.__pos__ = make_invalid_op('__pos__')
+ cls.__abs__ = make_invalid_op('__abs__')
+ cls.__inv__ = make_invalid_op('__inv__')
+
+ def _maybe_update_attributes(self, attrs):
+ """
+ Update Index attributes (e.g. freq) depending on op.
+ """
+ return attrs
+
+ def _validate_for_numeric_unaryop(self, op, opstr):
+ """
+ Validate if we can perform a numeric unary operation.
+ """
+ if not self._is_numeric_dtype:
+ raise TypeError("cannot evaluate a numeric op "
+ "{opstr} for type: {typ}"
+ .format(opstr=opstr, typ=type(self).__name__))
+
+ def _validate_for_numeric_binop(self, other, op):
+ """
+ Return valid other; evaluate or raise TypeError if we are not of
+ the appropriate type.
+
+ Notes
+ -----
+ This is an internal method called by ops.
+ """
+ opstr = '__{opname}__'.format(opname=op.__name__)
+ # if we are an inheritor of numeric,
+ # but not actually numeric (e.g. DatetimeIndex/PeriodIndex)
+ if not self._is_numeric_dtype:
+ raise TypeError("cannot evaluate a numeric op {opstr} "
+ "for type: {typ}"
+ .format(opstr=opstr, typ=type(self).__name__))
+
+ if isinstance(other, Index):
+ if not other._is_numeric_dtype:
+ raise TypeError("cannot evaluate a numeric op "
+ "{opstr} with type: {typ}"
+ .format(opstr=opstr, typ=type(other)))
+ elif isinstance(other, np.ndarray) and not other.ndim:
+ other = other.item()
+
+ if isinstance(other, (Index, ABCSeries, np.ndarray)):
+ if len(self) != len(other):
+ raise ValueError("cannot evaluate a numeric op with "
+ "unequal lengths")
+ other = com.values_from_object(other)
+ if other.dtype.kind not in ['f', 'i', 'u']:
+ raise TypeError("cannot evaluate a numeric op "
+ "with a non-numeric dtype")
+ elif isinstance(other, (ABCDateOffset, np.timedelta64, timedelta)):
+ # higher up to handle
+ pass
+ elif isinstance(other, (datetime, np.datetime64)):
+ # higher up to handle
+ pass
+ else:
+ if not (is_float(other) or is_integer(other)):
+ raise TypeError("can only perform ops with scalar values")
+
+ return other
+
+ @classmethod
+ def _add_numeric_methods_binary(cls):
+ """
+ Add in numeric methods.
+ """
+ cls.__add__ = _make_arithmetic_op(operator.add, cls)
+ cls.__radd__ = _make_arithmetic_op(ops.radd, cls)
+ cls.__sub__ = _make_arithmetic_op(operator.sub, cls)
+ cls.__rsub__ = _make_arithmetic_op(ops.rsub, cls)
+ cls.__rpow__ = _make_arithmetic_op(ops.rpow, cls)
+ cls.__pow__ = _make_arithmetic_op(operator.pow, cls)
+
+ cls.__truediv__ = _make_arithmetic_op(operator.truediv, cls)
+ cls.__rtruediv__ = _make_arithmetic_op(ops.rtruediv, cls)
+ if not compat.PY3:
+ cls.__div__ = _make_arithmetic_op(operator.div, cls)
+ cls.__rdiv__ = _make_arithmetic_op(ops.rdiv, cls)
+
+ # TODO: rmod? rdivmod?
+ cls.__mod__ = _make_arithmetic_op(operator.mod, cls)
+ cls.__floordiv__ = _make_arithmetic_op(operator.floordiv, cls)
+ cls.__rfloordiv__ = _make_arithmetic_op(ops.rfloordiv, cls)
+ cls.__divmod__ = _make_arithmetic_op(divmod, cls)
+ cls.__mul__ = _make_arithmetic_op(operator.mul, cls)
+ cls.__rmul__ = _make_arithmetic_op(ops.rmul, cls)
+
+ @classmethod
+ def _add_numeric_methods_unary(cls):
+ """
+ Add in numeric unary methods.
+ """
+ def _make_evaluate_unary(op, opstr):
+
+ def _evaluate_numeric_unary(self):
+
+ self._validate_for_numeric_unaryop(op, opstr)
+ attrs = self._get_attributes_dict()
+ attrs = self._maybe_update_attributes(attrs)
+ return Index(op(self.values), **attrs)
+
+ _evaluate_numeric_unary.__name__ = opstr
+ return _evaluate_numeric_unary
+
+ cls.__neg__ = _make_evaluate_unary(operator.neg, '__neg__')
+ cls.__pos__ = _make_evaluate_unary(operator.pos, '__pos__')
+ cls.__abs__ = _make_evaluate_unary(np.abs, '__abs__')
+ cls.__inv__ = _make_evaluate_unary(lambda x: -x, '__inv__')
+
+ @classmethod
+ def _add_numeric_methods(cls):
+ cls._add_numeric_methods_unary()
+ cls._add_numeric_methods_binary()
+
+ @classmethod
+ def _add_logical_methods(cls):
+ """
+ Add in logical methods.
+ """
+ _doc = """
+ %(desc)s
+
+ Parameters
+ ----------
+ *args
+ These parameters will be passed to numpy.%(outname)s.
+ **kwargs
+ These parameters will be passed to numpy.%(outname)s.
+
+ Returns
+ -------
+ %(outname)s : bool or array_like (if axis is specified)
+ A single element array_like may be converted to bool."""
+
+ _index_shared_docs['index_all'] = dedent("""
+
+ See Also
+ --------
+ pandas.Index.any : Return whether any element in an Index is True.
+ pandas.Series.any : Return whether any element in a Series is True.
+ pandas.Series.all : Return whether all elements in a Series are True.
+
+ Notes
+ -----
+ Not a Number (NaN), positive infinity and negative infinity
+ evaluate to True because these are not equal to zero.
+
+ Examples
+ --------
+ **all**
+
+ True, because nonzero integers are considered True.
+
+ >>> pd.Index([1, 2, 3]).all()
+ True
+
+ False, because ``0`` is considered False.
+
+ >>> pd.Index([0, 1, 2]).all()
+ False
+
+ **any**
+
+ True, because ``1`` is considered True.
+
+ >>> pd.Index([0, 0, 1]).any()
+ True
+
+ False, because ``0`` is considered False.
+
+ >>> pd.Index([0, 0, 0]).any()
+ False
+ """)
+
+ _index_shared_docs['index_any'] = dedent("""
+
+ See Also
+ --------
+ pandas.Index.all : Return whether all elements are True.
+ pandas.Series.all : Return whether all elements are True.
+
+ Notes
+ -----
+ Not a Number (NaN), positive infinity and negative infinity
+ evaluate to True because these are not equal to zero.
+
+ Examples
+ --------
+ >>> index = pd.Index([0, 1, 2])
+ >>> index.any()
+ True
+
+ >>> index = pd.Index([0, 0, 0])
+ >>> index.any()
+ False
+ """)
+
+ def _make_logical_function(name, desc, f):
+ @Substitution(outname=name, desc=desc)
+ @Appender(_index_shared_docs['index_' + name])
+ @Appender(_doc)
+ def logical_func(self, *args, **kwargs):
+ result = f(self.values)
+ if (isinstance(result, (np.ndarray, ABCSeries, Index)) and
+ result.ndim == 0):
+ # return NumPy type
+ return result.dtype.type(result.item())
+ else: # pragma: no cover
+ return result
+
+ logical_func.__name__ = name
+ return logical_func
+
+ cls.all = _make_logical_function('all', 'Return whether all elements '
+ 'are True.',
+ np.all)
+ cls.any = _make_logical_function('any',
+ 'Return whether any element is True.',
+ np.any)
+
+ @classmethod
+ def _add_logical_methods_disabled(cls):
+ """
+ Add in logical methods to disable.
+ """
+ cls.all = make_invalid_op('all')
+ cls.any = make_invalid_op('any')
+
+
+Index._add_numeric_methods_disabled()
+Index._add_logical_methods()
+Index._add_comparison_methods()
+
+
+def ensure_index_from_sequences(sequences, names=None):
+ """
+ Construct an index from sequences of data.
+
+ A single sequence returns an Index. Many sequences returns a
+ MultiIndex.
+
+ Parameters
+ ----------
+ sequences : sequence of sequences
+ names : sequence of str
+
+ Returns
+ -------
+ index : Index or MultiIndex
+
+ Examples
+ --------
+ >>> ensure_index_from_sequences([[1, 2, 3]], names=['name'])
+ Int64Index([1, 2, 3], dtype='int64', name='name')
+
+ >>> ensure_index_from_sequences([['a', 'a'], ['a', 'b']],
+ names=['L1', 'L2'])
+ MultiIndex(levels=[['a'], ['a', 'b']],
+ codes=[[0, 0], [0, 1]],
+ names=['L1', 'L2'])
+
+ See Also
+ --------
+ ensure_index
+ """
+ from .multi import MultiIndex
+
+ if len(sequences) == 1:
+ if names is not None:
+ names = names[0]
+ return Index(sequences[0], name=names)
+ else:
+ return MultiIndex.from_arrays(sequences, names=names)
+
+
+def ensure_index(index_like, copy=False):
+ """
+ Ensure that we have an index from some index-like object.
+
+ Parameters
+ ----------
+ index : sequence
+ An Index or other sequence
+ copy : bool
+
+ Returns
+ -------
+ index : Index or MultiIndex
+
+ Examples
+ --------
+ >>> ensure_index(['a', 'b'])
+ Index(['a', 'b'], dtype='object')
+
+ >>> ensure_index([('a', 'a'), ('b', 'c')])
+ Index([('a', 'a'), ('b', 'c')], dtype='object')
+
+ >>> ensure_index([['a', 'a'], ['b', 'c']])
+ MultiIndex(levels=[['a'], ['b', 'c']],
+ codes=[[0, 0], [0, 1]])
+
+ See Also
+ --------
+ ensure_index_from_sequences
+ """
+ if isinstance(index_like, Index):
+ if copy:
+ index_like = index_like.copy()
+ return index_like
+ if hasattr(index_like, 'name'):
+ return Index(index_like, name=index_like.name, copy=copy)
+
+ if is_iterator(index_like):
+ index_like = list(index_like)
+
+ # must check for exactly list here because of strict type
+ # check in clean_index_list
+ if isinstance(index_like, list):
+ if type(index_like) != list:
+ index_like = list(index_like)
+
+ converted, all_arrays = lib.clean_index_list(index_like)
+
+ if len(converted) > 0 and all_arrays:
+ from .multi import MultiIndex
+ return MultiIndex.from_arrays(converted)
+ else:
+ index_like = converted
+ else:
+ # clean_index_list does the equivalent of copying
+ # so only need to do this if not list instance
+ if copy:
+ from copy import copy
+ index_like = copy(index_like)
+
+ return Index(index_like)
+
+
+def _ensure_has_len(seq):
+ """
+ If seq is an iterator, put its values into a list.
+ """
+ try:
+ len(seq)
+ except TypeError:
+ return list(seq)
+ else:
+ return seq
+
+
+def _trim_front(strings):
+ """
+ Trims zeros and decimal points.
+ """
+ trimmed = strings
+ while len(strings) > 0 and all(x[0] == ' ' for x in trimmed):
+ trimmed = [x[1:] for x in trimmed]
+ return trimmed
+
+
+def _validate_join_method(method):
+ if method not in ['left', 'right', 'inner', 'outer']:
+ raise ValueError('do not recognize join method %s' % method)
+
+
+def default_index(n):
+ from pandas.core.index import RangeIndex
+ return RangeIndex(0, n, name=None)
diff --git a/contrib/python/pandas/py2/pandas/core/indexes/category.py b/contrib/python/pandas/py2/pandas/core/indexes/category.py
new file mode 100644
index 00000000000..e43b64827d0
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/indexes/category.py
@@ -0,0 +1,852 @@
+import operator
+import warnings
+
+import numpy as np
+
+from pandas._libs import index as libindex
+import pandas.compat as compat
+from pandas.compat.numpy import function as nv
+from pandas.util._decorators import Appender, cache_readonly
+
+from pandas.core.dtypes.common import (
+ ensure_platform_int, is_categorical_dtype, is_interval_dtype, is_list_like,
+ is_scalar)
+from pandas.core.dtypes.dtypes import CategoricalDtype
+from pandas.core.dtypes.generic import ABCCategorical, ABCSeries
+from pandas.core.dtypes.missing import isna
+
+from pandas.core import accessor
+from pandas.core.algorithms import take_1d
+from pandas.core.arrays.categorical import Categorical, contains
+import pandas.core.common as com
+from pandas.core.config import get_option
+import pandas.core.indexes.base as ibase
+from pandas.core.indexes.base import Index, _index_shared_docs
+import pandas.core.missing as missing
+from pandas.core.ops import get_op_result_name
+
+_index_doc_kwargs = dict(ibase._index_doc_kwargs)
+_index_doc_kwargs.update(dict(target_klass='CategoricalIndex'))
+
+
+ delegate=Categorical,
+ accessors=["rename_categories",
+ "reorder_categories",
+ "add_categories",
+ "remove_categories",
+ "remove_unused_categories",
+ "set_categories",
+ "as_ordered", "as_unordered",
+ "min", "max"],
+ typ='method', overwrite=True)
+class CategoricalIndex(Index, accessor.PandasDelegate):
+ """
+ Immutable Index implementing an ordered, sliceable set. CategoricalIndex
+ represents a sparsely populated Index with an underlying Categorical.
+
+ Parameters
+ ----------
+ data : array-like or Categorical, (1-dimensional)
+ categories : optional, array-like
+ categories for the CategoricalIndex
+ ordered : boolean,
+ designating if the categories are ordered
+ copy : bool
+ Make a copy of input ndarray
+ name : object
+ Name to be stored in the index
+
+ Attributes
+ ----------
+ codes
+ categories
+ ordered
+
+ Methods
+ -------
+ rename_categories
+ reorder_categories
+ add_categories
+ remove_categories
+ remove_unused_categories
+ set_categories
+ as_ordered
+ as_unordered
+ map
+
+ See Also
+ --------
+ Categorical, Index
+ """
+
+ _typ = 'categoricalindex'
+
+ @property
+ def _engine_type(self):
+ # self.codes can have dtype int8, int16, int32 or int64, so we need
+ # to return the corresponding engine type (libindex.Int8Engine, etc.).
+ return {np.int8: libindex.Int8Engine,
+ np.int16: libindex.Int16Engine,
+ np.int32: libindex.Int32Engine,
+ np.int64: libindex.Int64Engine,
+ }[self.codes.dtype.type]
+
+ _attributes = ['name']
+
+ # --------------------------------------------------------------------
+ # Constructors
+
+ def __new__(cls, data=None, categories=None, ordered=None, dtype=None,
+ copy=False, name=None, fastpath=None):
+
+ if fastpath is not None:
+ warnings.warn("The 'fastpath' keyword is deprecated, and will be "
+ "removed in a future version.",
+ FutureWarning, stacklevel=2)
+ if fastpath:
+ return cls._simple_new(data, name=name, dtype=dtype)
+
+ dtype = CategoricalDtype._from_values_or_dtype(data, categories,
+ ordered, dtype)
+
+ if name is None and hasattr(data, 'name'):
+ name = data.name
+
+ if not is_categorical_dtype(data):
+ # don't allow scalars
+ # if data is None, then categories must be provided
+ if is_scalar(data):
+ if data is not None or categories is None:
+ cls._scalar_data_error(data)
+ data = []
+
+ data = cls._create_categorical(data, dtype=dtype)
+
+ data = data.copy() if copy else data
+
+ return cls._simple_new(data, name=name)
+
+ def _create_from_codes(self, codes, dtype=None, name=None):
+ """
+ *this is an internal non-public method*
+
+ create the correct categorical from codes
+
+ Parameters
+ ----------
+ codes : new codes
+ dtype: CategoricalDtype, defaults to existing
+ name : optional name attribute, defaults to existing
+
+ Returns
+ -------
+ CategoricalIndex
+ """
+
+ if dtype is None:
+ dtype = self.dtype
+ if name is None:
+ name = self.name
+ cat = Categorical.from_codes(codes, dtype=dtype)
+ return CategoricalIndex(cat, name=name)
+
+ @classmethod
+ def _create_categorical(cls, data, dtype=None):
+ """
+ *this is an internal non-public method*
+
+ create the correct categorical from data and the properties
+
+ Parameters
+ ----------
+ data : data for new Categorical
+ dtype : CategoricalDtype, defaults to existing
+
+ Returns
+ -------
+ Categorical
+ """
+ if (isinstance(data, (cls, ABCSeries)) and
+ is_categorical_dtype(data)):
+ data = data.values
+
+ if not isinstance(data, ABCCategorical):
+ return Categorical(data, dtype=dtype)
+
+ if isinstance(dtype, CategoricalDtype) and dtype != data.dtype:
+ # we want to silently ignore dtype='category'
+ data = data._set_dtype(dtype)
+ return data
+
+ @classmethod
+ def _simple_new(cls, values, name=None, dtype=None, **kwargs):
+ result = object.__new__(cls)
+
+ values = cls._create_categorical(values, dtype=dtype)
+ result._data = values
+ result.name = name
+ for k, v in compat.iteritems(kwargs):
+ setattr(result, k, v)
+
+ result._reset_identity()
+ return result
+
+ # --------------------------------------------------------------------
+
+ @Appender(_index_shared_docs['_shallow_copy'])
+ def _shallow_copy(self, values=None, dtype=None, **kwargs):
+ if dtype is None:
+ dtype = self.dtype
+ return super(CategoricalIndex, self)._shallow_copy(
+ values=values, dtype=dtype, **kwargs)
+
+ def _is_dtype_compat(self, other):
+ """
+ *this is an internal non-public method*
+
+ provide a comparison between the dtype of self and other (coercing if
+ needed)
+
+ Raises
+ ------
+ TypeError if the dtypes are not compatible
+ """
+ if is_categorical_dtype(other):
+ if isinstance(other, CategoricalIndex):
+ other = other._values
+ if not other.is_dtype_equal(self):
+ raise TypeError("categories must match existing categories "
+ "when appending")
+ else:
+ values = other
+ if not is_list_like(values):
+ values = [values]
+ other = CategoricalIndex(self._create_categorical(
+ other, dtype=self.dtype))
+ if not other.isin(values).all():
+ raise TypeError("cannot append a non-category item to a "
+ "CategoricalIndex")
+
+ return other
+
+ def equals(self, other):
+ """
+ Determines if two CategorialIndex objects contain the same elements.
+ """
+ if self.is_(other):
+ return True
+
+ if not isinstance(other, Index):
+ return False
+
+ try:
+ other = self._is_dtype_compat(other)
+ if isinstance(other, type(self)):
+ other = other._data
+ return self._data.equals(other)
+ except (TypeError, ValueError):
+ pass
+
+ return False
+
+ # --------------------------------------------------------------------
+ # Rendering Methods
+
+ @property
+ def _formatter_func(self):
+ return self.categories._formatter_func
+
+ def _format_attrs(self):
+ """
+ Return a list of tuples of the (attr,formatted_value)
+ """
+ max_categories = (10 if get_option("display.max_categories") == 0 else
+ get_option("display.max_categories"))
+ attrs = [
+ ('categories',
+ ibase.default_pprint(self.categories,
+ max_seq_items=max_categories)),
+ ('ordered', self.ordered)]
+ if self.name is not None:
+ attrs.append(('name', ibase.default_pprint(self.name)))
+ attrs.append(('dtype', "'%s'" % self.dtype.name))
+ max_seq_items = get_option('display.max_seq_items') or len(self)
+ if len(self) > max_seq_items:
+ attrs.append(('length', len(self)))
+ return attrs
+
+ # --------------------------------------------------------------------
+
+ @property
+ def inferred_type(self):
+ return 'categorical'
+
+ @property
+ def values(self):
+ """ return the underlying data, which is a Categorical """
+ return self._data
+
+ @property
+ def itemsize(self):
+ # Size of the items in categories, not codes.
+ return self.values.itemsize
+
+ def _wrap_setop_result(self, other, result):
+ name = get_op_result_name(self, other)
+ return self._shallow_copy(result, name=name)
+
+ def get_values(self):
+ """ return the underlying data as an ndarray """
+ return self._data.get_values()
+
+ def tolist(self):
+ return self._data.tolist()
+
+ @property
+ def codes(self):
+ return self._data.codes
+
+ @property
+ def categories(self):
+ return self._data.categories
+
+ @property
+ def ordered(self):
+ return self._data.ordered
+
+ def _reverse_indexer(self):
+ return self._data._reverse_indexer()
+
+ @Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
+ def __contains__(self, key):
+ # if key is a NaN, check if any NaN is in self.
+ if isna(key):
+ return self.hasnans
+
+ return contains(self, key, container=self._engine)
+
+ @Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
+ def contains(self, key):
+ return key in self
+
+ def __array__(self, dtype=None):
+ """ the array interface, return my values """
+ return np.array(self._data, dtype=dtype)
+
+ @Appender(_index_shared_docs['astype'])
+ def astype(self, dtype, copy=True):
+ if is_interval_dtype(dtype):
+ from pandas import IntervalIndex
+ return IntervalIndex(np.array(self))
+ elif is_categorical_dtype(dtype):
+ # GH 18630
+ dtype = self.dtype.update_dtype(dtype)
+ if dtype == self.dtype:
+ return self.copy() if copy else self
+
+ return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy)
+
+ @cache_readonly
+ def _isnan(self):
+ """ return if each value is nan"""
+ return self._data.codes == -1
+
+ @Appender(ibase._index_shared_docs['fillna'])
+ def fillna(self, value, downcast=None):
+ self._assert_can_do_op(value)
+ return CategoricalIndex(self._data.fillna(value), name=self.name)
+
+ def argsort(self, *args, **kwargs):
+ return self.values.argsort(*args, **kwargs)
+
+ @cache_readonly
+ def _engine(self):
+
+ # we are going to look things up with the codes themselves
+ return self._engine_type(lambda: self.codes, len(self))
+
+ # introspection
+ @cache_readonly
+ def is_unique(self):
+ return self._engine.is_unique
+
+ @property
+ def is_monotonic_increasing(self):
+ return self._engine.is_monotonic_increasing
+
+ @property
+ def is_monotonic_decreasing(self):
+ return self._engine.is_monotonic_decreasing
+
+ @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs)
+ def unique(self, level=None):
+ if level is not None:
+ self._validate_index_level(level)
+ result = self.values.unique()
+ # CategoricalIndex._shallow_copy keeps original dtype
+ # if not otherwise specified
+ return self._shallow_copy(result, dtype=result.dtype)
+
+ @Appender(Index.duplicated.__doc__)
+ def duplicated(self, keep='first'):
+ from pandas._libs.hashtable import duplicated_int64
+ codes = self.codes.astype('i8')
+ return duplicated_int64(codes, keep)
+
+ def _to_safe_for_reshape(self):
+ """ convert to object if we are a categorical """
+ return self.astype('object')
+
+ def get_loc(self, key, method=None):
+ """
+ Get integer location, slice or boolean mask for requested label.
+
+ Parameters
+ ----------
+ key : label
+ method : {None}
+ * default: exact matches only.
+
+ Returns
+ -------
+ loc : int if unique index, slice if monotonic index, else mask
+
+ Raises
+ ------
+ KeyError : if the key is not in the index
+
+ Examples
+ ---------
+ >>> unique_index = pd.CategoricalIndex(list('abc'))
+ >>> unique_index.get_loc('b')
+ 1
+
+ >>> monotonic_index = pd.CategoricalIndex(list('abbc'))
+ >>> monotonic_index.get_loc('b')
+ slice(1, 3, None)
+
+ >>> non_monotonic_index = pd.CategoricalIndex(list('abcb'))
+ >>> non_monotonic_index.get_loc('b')
+ array([False, True, False, True], dtype=bool)
+ """
+ code = self.categories.get_loc(key)
+ code = self.codes.dtype.type(code)
+ try:
+ return self._engine.get_loc(code)
+ except KeyError:
+ raise KeyError(key)
+
+ def get_value(self, series, key):
+ """
+ Fast lookup of value from 1-dimensional ndarray. Only use this if you
+ know what you're doing
+ """
+ try:
+ k = com.values_from_object(key)
+ k = self._convert_scalar_indexer(k, kind='getitem')
+ indexer = self.get_loc(k)
+ return series.iloc[indexer]
+ except (KeyError, TypeError):
+ pass
+
+ # we might be a positional inexer
+ return super(CategoricalIndex, self).get_value(series, key)
+
+ def _can_reindex(self, indexer):
+ """ always allow reindexing """
+ pass
+
+ @Appender(_index_shared_docs['where'])
+ def where(self, cond, other=None):
+ # TODO: Investigate an alternative implementation with
+ # 1. copy the underyling Categorical
+ # 2. setitem with `cond` and `other`
+ # 3. Rebuild CategoricalIndex.
+ if other is None:
+ other = self._na_value
+ values = np.where(cond, self.values, other)
+ cat = Categorical(values, dtype=self.dtype)
+ return self._shallow_copy(cat, **self._get_attributes_dict())
+
+ def reindex(self, target, method=None, level=None, limit=None,
+ tolerance=None):
+ """
+ Create index with target's values (move/add/delete values as necessary)
+
+ Returns
+ -------
+ new_index : pd.Index
+ Resulting index
+ indexer : np.ndarray or None
+ Indices of output values in original index
+
+ """
+
+ if method is not None:
+ raise NotImplementedError("argument method is not implemented for "
+ "CategoricalIndex.reindex")
+ if level is not None:
+ raise NotImplementedError("argument level is not implemented for "
+ "CategoricalIndex.reindex")
+ if limit is not None:
+ raise NotImplementedError("argument limit is not implemented for "
+ "CategoricalIndex.reindex")
+
+ target = ibase.ensure_index(target)
+
+ if self.equals(target):
+ indexer = None
+ missing = []
+ else:
+ if not target.is_unique:
+ raise ValueError("cannot reindex with a non-unique indexer")
+
+ indexer, missing = self.get_indexer_non_unique(np.array(target))
+
+ if len(self.codes) and indexer is not None:
+ new_target = self.take(indexer)
+ else:
+ new_target = target
+
+ # filling in missing if needed
+ if len(missing):
+ cats = self.categories.get_indexer(target)
+
+ if (cats == -1).any():
+ # coerce to a regular index here!
+ result = Index(np.array(self), name=self.name)
+ new_target, indexer, _ = result._reindex_non_unique(
+ np.array(target))
+ else:
+
+ codes = new_target.codes.copy()
+ codes[indexer == -1] = cats[missing]
+ new_target = self._create_from_codes(codes)
+
+ # we always want to return an Index type here
+ # to be consistent with .reindex for other index types (e.g. they don't
+ # coerce based on the actual values, only on the dtype)
+ # unless we had an initial Categorical to begin with
+ # in which case we are going to conform to the passed Categorical
+ new_target = np.asarray(new_target)
+ if is_categorical_dtype(target):
+ new_target = target._shallow_copy(new_target, name=self.name)
+ else:
+ new_target = Index(new_target, name=self.name)
+
+ return new_target, indexer
+
+ def _reindex_non_unique(self, target):
+ """ reindex from a non-unique; which CategoricalIndex's are almost
+ always
+ """
+ new_target, indexer = self.reindex(target)
+ new_indexer = None
+
+ check = indexer == -1
+ if check.any():
+ new_indexer = np.arange(len(self.take(indexer)))
+ new_indexer[check] = -1
+
+ cats = self.categories.get_indexer(target)
+ if not (cats == -1).any():
+ # .reindex returns normal Index. Revert to CategoricalIndex if
+ # all targets are included in my categories
+ new_target = self._shallow_copy(new_target)
+
+ return new_target, indexer, new_indexer
+
+ @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs)
+ def get_indexer(self, target, method=None, limit=None, tolerance=None):
+ from pandas.core.arrays.categorical import _recode_for_categories
+
+ method = missing.clean_reindex_fill_method(method)
+ target = ibase.ensure_index(target)
+
+ if self.is_unique and self.equals(target):
+ return np.arange(len(self), dtype='intp')
+
+ if method == 'pad' or method == 'backfill':
+ raise NotImplementedError("method='pad' and method='backfill' not "
+ "implemented yet for CategoricalIndex")
+ elif method == 'nearest':
+ raise NotImplementedError("method='nearest' not implemented yet "
+ 'for CategoricalIndex')
+
+ if (isinstance(target, CategoricalIndex) and
+ self.values.is_dtype_equal(target)):
+ if self.values.equals(target.values):
+ # we have the same codes
+ codes = target.codes
+ else:
+ codes = _recode_for_categories(target.codes,
+ target.categories,
+ self.values.categories)
+ else:
+ if isinstance(target, CategoricalIndex):
+ code_indexer = self.categories.get_indexer(target.categories)
+ codes = take_1d(code_indexer, target.codes, fill_value=-1)
+ else:
+ codes = self.categories.get_indexer(target)
+
+ indexer, _ = self._engine.get_indexer_non_unique(codes)
+ return ensure_platform_int(indexer)
+
+ @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs)
+ def get_indexer_non_unique(self, target):
+ target = ibase.ensure_index(target)
+
+ if isinstance(target, CategoricalIndex):
+ # Indexing on codes is more efficient if categories are the same:
+ if target.categories is self.categories:
+ target = target.codes
+ indexer, missing = self._engine.get_indexer_non_unique(target)
+ return ensure_platform_int(indexer), missing
+ target = target.values
+
+ codes = self.categories.get_indexer(target)
+ indexer, missing = self._engine.get_indexer_non_unique(codes)
+ return ensure_platform_int(indexer), missing
+
+ @Appender(_index_shared_docs['_convert_scalar_indexer'])
+ def _convert_scalar_indexer(self, key, kind=None):
+ if self.categories._defer_to_indexing:
+ return self.categories._convert_scalar_indexer(key, kind=kind)
+
+ return super(CategoricalIndex, self)._convert_scalar_indexer(
+ key, kind=kind)
+
+ @Appender(_index_shared_docs['_convert_list_indexer'])
+ def _convert_list_indexer(self, keyarr, kind=None):
+ # Return our indexer or raise if all of the values are not included in
+ # the categories
+
+ if self.categories._defer_to_indexing:
+ indexer = self.categories._convert_list_indexer(keyarr, kind=kind)
+ return Index(self.codes).get_indexer_for(indexer)
+
+ indexer = self.categories.get_indexer(np.asarray(keyarr))
+ if (indexer == -1).any():
+ raise KeyError(
+ "a list-indexer must only "
+ "include values that are "
+ "in the categories")
+
+ return self.get_indexer(keyarr)
+
+ @Appender(_index_shared_docs['_convert_arr_indexer'])
+ def _convert_arr_indexer(self, keyarr):
+ keyarr = com.asarray_tuplesafe(keyarr)
+
+ if self.categories._defer_to_indexing:
+ return keyarr
+
+ return self._shallow_copy(keyarr)
+
+ @Appender(_index_shared_docs['_convert_index_indexer'])
+ def _convert_index_indexer(self, keyarr):
+ return self._shallow_copy(keyarr)
+
+ @Appender(_index_shared_docs['take'] % _index_doc_kwargs)
+ def take(self, indices, axis=0, allow_fill=True,
+ fill_value=None, **kwargs):
+ nv.validate_take(tuple(), kwargs)
+ indices = ensure_platform_int(indices)
+ taken = self._assert_take_fillable(self.codes, indices,
+ allow_fill=allow_fill,
+ fill_value=fill_value,
+ na_value=-1)
+ return self._create_from_codes(taken)
+
+ def is_dtype_equal(self, other):
+ return self._data.is_dtype_equal(other)
+
+ take_nd = take
+
+ def map(self, mapper):
+ """
+ Map values using input correspondence (a dict, Series, or function).
+
+ Maps the values (their categories, not the codes) of the index to new
+ categories. If the mapping correspondence is one-to-one the result is a
+ :class:`~pandas.CategoricalIndex` which has the same order property as
+ the original, otherwise an :class:`~pandas.Index` is returned.
+
+ If a `dict` or :class:`~pandas.Series` is used any unmapped category is
+ mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`
+ will be returned.
+
+ Parameters
+ ----------
+ mapper : function, dict, or Series
+ Mapping correspondence.
+
+ Returns
+ -------
+ pandas.CategoricalIndex or pandas.Index
+ Mapped index.
+
+ See Also
+ --------
+ Index.map : Apply a mapping correspondence on an
+ :class:`~pandas.Index`.
+ Series.map : Apply a mapping correspondence on a
+ :class:`~pandas.Series`.
+ Series.apply : Apply more complex functions on a
+ :class:`~pandas.Series`.
+
+ Examples
+ --------
+ >>> idx = pd.CategoricalIndex(['a', 'b', 'c'])
+ >>> idx
+ CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'],
+ ordered=False, dtype='category')
+ >>> idx.map(lambda x: x.upper())
+ CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'],
+ ordered=False, dtype='category')
+ >>> idx.map({'a': 'first', 'b': 'second', 'c': 'third'})
+ CategoricalIndex(['first', 'second', 'third'], categories=['first',
+ 'second', 'third'], ordered=False, dtype='category')
+
+ If the mapping is one-to-one the ordering of the categories is
+ preserved:
+
+ >>> idx = pd.CategoricalIndex(['a', 'b', 'c'], ordered=True)
+ >>> idx
+ CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'],
+ ordered=True, dtype='category')
+ >>> idx.map({'a': 3, 'b': 2, 'c': 1})
+ CategoricalIndex([3, 2, 1], categories=[3, 2, 1], ordered=True,
+ dtype='category')
+
+ If the mapping is not one-to-one an :class:`~pandas.Index` is returned:
+
+ >>> idx.map({'a': 'first', 'b': 'second', 'c': 'first'})
+ Index(['first', 'second', 'first'], dtype='object')
+
+ If a `dict` is used, all unmapped categories are mapped to `NaN` and
+ the result is an :class:`~pandas.Index`:
+
+ >>> idx.map({'a': 'first', 'b': 'second'})
+ Index(['first', 'second', nan], dtype='object')
+ """
+ return self._shallow_copy_with_infer(self.values.map(mapper))
+
+ def delete(self, loc):
+ """
+ Make new Index with passed location(-s) deleted
+
+ Returns
+ -------
+ new_index : Index
+ """
+ return self._create_from_codes(np.delete(self.codes, loc))
+
+ def insert(self, loc, item):
+ """
+ Make new Index inserting new item at location. Follows
+ Python list.append semantics for negative values
+
+ Parameters
+ ----------
+ loc : int
+ item : object
+
+ Returns
+ -------
+ new_index : Index
+
+ Raises
+ ------
+ ValueError if the item is not in the categories
+
+ """
+ code = self.categories.get_indexer([item])
+ if (code == -1) and not (is_scalar(item) and isna(item)):
+ raise TypeError("cannot insert an item into a CategoricalIndex "
+ "that is not already an existing category")
+
+ codes = self.codes
+ codes = np.concatenate((codes[:loc], code, codes[loc:]))
+ return self._create_from_codes(codes)
+
+ def _concat(self, to_concat, name):
+ # if calling index is category, don't check dtype of others
+ return CategoricalIndex._concat_same_dtype(self, to_concat, name)
+
+ def _concat_same_dtype(self, to_concat, name):
+ """
+ Concatenate to_concat which has the same class
+ ValueError if other is not in the categories
+ """
+ to_concat = [self._is_dtype_compat(c) for c in to_concat]
+ codes = np.concatenate([c.codes for c in to_concat])
+ result = self._create_from_codes(codes, name=name)
+ # if name is None, _create_from_codes sets self.name
+ result.name = name
+ return result
+
+ def _codes_for_groupby(self, sort, observed):
+ """ Return a Categorical adjusted for groupby """
+ return self.values._codes_for_groupby(sort, observed)
+
+ @classmethod
+ def _add_comparison_methods(cls):
+ """ add in comparison methods """
+
+ def _make_compare(op):
+ opname = '__{op}__'.format(op=op.__name__)
+
+ def _evaluate_compare(self, other):
+
+ # if we have a Categorical type, then must have the same
+ # categories
+ if isinstance(other, CategoricalIndex):
+ other = other._values
+ elif isinstance(other, Index):
+ other = self._create_categorical(
+ other._values, dtype=self.dtype)
+
+ if isinstance(other, (ABCCategorical, np.ndarray,
+ ABCSeries)):
+ if len(self.values) != len(other):
+ raise ValueError("Lengths must match to compare")
+
+ if isinstance(other, ABCCategorical):
+ if not self.values.is_dtype_equal(other):
+ raise TypeError("categorical index comparisons must "
+ "have the same categories and ordered "
+ "attributes")
+
+ result = op(self.values, other)
+ if isinstance(result, ABCSeries):
+ # Dispatch to pd.Categorical returned NotImplemented
+ # and we got a Series back; down-cast to ndarray
+ result = result.values
+ return result
+
+ return compat.set_function_name(_evaluate_compare, opname, cls)
+
+ cls.__eq__ = _make_compare(operator.eq)
+ cls.__ne__ = _make_compare(operator.ne)
+ cls.__lt__ = _make_compare(operator.lt)
+ cls.__gt__ = _make_compare(operator.gt)
+ cls.__le__ = _make_compare(operator.le)
+ cls.__ge__ = _make_compare(operator.ge)
+
+ def _delegate_method(self, name, *args, **kwargs):
+ """ method delegation to the ._values """
+ method = getattr(self._values, name)
+ if 'inplace' in kwargs:
+ raise ValueError("cannot use inplace with CategoricalIndex")
+ res = method(*args, **kwargs)
+ if is_scalar(res):
+ return res
+ return CategoricalIndex(res, name=self.name)
+
+
+CategoricalIndex._add_numeric_methods_add_sub_disabled()
+CategoricalIndex._add_numeric_methods_disabled()
+CategoricalIndex._add_logical_methods_disabled()
+CategoricalIndex._add_comparison_methods()
diff --git a/contrib/python/pandas/py2/pandas/core/indexes/datetimelike.py b/contrib/python/pandas/py2/pandas/core/indexes/datetimelike.py
new file mode 100644
index 00000000000..aa7332472fc
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/indexes/datetimelike.py
@@ -0,0 +1,724 @@
+# -*- coding: utf-8 -*-
+"""
+Base and utility classes for tseries type pandas objects.
+"""
+import operator
+import warnings
+
+import numpy as np
+
+from pandas._libs import NaT, iNaT, lib
+from pandas.compat.numpy import function as nv
+from pandas.errors import AbstractMethodError
+from pandas.util._decorators import Appender, cache_readonly, deprecate_kwarg
+
+from pandas.core.dtypes.common import (
+ ensure_int64, is_dtype_equal, is_float, is_integer, is_list_like,
+ is_period_dtype, is_scalar)
+from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries
+
+from pandas.core import algorithms, ops
+from pandas.core.accessor import PandasDelegate
+from pandas.core.arrays import ExtensionOpsMixin
+from pandas.core.arrays.datetimelike import (
+ DatetimeLikeArrayMixin, _ensure_datetimelike_to_i8)
+import pandas.core.indexes.base as ibase
+from pandas.core.indexes.base import Index, _index_shared_docs
+from pandas.core.tools.timedeltas import to_timedelta
+
+import pandas.io.formats.printing as printing
+
+_index_doc_kwargs = dict(ibase._index_doc_kwargs)
+
+
+def ea_passthrough(array_method):
+ """
+ Make an alias for a method of the underlying ExtensionArray.
+
+ Parameters
+ ----------
+ array_method : method on an Array class
+
+ Returns
+ -------
+ method
+ """
+
+ def method(self, *args, **kwargs):
+ return array_method(self._data, *args, **kwargs)
+
+ method.__name__ = array_method.__name__
+ method.__doc__ = array_method.__doc__
+ return method
+
+
+class DatetimeIndexOpsMixin(ExtensionOpsMixin):
+ """
+ common ops mixin to support a unified interface datetimelike Index
+ """
+ _data = None # type: DatetimeLikeArrayMixin
+
+ # DatetimeLikeArrayMixin assumes subclasses are mutable, so these are
+ # properties there. They can be made into cache_readonly for Index
+ # subclasses bc they are immutable
+ inferred_freq = cache_readonly(DatetimeLikeArrayMixin.inferred_freq.fget)
+ _isnan = cache_readonly(DatetimeLikeArrayMixin._isnan.fget)
+ hasnans = cache_readonly(DatetimeLikeArrayMixin._hasnans.fget)
+ _hasnans = hasnans # for index / array -agnostic code
+ _resolution = cache_readonly(DatetimeLikeArrayMixin._resolution.fget)
+ resolution = cache_readonly(DatetimeLikeArrayMixin.resolution.fget)
+
+ _box_values = ea_passthrough(DatetimeLikeArrayMixin._box_values)
+ _maybe_mask_results = ea_passthrough(
+ DatetimeLikeArrayMixin._maybe_mask_results)
+ __iter__ = ea_passthrough(DatetimeLikeArrayMixin.__iter__)
+
+ @property
+ def freq(self):
+ """
+ Return the frequency object if it is set, otherwise None.
+ """
+ return self._data.freq
+
+ @freq.setter
+ def freq(self, value):
+ # validation is handled by _data setter
+ self._data.freq = value
+
+ @property
+ def freqstr(self):
+ """
+ Return the frequency object as a string if it is set, otherwise None.
+ """
+ return self._data.freqstr
+
+ def unique(self, level=None):
+ if level is not None:
+ self._validate_index_level(level)
+
+ result = self._data.unique()
+
+ # Note: if `self` is already unique, then self.unique() should share
+ # a `freq` with self. If not already unique, then self.freq must be
+ # None, so again sharing freq is correct.
+ return self._shallow_copy(result._data)
+
+ @classmethod
+ def _create_comparison_method(cls, op):
+ """
+ Create a comparison method that dispatches to ``cls.values``.
+ """
+ def wrapper(self, other):
+ if isinstance(other, ABCSeries):
+ # the arrays defer to Series for comparison ops but the indexes
+ # don't, so we have to unwrap here.
+ other = other._values
+
+ result = op(self._data, maybe_unwrap_index(other))
+ return result
+
+ wrapper.__doc__ = op.__doc__
+ wrapper.__name__ = '__{}__'.format(op.__name__)
+ return wrapper
+
+ @property
+ def _ndarray_values(self):
+ return self._data._ndarray_values
+
+ # ------------------------------------------------------------------------
+ # Abstract data attributes
+
+ @property
+ def values(self):
+ # type: () -> np.ndarray
+ # Note: PeriodArray overrides this to return an ndarray of objects.
+ return self._data._data
+
+ @property
+ @Appender(DatetimeLikeArrayMixin.asi8.__doc__)
+ def asi8(self):
+ return self._data.asi8
+
+ # ------------------------------------------------------------------------
+
+ def equals(self, other):
+ """
+ Determines if two Index objects contain the same elements.
+ """
+ if self.is_(other):
+ return True
+
+ if not isinstance(other, ABCIndexClass):
+ return False
+ elif not isinstance(other, type(self)):
+ try:
+ other = type(self)(other)
+ except Exception:
+ return False
+
+ if not is_dtype_equal(self.dtype, other.dtype):
+ # have different timezone
+ return False
+
+ elif is_period_dtype(self):
+ if not is_period_dtype(other):
+ return False
+ if self.freq != other.freq:
+ return False
+
+ return np.array_equal(self.asi8, other.asi8)
+
+ @staticmethod
+ def _join_i8_wrapper(joinf, dtype, with_indexers=True):
+ """
+ Create the join wrapper methods.
+ """
+ from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin
+
+ @staticmethod
+ def wrapper(left, right):
+ if isinstance(left, (np.ndarray, ABCIndex, ABCSeries,
+ DatetimeLikeArrayMixin)):
+ left = left.view('i8')
+ if isinstance(right, (np.ndarray, ABCIndex, ABCSeries,
+ DatetimeLikeArrayMixin)):
+ right = right.view('i8')
+ results = joinf(left, right)
+ if with_indexers:
+ join_index, left_indexer, right_indexer = results
+ join_index = join_index.view(dtype)
+ return join_index, left_indexer, right_indexer
+ return results
+
+ return wrapper
+
+ def _ensure_localized(self, arg, ambiguous='raise', nonexistent='raise',
+ from_utc=False):
+ # See DatetimeLikeArrayMixin._ensure_localized.__doc__
+ if getattr(self, 'tz', None):
+ # ensure_localized is only relevant for tz-aware DTI
+ result = self._data._ensure_localized(arg,
+ ambiguous=ambiguous,
+ nonexistent=nonexistent,
+ from_utc=from_utc)
+ return type(self)._simple_new(result, name=self.name)
+ return arg
+
+ def _box_values(self, values):
+ return self._data._box_values(values)
+
+ @Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
+ def __contains__(self, key):
+ try:
+ res = self.get_loc(key)
+ return (is_scalar(res) or isinstance(res, slice) or
+ (is_list_like(res) and len(res)))
+ except (KeyError, TypeError, ValueError):
+ return False
+
+ contains = __contains__
+
+ # Try to run function on index first, and then on elements of index
+ # Especially important for group-by functionality
+ def map(self, f):
+ try:
+ result = f(self)
+
+ # Try to use this result if we can
+ if isinstance(result, np.ndarray):
+ result = Index(result)
+
+ if not isinstance(result, Index):
+ raise TypeError('The map function must return an Index object')
+ return result
+ except Exception:
+ return self.astype(object).map(f)
+
+ def sort_values(self, return_indexer=False, ascending=True):
+ """
+ Return sorted copy of Index.
+ """
+ if return_indexer:
+ _as = self.argsort()
+ if not ascending:
+ _as = _as[::-1]
+ sorted_index = self.take(_as)
+ return sorted_index, _as
+ else:
+ sorted_values = np.sort(self._ndarray_values)
+ attribs = self._get_attributes_dict()
+ freq = attribs['freq']
+
+ if freq is not None and not is_period_dtype(self):
+ if freq.n > 0 and not ascending:
+ freq = freq * -1
+ elif freq.n < 0 and ascending:
+ freq = freq * -1
+ attribs['freq'] = freq
+
+ if not ascending:
+ sorted_values = sorted_values[::-1]
+
+ return self._simple_new(sorted_values, **attribs)
+
+ @Appender(_index_shared_docs['take'] % _index_doc_kwargs)
+ def take(self, indices, axis=0, allow_fill=True,
+ fill_value=None, **kwargs):
+ nv.validate_take(tuple(), kwargs)
+ indices = ensure_int64(indices)
+
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(self))
+ if isinstance(maybe_slice, slice):
+ return self[maybe_slice]
+
+ taken = self._assert_take_fillable(self.asi8, indices,
+ allow_fill=allow_fill,
+ fill_value=fill_value,
+ na_value=iNaT)
+
+ # keep freq in PeriodArray/Index, reset otherwise
+ freq = self.freq if is_period_dtype(self) else None
+ return self._shallow_copy(taken, freq=freq)
+
+ _can_hold_na = True
+
+ _na_value = NaT
+ """The expected NA value to use with this index."""
+
+ @property
+ def asobject(self):
+ """
+ Return object Index which contains boxed values.
+
+ .. deprecated:: 0.23.0
+ Use ``astype(object)`` instead.
+
+ *this is an internal non-public method*
+ """
+ warnings.warn("'asobject' is deprecated. Use 'astype(object)'"
+ " instead", FutureWarning, stacklevel=2)
+ return self.astype(object)
+
+ def _convert_tolerance(self, tolerance, target):
+ tolerance = np.asarray(to_timedelta(tolerance, box=False))
+ if target.size != tolerance.size and tolerance.size > 1:
+ raise ValueError('list-like tolerance size must match '
+ 'target index size')
+ return tolerance
+
+ def tolist(self):
+ """
+ Return a list of the underlying data.
+ """
+ return list(self.astype(object))
+
+ def min(self, axis=None, skipna=True, *args, **kwargs):
+ """
+ Return the minimum value of the Index or minimum along
+ an axis.
+
+ See Also
+ --------
+ numpy.ndarray.min
+ Series.min : Return the minimum value in a Series.
+ """
+ nv.validate_min(args, kwargs)
+ nv.validate_minmax_axis(axis)
+
+ if not len(self):
+ return self._na_value
+
+ i8 = self.asi8
+ try:
+ # quick check
+ if len(i8) and self.is_monotonic:
+ if i8[0] != iNaT:
+ return self._box_func(i8[0])
+
+ if self.hasnans:
+ if skipna:
+ min_stamp = self[~self._isnan].asi8.min()
+ else:
+ return self._na_value
+ else:
+ min_stamp = i8.min()
+ return self._box_func(min_stamp)
+ except ValueError:
+ return self._na_value
+
+ def argmin(self, axis=None, skipna=True, *args, **kwargs):
+ """
+ Returns the indices of the minimum values along an axis.
+
+ See `numpy.ndarray.argmin` for more information on the
+ `axis` parameter.
+
+ See Also
+ --------
+ numpy.ndarray.argmin
+ """
+ nv.validate_argmin(args, kwargs)
+ nv.validate_minmax_axis(axis)
+
+ i8 = self.asi8
+ if self.hasnans:
+ mask = self._isnan
+ if mask.all() or not skipna:
+ return -1
+ i8 = i8.copy()
+ i8[mask] = np.iinfo('int64').max
+ return i8.argmin()
+
+ def max(self, axis=None, skipna=True, *args, **kwargs):
+ """
+ Return the maximum value of the Index or maximum along
+ an axis.
+
+ See Also
+ --------
+ numpy.ndarray.max
+ Series.max : Return the maximum value in a Series.
+ """
+ nv.validate_max(args, kwargs)
+ nv.validate_minmax_axis(axis)
+
+ if not len(self):
+ return self._na_value
+
+ i8 = self.asi8
+ try:
+ # quick check
+ if len(i8) and self.is_monotonic:
+ if i8[-1] != iNaT:
+ return self._box_func(i8[-1])
+
+ if self.hasnans:
+ if skipna:
+ max_stamp = self[~self._isnan].asi8.max()
+ else:
+ return self._na_value
+ else:
+ max_stamp = i8.max()
+ return self._box_func(max_stamp)
+ except ValueError:
+ return self._na_value
+
+ def argmax(self, axis=None, skipna=True, *args, **kwargs):
+ """
+ Returns the indices of the maximum values along an axis.
+
+ See `numpy.ndarray.argmax` for more information on the
+ `axis` parameter.
+
+ See Also
+ --------
+ numpy.ndarray.argmax
+ """
+ nv.validate_argmax(args, kwargs)
+ nv.validate_minmax_axis(axis)
+
+ i8 = self.asi8
+ if self.hasnans:
+ mask = self._isnan
+ if mask.all() or not skipna:
+ return -1
+ i8 = i8.copy()
+ i8[mask] = 0
+ return i8.argmax()
+
+ # --------------------------------------------------------------------
+ # Rendering Methods
+
+ def _format_with_header(self, header, **kwargs):
+ return header + list(self._format_native_types(**kwargs))
+
+ @property
+ def _formatter_func(self):
+ raise AbstractMethodError(self)
+
+ def _format_attrs(self):
+ """
+ Return a list of tuples of the (attr,formatted_value).
+ """
+ attrs = super(DatetimeIndexOpsMixin, self)._format_attrs()
+ for attrib in self._attributes:
+ if attrib == 'freq':
+ freq = self.freqstr
+ if freq is not None:
+ freq = "'%s'" % freq
+ attrs.append(('freq', freq))
+ return attrs
+
+ # --------------------------------------------------------------------
+
+ def _convert_scalar_indexer(self, key, kind=None):
+ """
+ We don't allow integer or float indexing on datetime-like when using
+ loc.
+
+ Parameters
+ ----------
+ key : label of the slice bound
+ kind : {'ix', 'loc', 'getitem', 'iloc'} or None
+ """
+
+ assert kind in ['ix', 'loc', 'getitem', 'iloc', None]
+
+ # we don't allow integer/float indexing for loc
+ # we don't allow float indexing for ix/getitem
+ if is_scalar(key):
+ is_int = is_integer(key)
+ is_flt = is_float(key)
+ if kind in ['loc'] and (is_int or is_flt):
+ self._invalid_indexer('index', key)
+ elif kind in ['ix', 'getitem'] and is_flt:
+ self._invalid_indexer('index', key)
+
+ return (super(DatetimeIndexOpsMixin, self)
+ ._convert_scalar_indexer(key, kind=kind))
+
+ @classmethod
+ def _add_datetimelike_methods(cls):
+ """
+ Add in the datetimelike methods (as we may have to override the
+ superclass).
+ """
+
+ def __add__(self, other):
+ # dispatch to ExtensionArray implementation
+ result = self._data.__add__(maybe_unwrap_index(other))
+ return wrap_arithmetic_op(self, other, result)
+
+ cls.__add__ = __add__
+
+ def __radd__(self, other):
+ # alias for __add__
+ return self.__add__(other)
+ cls.__radd__ = __radd__
+
+ def __sub__(self, other):
+ # dispatch to ExtensionArray implementation
+ result = self._data.__sub__(maybe_unwrap_index(other))
+ return wrap_arithmetic_op(self, other, result)
+
+ cls.__sub__ = __sub__
+
+ def __rsub__(self, other):
+ result = self._data.__rsub__(maybe_unwrap_index(other))
+ return wrap_arithmetic_op(self, other, result)
+
+ cls.__rsub__ = __rsub__
+
+ def isin(self, values):
+ """
+ Compute boolean array of whether each index value is found in the
+ passed set of values.
+
+ Parameters
+ ----------
+ values : set or sequence of values
+
+ Returns
+ -------
+ is_contained : ndarray (boolean dtype)
+ """
+ if not isinstance(values, type(self)):
+ try:
+ values = type(self)(values)
+ except ValueError:
+ return self.astype(object).isin(values)
+
+ return algorithms.isin(self.asi8, values.asi8)
+
+ @Appender(_index_shared_docs['repeat'] % _index_doc_kwargs)
+ def repeat(self, repeats, axis=None):
+ nv.validate_repeat(tuple(), dict(axis=axis))
+ freq = self.freq if is_period_dtype(self) else None
+ return self._shallow_copy(self.asi8.repeat(repeats), freq=freq)
+
+ @Appender(_index_shared_docs['where'] % _index_doc_kwargs)
+ def where(self, cond, other=None):
+ other = _ensure_datetimelike_to_i8(other, to_utc=True)
+ values = _ensure_datetimelike_to_i8(self, to_utc=True)
+ result = np.where(cond, values, other).astype('i8')
+
+ result = self._ensure_localized(result, from_utc=True)
+ return self._shallow_copy(result)
+
+ def _summary(self, name=None):
+ """
+ Return a summarized representation.
+
+ Parameters
+ ----------
+ name : str
+ name to use in the summary representation
+
+ Returns
+ -------
+ String with a summarized representation of the index
+ """
+ formatter = self._formatter_func
+ if len(self) > 0:
+ index_summary = ', %s to %s' % (formatter(self[0]),
+ formatter(self[-1]))
+ else:
+ index_summary = ''
+
+ if name is None:
+ name = type(self).__name__
+ result = '%s: %s entries%s' % (printing.pprint_thing(name),
+ len(self), index_summary)
+ if self.freq:
+ result += '\nFreq: %s' % self.freqstr
+
+ # display as values, not quoted
+ result = result.replace("'", "")
+ return result
+
+ def _concat_same_dtype(self, to_concat, name):
+ """
+ Concatenate to_concat which has the same class.
+ """
+ attribs = self._get_attributes_dict()
+ attribs['name'] = name
+ # do not pass tz to set because tzlocal cannot be hashed
+ if len({str(x.dtype) for x in to_concat}) != 1:
+ raise ValueError('to_concat must have the same tz')
+
+ if not is_period_dtype(self):
+ # reset freq
+ attribs['freq'] = None
+
+ new_data = type(self._values)._concat_same_type(to_concat).asi8
+ return self._simple_new(new_data, **attribs)
+
+ @Appender(_index_shared_docs['astype'])
+ def astype(self, dtype, copy=True):
+ if is_dtype_equal(self.dtype, dtype) and copy is False:
+ # Ensure that self.astype(self.dtype) is self
+ return self
+
+ new_values = self._data.astype(dtype, copy=copy)
+
+ # pass copy=False because any copying will be done in the
+ # _data.astype call above
+ return Index(new_values,
+ dtype=new_values.dtype, name=self.name, copy=False)
+
+ @deprecate_kwarg(old_arg_name='n', new_arg_name='periods')
+ def shift(self, periods, freq=None):
+ """
+ Shift index by desired number of time frequency increments.
+
+ This method is for shifting the values of datetime-like indexes
+ by a specified time increment a given number of times.
+
+ Parameters
+ ----------
+ periods : int
+ Number of periods (or increments) to shift by,
+ can be positive or negative.
+
+ .. versionchanged:: 0.24.0
+
+ freq : pandas.DateOffset, pandas.Timedelta or string, optional
+ Frequency increment to shift by.
+ If None, the index is shifted by its own `freq` attribute.
+ Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc.
+
+ Returns
+ -------
+ pandas.DatetimeIndex
+ Shifted index.
+
+ See Also
+ --------
+ Index.shift : Shift values of Index.
+ PeriodIndex.shift : Shift values of PeriodIndex.
+ """
+ result = self._data._time_shift(periods, freq=freq)
+ return type(self)(result, name=self.name)
+
+
+def wrap_arithmetic_op(self, other, result):
+ if result is NotImplemented:
+ return NotImplemented
+
+ if isinstance(result, tuple):
+ # divmod, rdivmod
+ assert len(result) == 2
+ return (wrap_arithmetic_op(self, other, result[0]),
+ wrap_arithmetic_op(self, other, result[1]))
+
+ if not isinstance(result, Index):
+ # Index.__new__ will choose appropriate subclass for dtype
+ result = Index(result)
+
+ res_name = ops.get_op_result_name(self, other)
+ result.name = res_name
+ return result
+
+
+def maybe_unwrap_index(obj):
+ """
+ If operating against another Index object, we need to unwrap the underlying
+ data before deferring to the DatetimeArray/TimedeltaArray/PeriodArray
+ implementation, otherwise we will incorrectly return NotImplemented.
+
+ Parameters
+ ----------
+ obj : object
+
+ Returns
+ -------
+ unwrapped object
+ """
+ if isinstance(obj, ABCIndexClass):
+ return obj._data
+ return obj
+
+
+class DatetimelikeDelegateMixin(PandasDelegate):
+ """
+ Delegation mechanism, specific for Datetime, Timedelta, and Period types.
+
+ Functionality is delegated from the Index class to an Array class. A
+ few things can be customized
+
+ * _delegate_class : type
+ The class being delegated to.
+ * _delegated_methods, delegated_properties : List
+ The list of property / method names being delagated.
+ * raw_methods : Set
+ The set of methods whose results should should *not* be
+ boxed in an index, after being returned from the array
+ * raw_properties : Set
+ The set of properties whose results should should *not* be
+ boxed in an index, after being returned from the array
+ """
+ # raw_methods : dispatch methods that shouldn't be boxed in an Index
+ _raw_methods = set()
+ # raw_properties : dispatch properties that shouldn't be boxed in an Index
+ _raw_properties = set()
+ name = None
+ _data = None
+
+ @property
+ def _delegate_class(self):
+ raise AbstractMethodError
+
+ def _delegate_property_get(self, name, *args, **kwargs):
+ result = getattr(self._data, name)
+ if name not in self._raw_properties:
+ result = Index(result, name=self.name)
+ return result
+
+ def _delegate_property_set(self, name, value, *args, **kwargs):
+ setattr(self._data, name, value)
+
+ def _delegate_method(self, name, *args, **kwargs):
+ result = operator.methodcaller(name, *args, **kwargs)(self._data)
+ if name not in self._raw_methods:
+ result = Index(result, name=self.name)
+ return result
diff --git a/contrib/python/pandas/py2/pandas/core/indexes/datetimes.py b/contrib/python/pandas/py2/pandas/core/indexes/datetimes.py
new file mode 100644
index 00000000000..9c46860eb49
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/indexes/datetimes.py
@@ -0,0 +1,1679 @@
+# pylint: disable=E1101
+from __future__ import division
+
+from datetime import datetime, time, timedelta
+import operator
+import warnings
+
+import numpy as np
+
+from pandas._libs import (
+ Timestamp, index as libindex, join as libjoin, lib, tslib as libts)
+from pandas._libs.tslibs import ccalendar, fields, parsing, timezones
+import pandas.compat as compat
+from pandas.util._decorators import Appender, Substitution, cache_readonly
+
+from pandas.core.dtypes.common import (
+ _NS_DTYPE, ensure_int64, is_float, is_integer, is_list_like, is_scalar,
+ is_string_like)
+import pandas.core.dtypes.concat as _concat
+from pandas.core.dtypes.dtypes import DatetimeTZDtype
+from pandas.core.dtypes.missing import isna
+
+from pandas.core.accessor import delegate_names
+from pandas.core.arrays.datetimes import (
+ DatetimeArray, _to_M8, tz_to_dtype, validate_tz_from_dtype)
+from pandas.core.base import _shared_docs
+import pandas.core.common as com
+from pandas.core.indexes.base import Index
+from pandas.core.indexes.datetimelike import (
+ DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, ea_passthrough)
+from pandas.core.indexes.numeric import Int64Index
+from pandas.core.ops import get_op_result_name
+import pandas.core.tools.datetimes as tools
+
+from pandas.tseries import offsets
+from pandas.tseries.frequencies import Resolution, to_offset
+from pandas.tseries.offsets import CDay, prefix_mapping
+
+
+def _new_DatetimeIndex(cls, d):
+ """ This is called upon unpickling, rather than the default which doesn't
+ have arguments and breaks __new__ """
+
+ if "data" in d and not isinstance(d["data"], DatetimeIndex):
+ # Avoid need to verify integrity by calling simple_new directly
+ data = d.pop("data")
+ result = cls._simple_new(data, **d)
+ else:
+ with warnings.catch_warnings():
+ # we ignore warnings from passing verify_integrity=False
+ # TODO: If we knew what was going in to **d, we might be able to
+ # go through _simple_new instead
+ warnings.simplefilter("ignore")
+ result = cls.__new__(cls, verify_integrity=False, **d)
+
+ return result
+
+
+class DatetimeDelegateMixin(DatetimelikeDelegateMixin):
+ # Most attrs are dispatched via datetimelike_{ops,methods}
+ # Some are "raw" methods, the result is not not re-boxed in an Index
+ # We also have a few "extra" attrs, which may or may not be raw,
+ # which we we dont' want to expose in the .dt accessor.
+ _extra_methods = [
+ 'to_period',
+ 'to_perioddelta',
+ 'to_julian_date',
+ ]
+ _extra_raw_methods = [
+ 'to_pydatetime',
+ '_local_timestamps',
+ '_has_same_tz',
+ ]
+ _extra_raw_properties = [
+ '_box_func',
+ 'tz', 'tzinfo',
+ ]
+ _delegated_properties = (
+ DatetimeArray._datetimelike_ops + _extra_raw_properties
+ )
+ _delegated_methods = (
+ DatetimeArray._datetimelike_methods + _extra_methods +
+ _extra_raw_methods
+ )
+ _raw_properties = {
+ 'date',
+ 'time',
+ 'timetz',
+ } | set(DatetimeArray._bool_ops) | set(_extra_raw_properties)
+ _raw_methods = set(_extra_raw_methods)
+ _delegate_class = DatetimeArray
+
+
+@delegate_names(DatetimeArray,
+ DatetimeDelegateMixin._delegated_properties,
+ typ="property")
+@delegate_names(DatetimeArray,
+ DatetimeDelegateMixin._delegated_methods,
+ typ="method", overwrite=False)
+class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin):
+ """
+ Immutable ndarray of datetime64 data, represented internally as int64, and
+ which can be boxed to Timestamp objects that are subclasses of datetime and
+ carry metadata such as frequency information.
+
+ Parameters
+ ----------
+ data : array-like (1-dimensional), optional
+ Optional datetime-like data to construct index with
+ copy : bool
+ Make a copy of input ndarray
+ freq : string or pandas offset object, optional
+ One of pandas date offset strings or corresponding objects. The string
+ 'infer' can be passed in order to set the frequency of the index as the
+ inferred frequency upon creation
+
+ start : starting value, datetime-like, optional
+ If data is None, start is used as the start point in generating regular
+ timestamp data.
+
+ .. deprecated:: 0.24.0
+
+ periods : int, optional, > 0
+ Number of periods to generate, if generating index. Takes precedence
+ over end argument
+
+ .. deprecated:: 0.24.0
+
+ end : end time, datetime-like, optional
+ If periods is none, generated index will extend to first conforming
+ time on or just past end argument
+
+ .. deprecated:: 0.24.0
+
+ closed : string or None, default None
+ Make the interval closed with respect to the given frequency to
+ the 'left', 'right', or both sides (None)
+
+ .. deprecated:: 0.24. 0
+
+ tz : pytz.timezone or dateutil.tz.tzfile
+ ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'
+ When clocks moved backward due to DST, ambiguous times may arise.
+ For example in Central European Time (UTC+01), when going from 03:00
+ DST to 02:00 non-DST, 02:30:00 local time occurs both at 00:30:00 UTC
+ and at 01:30:00 UTC. In such a situation, the `ambiguous` parameter
+ dictates how ambiguous times should be handled.
+
+ - 'infer' will attempt to infer fall dst-transition hours based on
+ order
+ - bool-ndarray where True signifies a DST time, False signifies a
+ non-DST time (note that this flag is only applicable for ambiguous
+ times)
+ - 'NaT' will return NaT where there are ambiguous times
+ - 'raise' will raise an AmbiguousTimeError if there are ambiguous times
+ name : object
+ Name to be stored in the index
+ dayfirst : bool, default False
+ If True, parse dates in `data` with the day first order
+ yearfirst : bool, default False
+ If True parse dates in `data` with the year first order
+
+ Attributes
+ ----------
+ year
+ month
+ day
+ hour
+ minute
+ second
+ microsecond
+ nanosecond
+ date
+ time
+ timetz
+ dayofyear
+ weekofyear
+ week
+ dayofweek
+ weekday
+ quarter
+ tz
+ freq
+ freqstr
+ is_month_start
+ is_month_end
+ is_quarter_start
+ is_quarter_end
+ is_year_start
+ is_year_end
+ is_leap_year
+ inferred_freq
+
+ Methods
+ -------
+ normalize
+ strftime
+ snap
+ tz_convert
+ tz_localize
+ round
+ floor
+ ceil
+ to_period
+ to_perioddelta
+ to_pydatetime
+ to_series
+ to_frame
+ month_name
+ day_name
+
+ Notes
+ -----
+ To learn more about the frequency strings, please see `this link
+ <http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases>`__.
+
+ Creating a DatetimeIndex based on `start`, `periods`, and `end` has
+ been deprecated in favor of :func:`date_range`.
+
+ See Also
+ ---------
+ Index : The base pandas Index type.
+ TimedeltaIndex : Index of timedelta64 data.
+ PeriodIndex : Index of Period data.
+ to_datetime : Convert argument to datetime.
+ date_range : Create a fixed-frequency DatetimeIndex.
+ """
+ _typ = 'datetimeindex'
+ _join_precedence = 10
+
+ def _join_i8_wrapper(joinf, **kwargs):
+ return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype='M8[ns]',
+ **kwargs)
+
+ _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer_int64)
+ _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer_int64)
+ _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer_int64)
+ _left_indexer_unique = _join_i8_wrapper(
+ libjoin.left_join_indexer_unique_int64, with_indexers=False)
+
+ _engine_type = libindex.DatetimeEngine
+
+ _tz = None
+ _freq = None
+ _comparables = ['name', 'freqstr', 'tz']
+ _attributes = ['name', 'tz', 'freq']
+
+ # dummy attribute so that datetime.__eq__(DatetimeArray) defers
+ # by returning NotImplemented
+ timetuple = None
+
+ _is_numeric_dtype = False
+ _infer_as_myclass = True
+
+ # some things like freq inference make use of these attributes.
+ _bool_ops = DatetimeArray._bool_ops
+ _object_ops = DatetimeArray._object_ops
+ _field_ops = DatetimeArray._field_ops
+ _datetimelike_ops = DatetimeArray._datetimelike_ops
+ _datetimelike_methods = DatetimeArray._datetimelike_methods
+
+ # --------------------------------------------------------------------
+ # Constructors
+
+ def __new__(cls, data=None,
+ freq=None, start=None, end=None, periods=None, tz=None,
+ normalize=False, closed=None, ambiguous='raise',
+ dayfirst=False, yearfirst=False, dtype=None,
+ copy=False, name=None, verify_integrity=None):
+
+ if verify_integrity is not None:
+ warnings.warn("The 'verify_integrity' argument is deprecated, "
+ "will be removed in a future version.",
+ FutureWarning, stacklevel=2)
+ else:
+ verify_integrity = True
+
+ if data is None:
+ dtarr = DatetimeArray._generate_range(
+ start, end, periods,
+ freq=freq, tz=tz, normalize=normalize,
+ closed=closed, ambiguous=ambiguous)
+ warnings.warn("Creating a DatetimeIndex by passing range "
+ "endpoints is deprecated. Use "
+ "`pandas.date_range` instead.",
+ FutureWarning, stacklevel=2)
+ return cls._simple_new(
+ dtarr._data, freq=dtarr.freq, tz=dtarr.tz, name=name)
+
+ if is_scalar(data):
+ raise TypeError("{cls}() must be called with a "
+ "collection of some kind, {data} was passed"
+ .format(cls=cls.__name__, data=repr(data)))
+
+ # - Cases checked above all return/raise before reaching here - #
+
+ if name is None and hasattr(data, 'name'):
+ name = data.name
+
+ dtarr = DatetimeArray._from_sequence(
+ data, dtype=dtype, copy=copy, tz=tz, freq=freq,
+ dayfirst=dayfirst, yearfirst=yearfirst, ambiguous=ambiguous,
+ int_as_wall_time=True)
+
+ subarr = cls._simple_new(dtarr, name=name,
+ freq=dtarr.freq, tz=dtarr.tz)
+ return subarr
+
+ @classmethod
+ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None):
+ """
+ we require the we have a dtype compat for the values
+ if we are passed a non-dtype compat, then coerce using the constructor
+ """
+ if isinstance(values, DatetimeArray):
+ if tz:
+ tz = validate_tz_from_dtype(dtype, tz)
+ dtype = DatetimeTZDtype(tz=tz)
+ elif dtype is None:
+ dtype = _NS_DTYPE
+
+ values = DatetimeArray(values, freq=freq, dtype=dtype)
+ tz = values.tz
+ freq = values.freq
+ values = values._data
+
+ # DatetimeArray._simple_new will accept either i8 or M8[ns] dtypes
+ if isinstance(values, DatetimeIndex):
+ values = values._data
+
+ dtype = tz_to_dtype(tz)
+ dtarr = DatetimeArray._simple_new(values, freq=freq, dtype=dtype)
+ assert isinstance(dtarr, DatetimeArray)
+
+ result = object.__new__(cls)
+ result._data = dtarr
+ result.name = name
+ # For groupby perf. See note in indexes/base about _index_data
+ result._index_data = dtarr._data
+ result._reset_identity()
+ return result
+
+ # --------------------------------------------------------------------
+
+ def __array__(self, dtype=None):
+ if (dtype is None and isinstance(self._data, DatetimeArray)
+ and getattr(self.dtype, 'tz', None)):
+ msg = (
+ "Converting timezone-aware DatetimeArray to timezone-naive "
+ "ndarray with 'datetime64[ns]' dtype. In the future, this "
+ "will return an ndarray with 'object' dtype where each "
+ "element is a 'pandas.Timestamp' with the correct 'tz'.\n\t"
+ "To accept the future behavior, pass 'dtype=object'.\n\t"
+ "To keep the old behavior, pass 'dtype=\"datetime64[ns]\"'."
+ )
+ warnings.warn(msg, FutureWarning, stacklevel=3)
+ dtype = 'M8[ns]'
+ return np.asarray(self._data, dtype=dtype)
+
+ @property
+ def dtype(self):
+ return self._data.dtype
+
+ @property
+ def tz(self):
+ # GH 18595
+ return self._data.tz
+
+ @tz.setter
+ def tz(self, value):
+ # GH 3746: Prevent localizing or converting the index by setting tz
+ raise AttributeError("Cannot directly set timezone. Use tz_localize() "
+ "or tz_convert() as appropriate")
+
+ tzinfo = tz
+
+ @cache_readonly
+ def _is_dates_only(self):
+ """Return a boolean if we are only dates (and don't have a timezone)"""
+ from pandas.io.formats.format import _is_dates_only
+ return _is_dates_only(self.values) and self.tz is None
+
+ def __reduce__(self):
+
+ # we use a special reudce here because we need
+ # to simply set the .tz (and not reinterpret it)
+
+ d = dict(data=self._data)
+ d.update(self._get_attributes_dict())
+ return _new_DatetimeIndex, (self.__class__, d), None
+
+ def __setstate__(self, state):
+ """Necessary for making this object picklable"""
+ if isinstance(state, dict):
+ super(DatetimeIndex, self).__setstate__(state)
+
+ elif isinstance(state, tuple):
+
+ # < 0.15 compat
+ if len(state) == 2:
+ nd_state, own_state = state
+ data = np.empty(nd_state[1], dtype=nd_state[2])
+ np.ndarray.__setstate__(data, nd_state)
+
+ freq = own_state[1]
+ tz = timezones.tz_standardize(own_state[2])
+ dtype = tz_to_dtype(tz)
+ dtarr = DatetimeArray._simple_new(data, freq=freq, dtype=dtype)
+
+ self.name = own_state[0]
+
+ else: # pragma: no cover
+ data = np.empty(state)
+ np.ndarray.__setstate__(data, state)
+ dtarr = DatetimeArray(data)
+
+ self._data = dtarr
+ self._reset_identity()
+
+ else:
+ raise Exception("invalid pickle state")
+ _unpickle_compat = __setstate__
+
+ def _convert_for_op(self, value):
+ """ Convert value to be insertable to ndarray """
+ if self._has_same_tz(value):
+ return _to_M8(value)
+ raise ValueError('Passed item and index have different timezone')
+
+ def _maybe_update_attributes(self, attrs):
+ """ Update Index attributes (e.g. freq) depending on op """
+ freq = attrs.get('freq', None)
+ if freq is not None:
+ # no need to infer if freq is None
+ attrs['freq'] = 'infer'
+ return attrs
+
+ # --------------------------------------------------------------------
+ # Rendering Methods
+
+ def _mpl_repr(self):
+ # how to represent ourselves to matplotlib
+ return libts.ints_to_pydatetime(self.asi8, self.tz)
+
+ def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs):
+ from pandas.io.formats.format import _get_format_datetime64_from_values
+ fmt = _get_format_datetime64_from_values(self, date_format)
+
+ return libts.format_array_from_datetime(self.asi8,
+ tz=self.tz,
+ format=fmt,
+ na_rep=na_rep)
+
+ @property
+ def _formatter_func(self):
+ from pandas.io.formats.format import _get_format_datetime64
+ formatter = _get_format_datetime64(is_dates_only=self._is_dates_only)
+ return lambda x: "'%s'" % formatter(x, tz=self.tz)
+
+ # --------------------------------------------------------------------
+ # Set Operation Methods
+
+ def union(self, other):
+ """
+ Specialized union for DatetimeIndex objects. If combine
+ overlapping ranges with the same DateOffset, will be much
+ faster than Index.union
+
+ Parameters
+ ----------
+ other : DatetimeIndex or array-like
+
+ Returns
+ -------
+ y : Index or DatetimeIndex
+ """
+ self._assert_can_do_setop(other)
+
+ if len(other) == 0 or self.equals(other) or len(self) == 0:
+ return super(DatetimeIndex, self).union(other)
+
+ if not isinstance(other, DatetimeIndex):
+ try:
+ other = DatetimeIndex(other)
+ except TypeError:
+ pass
+
+ this, other = self._maybe_utc_convert(other)
+
+ if this._can_fast_union(other):
+ return this._fast_union(other)
+ else:
+ result = Index.union(this, other)
+ if isinstance(result, DatetimeIndex):
+ # TODO: we shouldn't be setting attributes like this;
+ # in all the tests this equality already holds
+ result._data._dtype = this.dtype
+ if (result.freq is None and
+ (this.freq is not None or other.freq is not None)):
+ result.freq = to_offset(result.inferred_freq)
+ return result
+
+ def union_many(self, others):
+ """
+ A bit of a hack to accelerate unioning a collection of indexes
+ """
+ this = self
+
+ for other in others:
+ if not isinstance(this, DatetimeIndex):
+ this = Index.union(this, other)
+ continue
+
+ if not isinstance(other, DatetimeIndex):
+ try:
+ other = DatetimeIndex(other)
+ except TypeError:
+ pass
+
+ this, other = this._maybe_utc_convert(other)
+
+ if this._can_fast_union(other):
+ this = this._fast_union(other)
+ else:
+ dtype = this.dtype
+ this = Index.union(this, other)
+ if isinstance(this, DatetimeIndex):
+ # TODO: we shouldn't be setting attributes like this;
+ # in all the tests this equality already holds
+ this._data._dtype = dtype
+ return this
+
+ def _can_fast_union(self, other):
+ if not isinstance(other, DatetimeIndex):
+ return False
+
+ freq = self.freq
+
+ if freq is None or freq != other.freq:
+ return False
+
+ if not self.is_monotonic or not other.is_monotonic:
+ return False
+
+ if len(self) == 0 or len(other) == 0:
+ return True
+
+ # to make our life easier, "sort" the two ranges
+ if self[0] <= other[0]:
+ left, right = self, other
+ else:
+ left, right = other, self
+
+ right_start = right[0]
+ left_end = left[-1]
+
+ # Only need to "adjoin", not overlap
+ try:
+ return (right_start == left_end + freq) or right_start in left
+ except (ValueError):
+
+ # if we are comparing a freq that does not propagate timezones
+ # this will raise
+ return False
+
+ def _fast_union(self, other):
+ if len(other) == 0:
+ return self.view(type(self))
+
+ if len(self) == 0:
+ return other.view(type(self))
+
+ # to make our life easier, "sort" the two ranges
+ if self[0] <= other[0]:
+ left, right = self, other
+ else:
+ left, right = other, self
+
+ left_end = left[-1]
+ right_end = right[-1]
+
+ # TODO: consider re-implementing freq._should_cache for fastpath
+
+ # concatenate dates
+ if left_end < right_end:
+ loc = right.searchsorted(left_end, side='right')
+ right_chunk = right.values[loc:]
+ dates = _concat._concat_compat((left.values, right_chunk))
+ return self._shallow_copy(dates)
+ else:
+ return left
+
+ def _wrap_setop_result(self, other, result):
+ name = get_op_result_name(self, other)
+ return self._shallow_copy(result, name=name, freq=None, tz=self.tz)
+
+ def intersection(self, other, sort=False):
+ """
+ Specialized intersection for DatetimeIndex objects. May be much faster
+ than Index.intersection
+
+ Parameters
+ ----------
+ other : DatetimeIndex or array-like
+ sort : False or None, default False
+ Sort the resulting index if possible.
+
+ .. versionadded:: 0.24.0
+
+ .. versionchanged:: 0.24.1
+
+ Changed the default to ``False`` to match the behaviour
+ from before 0.24.0.
+
+ Returns
+ -------
+ y : Index or DatetimeIndex
+ """
+ self._validate_sort_keyword(sort)
+ self._assert_can_do_setop(other)
+
+ if self.equals(other):
+ return self._get_reconciled_name_object(other)
+
+ if not isinstance(other, DatetimeIndex):
+ try:
+ other = DatetimeIndex(other)
+ except (TypeError, ValueError):
+ pass
+ result = Index.intersection(self, other, sort=sort)
+ if isinstance(result, DatetimeIndex):
+ if result.freq is None:
+ result.freq = to_offset(result.inferred_freq)
+ return result
+
+ elif (other.freq is None or self.freq is None or
+ other.freq != self.freq or
+ not other.freq.isAnchored() or
+ (not self.is_monotonic or not other.is_monotonic)):
+ result = Index.intersection(self, other, sort=sort)
+ # Invalidate the freq of `result`, which may not be correct at
+ # this point, depending on the values.
+ result.freq = None
+ result = self._shallow_copy(result._values, name=result.name,
+ tz=result.tz, freq=None)
+ if result.freq is None:
+ result.freq = to_offset(result.inferred_freq)
+ return result
+
+ if len(self) == 0:
+ return self
+ if len(other) == 0:
+ return other
+ # to make our life easier, "sort" the two ranges
+ if self[0] <= other[0]:
+ left, right = self, other
+ else:
+ left, right = other, self
+
+ end = min(left[-1], right[-1])
+ start = right[0]
+
+ if end < start:
+ return type(self)(data=[])
+ else:
+ lslice = slice(*left.slice_locs(start, end))
+ left_chunk = left.values[lslice]
+ return self._shallow_copy(left_chunk)
+
+ # --------------------------------------------------------------------
+
+ def _get_time_micros(self):
+ values = self.asi8
+ if self.tz is not None and not timezones.is_utc(self.tz):
+ values = self._data._local_timestamps()
+ return fields.get_time_micros(values)
+
+ def to_series(self, keep_tz=None, index=None, name=None):
+ """
+ Create a Series with both index and values equal to the index keys
+ useful with map for returning an indexer based on an index
+
+ Parameters
+ ----------
+ keep_tz : optional, defaults False
+ Return the data keeping the timezone.
+
+ If keep_tz is True:
+
+ If the timezone is not set, the resulting
+ Series will have a datetime64[ns] dtype.
+
+ Otherwise the Series will have an datetime64[ns, tz] dtype; the
+ tz will be preserved.
+
+ If keep_tz is False:
+
+ Series will have a datetime64[ns] dtype. TZ aware
+ objects will have the tz removed.
+
+ .. versionchanged:: 0.24
+ The default value will change to True in a future release.
+ You can set ``keep_tz=True`` to already obtain the future
+ behaviour and silence the warning.
+
+ index : Index, optional
+ index of resulting Series. If None, defaults to original index
+ name : string, optional
+ name of resulting Series. If None, defaults to name of original
+ index
+
+ Returns
+ -------
+ Series
+ """
+ from pandas import Series
+
+ if index is None:
+ index = self._shallow_copy()
+ if name is None:
+ name = self.name
+
+ if keep_tz is None and self.tz is not None:
+ warnings.warn("The default of the 'keep_tz' keyword will change "
+ "to True in a future release. You can set "
+ "'keep_tz=True' to obtain the future behaviour and "
+ "silence this warning.", FutureWarning, stacklevel=2)
+ keep_tz = False
+ elif keep_tz is False:
+ warnings.warn("Specifying 'keep_tz=False' is deprecated and this "
+ "option will be removed in a future release. If "
+ "you want to remove the timezone information, you "
+ "can do 'idx.tz_convert(None)' before calling "
+ "'to_series'.", FutureWarning, stacklevel=2)
+
+ if keep_tz and self.tz is not None:
+ # preserve the tz & copy
+ values = self.copy(deep=True)
+ else:
+ values = self.values.copy()
+
+ return Series(values, index=index, name=name)
+
+ def snap(self, freq='S'):
+ """
+ Snap time stamps to nearest occurring frequency
+ """
+ # Superdumb, punting on any optimizing
+ freq = to_offset(freq)
+
+ snapped = np.empty(len(self), dtype=_NS_DTYPE)
+
+ for i, v in enumerate(self):
+ s = v
+ if not freq.onOffset(s):
+ t0 = freq.rollback(s)
+ t1 = freq.rollforward(s)
+ if abs(s - t0) < abs(t1 - s):
+ s = t0
+ else:
+ s = t1
+ snapped[i] = s
+
+ # we know it conforms; skip check
+ return DatetimeIndex._simple_new(snapped, freq=freq)
+ # TODO: what about self.name? tz? if so, use shallow_copy?
+
+ def join(self, other, how='left', level=None, return_indexers=False,
+ sort=False):
+ """
+ See Index.join
+ """
+ if (not isinstance(other, DatetimeIndex) and len(other) > 0 and
+ other.inferred_type not in ('floating', 'integer', 'mixed-integer',
+ 'mixed-integer-float', 'mixed')):
+ try:
+ other = DatetimeIndex(other)
+ except (TypeError, ValueError):
+ pass
+
+ this, other = self._maybe_utc_convert(other)
+ return Index.join(this, other, how=how, level=level,
+ return_indexers=return_indexers, sort=sort)
+
+ def _maybe_utc_convert(self, other):
+ this = self
+ if isinstance(other, DatetimeIndex):
+ if self.tz is not None:
+ if other.tz is None:
+ raise TypeError('Cannot join tz-naive with tz-aware '
+ 'DatetimeIndex')
+ elif other.tz is not None:
+ raise TypeError('Cannot join tz-naive with tz-aware '
+ 'DatetimeIndex')
+
+ if not timezones.tz_compare(self.tz, other.tz):
+ this = self.tz_convert('UTC')
+ other = other.tz_convert('UTC')
+ return this, other
+
+ def _wrap_joined_index(self, joined, other):
+ name = get_op_result_name(self, other)
+ if (isinstance(other, DatetimeIndex) and
+ self.freq == other.freq and
+ self._can_fast_union(other)):
+ joined = self._shallow_copy(joined)
+ joined.name = name
+ return joined
+ else:
+ tz = getattr(other, 'tz', None)
+ return self._simple_new(joined, name, tz=tz)
+
+ def _parsed_string_to_bounds(self, reso, parsed):
+ """
+ Calculate datetime bounds for parsed time string and its resolution.
+
+ Parameters
+ ----------
+ reso : Resolution
+ Resolution provided by parsed string.
+ parsed : datetime
+ Datetime from parsed string.
+
+ Returns
+ -------
+ lower, upper: pd.Timestamp
+
+ """
+ if reso == 'year':
+ return (Timestamp(datetime(parsed.year, 1, 1), tz=self.tz),
+ Timestamp(datetime(parsed.year, 12, 31, 23,
+ 59, 59, 999999), tz=self.tz))
+ elif reso == 'month':
+ d = ccalendar.get_days_in_month(parsed.year, parsed.month)
+ return (Timestamp(datetime(parsed.year, parsed.month, 1),
+ tz=self.tz),
+ Timestamp(datetime(parsed.year, parsed.month, d, 23,
+ 59, 59, 999999), tz=self.tz))
+ elif reso == 'quarter':
+ qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead
+ d = ccalendar.get_days_in_month(parsed.year, qe) # at end of month
+ return (Timestamp(datetime(parsed.year, parsed.month, 1),
+ tz=self.tz),
+ Timestamp(datetime(parsed.year, qe, d, 23, 59,
+ 59, 999999), tz=self.tz))
+ elif reso == 'day':
+ st = datetime(parsed.year, parsed.month, parsed.day)
+ return (Timestamp(st, tz=self.tz),
+ Timestamp(Timestamp(st + offsets.Day(),
+ tz=self.tz).value - 1))
+ elif reso == 'hour':
+ st = datetime(parsed.year, parsed.month, parsed.day,
+ hour=parsed.hour)
+ return (Timestamp(st, tz=self.tz),
+ Timestamp(Timestamp(st + offsets.Hour(),
+ tz=self.tz).value - 1))
+ elif reso == 'minute':
+ st = datetime(parsed.year, parsed.month, parsed.day,
+ hour=parsed.hour, minute=parsed.minute)
+ return (Timestamp(st, tz=self.tz),
+ Timestamp(Timestamp(st + offsets.Minute(),
+ tz=self.tz).value - 1))
+ elif reso == 'second':
+ st = datetime(parsed.year, parsed.month, parsed.day,
+ hour=parsed.hour, minute=parsed.minute,
+ second=parsed.second)
+ return (Timestamp(st, tz=self.tz),
+ Timestamp(Timestamp(st + offsets.Second(),
+ tz=self.tz).value - 1))
+ elif reso == 'microsecond':
+ st = datetime(parsed.year, parsed.month, parsed.day,
+ parsed.hour, parsed.minute, parsed.second,
+ parsed.microsecond)
+ return (Timestamp(st, tz=self.tz), Timestamp(st, tz=self.tz))
+ else:
+ raise KeyError
+
+ def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True):
+ is_monotonic = self.is_monotonic
+ if (is_monotonic and reso in ['day', 'hour', 'minute', 'second'] and
+ self._resolution >= Resolution.get_reso(reso)):
+ # These resolution/monotonicity validations came from GH3931,
+ # GH3452 and GH2369.
+
+ # See also GH14826
+ raise KeyError
+
+ if reso == 'microsecond':
+ # _partial_date_slice doesn't allow microsecond resolution, but
+ # _parsed_string_to_bounds allows it.
+ raise KeyError
+
+ t1, t2 = self._parsed_string_to_bounds(reso, parsed)
+ stamps = self.asi8
+
+ if is_monotonic:
+
+ # we are out of range
+ if (len(stamps) and ((use_lhs and t1.value < stamps[0] and
+ t2.value < stamps[0]) or
+ ((use_rhs and t1.value > stamps[-1] and
+ t2.value > stamps[-1])))):
+ raise KeyError
+
+ # a monotonic (sorted) series can be sliced
+ left = stamps.searchsorted(
+ t1.value, side='left') if use_lhs else None
+ right = stamps.searchsorted(
+ t2.value, side='right') if use_rhs else None
+
+ return slice(left, right)
+
+ lhs_mask = (stamps >= t1.value) if use_lhs else True
+ rhs_mask = (stamps <= t2.value) if use_rhs else True
+
+ # try to find a the dates
+ return (lhs_mask & rhs_mask).nonzero()[0]
+
+ def _maybe_promote(self, other):
+ if other.inferred_type == 'date':
+ other = DatetimeIndex(other)
+ return self, other
+
+ def get_value(self, series, key):
+ """
+ Fast lookup of value from 1-dimensional ndarray. Only use this if you
+ know what you're doing
+ """
+
+ if isinstance(key, datetime):
+
+ # needed to localize naive datetimes
+ if self.tz is not None:
+ if key.tzinfo is not None:
+ key = Timestamp(key).tz_convert(self.tz)
+ else:
+ key = Timestamp(key).tz_localize(self.tz)
+
+ return self.get_value_maybe_box(series, key)
+
+ if isinstance(key, time):
+ locs = self.indexer_at_time(key)
+ return series.take(locs)
+
+ try:
+ return com.maybe_box(self, Index.get_value(self, series, key),
+ series, key)
+ except KeyError:
+ try:
+ loc = self._get_string_slice(key)
+ return series[loc]
+ except (TypeError, ValueError, KeyError):
+ pass
+
+ try:
+ return self.get_value_maybe_box(series, key)
+ except (TypeError, ValueError, KeyError):
+ raise KeyError(key)
+
+ def get_value_maybe_box(self, series, key):
+ # needed to localize naive datetimes
+ if self.tz is not None:
+ key = Timestamp(key)
+ if key.tzinfo is not None:
+ key = key.tz_convert(self.tz)
+ else:
+ key = key.tz_localize(self.tz)
+ elif not isinstance(key, Timestamp):
+ key = Timestamp(key)
+ values = self._engine.get_value(com.values_from_object(series),
+ key, tz=self.tz)
+ return com.maybe_box(self, values, series, key)
+
+ def get_loc(self, key, method=None, tolerance=None):
+ """
+ Get integer location for requested label
+
+ Returns
+ -------
+ loc : int
+ """
+
+ if tolerance is not None:
+ # try converting tolerance now, so errors don't get swallowed by
+ # the try/except clauses below
+ tolerance = self._convert_tolerance(tolerance, np.asarray(key))
+
+ if isinstance(key, datetime):
+ # needed to localize naive datetimes
+ if key.tzinfo is None:
+ key = Timestamp(key, tz=self.tz)
+ else:
+ key = Timestamp(key).tz_convert(self.tz)
+ return Index.get_loc(self, key, method, tolerance)
+
+ elif isinstance(key, timedelta):
+ # GH#20464
+ raise TypeError("Cannot index {cls} with {other}"
+ .format(cls=type(self).__name__,
+ other=type(key).__name__))
+
+ if isinstance(key, time):
+ if method is not None:
+ raise NotImplementedError('cannot yet lookup inexact labels '
+ 'when key is a time object')
+ return self.indexer_at_time(key)
+
+ try:
+ return Index.get_loc(self, key, method, tolerance)
+ except (KeyError, ValueError, TypeError):
+ try:
+ return self._get_string_slice(key)
+ except (TypeError, KeyError, ValueError):
+ pass
+
+ try:
+ stamp = Timestamp(key)
+ if stamp.tzinfo is not None and self.tz is not None:
+ stamp = stamp.tz_convert(self.tz)
+ else:
+ stamp = stamp.tz_localize(self.tz)
+ return Index.get_loc(self, stamp, method, tolerance)
+ except KeyError:
+ raise KeyError(key)
+ except ValueError as e:
+ # list-like tolerance size must match target index size
+ if 'list-like' in str(e):
+ raise e
+ raise KeyError(key)
+
+ def _maybe_cast_slice_bound(self, label, side, kind):
+ """
+ If label is a string, cast it to datetime according to resolution.
+
+ Parameters
+ ----------
+ label : object
+ side : {'left', 'right'}
+ kind : {'ix', 'loc', 'getitem'}
+
+ Returns
+ -------
+ label : object
+
+ Notes
+ -----
+ Value of `side` parameter should be validated in caller.
+
+ """
+ assert kind in ['ix', 'loc', 'getitem', None]
+
+ if is_float(label) or isinstance(label, time) or is_integer(label):
+ self._invalid_indexer('slice', label)
+
+ if isinstance(label, compat.string_types):
+ freq = getattr(self, 'freqstr',
+ getattr(self, 'inferred_freq', None))
+ _, parsed, reso = parsing.parse_time_string(label, freq)
+ lower, upper = self._parsed_string_to_bounds(reso, parsed)
+ # lower, upper form the half-open interval:
+ # [parsed, parsed + 1 freq)
+ # because label may be passed to searchsorted
+ # the bounds need swapped if index is reverse sorted and has a
+ # length > 1 (is_monotonic_decreasing gives True for empty
+ # and length 1 index)
+ if self._is_strictly_monotonic_decreasing and len(self) > 1:
+ return upper if side == 'left' else lower
+ return lower if side == 'left' else upper
+ else:
+ return label
+
+ def _get_string_slice(self, key, use_lhs=True, use_rhs=True):
+ freq = getattr(self, 'freqstr',
+ getattr(self, 'inferred_freq', None))
+ _, parsed, reso = parsing.parse_time_string(key, freq)
+ loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs,
+ use_rhs=use_rhs)
+ return loc
+
+ def slice_indexer(self, start=None, end=None, step=None, kind=None):
+ """
+ Return indexer for specified label slice.
+ Index.slice_indexer, customized to handle time slicing.
+
+ In addition to functionality provided by Index.slice_indexer, does the
+ following:
+
+ - if both `start` and `end` are instances of `datetime.time`, it
+ invokes `indexer_between_time`
+ - if `start` and `end` are both either string or None perform
+ value-based selection in non-monotonic cases.
+
+ """
+ # For historical reasons DatetimeIndex supports slices between two
+ # instances of datetime.time as if it were applying a slice mask to
+ # an array of (self.hour, self.minute, self.seconds, self.microsecond).
+ if isinstance(start, time) and isinstance(end, time):
+ if step is not None and step != 1:
+ raise ValueError('Must have step size of 1 with time slices')
+ return self.indexer_between_time(start, end)
+
+ if isinstance(start, time) or isinstance(end, time):
+ raise KeyError('Cannot mix time and non-time slice keys')
+
+ try:
+ return Index.slice_indexer(self, start, end, step, kind=kind)
+ except KeyError:
+ # For historical reasons DatetimeIndex by default supports
+ # value-based partial (aka string) slices on non-monotonic arrays,
+ # let's try that.
+ if ((start is None or isinstance(start, compat.string_types)) and
+ (end is None or isinstance(end, compat.string_types))):
+ mask = True
+ if start is not None:
+ start_casted = self._maybe_cast_slice_bound(
+ start, 'left', kind)
+ mask = start_casted <= self
+
+ if end is not None:
+ end_casted = self._maybe_cast_slice_bound(
+ end, 'right', kind)
+ mask = (self <= end_casted) & mask
+
+ indexer = mask.nonzero()[0][::step]
+ if len(indexer) == len(self):
+ return slice(None)
+ else:
+ return indexer
+ else:
+ raise
+
+ # --------------------------------------------------------------------
+ # Wrapping DatetimeArray
+
+ # Compat for frequency inference, see GH#23789
+ _is_monotonic_increasing = Index.is_monotonic_increasing
+ _is_monotonic_decreasing = Index.is_monotonic_decreasing
+ _is_unique = Index.is_unique
+
+ _timezone = cache_readonly(DatetimeArray._timezone.fget)
+ is_normalized = cache_readonly(DatetimeArray.is_normalized.fget)
+ _resolution = cache_readonly(DatetimeArray._resolution.fget)
+
+ strftime = ea_passthrough(DatetimeArray.strftime)
+ _has_same_tz = ea_passthrough(DatetimeArray._has_same_tz)
+
+ @property
+ def offset(self):
+ """
+ get/set the frequency of the instance
+ """
+ msg = ('{cls}.offset has been deprecated and will be removed '
+ 'in a future version; use {cls}.freq instead.'
+ .format(cls=type(self).__name__))
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+ return self.freq
+
+ @offset.setter
+ def offset(self, value):
+ """
+ get/set the frequency of the instance
+ """
+ msg = ('{cls}.offset has been deprecated and will be removed '
+ 'in a future version; use {cls}.freq instead.'
+ .format(cls=type(self).__name__))
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+ self.freq = value
+
+ def __getitem__(self, key):
+ result = self._data.__getitem__(key)
+ if is_scalar(result):
+ return result
+ elif result.ndim > 1:
+ # To support MPL which performs slicing with 2 dim
+ # even though it only has 1 dim by definition
+ assert isinstance(result, np.ndarray), result
+ return result
+ return type(self)(result, name=self.name)
+
+ @property
+ def _box_func(self):
+ return lambda x: Timestamp(x, tz=self.tz)
+
+ # --------------------------------------------------------------------
+
+ @Substitution(klass='DatetimeIndex')
+ @Appender(_shared_docs['searchsorted'])
+ def searchsorted(self, value, side='left', sorter=None):
+ if isinstance(value, (np.ndarray, Index)):
+ value = np.array(value, dtype=_NS_DTYPE, copy=False)
+ else:
+ value = _to_M8(value, tz=self.tz)
+
+ return self.values.searchsorted(value, side=side)
+
+ def is_type_compatible(self, typ):
+ return typ == self.inferred_type or typ == 'datetime'
+
+ @property
+ def inferred_type(self):
+ # b/c datetime is represented as microseconds since the epoch, make
+ # sure we can't have ambiguous indexing
+ return 'datetime64'
+
+ @property
+ def is_all_dates(self):
+ return True
+
+ def insert(self, loc, item):
+ """
+ Make new Index inserting new item at location
+
+ Parameters
+ ----------
+ loc : int
+ item : object
+ if not either a Python datetime or a numpy integer-like, returned
+ Index dtype will be object rather than datetime.
+
+ Returns
+ -------
+ new_index : Index
+ """
+ if is_scalar(item) and isna(item):
+ # GH 18295
+ item = self._na_value
+
+ freq = None
+
+ if isinstance(item, (datetime, np.datetime64)):
+ self._assert_can_do_op(item)
+ if not self._has_same_tz(item) and not isna(item):
+ raise ValueError(
+ 'Passed item and index have different timezone')
+ # check freq can be preserved on edge cases
+ if self.size and self.freq is not None:
+ if ((loc == 0 or loc == -len(self)) and
+ item + self.freq == self[0]):
+ freq = self.freq
+ elif (loc == len(self)) and item - self.freq == self[-1]:
+ freq = self.freq
+ item = _to_M8(item, tz=self.tz)
+
+ try:
+ new_dates = np.concatenate((self[:loc].asi8, [item.view(np.int64)],
+ self[loc:].asi8))
+ return self._shallow_copy(new_dates, freq=freq)
+ except (AttributeError, TypeError):
+
+ # fall back to object index
+ if isinstance(item, compat.string_types):
+ return self.astype(object).insert(loc, item)
+ raise TypeError(
+ "cannot insert DatetimeIndex with incompatible label")
+
+ def delete(self, loc):
+ """
+ Make a new DatetimeIndex with passed location(s) deleted.
+
+ Parameters
+ ----------
+ loc: int, slice or array of ints
+ Indicate which sub-arrays to remove.
+
+ Returns
+ -------
+ new_index : DatetimeIndex
+ """
+ new_dates = np.delete(self.asi8, loc)
+
+ freq = None
+ if is_integer(loc):
+ if loc in (0, -len(self), -1, len(self) - 1):
+ freq = self.freq
+ else:
+ if is_list_like(loc):
+ loc = lib.maybe_indices_to_slice(
+ ensure_int64(np.array(loc)), len(self))
+ if isinstance(loc, slice) and loc.step in (1, None):
+ if (loc.start in (0, None) or loc.stop in (len(self), None)):
+ freq = self.freq
+
+ return self._shallow_copy(new_dates, freq=freq)
+
+ def indexer_at_time(self, time, asof=False):
+ """
+ Returns index locations of index values at particular time of day
+ (e.g. 9:30AM).
+
+ Parameters
+ ----------
+ time : datetime.time or string
+ datetime.time or string in appropriate format ("%H:%M", "%H%M",
+ "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p",
+ "%I%M%S%p").
+
+ Returns
+ -------
+ values_at_time : array of integers
+
+ See Also
+ --------
+ indexer_between_time, DataFrame.at_time
+ """
+ from dateutil.parser import parse
+
+ if asof:
+ raise NotImplementedError("'asof' argument is not supported")
+
+ if isinstance(time, compat.string_types):
+ time = parse(time).time()
+
+ if time.tzinfo:
+ # TODO
+ raise NotImplementedError("argument 'time' with timezone info is "
+ "not supported")
+
+ time_micros = self._get_time_micros()
+ micros = _time_to_micros(time)
+ return (micros == time_micros).nonzero()[0]
+
+ def indexer_between_time(self, start_time, end_time, include_start=True,
+ include_end=True):
+ """
+ Return index locations of values between particular times of day
+ (e.g., 9:00-9:30AM).
+
+ Parameters
+ ----------
+ start_time, end_time : datetime.time, str
+ datetime.time or string in appropriate format ("%H:%M", "%H%M",
+ "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p",
+ "%I%M%S%p").
+ include_start : boolean, default True
+ include_end : boolean, default True
+
+ Returns
+ -------
+ values_between_time : array of integers
+
+ See Also
+ --------
+ indexer_at_time, DataFrame.between_time
+ """
+ start_time = tools.to_time(start_time)
+ end_time = tools.to_time(end_time)
+ time_micros = self._get_time_micros()
+ start_micros = _time_to_micros(start_time)
+ end_micros = _time_to_micros(end_time)
+
+ if include_start and include_end:
+ lop = rop = operator.le
+ elif include_start:
+ lop = operator.le
+ rop = operator.lt
+ elif include_end:
+ lop = operator.lt
+ rop = operator.le
+ else:
+ lop = rop = operator.lt
+
+ if start_time <= end_time:
+ join_op = operator.and_
+ else:
+ join_op = operator.or_
+
+ mask = join_op(lop(start_micros, time_micros),
+ rop(time_micros, end_micros))
+
+ return mask.nonzero()[0]
+
+
+DatetimeIndex._add_comparison_ops()
+DatetimeIndex._add_numeric_methods_disabled()
+DatetimeIndex._add_logical_methods_disabled()
+DatetimeIndex._add_datetimelike_methods()
+
+
+def date_range(start=None, end=None, periods=None, freq=None, tz=None,
+ normalize=False, name=None, closed=None, **kwargs):
+ """
+ Return a fixed frequency DatetimeIndex.
+
+ Parameters
+ ----------
+ start : str or datetime-like, optional
+ Left bound for generating dates.
+ end : str or datetime-like, optional
+ Right bound for generating dates.
+ periods : integer, optional
+ Number of periods to generate.
+ freq : str or DateOffset, default 'D'
+ Frequency strings can have multiples, e.g. '5H'. See
+ :ref:`here <timeseries.offset_aliases>` for a list of
+ frequency aliases.
+ tz : str or tzinfo, optional
+ Time zone name for returning localized DatetimeIndex, for example
+ 'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is
+ timezone-naive.
+ normalize : bool, default False
+ Normalize start/end dates to midnight before generating date range.
+ name : str, default None
+ Name of the resulting DatetimeIndex.
+ closed : {None, 'left', 'right'}, optional
+ Make the interval closed with respect to the given frequency to
+ the 'left', 'right', or both sides (None, the default).
+ **kwargs
+ For compatibility. Has no effect on the result.
+
+ Returns
+ -------
+ rng : DatetimeIndex
+
+ See Also
+ --------
+ pandas.DatetimeIndex : An immutable container for datetimes.
+ pandas.timedelta_range : Return a fixed frequency TimedeltaIndex.
+ pandas.period_range : Return a fixed frequency PeriodIndex.
+ pandas.interval_range : Return a fixed frequency IntervalIndex.
+
+ Notes
+ -----
+ Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,
+ exactly three must be specified. If ``freq`` is omitted, the resulting
+ ``DatetimeIndex`` will have ``periods`` linearly spaced elements between
+ ``start`` and ``end`` (closed on both sides).
+
+ To learn more about the frequency strings, please see `this link
+ <http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases>`__.
+
+ Examples
+ --------
+ **Specifying the values**
+
+ The next four examples generate the same `DatetimeIndex`, but vary
+ the combination of `start`, `end` and `periods`.
+
+ Specify `start` and `end`, with the default daily frequency.
+
+ >>> pd.date_range(start='1/1/2018', end='1/08/2018')
+ DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
+ '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
+ dtype='datetime64[ns]', freq='D')
+
+ Specify `start` and `periods`, the number of periods (days).
+
+ >>> pd.date_range(start='1/1/2018', periods=8)
+ DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
+ '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
+ dtype='datetime64[ns]', freq='D')
+
+ Specify `end` and `periods`, the number of periods (days).
+
+ >>> pd.date_range(end='1/1/2018', periods=8)
+ DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28',
+ '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'],
+ dtype='datetime64[ns]', freq='D')
+
+ Specify `start`, `end`, and `periods`; the frequency is generated
+ automatically (linearly spaced).
+
+ >>> pd.date_range(start='2018-04-24', end='2018-04-27', periods=3)
+ DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00',
+ '2018-04-27 00:00:00'],
+ dtype='datetime64[ns]', freq=None)
+
+ **Other Parameters**
+
+ Changed the `freq` (frequency) to ``'M'`` (month end frequency).
+
+ >>> pd.date_range(start='1/1/2018', periods=5, freq='M')
+ DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',
+ '2018-05-31'],
+ dtype='datetime64[ns]', freq='M')
+
+ Multiples are allowed
+
+ >>> pd.date_range(start='1/1/2018', periods=5, freq='3M')
+ DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
+ '2019-01-31'],
+ dtype='datetime64[ns]', freq='3M')
+
+ `freq` can also be specified as an Offset object.
+
+ >>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3))
+ DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
+ '2019-01-31'],
+ dtype='datetime64[ns]', freq='3M')
+
+ Specify `tz` to set the timezone.
+
+ >>> pd.date_range(start='1/1/2018', periods=5, tz='Asia/Tokyo')
+ DatetimeIndex(['2018-01-01 00:00:00+09:00', '2018-01-02 00:00:00+09:00',
+ '2018-01-03 00:00:00+09:00', '2018-01-04 00:00:00+09:00',
+ '2018-01-05 00:00:00+09:00'],
+ dtype='datetime64[ns, Asia/Tokyo]', freq='D')
+
+ `closed` controls whether to include `start` and `end` that are on the
+ boundary. The default includes boundary points on either end.
+
+ >>> pd.date_range(start='2017-01-01', end='2017-01-04', closed=None)
+ DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'],
+ dtype='datetime64[ns]', freq='D')
+
+ Use ``closed='left'`` to exclude `end` if it falls on the boundary.
+
+ >>> pd.date_range(start='2017-01-01', end='2017-01-04', closed='left')
+ DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'],
+ dtype='datetime64[ns]', freq='D')
+
+ Use ``closed='right'`` to exclude `start` if it falls on the boundary.
+
+ >>> pd.date_range(start='2017-01-01', end='2017-01-04', closed='right')
+ DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'],
+ dtype='datetime64[ns]', freq='D')
+ """
+
+ if freq is None and com._any_none(periods, start, end):
+ freq = 'D'
+
+ dtarr = DatetimeArray._generate_range(
+ start=start, end=end, periods=periods,
+ freq=freq, tz=tz, normalize=normalize,
+ closed=closed, **kwargs)
+ return DatetimeIndex._simple_new(
+ dtarr, tz=dtarr.tz, freq=dtarr.freq, name=name)
+
+
+def bdate_range(start=None, end=None, periods=None, freq='B', tz=None,
+ normalize=True, name=None, weekmask=None, holidays=None,
+ closed=None, **kwargs):
+ """
+ Return a fixed frequency DatetimeIndex, with business day as the default
+ frequency
+
+ Parameters
+ ----------
+ start : string or datetime-like, default None
+ Left bound for generating dates.
+ end : string or datetime-like, default None
+ Right bound for generating dates.
+ periods : integer, default None
+ Number of periods to generate.
+ freq : string or DateOffset, default 'B' (business daily)
+ Frequency strings can have multiples, e.g. '5H'.
+ tz : string or None
+ Time zone name for returning localized DatetimeIndex, for example
+ Asia/Beijing.
+ normalize : bool, default False
+ Normalize start/end dates to midnight before generating date range.
+ name : string, default None
+ Name of the resulting DatetimeIndex.
+ weekmask : string or None, default None
+ Weekmask of valid business days, passed to ``numpy.busdaycalendar``,
+ only used when custom frequency strings are passed. The default
+ value None is equivalent to 'Mon Tue Wed Thu Fri'.
+
+ .. versionadded:: 0.21.0
+
+ holidays : list-like or None, default None
+ Dates to exclude from the set of valid business days, passed to
+ ``numpy.busdaycalendar``, only used when custom frequency strings
+ are passed.
+
+ .. versionadded:: 0.21.0
+
+ closed : string, default None
+ Make the interval closed with respect to the given frequency to
+ the 'left', 'right', or both sides (None).
+ **kwargs
+ For compatibility. Has no effect on the result.
+
+ Returns
+ -------
+ DatetimeIndex
+
+ Notes
+ -----
+ Of the four parameters: ``start``, ``end``, ``periods``, and ``freq``,
+ exactly three must be specified. Specifying ``freq`` is a requirement
+ for ``bdate_range``. Use ``date_range`` if specifying ``freq`` is not
+ desired.
+
+ To learn more about the frequency strings, please see `this link
+ <http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases>`__.
+
+ Examples
+ --------
+ Note how the two weekend days are skipped in the result.
+
+ >>> pd.bdate_range(start='1/1/2018', end='1/08/2018')
+ DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
+ '2018-01-05', '2018-01-08'],
+ dtype='datetime64[ns]', freq='B')
+ """
+ if freq is None:
+ msg = 'freq must be specified for bdate_range; use date_range instead'
+ raise TypeError(msg)
+
+ if is_string_like(freq) and freq.startswith('C'):
+ try:
+ weekmask = weekmask or 'Mon Tue Wed Thu Fri'
+ freq = prefix_mapping[freq](holidays=holidays, weekmask=weekmask)
+ except (KeyError, TypeError):
+ msg = 'invalid custom frequency string: {freq}'.format(freq=freq)
+ raise ValueError(msg)
+ elif holidays or weekmask:
+ msg = ('a custom frequency string is required when holidays or '
+ 'weekmask are passed, got frequency {freq}').format(freq=freq)
+ raise ValueError(msg)
+
+ return date_range(start=start, end=end, periods=periods,
+ freq=freq, tz=tz, normalize=normalize, name=name,
+ closed=closed, **kwargs)
+
+
+def cdate_range(start=None, end=None, periods=None, freq='C', tz=None,
+ normalize=True, name=None, closed=None, **kwargs):
+ """
+ Return a fixed frequency DatetimeIndex, with CustomBusinessDay as the
+ default frequency
+
+ .. deprecated:: 0.21.0
+
+ Parameters
+ ----------
+ start : string or datetime-like, default None
+ Left bound for generating dates
+ end : string or datetime-like, default None
+ Right bound for generating dates
+ periods : integer, default None
+ Number of periods to generate
+ freq : string or DateOffset, default 'C' (CustomBusinessDay)
+ Frequency strings can have multiples, e.g. '5H'
+ tz : string, default None
+ Time zone name for returning localized DatetimeIndex, for example
+ Asia/Beijing
+ normalize : bool, default False
+ Normalize start/end dates to midnight before generating date range
+ name : string, default None
+ Name of the resulting DatetimeIndex
+ weekmask : string, Default 'Mon Tue Wed Thu Fri'
+ weekmask of valid business days, passed to ``numpy.busdaycalendar``
+ holidays : list
+ list/array of dates to exclude from the set of valid business days,
+ passed to ``numpy.busdaycalendar``
+ closed : string, default None
+ Make the interval closed with respect to the given frequency to
+ the 'left', 'right', or both sides (None)
+
+ Notes
+ -----
+ Of the three parameters: ``start``, ``end``, and ``periods``, exactly two
+ must be specified.
+
+ To learn more about the frequency strings, please see `this link
+ <http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases>`__.
+
+ Returns
+ -------
+ rng : DatetimeIndex
+ """
+ warnings.warn("cdate_range is deprecated and will be removed in a future "
+ "version, instead use pd.bdate_range(..., freq='{freq}')"
+ .format(freq=freq), FutureWarning, stacklevel=2)
+
+ if freq == 'C':
+ holidays = kwargs.pop('holidays', [])
+ weekmask = kwargs.pop('weekmask', 'Mon Tue Wed Thu Fri')
+ freq = CDay(holidays=holidays, weekmask=weekmask)
+
+ return date_range(start=start, end=end, periods=periods, freq=freq,
+ tz=tz, normalize=normalize, name=name,
+ closed=closed, **kwargs)
+
+
+def _time_to_micros(time):
+ seconds = time.hour * 60 * 60 + 60 * time.minute + time.second
+ return 1000000 * seconds + time.microsecond
diff --git a/contrib/python/pandas/py2/pandas/core/indexes/frozen.py b/contrib/python/pandas/py2/pandas/core/indexes/frozen.py
new file mode 100644
index 00000000000..982645ebd51
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/indexes/frozen.py
@@ -0,0 +1,196 @@
+"""
+frozen (immutable) data structures to support MultiIndexing
+
+These are used for:
+
+- .names (FrozenList)
+- .levels & .codes (FrozenNDArray)
+
+"""
+
+import warnings
+
+import numpy as np
+
+from pandas.util._decorators import deprecate_kwarg
+
+from pandas.core.dtypes.cast import coerce_indexer_dtype
+
+from pandas.core.base import PandasObject
+
+from pandas.io.formats.printing import pprint_thing
+
+
+class FrozenList(PandasObject, list):
+
+ """
+ Container that doesn't allow setting item *but*
+ because it's technically non-hashable, will be used
+ for lookups, appropriately, etc.
+ """
+ # Side note: This has to be of type list. Otherwise,
+ # it messes up PyTables type checks.
+
+ def union(self, other):
+ """
+ Returns a FrozenList with other concatenated to the end of self.
+
+ Parameters
+ ----------
+ other : array-like
+ The array-like whose elements we are concatenating.
+
+ Returns
+ -------
+ diff : FrozenList
+ The collection difference between self and other.
+ """
+ if isinstance(other, tuple):
+ other = list(other)
+ return type(self)(super(FrozenList, self).__add__(other))
+
+ def difference(self, other):
+ """
+ Returns a FrozenList with elements from other removed from self.
+
+ Parameters
+ ----------
+ other : array-like
+ The array-like whose elements we are removing self.
+
+ Returns
+ -------
+ diff : FrozenList
+ The collection difference between self and other.
+ """
+ other = set(other)
+ temp = [x for x in self if x not in other]
+ return type(self)(temp)
+
+ # TODO: Consider deprecating these in favor of `union` (xref gh-15506)
+ __add__ = __iadd__ = union
+
+ # Python 2 compat
+ def __getslice__(self, i, j):
+ return self.__class__(super(FrozenList, self).__getslice__(i, j))
+
+ def __getitem__(self, n):
+ # Python 3 compat
+ if isinstance(n, slice):
+ return self.__class__(super(FrozenList, self).__getitem__(n))
+ return super(FrozenList, self).__getitem__(n)
+
+ def __radd__(self, other):
+ if isinstance(other, tuple):
+ other = list(other)
+ return self.__class__(other + list(self))
+
+ def __eq__(self, other):
+ if isinstance(other, (tuple, FrozenList)):
+ other = list(other)
+ return super(FrozenList, self).__eq__(other)
+
+ __req__ = __eq__
+
+ def __mul__(self, other):
+ return self.__class__(super(FrozenList, self).__mul__(other))
+
+ __imul__ = __mul__
+
+ def __reduce__(self):
+ return self.__class__, (list(self),)
+
+ def __hash__(self):
+ return hash(tuple(self))
+
+ def _disabled(self, *args, **kwargs):
+ """This method will not function because object is immutable."""
+ raise TypeError("'%s' does not support mutable operations." %
+ self.__class__.__name__)
+
+ def __unicode__(self):
+ return pprint_thing(self, quote_strings=True,
+ escape_chars=('\t', '\r', '\n'))
+
+ def __repr__(self):
+ return "%s(%s)" % (self.__class__.__name__,
+ str(self))
+
+ __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled
+ pop = append = extend = remove = sort = insert = _disabled
+
+
+class FrozenNDArray(PandasObject, np.ndarray):
+
+ # no __array_finalize__ for now because no metadata
+ def __new__(cls, data, dtype=None, copy=False):
+ warnings.warn("\nFrozenNDArray is deprecated and will be removed in a "
+ "future version.\nPlease use `numpy.ndarray` instead.\n",
+ FutureWarning, stacklevel=2)
+
+ if copy is None:
+ copy = not isinstance(data, FrozenNDArray)
+ res = np.array(data, dtype=dtype, copy=copy).view(cls)
+ return res
+
+ def _disabled(self, *args, **kwargs):
+ """This method will not function because object is immutable."""
+ raise TypeError("'%s' does not support mutable operations." %
+ self.__class__)
+
+ __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled
+ put = itemset = fill = _disabled
+
+ def _shallow_copy(self):
+ return self.view()
+
+ def values(self):
+ """returns *copy* of underlying array"""
+ arr = self.view(np.ndarray).copy()
+ return arr
+
+ def __unicode__(self):
+ """
+ Return a string representation for this object.
+
+ Invoked by unicode(df) in py2 only. Yields a Unicode String in both
+ py2/py3.
+ """
+ prepr = pprint_thing(self, escape_chars=('\t', '\r', '\n'),
+ quote_strings=True)
+ return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype)
+
+ @deprecate_kwarg(old_arg_name="v", new_arg_name="value")
+ def searchsorted(self, value, side="left", sorter=None):
+ """
+ Find indices to insert `value` so as to maintain order.
+
+ For full documentation, see `numpy.searchsorted`
+
+ See Also
+ --------
+ numpy.searchsorted : Equivalent function.
+ """
+
+ # We are much more performant if the searched
+ # indexer is the same type as the array.
+ #
+ # This doesn't matter for int64, but DOES
+ # matter for smaller int dtypes.
+ #
+ # xref: https://github.com/numpy/numpy/issues/5370
+ try:
+ value = self.dtype.type(value)
+ except ValueError:
+ pass
+
+ return super(FrozenNDArray, self).searchsorted(
+ value, side=side, sorter=sorter)
+
+
+def _ensure_frozen(array_like, categories, copy=False):
+ array_like = coerce_indexer_dtype(array_like, categories)
+ array_like = array_like.view(FrozenNDArray)
+ if copy:
+ array_like = array_like.copy()
+ return array_like
diff --git a/contrib/python/pandas/py2/pandas/core/indexes/interval.py b/contrib/python/pandas/py2/pandas/core/indexes/interval.py
new file mode 100644
index 00000000000..2c63fe33c57
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/indexes/interval.py
@@ -0,0 +1,1315 @@
+""" define the IntervalIndex """
+import textwrap
+import warnings
+
+import numpy as np
+
+from pandas._libs import Timedelta, Timestamp
+from pandas._libs.interval import Interval, IntervalMixin, IntervalTree
+from pandas.compat import add_metaclass
+from pandas.util._decorators import Appender, cache_readonly
+from pandas.util._doctools import _WritableDoc
+from pandas.util._exceptions import rewrite_exception
+
+from pandas.core.dtypes.cast import (
+ find_common_type, infer_dtype_from_scalar, maybe_downcast_to_dtype)
+from pandas.core.dtypes.common import (
+ ensure_platform_int, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype,
+ is_dtype_equal, is_float, is_float_dtype, is_integer, is_integer_dtype,
+ is_interval_dtype, is_list_like, is_number, is_object_dtype, is_scalar)
+from pandas.core.dtypes.missing import isna
+
+from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs
+import pandas.core.common as com
+from pandas.core.config import get_option
+import pandas.core.indexes.base as ibase
+from pandas.core.indexes.base import (
+ Index, _index_shared_docs, default_pprint, ensure_index)
+from pandas.core.indexes.datetimes import DatetimeIndex, date_range
+from pandas.core.indexes.multi import MultiIndex
+from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range
+from pandas.core.ops import get_op_result_name
+
+from pandas.tseries.frequencies import to_offset
+from pandas.tseries.offsets import DateOffset
+
+_VALID_CLOSED = {'left', 'right', 'both', 'neither'}
+_index_doc_kwargs = dict(ibase._index_doc_kwargs)
+
+_index_doc_kwargs.update(
+ dict(klass='IntervalIndex',
+ qualname="IntervalIndex",
+ target_klass='IntervalIndex or list of Intervals',
+ name=textwrap.dedent("""\
+ name : object, optional
+ Name to be stored in the index.
+ """),
+ ))
+
+
+def _get_next_label(label):
+ dtype = getattr(label, 'dtype', type(label))
+ if isinstance(label, (Timestamp, Timedelta)):
+ dtype = 'datetime64'
+ if is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype):
+ return label + np.timedelta64(1, 'ns')
+ elif is_integer_dtype(dtype):
+ return label + 1
+ elif is_float_dtype(dtype):
+ return np.nextafter(label, np.infty)
+ else:
+ raise TypeError('cannot determine next label for type {typ!r}'
+ .format(typ=type(label)))
+
+
+def _get_prev_label(label):
+ dtype = getattr(label, 'dtype', type(label))
+ if isinstance(label, (Timestamp, Timedelta)):
+ dtype = 'datetime64'
+ if is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype):
+ return label - np.timedelta64(1, 'ns')
+ elif is_integer_dtype(dtype):
+ return label - 1
+ elif is_float_dtype(dtype):
+ return np.nextafter(label, -np.infty)
+ else:
+ raise TypeError('cannot determine next label for type {typ!r}'
+ .format(typ=type(label)))
+
+
+def _get_interval_closed_bounds(interval):
+ """
+ Given an Interval or IntervalIndex, return the corresponding interval with
+ closed bounds.
+ """
+ left, right = interval.left, interval.right
+ if interval.open_left:
+ left = _get_next_label(left)
+ if interval.open_right:
+ right = _get_prev_label(right)
+ return left, right
+
+
+def _new_IntervalIndex(cls, d):
+ """
+ This is called upon unpickling, rather than the default which doesn't have
+ arguments and breaks __new__
+ """
+ return cls.from_arrays(**d)
+
+
+@Appender(_interval_shared_docs['class'] % dict(
+ klass="IntervalIndex",
+ summary="Immutable index of intervals that are closed on the same side.",
+ name=_index_doc_kwargs['name'],
+ versionadded="0.20.0",
+ extra_attributes="is_overlapping\nvalues\n",
+ extra_methods="contains\n",
+ examples=textwrap.dedent("""\
+ Examples
+ --------
+ A new ``IntervalIndex`` is typically constructed using
+ :func:`interval_range`:
+
+ >>> pd.interval_range(start=0, end=5)
+ IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]],
+ closed='right',
+ dtype='interval[int64]')
+
+ It may also be constructed using one of the constructor
+ methods: :meth:`IntervalIndex.from_arrays`,
+ :meth:`IntervalIndex.from_breaks`, and :meth:`IntervalIndex.from_tuples`.
+
+ See further examples in the doc strings of ``interval_range`` and the
+ mentioned constructor methods.
+ """),
+
+))
+@add_metaclass(_WritableDoc)
+class IntervalIndex(IntervalMixin, Index):
+ _typ = 'intervalindex'
+ _comparables = ['name']
+ _attributes = ['name', 'closed']
+
+ # we would like our indexing holder to defer to us
+ _defer_to_indexing = True
+
+ # Immutable, so we are able to cache computations like isna in '_mask'
+ _mask = None
+
+ # --------------------------------------------------------------------
+ # Constructors
+
+ def __new__(cls, data, closed=None, dtype=None, copy=False,
+ name=None, verify_integrity=True):
+
+ if name is None and hasattr(data, 'name'):
+ name = data.name
+
+ with rewrite_exception("IntervalArray", cls.__name__):
+ array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype,
+ verify_integrity=verify_integrity)
+
+ return cls._simple_new(array, name)
+
+ @classmethod
+ def _simple_new(cls, array, name, closed=None):
+ """
+ Construct from an IntervalArray
+
+ Parameters
+ ----------
+ array : IntervalArray
+ name : str
+ Attached as result.name
+ closed : Any
+ Ignored.
+ """
+ result = IntervalMixin.__new__(cls)
+ result._data = array
+ result.name = name
+ result._reset_identity()
+ return result
+
+ @classmethod
+ @Appender(_interval_shared_docs['from_breaks'] % _index_doc_kwargs)
+ def from_breaks(cls, breaks, closed='right', name=None, copy=False,
+ dtype=None):
+ with rewrite_exception("IntervalArray", cls.__name__):
+ array = IntervalArray.from_breaks(breaks, closed=closed, copy=copy,
+ dtype=dtype)
+ return cls._simple_new(array, name=name)
+
+ @classmethod
+ @Appender(_interval_shared_docs['from_arrays'] % _index_doc_kwargs)
+ def from_arrays(cls, left, right, closed='right', name=None, copy=False,
+ dtype=None):
+ with rewrite_exception("IntervalArray", cls.__name__):
+ array = IntervalArray.from_arrays(left, right, closed, copy=copy,
+ dtype=dtype)
+ return cls._simple_new(array, name=name)
+
+ @classmethod
+ @Appender(_interval_shared_docs['from_intervals'] % _index_doc_kwargs)
+ def from_intervals(cls, data, closed=None, name=None, copy=False,
+ dtype=None):
+ msg = ('IntervalIndex.from_intervals is deprecated and will be '
+ 'removed in a future version; Use IntervalIndex(...) instead')
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+ with rewrite_exception("IntervalArray", cls.__name__):
+ array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype)
+
+ if name is None and isinstance(data, cls):
+ name = data.name
+
+ return cls._simple_new(array, name=name)
+
+ @classmethod
+ @Appender(_interval_shared_docs['from_tuples'] % _index_doc_kwargs)
+ def from_tuples(cls, data, closed='right', name=None, copy=False,
+ dtype=None):
+ with rewrite_exception("IntervalArray", cls.__name__):
+ arr = IntervalArray.from_tuples(data, closed=closed, copy=copy,
+ dtype=dtype)
+ return cls._simple_new(arr, name=name)
+
+ # --------------------------------------------------------------------
+
+ @Appender(_index_shared_docs['_shallow_copy'])
+ def _shallow_copy(self, left=None, right=None, **kwargs):
+ result = self._data._shallow_copy(left=left, right=right)
+ attributes = self._get_attributes_dict()
+ attributes.update(kwargs)
+ return self._simple_new(result, **attributes)
+
+ @cache_readonly
+ def _isnan(self):
+ """Return a mask indicating if each value is NA"""
+ if self._mask is None:
+ self._mask = isna(self.left)
+ return self._mask
+
+ @cache_readonly
+ def _engine(self):
+ left = self._maybe_convert_i8(self.left)
+ right = self._maybe_convert_i8(self.right)
+ return IntervalTree(left, right, closed=self.closed)
+
+ def __contains__(self, key):
+ """
+ return a boolean if this key is IN the index
+ We *only* accept an Interval
+
+ Parameters
+ ----------
+ key : Interval
+
+ Returns
+ -------
+ boolean
+ """
+ if not isinstance(key, Interval):
+ return False
+
+ try:
+ self.get_loc(key)
+ return True
+ except KeyError:
+ return False
+
+ def contains(self, key):
+ """
+ Return a boolean indicating if the key is IN the index
+
+ We accept / allow keys to be not *just* actual
+ objects.
+
+ Parameters
+ ----------
+ key : int, float, Interval
+
+ Returns
+ -------
+ boolean
+ """
+ try:
+ self.get_loc(key)
+ return True
+ except KeyError:
+ return False
+
+ @Appender(_interval_shared_docs['to_tuples'] % dict(
+ return_type="Index",
+ examples="""
+ Examples
+ --------
+ >>> idx = pd.IntervalIndex.from_arrays([0, np.nan, 2], [1, np.nan, 3])
+ >>> idx.to_tuples()
+ Index([(0.0, 1.0), (nan, nan), (2.0, 3.0)], dtype='object')
+ >>> idx.to_tuples(na_tuple=False)
+ Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object')""",
+ ))
+ def to_tuples(self, na_tuple=True):
+ tuples = self._data.to_tuples(na_tuple=na_tuple)
+ return Index(tuples)
+
+ @cache_readonly
+ def _multiindex(self):
+ return MultiIndex.from_arrays([self.left, self.right],
+ names=['left', 'right'])
+
+ @property
+ def left(self):
+ """
+ Return the left endpoints of each Interval in the IntervalIndex as
+ an Index
+ """
+ return self._data._left
+
+ @property
+ def right(self):
+ """
+ Return the right endpoints of each Interval in the IntervalIndex as
+ an Index
+ """
+ return self._data._right
+
+ @property
+ def closed(self):
+ """
+ Whether the intervals are closed on the left-side, right-side, both or
+ neither
+ """
+ return self._data._closed
+
+ @Appender(_interval_shared_docs['set_closed'] % _index_doc_kwargs)
+ def set_closed(self, closed):
+ if closed not in _VALID_CLOSED:
+ msg = "invalid option for 'closed': {closed}"
+ raise ValueError(msg.format(closed=closed))
+
+ # return self._shallow_copy(closed=closed)
+ array = self._data.set_closed(closed)
+ return self._simple_new(array, self.name)
+
+ @property
+ def length(self):
+ """
+ Return an Index with entries denoting the length of each Interval in
+ the IntervalIndex
+ """
+ return self._data.length
+
+ @property
+ def size(self):
+ # Avoid materializing ndarray[Interval]
+ return self._data.size
+
+ @property
+ def shape(self):
+ # Avoid materializing ndarray[Interval]
+ return self._data.shape
+
+ @property
+ def itemsize(self):
+ msg = ('IntervalIndex.itemsize is deprecated and will be removed in '
+ 'a future version')
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+
+ # supress the warning from the underlying left/right itemsize
+ with warnings.catch_warnings():
+ warnings.simplefilter('ignore')
+ return self.left.itemsize + self.right.itemsize
+
+ def __len__(self):
+ return len(self.left)
+
+ @cache_readonly
+ def values(self):
+ """
+ Return the IntervalIndex's data as an IntervalArray.
+ """
+ return self._data
+
+ @cache_readonly
+ def _values(self):
+ return self._data
+
+ @cache_readonly
+ def _ndarray_values(self):
+ return np.array(self._data)
+
+ def __array__(self, result=None):
+ """ the array interface, return my values """
+ return self._ndarray_values
+
+ def __array_wrap__(self, result, context=None):
+ # we don't want the superclass implementation
+ return result
+
+ def __reduce__(self):
+ d = dict(left=self.left,
+ right=self.right)
+ d.update(self._get_attributes_dict())
+ return _new_IntervalIndex, (self.__class__, d), None
+
+ @Appender(_index_shared_docs['copy'])
+ def copy(self, deep=False, name=None):
+ array = self._data.copy(deep=deep)
+ attributes = self._get_attributes_dict()
+ if name is not None:
+ attributes.update(name=name)
+
+ return self._simple_new(array, **attributes)
+
+ @Appender(_index_shared_docs['astype'])
+ def astype(self, dtype, copy=True):
+ with rewrite_exception('IntervalArray', self.__class__.__name__):
+ new_values = self.values.astype(dtype, copy=copy)
+ if is_interval_dtype(new_values):
+ return self._shallow_copy(new_values.left, new_values.right)
+ return super(IntervalIndex, self).astype(dtype, copy=copy)
+
+ @cache_readonly
+ def dtype(self):
+ """Return the dtype object of the underlying data"""
+ return self._data.dtype
+
+ @property
+ def inferred_type(self):
+ """Return a string of the type inferred from the values"""
+ return 'interval'
+
+ @Appender(Index.memory_usage.__doc__)
+ def memory_usage(self, deep=False):
+ # we don't use an explicit engine
+ # so return the bytes here
+ return (self.left.memory_usage(deep=deep) +
+ self.right.memory_usage(deep=deep))
+
+ @cache_readonly
+ def mid(self):
+ """
+ Return the midpoint of each Interval in the IntervalIndex as an Index
+ """
+ return self._data.mid
+
+ @cache_readonly
+ def is_monotonic(self):
+ """
+ Return True if the IntervalIndex is monotonic increasing (only equal or
+ increasing values), else False
+ """
+ return self._multiindex.is_monotonic
+
+ @cache_readonly
+ def is_monotonic_increasing(self):
+ """
+ Return True if the IntervalIndex is monotonic increasing (only equal or
+ increasing values), else False
+ """
+ return self._multiindex.is_monotonic_increasing
+
+ @cache_readonly
+ def is_monotonic_decreasing(self):
+ """
+ Return True if the IntervalIndex is monotonic decreasing (only equal or
+ decreasing values), else False
+ """
+ return self._multiindex.is_monotonic_decreasing
+
+ @cache_readonly
+ def is_unique(self):
+ """
+ Return True if the IntervalIndex contains unique elements, else False
+ """
+ return self._multiindex.is_unique
+
+ @cache_readonly
+ @Appender(_interval_shared_docs['is_non_overlapping_monotonic']
+ % _index_doc_kwargs)
+ def is_non_overlapping_monotonic(self):
+ return self._data.is_non_overlapping_monotonic
+
+ @property
+ def is_overlapping(self):
+ """
+ Return True if the IntervalIndex has overlapping intervals, else False.
+
+ Two intervals overlap if they share a common point, including closed
+ endpoints. Intervals that only have an open endpoint in common do not
+ overlap.
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ bool
+ Boolean indicating if the IntervalIndex has overlapping intervals.
+
+ See Also
+ --------
+ Interval.overlaps : Check whether two Interval objects overlap.
+ IntervalIndex.overlaps : Check an IntervalIndex elementwise for
+ overlaps.
+
+ Examples
+ --------
+ >>> index = pd.IntervalIndex.from_tuples([(0, 2), (1, 3), (4, 5)])
+ >>> index
+ IntervalIndex([(0, 2], (1, 3], (4, 5]],
+ closed='right',
+ dtype='interval[int64]')
+ >>> index.is_overlapping
+ True
+
+ Intervals that share closed endpoints overlap:
+
+ >>> index = pd.interval_range(0, 3, closed='both')
+ >>> index
+ IntervalIndex([[0, 1], [1, 2], [2, 3]],
+ closed='both',
+ dtype='interval[int64]')
+ >>> index.is_overlapping
+ True
+
+ Intervals that only have an open endpoint in common do not overlap:
+
+ >>> index = pd.interval_range(0, 3, closed='left')
+ >>> index
+ IntervalIndex([[0, 1), [1, 2), [2, 3)],
+ closed='left',
+ dtype='interval[int64]')
+ >>> index.is_overlapping
+ False
+ """
+ # GH 23309
+ return self._engine.is_overlapping
+
+ @Appender(_index_shared_docs['_convert_scalar_indexer'])
+ def _convert_scalar_indexer(self, key, kind=None):
+ if kind == 'iloc':
+ return super(IntervalIndex, self)._convert_scalar_indexer(
+ key, kind=kind)
+ return key
+
+ def _maybe_cast_slice_bound(self, label, side, kind):
+ return getattr(self, side)._maybe_cast_slice_bound(label, side, kind)
+
+ @Appender(_index_shared_docs['_convert_list_indexer'])
+ def _convert_list_indexer(self, keyarr, kind=None):
+ """
+ we are passed a list-like indexer. Return the
+ indexer for matching intervals.
+ """
+ locs = self.get_indexer_for(keyarr)
+
+ # we have missing values
+ if (locs == -1).any():
+ raise KeyError
+
+ return locs
+
+ def _maybe_cast_indexed(self, key):
+ """
+ we need to cast the key, which could be a scalar
+ or an array-like to the type of our subtype
+ """
+ if isinstance(key, IntervalIndex):
+ return key
+
+ subtype = self.dtype.subtype
+ if is_float_dtype(subtype):
+ if is_integer(key):
+ key = float(key)
+ elif isinstance(key, (np.ndarray, Index)):
+ key = key.astype('float64')
+ elif is_integer_dtype(subtype):
+ if is_integer(key):
+ key = int(key)
+
+ return key
+
+ def _needs_i8_conversion(self, key):
+ """
+ Check if a given key needs i8 conversion. Conversion is necessary for
+ Timestamp, Timedelta, DatetimeIndex, and TimedeltaIndex keys. An
+ Interval-like requires conversion if it's endpoints are one of the
+ aforementioned types.
+
+ Assumes that any list-like data has already been cast to an Index.
+
+ Parameters
+ ----------
+ key : scalar or Index-like
+ The key that should be checked for i8 conversion
+
+ Returns
+ -------
+ boolean
+ """
+ if is_interval_dtype(key) or isinstance(key, Interval):
+ return self._needs_i8_conversion(key.left)
+
+ i8_types = (Timestamp, Timedelta, DatetimeIndex, TimedeltaIndex)
+ return isinstance(key, i8_types)
+
+ def _maybe_convert_i8(self, key):
+ """
+ Maybe convert a given key to it's equivalent i8 value(s). Used as a
+ preprocessing step prior to IntervalTree queries (self._engine), which
+ expects numeric data.
+
+ Parameters
+ ----------
+ key : scalar or list-like
+ The key that should maybe be converted to i8.
+
+ Returns
+ -------
+ key: scalar or list-like
+ The original key if no conversion occured, int if converted scalar,
+ Int64Index if converted list-like.
+ """
+ original = key
+ if is_list_like(key):
+ key = ensure_index(key)
+
+ if not self._needs_i8_conversion(key):
+ return original
+
+ scalar = is_scalar(key)
+ if is_interval_dtype(key) or isinstance(key, Interval):
+ # convert left/right and reconstruct
+ left = self._maybe_convert_i8(key.left)
+ right = self._maybe_convert_i8(key.right)
+ constructor = Interval if scalar else IntervalIndex.from_arrays
+ return constructor(left, right, closed=self.closed)
+
+ if scalar:
+ # Timestamp/Timedelta
+ key_dtype, key_i8 = infer_dtype_from_scalar(key, pandas_dtype=True)
+ else:
+ # DatetimeIndex/TimedeltaIndex
+ key_dtype, key_i8 = key.dtype, Index(key.asi8)
+ if key.hasnans:
+ # convert NaT from it's i8 value to np.nan so it's not viewed
+ # as a valid value, maybe causing errors (e.g. is_overlapping)
+ key_i8 = key_i8.where(~key._isnan)
+
+ # ensure consistency with IntervalIndex subtype
+ subtype = self.dtype.subtype
+ msg = ('Cannot index an IntervalIndex of subtype {subtype} with '
+ 'values of dtype {other}')
+ if not is_dtype_equal(subtype, key_dtype):
+ raise ValueError(msg.format(subtype=subtype, other=key_dtype))
+
+ return key_i8
+
+ def _check_method(self, method):
+ if method is None:
+ return
+
+ if method in ['bfill', 'backfill', 'pad', 'ffill', 'nearest']:
+ msg = 'method {method} not yet implemented for IntervalIndex'
+ raise NotImplementedError(msg.format(method=method))
+
+ raise ValueError("Invalid fill method")
+
+ def _searchsorted_monotonic(self, label, side, exclude_label=False):
+ if not self.is_non_overlapping_monotonic:
+ raise KeyError('can only get slices from an IntervalIndex if '
+ 'bounds are non-overlapping and all monotonic '
+ 'increasing or decreasing')
+
+ if isinstance(label, IntervalMixin):
+ raise NotImplementedError
+
+ # GH 20921: "not is_monotonic_increasing" for the second condition
+ # instead of "is_monotonic_decreasing" to account for single element
+ # indexes being both increasing and decreasing
+ if ((side == 'left' and self.left.is_monotonic_increasing) or
+ (side == 'right' and not self.left.is_monotonic_increasing)):
+ sub_idx = self.right
+ if self.open_right or exclude_label:
+ label = _get_next_label(label)
+ else:
+ sub_idx = self.left
+ if self.open_left or exclude_label:
+ label = _get_prev_label(label)
+
+ return sub_idx._searchsorted_monotonic(label, side)
+
+ def _get_loc_only_exact_matches(self, key):
+ if isinstance(key, Interval):
+
+ if not self.is_unique:
+ raise ValueError("cannot index with a slice Interval"
+ " and a non-unique index")
+
+ # TODO: this expands to a tuple index, see if we can
+ # do better
+ return Index(self._multiindex.values).get_loc(key)
+ raise KeyError
+
+ def _find_non_overlapping_monotonic_bounds(self, key):
+ if isinstance(key, IntervalMixin):
+ start = self._searchsorted_monotonic(
+ key.left, 'left', exclude_label=key.open_left)
+ stop = self._searchsorted_monotonic(
+ key.right, 'right', exclude_label=key.open_right)
+ elif isinstance(key, slice):
+ # slice
+ start, stop = key.start, key.stop
+ if (key.step or 1) != 1:
+ raise NotImplementedError("cannot slice with a slice step")
+ if start is None:
+ start = 0
+ else:
+ start = self._searchsorted_monotonic(start, 'left')
+ if stop is None:
+ stop = len(self)
+ else:
+ stop = self._searchsorted_monotonic(stop, 'right')
+ else:
+ # scalar or index-like
+
+ start = self._searchsorted_monotonic(key, 'left')
+ stop = self._searchsorted_monotonic(key, 'right')
+ return start, stop
+
+ def get_loc(self, key, method=None):
+ """Get integer location, slice or boolean mask for requested label.
+
+ Parameters
+ ----------
+ key : label
+ method : {None}, optional
+ * default: matches where the label is within an interval only.
+
+ Returns
+ -------
+ loc : int if unique index, slice if monotonic index, else mask
+
+ Examples
+ ---------
+ >>> i1, i2 = pd.Interval(0, 1), pd.Interval(1, 2)
+ >>> index = pd.IntervalIndex([i1, i2])
+ >>> index.get_loc(1)
+ 0
+
+ You can also supply an interval or an location for a point inside an
+ interval.
+
+ >>> index.get_loc(pd.Interval(0, 2))
+ array([0, 1], dtype=int64)
+ >>> index.get_loc(1.5)
+ 1
+
+ If a label is in several intervals, you get the locations of all the
+ relevant intervals.
+
+ >>> i3 = pd.Interval(0, 2)
+ >>> overlapping_index = pd.IntervalIndex([i2, i3])
+ >>> overlapping_index.get_loc(1.5)
+ array([0, 1], dtype=int64)
+ """
+ self._check_method(method)
+
+ original_key = key
+ key = self._maybe_cast_indexed(key)
+
+ if self.is_non_overlapping_monotonic:
+ if isinstance(key, Interval):
+ left = self._maybe_cast_slice_bound(key.left, 'left', None)
+ right = self._maybe_cast_slice_bound(key.right, 'right', None)
+ key = Interval(left, right, key.closed)
+ else:
+ key = self._maybe_cast_slice_bound(key, 'left', None)
+
+ start, stop = self._find_non_overlapping_monotonic_bounds(key)
+
+ if start is None or stop is None:
+ return slice(start, stop)
+ elif start + 1 == stop:
+ return start
+ elif start < stop:
+ return slice(start, stop)
+ else:
+ raise KeyError(original_key)
+
+ else:
+ # use the interval tree
+ key = self._maybe_convert_i8(key)
+ if isinstance(key, Interval):
+ left, right = _get_interval_closed_bounds(key)
+ return self._engine.get_loc_interval(left, right)
+ else:
+ return self._engine.get_loc(key)
+
+ def get_value(self, series, key):
+ if com.is_bool_indexer(key):
+ loc = key
+ elif is_list_like(key):
+ loc = self.get_indexer(key)
+ elif isinstance(key, slice):
+
+ if not (key.step is None or key.step == 1):
+ raise ValueError("cannot support not-default step in a slice")
+
+ try:
+ loc = self.get_loc(key)
+ except TypeError:
+ # we didn't find exact intervals or are non-unique
+ msg = "unable to slice with this key: {key}".format(key=key)
+ raise ValueError(msg)
+
+ else:
+ loc = self.get_loc(key)
+ return series.iloc[loc]
+
+ @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs)
+ def get_indexer(self, target, method=None, limit=None, tolerance=None):
+
+ self._check_method(method)
+ target = ensure_index(target)
+ target = self._maybe_cast_indexed(target)
+
+ if self.equals(target):
+ return np.arange(len(self), dtype='intp')
+
+ if self.is_non_overlapping_monotonic:
+ start, stop = self._find_non_overlapping_monotonic_bounds(target)
+
+ start_plus_one = start + 1
+ if not ((start_plus_one < stop).any()):
+ return np.where(start_plus_one == stop, start, -1)
+
+ if not self.is_unique:
+ raise ValueError("cannot handle non-unique indices")
+
+ # IntervalIndex
+ if isinstance(target, IntervalIndex):
+ indexer = self._get_reindexer(target)
+
+ # non IntervalIndex
+ else:
+ indexer = np.concatenate([self.get_loc(i) for i in target])
+
+ return ensure_platform_int(indexer)
+
+ def _get_reindexer(self, target):
+ """
+ Return an indexer for a target IntervalIndex with self
+ """
+
+ # find the left and right indexers
+ left = self._maybe_convert_i8(target.left)
+ right = self._maybe_convert_i8(target.right)
+ lindexer = self._engine.get_indexer(left.values)
+ rindexer = self._engine.get_indexer(right.values)
+
+ # we want to return an indexer on the intervals
+ # however, our keys could provide overlapping of multiple
+ # intervals, so we iterate thru the indexers and construct
+ # a set of indexers
+
+ indexer = []
+ n = len(self)
+
+ for i, (lhs, rhs) in enumerate(zip(lindexer, rindexer)):
+
+ target_value = target[i]
+
+ # matching on the lhs bound
+ if (lhs != -1 and
+ self.closed == 'right' and
+ target_value.left == self[lhs].right):
+ lhs += 1
+
+ # matching on the lhs bound
+ if (rhs != -1 and
+ self.closed == 'left' and
+ target_value.right == self[rhs].left):
+ rhs -= 1
+
+ # not found
+ if lhs == -1 and rhs == -1:
+ indexer.append(np.array([-1]))
+
+ elif rhs == -1:
+
+ indexer.append(np.arange(lhs, n))
+
+ elif lhs == -1:
+
+ # care about left/right closed here
+ value = self[i]
+
+ # target.closed same as self.closed
+ if self.closed == target.closed:
+ if target_value.left < value.left:
+ indexer.append(np.array([-1]))
+ continue
+
+ # target.closed == 'left'
+ elif self.closed == 'right':
+ if target_value.left <= value.left:
+ indexer.append(np.array([-1]))
+ continue
+
+ # target.closed == 'right'
+ elif self.closed == 'left':
+ if target_value.left <= value.left:
+ indexer.append(np.array([-1]))
+ continue
+
+ indexer.append(np.arange(0, rhs + 1))
+
+ else:
+ indexer.append(np.arange(lhs, rhs + 1))
+
+ return np.concatenate(indexer)
+
+ @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs)
+ def get_indexer_non_unique(self, target):
+ target = self._maybe_cast_indexed(ensure_index(target))
+ return super(IntervalIndex, self).get_indexer_non_unique(target)
+
+ @Appender(_index_shared_docs['where'])
+ def where(self, cond, other=None):
+ if other is None:
+ other = self._na_value
+ values = np.where(cond, self.values, other)
+ return self._shallow_copy(values)
+
+ def delete(self, loc):
+ """
+ Return a new IntervalIndex with passed location(-s) deleted
+
+ Returns
+ -------
+ new_index : IntervalIndex
+ """
+ new_left = self.left.delete(loc)
+ new_right = self.right.delete(loc)
+ return self._shallow_copy(new_left, new_right)
+
+ def insert(self, loc, item):
+ """
+ Return a new IntervalIndex inserting new item at location. Follows
+ Python list.append semantics for negative values. Only Interval
+ objects and NA can be inserted into an IntervalIndex
+
+ Parameters
+ ----------
+ loc : int
+ item : object
+
+ Returns
+ -------
+ new_index : IntervalIndex
+ """
+ if isinstance(item, Interval):
+ if item.closed != self.closed:
+ raise ValueError('inserted item must be closed on the same '
+ 'side as the index')
+ left_insert = item.left
+ right_insert = item.right
+ elif is_scalar(item) and isna(item):
+ # GH 18295
+ left_insert = right_insert = item
+ else:
+ raise ValueError('can only insert Interval objects and NA into '
+ 'an IntervalIndex')
+
+ new_left = self.left.insert(loc, left_insert)
+ new_right = self.right.insert(loc, right_insert)
+ return self._shallow_copy(new_left, new_right)
+
+ def _as_like_interval_index(self, other):
+ self._assert_can_do_setop(other)
+ other = ensure_index(other)
+ if not isinstance(other, IntervalIndex):
+ msg = ('the other index needs to be an IntervalIndex too, but '
+ 'was type {}').format(other.__class__.__name__)
+ raise TypeError(msg)
+ elif self.closed != other.closed:
+ msg = ('can only do set operations between two IntervalIndex '
+ 'objects that are closed on the same side')
+ raise ValueError(msg)
+ return other
+
+ def _concat_same_dtype(self, to_concat, name):
+ """
+ assert that we all have the same .closed
+ we allow a 0-len index here as well
+ """
+ if not len({i.closed for i in to_concat if len(i)}) == 1:
+ msg = ('can only append two IntervalIndex objects '
+ 'that are closed on the same side')
+ raise ValueError(msg)
+ return super(IntervalIndex, self)._concat_same_dtype(to_concat, name)
+
+ @Appender(_index_shared_docs['take'] % _index_doc_kwargs)
+ def take(self, indices, axis=0, allow_fill=True,
+ fill_value=None, **kwargs):
+ result = self._data.take(indices, axis=axis, allow_fill=allow_fill,
+ fill_value=fill_value, **kwargs)
+ attributes = self._get_attributes_dict()
+ return self._simple_new(result, **attributes)
+
+ def __getitem__(self, value):
+ result = self._data[value]
+ if isinstance(result, IntervalArray):
+ return self._shallow_copy(result)
+ else:
+ # scalar
+ return result
+
+ # --------------------------------------------------------------------
+ # Rendering Methods
+ # __repr__ associated methods are based on MultiIndex
+
+ def _format_with_header(self, header, **kwargs):
+ return header + list(self._format_native_types(**kwargs))
+
+ def _format_native_types(self, na_rep='', quoting=None, **kwargs):
+ """ actually format my specific types """
+ from pandas.io.formats.format import ExtensionArrayFormatter
+ return ExtensionArrayFormatter(values=self,
+ na_rep=na_rep,
+ justify='all',
+ leading_space=False).get_result()
+
+ def _format_data(self, name=None):
+
+ # TODO: integrate with categorical and make generic
+ # name argument is unused here; just for compat with base / categorical
+ n = len(self)
+ max_seq_items = min((get_option(
+ 'display.max_seq_items') or n) // 10, 10)
+
+ formatter = str
+
+ if n == 0:
+ summary = '[]'
+ elif n == 1:
+ first = formatter(self[0])
+ summary = '[{first}]'.format(first=first)
+ elif n == 2:
+ first = formatter(self[0])
+ last = formatter(self[-1])
+ summary = '[{first}, {last}]'.format(first=first, last=last)
+ else:
+
+ if n > max_seq_items:
+ n = min(max_seq_items // 2, 10)
+ head = [formatter(x) for x in self[:n]]
+ tail = [formatter(x) for x in self[-n:]]
+ summary = '[{head} ... {tail}]'.format(
+ head=', '.join(head), tail=', '.join(tail))
+ else:
+ tail = [formatter(x) for x in self]
+ summary = '[{tail}]'.format(tail=', '.join(tail))
+
+ return summary + ',' + self._format_space()
+
+ def _format_attrs(self):
+ attrs = [('closed', repr(self.closed))]
+ if self.name is not None:
+ attrs.append(('name', default_pprint(self.name)))
+ attrs.append(('dtype', "'{dtype}'".format(dtype=self.dtype)))
+ return attrs
+
+ def _format_space(self):
+ space = ' ' * (len(self.__class__.__name__) + 1)
+ return "\n{space}".format(space=space)
+
+ # --------------------------------------------------------------------
+
+ def argsort(self, *args, **kwargs):
+ return np.lexsort((self.right, self.left))
+
+ def equals(self, other):
+ """
+ Determines if two IntervalIndex objects contain the same elements
+ """
+ if self.is_(other):
+ return True
+
+ # if we can coerce to an II
+ # then we can compare
+ if not isinstance(other, IntervalIndex):
+ if not is_interval_dtype(other):
+ return False
+ other = Index(getattr(other, '.values', other))
+
+ return (self.left.equals(other.left) and
+ self.right.equals(other.right) and
+ self.closed == other.closed)
+
+ @Appender(_interval_shared_docs['overlaps'] % _index_doc_kwargs)
+ def overlaps(self, other):
+ return self._data.overlaps(other)
+
+ def _setop(op_name, sort=None):
+ def func(self, other, sort=sort):
+ other = self._as_like_interval_index(other)
+
+ # GH 19016: ensure set op will not return a prohibited dtype
+ subtypes = [self.dtype.subtype, other.dtype.subtype]
+ common_subtype = find_common_type(subtypes)
+ if is_object_dtype(common_subtype):
+ msg = ('can only do {op} between two IntervalIndex '
+ 'objects that have compatible dtypes')
+ raise TypeError(msg.format(op=op_name))
+
+ result = getattr(self._multiindex, op_name)(other._multiindex,
+ sort=sort)
+ result_name = get_op_result_name(self, other)
+
+ # GH 19101: ensure empty results have correct dtype
+ if result.empty:
+ result = result.values.astype(self.dtype.subtype)
+ else:
+ result = result.values
+
+ return type(self).from_tuples(result, closed=self.closed,
+ name=result_name)
+ return func
+
+ @property
+ def is_all_dates(self):
+ """
+ This is False even when left/right contain datetime-like objects,
+ as the check is done on the Interval itself
+ """
+ return False
+
+ union = _setop('union')
+ intersection = _setop('intersection', sort=False)
+ difference = _setop('difference')
+ symmetric_difference = _setop('symmetric_difference')
+
+ # TODO: arithmetic operations
+
+
+IntervalIndex._add_logical_methods_disabled()
+
+
+def _is_valid_endpoint(endpoint):
+ """helper for interval_range to check if start/end are valid types"""
+ return any([is_number(endpoint),
+ isinstance(endpoint, Timestamp),
+ isinstance(endpoint, Timedelta),
+ endpoint is None])
+
+
+def _is_type_compatible(a, b):
+ """helper for interval_range to check type compat of start/end/freq"""
+ is_ts_compat = lambda x: isinstance(x, (Timestamp, DateOffset))
+ is_td_compat = lambda x: isinstance(x, (Timedelta, DateOffset))
+ return ((is_number(a) and is_number(b)) or
+ (is_ts_compat(a) and is_ts_compat(b)) or
+ (is_td_compat(a) and is_td_compat(b)) or
+ com._any_none(a, b))
+
+
+def interval_range(start=None, end=None, periods=None, freq=None,
+ name=None, closed='right'):
+ """
+ Return a fixed frequency IntervalIndex
+
+ Parameters
+ ----------
+ start : numeric or datetime-like, default None
+ Left bound for generating intervals
+ end : numeric or datetime-like, default None
+ Right bound for generating intervals
+ periods : integer, default None
+ Number of periods to generate
+ freq : numeric, string, or DateOffset, default None
+ The length of each interval. Must be consistent with the type of start
+ and end, e.g. 2 for numeric, or '5H' for datetime-like. Default is 1
+ for numeric and 'D' for datetime-like.
+ name : string, default None
+ Name of the resulting IntervalIndex
+ closed : {'left', 'right', 'both', 'neither'}, default 'right'
+ Whether the intervals are closed on the left-side, right-side, both
+ or neither.
+
+ Returns
+ -------
+ rng : IntervalIndex
+
+ See Also
+ --------
+ IntervalIndex : An Index of intervals that are all closed on the same side.
+
+ Notes
+ -----
+ Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,
+ exactly three must be specified. If ``freq`` is omitted, the resulting
+ ``IntervalIndex`` will have ``periods`` linearly spaced elements between
+ ``start`` and ``end``, inclusively.
+
+ To learn more about datetime-like frequency strings, please see `this link
+ <http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases>`__.
+
+ Examples
+ --------
+ Numeric ``start`` and ``end`` is supported.
+
+ >>> pd.interval_range(start=0, end=5)
+ IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]],
+ closed='right', dtype='interval[int64]')
+
+ Additionally, datetime-like input is also supported.
+
+ >>> pd.interval_range(start=pd.Timestamp('2017-01-01'),
+ ... end=pd.Timestamp('2017-01-04'))
+ IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03],
+ (2017-01-03, 2017-01-04]],
+ closed='right', dtype='interval[datetime64[ns]]')
+
+ The ``freq`` parameter specifies the frequency between the left and right.
+ endpoints of the individual intervals within the ``IntervalIndex``. For
+ numeric ``start`` and ``end``, the frequency must also be numeric.
+
+ >>> pd.interval_range(start=0, periods=4, freq=1.5)
+ IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]],
+ closed='right', dtype='interval[float64]')
+
+ Similarly, for datetime-like ``start`` and ``end``, the frequency must be
+ convertible to a DateOffset.
+
+ >>> pd.interval_range(start=pd.Timestamp('2017-01-01'),
+ ... periods=3, freq='MS')
+ IntervalIndex([(2017-01-01, 2017-02-01], (2017-02-01, 2017-03-01],
+ (2017-03-01, 2017-04-01]],
+ closed='right', dtype='interval[datetime64[ns]]')
+
+ Specify ``start``, ``end``, and ``periods``; the frequency is generated
+ automatically (linearly spaced).
+
+ >>> pd.interval_range(start=0, end=6, periods=4)
+ IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]],
+ closed='right',
+ dtype='interval[float64]')
+
+ The ``closed`` parameter specifies which endpoints of the individual
+ intervals within the ``IntervalIndex`` are closed.
+
+ >>> pd.interval_range(end=5, periods=4, closed='both')
+ IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]],
+ closed='both', dtype='interval[int64]')
+ """
+ start = com.maybe_box_datetimelike(start)
+ end = com.maybe_box_datetimelike(end)
+ endpoint = start if start is not None else end
+
+ if freq is None and com._any_none(periods, start, end):
+ freq = 1 if is_number(endpoint) else 'D'
+
+ if com.count_not_none(start, end, periods, freq) != 3:
+ raise ValueError('Of the four parameters: start, end, periods, and '
+ 'freq, exactly three must be specified')
+
+ if not _is_valid_endpoint(start):
+ msg = 'start must be numeric or datetime-like, got {start}'
+ raise ValueError(msg.format(start=start))
+ elif not _is_valid_endpoint(end):
+ msg = 'end must be numeric or datetime-like, got {end}'
+ raise ValueError(msg.format(end=end))
+
+ if is_float(periods):
+ periods = int(periods)
+ elif not is_integer(periods) and periods is not None:
+ msg = 'periods must be a number, got {periods}'
+ raise TypeError(msg.format(periods=periods))
+
+ if freq is not None and not is_number(freq):
+ try:
+ freq = to_offset(freq)
+ except ValueError:
+ raise ValueError('freq must be numeric or convertible to '
+ 'DateOffset, got {freq}'.format(freq=freq))
+
+ # verify type compatibility
+ if not all([_is_type_compatible(start, end),
+ _is_type_compatible(start, freq),
+ _is_type_compatible(end, freq)]):
+ raise TypeError("start, end, freq need to be type compatible")
+
+ # +1 to convert interval count to breaks count (n breaks = n-1 intervals)
+ if periods is not None:
+ periods += 1
+
+ if is_number(endpoint):
+ # force consistency between start/end/freq (lower end if freq skips it)
+ if com._all_not_none(start, end, freq):
+ end -= (end - start) % freq
+
+ # compute the period/start/end if unspecified (at most one)
+ if periods is None:
+ periods = int((end - start) // freq) + 1
+ elif start is None:
+ start = end - (periods - 1) * freq
+ elif end is None:
+ end = start + (periods - 1) * freq
+
+ breaks = np.linspace(start, end, periods)
+ if all(is_integer(x) for x in com._not_none(start, end, freq)):
+ # np.linspace always produces float output
+ breaks = maybe_downcast_to_dtype(breaks, 'int64')
+ else:
+ # delegate to the appropriate range function
+ if isinstance(endpoint, Timestamp):
+ range_func = date_range
+ else:
+ range_func = timedelta_range
+
+ breaks = range_func(start=start, end=end, periods=periods, freq=freq)
+
+ return IntervalIndex.from_breaks(breaks, name=name, closed=closed)
diff --git a/contrib/python/pandas/py2/pandas/core/indexes/multi.py b/contrib/python/pandas/py2/pandas/core/indexes/multi.py
new file mode 100644
index 00000000000..14975dbbefa
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/indexes/multi.py
@@ -0,0 +1,3166 @@
+# pylint: disable=E1101,E1103,W0232
+from collections import OrderedDict
+import datetime
+from sys import getsizeof
+import warnings
+
+import numpy as np
+
+from pandas._libs import (
+ Timestamp, algos as libalgos, index as libindex, lib, tslibs)
+import pandas.compat as compat
+from pandas.compat import lrange, lzip, map, range, zip
+from pandas.compat.numpy import function as nv
+from pandas.errors import PerformanceWarning, UnsortedIndexError
+from pandas.util._decorators import Appender, cache_readonly, deprecate_kwarg
+
+from pandas.core.dtypes.common import (
+ ensure_int64, ensure_platform_int, is_categorical_dtype, is_hashable,
+ is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar,
+ pandas_dtype)
+from pandas.core.dtypes.dtypes import ExtensionDtype, PandasExtensionDtype
+from pandas.core.dtypes.generic import ABCDataFrame
+from pandas.core.dtypes.missing import array_equivalent, isna
+
+import pandas.core.algorithms as algos
+import pandas.core.common as com
+from pandas.core.config import get_option
+import pandas.core.indexes.base as ibase
+from pandas.core.indexes.base import (
+ Index, InvalidIndexError, _index_shared_docs, ensure_index)
+from pandas.core.indexes.frozen import FrozenList, _ensure_frozen
+import pandas.core.missing as missing
+
+from pandas.io.formats.printing import pprint_thing
+
+_index_doc_kwargs = dict(ibase._index_doc_kwargs)
+_index_doc_kwargs.update(
+ dict(klass='MultiIndex',
+ target_klass='MultiIndex or list of tuples'))
+
+
+class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine,
+ libindex.UInt64Engine):
+ """
+ This class manages a MultiIndex by mapping label combinations to positive
+ integers.
+ """
+ _base = libindex.UInt64Engine
+
+ def _codes_to_ints(self, codes):
+ """
+ Transform combination(s) of uint64 in one uint64 (each), in a strictly
+ monotonic way (i.e. respecting the lexicographic order of integer
+ combinations): see BaseMultiIndexCodesEngine documentation.
+
+ Parameters
+ ----------
+ codes : 1- or 2-dimensional array of dtype uint64
+ Combinations of integers (one per row)
+
+ Returns
+ ------
+ int_keys : scalar or 1-dimensional array, of dtype uint64
+ Integer(s) representing one combination (each)
+ """
+ # Shift the representation of each level by the pre-calculated number
+ # of bits:
+ codes <<= self.offsets
+
+ # Now sum and OR are in fact interchangeable. This is a simple
+ # composition of the (disjunct) significant bits of each level (i.e.
+ # each column in "codes") in a single positive integer:
+ if codes.ndim == 1:
+ # Single key
+ return np.bitwise_or.reduce(codes)
+
+ # Multiple keys
+ return np.bitwise_or.reduce(codes, axis=1)
+
+
+class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine,
+ libindex.ObjectEngine):
+ """
+ This class manages those (extreme) cases in which the number of possible
+ label combinations overflows the 64 bits integers, and uses an ObjectEngine
+ containing Python integers.
+ """
+ _base = libindex.ObjectEngine
+
+ def _codes_to_ints(self, codes):
+ """
+ Transform combination(s) of uint64 in one Python integer (each), in a
+ strictly monotonic way (i.e. respecting the lexicographic order of
+ integer combinations): see BaseMultiIndexCodesEngine documentation.
+
+ Parameters
+ ----------
+ codes : 1- or 2-dimensional array of dtype uint64
+ Combinations of integers (one per row)
+
+ Returns
+ ------
+ int_keys : int, or 1-dimensional array of dtype object
+ Integer(s) representing one combination (each)
+ """
+
+ # Shift the representation of each level by the pre-calculated number
+ # of bits. Since this can overflow uint64, first make sure we are
+ # working with Python integers:
+ codes = codes.astype('object') << self.offsets
+
+ # Now sum and OR are in fact interchangeable. This is a simple
+ # composition of the (disjunct) significant bits of each level (i.e.
+ # each column in "codes") in a single positive integer (per row):
+ if codes.ndim == 1:
+ # Single key
+ return np.bitwise_or.reduce(codes)
+
+ # Multiple keys
+ return np.bitwise_or.reduce(codes, axis=1)
+
+
+class MultiIndex(Index):
+ """
+ A multi-level, or hierarchical, index object for pandas objects.
+
+ Parameters
+ ----------
+ levels : sequence of arrays
+ The unique labels for each level.
+ codes : sequence of arrays
+ Integers for each level designating which label at each location.
+
+ .. versionadded:: 0.24.0
+ labels : sequence of arrays
+ Integers for each level designating which label at each location.
+
+ .. deprecated:: 0.24.0
+ Use ``codes`` instead
+ sortorder : optional int
+ Level of sortedness (must be lexicographically sorted by that
+ level).
+ names : optional sequence of objects
+ Names for each of the index levels. (name is accepted for compat).
+ copy : bool, default False
+ Copy the meta-data.
+ verify_integrity : bool, default True
+ Check that the levels/codes are consistent and valid.
+
+ Attributes
+ ----------
+ names
+ levels
+ codes
+ nlevels
+ levshape
+
+ Methods
+ -------
+ from_arrays
+ from_tuples
+ from_product
+ from_frame
+ set_levels
+ set_codes
+ to_frame
+ to_flat_index
+ is_lexsorted
+ sortlevel
+ droplevel
+ swaplevel
+ reorder_levels
+ remove_unused_levels
+
+ See Also
+ --------
+ MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
+ MultiIndex.from_product : Create a MultiIndex from the cartesian product
+ of iterables.
+ MultiIndex.from_tuples : Convert list of tuples to a MultiIndex.
+ MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
+ Index : The base pandas Index type.
+
+ Examples
+ ---------
+ A new ``MultiIndex`` is typically constructed using one of the helper
+ methods :meth:`MultiIndex.from_arrays`, :meth:`MultiIndex.from_product`
+ and :meth:`MultiIndex.from_tuples`. For example (using ``.from_arrays``):
+
+ >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
+ >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ MultiIndex(levels=[[1, 2], ['blue', 'red']],
+ codes=[[0, 0, 1, 1], [1, 0, 1, 0]],
+ names=['number', 'color'])
+
+ See further examples for how to construct a MultiIndex in the doc strings
+ of the mentioned helper methods.
+
+ Notes
+ -----
+ See the `user guide
+ <http://pandas.pydata.org/pandas-docs/stable/advanced.html>`_ for more.
+ """
+
+ # initialize to zero-length tuples to make everything work
+ _typ = 'multiindex'
+ _names = FrozenList()
+ _levels = FrozenList()
+ _codes = FrozenList()
+ _comparables = ['names']
+ rename = Index.set_names
+
+ # --------------------------------------------------------------------
+ # Constructors
+
+ @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes')
+ def __new__(cls, levels=None, codes=None, sortorder=None, names=None,
+ dtype=None, copy=False, name=None,
+ verify_integrity=True, _set_identity=True):
+
+ # compat with Index
+ if name is not None:
+ names = name
+ if levels is None or codes is None:
+ raise TypeError("Must pass both levels and codes")
+ if len(levels) != len(codes):
+ raise ValueError('Length of levels and codes must be the same.')
+ if len(levels) == 0:
+ raise ValueError('Must pass non-zero number of levels/codes')
+
+ result = object.__new__(MultiIndex)
+
+ # we've already validated levels and codes, so shortcut here
+ result._set_levels(levels, copy=copy, validate=False)
+ result._set_codes(codes, copy=copy, validate=False)
+
+ if names is not None:
+ # handles name validation
+ result._set_names(names)
+
+ if sortorder is not None:
+ result.sortorder = int(sortorder)
+ else:
+ result.sortorder = sortorder
+
+ if verify_integrity:
+ result._verify_integrity()
+ if _set_identity:
+ result._reset_identity()
+ return result
+
+ def _verify_integrity(self, codes=None, levels=None):
+ """
+
+ Parameters
+ ----------
+ codes : optional list
+ Codes to check for validity. Defaults to current codes.
+ levels : optional list
+ Levels to check for validity. Defaults to current levels.
+
+ Raises
+ ------
+ ValueError
+ If length of levels and codes don't match, if the codes for any
+ level would exceed level bounds, or there are any duplicate levels.
+ """
+ # NOTE: Currently does not check, among other things, that cached
+ # nlevels matches nor that sortorder matches actually sortorder.
+ codes = codes or self.codes
+ levels = levels or self.levels
+
+ if len(levels) != len(codes):
+ raise ValueError("Length of levels and codes must match. NOTE:"
+ " this index is in an inconsistent state.")
+ codes_length = len(self.codes[0])
+ for i, (level, level_codes) in enumerate(zip(levels, codes)):
+ if len(level_codes) != codes_length:
+ raise ValueError("Unequal code lengths: %s" %
+ ([len(code_) for code_ in codes]))
+ if len(level_codes) and level_codes.max() >= len(level):
+ raise ValueError("On level %d, code max (%d) >= length of"
+ " level (%d). NOTE: this index is in an"
+ " inconsistent state" % (i, level_codes.max(),
+ len(level)))
+ if not level.is_unique:
+ raise ValueError("Level values must be unique: {values} on "
+ "level {level}".format(
+ values=[value for value in level],
+ level=i))
+
+ @classmethod
+ def from_arrays(cls, arrays, sortorder=None, names=None):
+ """
+ Convert arrays to MultiIndex.
+
+ Parameters
+ ----------
+ arrays : list / sequence of array-likes
+ Each array-like gives one level's value for each data point.
+ len(arrays) is the number of levels.
+ sortorder : int or None
+ Level of sortedness (must be lexicographically sorted by that
+ level).
+ names : list / sequence of str, optional
+ Names for the levels in the index.
+
+ Returns
+ -------
+ index : MultiIndex
+
+ See Also
+ --------
+ MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
+ MultiIndex.from_product : Make a MultiIndex from cartesian product
+ of iterables.
+ MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
+
+ Examples
+ --------
+ >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
+ >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+ MultiIndex(levels=[[1, 2], ['blue', 'red']],
+ codes=[[0, 0, 1, 1], [1, 0, 1, 0]],
+ names=['number', 'color'])
+ """
+ if not is_list_like(arrays):
+ raise TypeError("Input must be a list / sequence of array-likes.")
+ elif is_iterator(arrays):
+ arrays = list(arrays)
+
+ # Check if lengths of all arrays are equal or not,
+ # raise ValueError, if not
+ for i in range(1, len(arrays)):
+ if len(arrays[i]) != len(arrays[i - 1]):
+ raise ValueError('all arrays must be same length')
+
+ from pandas.core.arrays.categorical import _factorize_from_iterables
+
+ codes, levels = _factorize_from_iterables(arrays)
+ if names is None:
+ names = [getattr(arr, "name", None) for arr in arrays]
+
+ return MultiIndex(levels=levels, codes=codes, sortorder=sortorder,
+ names=names, verify_integrity=False)
+
+ @classmethod
+ def from_tuples(cls, tuples, sortorder=None, names=None):
+ """
+ Convert list of tuples to MultiIndex.
+
+ Parameters
+ ----------
+ tuples : list / sequence of tuple-likes
+ Each tuple is the index of one row/column.
+ sortorder : int or None
+ Level of sortedness (must be lexicographically sorted by that
+ level).
+ names : list / sequence of str, optional
+ Names for the levels in the index.
+
+ Returns
+ -------
+ index : MultiIndex
+
+ See Also
+ --------
+ MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
+ MultiIndex.from_product : Make a MultiIndex from cartesian product
+ of iterables.
+ MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
+
+ Examples
+ --------
+ >>> tuples = [(1, u'red'), (1, u'blue'),
+ ... (2, u'red'), (2, u'blue')]
+ >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color'))
+ MultiIndex(levels=[[1, 2], ['blue', 'red']],
+ codes=[[0, 0, 1, 1], [1, 0, 1, 0]],
+ names=['number', 'color'])
+ """
+ if not is_list_like(tuples):
+ raise TypeError('Input must be a list / sequence of tuple-likes.')
+ elif is_iterator(tuples):
+ tuples = list(tuples)
+
+ if len(tuples) == 0:
+ if names is None:
+ msg = 'Cannot infer number of levels from empty list'
+ raise TypeError(msg)
+ arrays = [[]] * len(names)
+ elif isinstance(tuples, (np.ndarray, Index)):
+ if isinstance(tuples, Index):
+ tuples = tuples._values
+
+ arrays = list(lib.tuples_to_object_array(tuples).T)
+ elif isinstance(tuples, list):
+ arrays = list(lib.to_object_array_tuples(tuples).T)
+ else:
+ arrays = lzip(*tuples)
+
+ return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names)
+
+ @classmethod
+ def from_product(cls, iterables, sortorder=None, names=None):
+ """
+ Make a MultiIndex from the cartesian product of multiple iterables.
+
+ Parameters
+ ----------
+ iterables : list / sequence of iterables
+ Each iterable has unique labels for each level of the index.
+ sortorder : int or None
+ Level of sortedness (must be lexicographically sorted by that
+ level).
+ names : list / sequence of str, optional
+ Names for the levels in the index.
+
+ Returns
+ -------
+ index : MultiIndex
+
+ See Also
+ --------
+ MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
+ MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
+ MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
+
+ Examples
+ --------
+ >>> numbers = [0, 1, 2]
+ >>> colors = ['green', 'purple']
+ >>> pd.MultiIndex.from_product([numbers, colors],
+ ... names=['number', 'color'])
+ MultiIndex(levels=[[0, 1, 2], ['green', 'purple']],
+ codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
+ names=['number', 'color'])
+ """
+ from pandas.core.arrays.categorical import _factorize_from_iterables
+ from pandas.core.reshape.util import cartesian_product
+
+ if not is_list_like(iterables):
+ raise TypeError("Input must be a list / sequence of iterables.")
+ elif is_iterator(iterables):
+ iterables = list(iterables)
+
+ codes, levels = _factorize_from_iterables(iterables)
+ codes = cartesian_product(codes)
+ return MultiIndex(levels, codes, sortorder=sortorder, names=names)
+
+ @classmethod
+ def from_frame(cls, df, sortorder=None, names=None):
+ """
+ Make a MultiIndex from a DataFrame.
+
+ .. versionadded:: 0.24.0
+
+ Parameters
+ ----------
+ df : DataFrame
+ DataFrame to be converted to MultiIndex.
+ sortorder : int, optional
+ Level of sortedness (must be lexicographically sorted by that
+ level).
+ names : list-like, optional
+ If no names are provided, use the column names, or tuple of column
+ names if the columns is a MultiIndex. If a sequence, overwrite
+ names with the given sequence.
+
+ Returns
+ -------
+ MultiIndex
+ The MultiIndex representation of the given DataFrame.
+
+ See Also
+ --------
+ MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
+ MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
+ MultiIndex.from_product : Make a MultiIndex from cartesian product
+ of iterables.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'],
+ ... ['NJ', 'Temp'], ['NJ', 'Precip']],
+ ... columns=['a', 'b'])
+ >>> df
+ a b
+ 0 HI Temp
+ 1 HI Precip
+ 2 NJ Temp
+ 3 NJ Precip
+
+ >>> pd.MultiIndex.from_frame(df)
+ MultiIndex(levels=[['HI', 'NJ'], ['Precip', 'Temp']],
+ codes=[[0, 0, 1, 1], [1, 0, 1, 0]],
+ names=['a', 'b'])
+
+ Using explicit names, instead of the column names
+
+ >>> pd.MultiIndex.from_frame(df, names=['state', 'observation'])
+ MultiIndex(levels=[['HI', 'NJ'], ['Precip', 'Temp']],
+ codes=[[0, 0, 1, 1], [1, 0, 1, 0]],
+ names=['state', 'observation'])
+ """
+ if not isinstance(df, ABCDataFrame):
+ raise TypeError("Input must be a DataFrame")
+
+ column_names, columns = lzip(*df.iteritems())
+ names = column_names if names is None else names
+ return cls.from_arrays(columns, sortorder=sortorder, names=names)
+
+ # --------------------------------------------------------------------
+
+ @property
+ def levels(self):
+ return self._levels
+
+ @property
+ def _values(self):
+ # We override here, since our parent uses _data, which we dont' use.
+ return self.values
+
+ @property
+ def array(self):
+ """
+ Raises a ValueError for `MultiIndex` because there's no single
+ array backing a MultiIndex.
+
+ Raises
+ ------
+ ValueError
+ """
+ msg = ("MultiIndex has no single backing array. Use "
+ "'MultiIndex.to_numpy()' to get a NumPy array of tuples.")
+ raise ValueError(msg)
+
+ @property
+ def _is_homogeneous_type(self):
+ """Whether the levels of a MultiIndex all have the same dtype.
+
+ This looks at the dtypes of the levels.
+
+ See Also
+ --------
+ Index._is_homogeneous_type
+ DataFrame._is_homogeneous_type
+
+ Examples
+ --------
+ >>> MultiIndex.from_tuples([
+ ... ('a', 'b'), ('a', 'c')])._is_homogeneous_type
+ True
+ >>> MultiIndex.from_tuples([
+ ... ('a', 1), ('a', 2)])._is_homogeneous_type
+ False
+ """
+ return len({x.dtype for x in self.levels}) <= 1
+
+ def _set_levels(self, levels, level=None, copy=False, validate=True,
+ verify_integrity=False):
+ # This is NOT part of the levels property because it should be
+ # externally not allowed to set levels. User beware if you change
+ # _levels directly
+ if validate and len(levels) == 0:
+ raise ValueError('Must set non-zero number of levels.')
+ if validate and level is None and len(levels) != self.nlevels:
+ raise ValueError('Length of levels must match number of levels.')
+ if validate and level is not None and len(levels) != len(level):
+ raise ValueError('Length of levels must match length of level.')
+
+ if level is None:
+ new_levels = FrozenList(
+ ensure_index(lev, copy=copy)._shallow_copy()
+ for lev in levels)
+ else:
+ level = [self._get_level_number(l) for l in level]
+ new_levels = list(self._levels)
+ for l, v in zip(level, levels):
+ new_levels[l] = ensure_index(v, copy=copy)._shallow_copy()
+ new_levels = FrozenList(new_levels)
+
+ if verify_integrity:
+ self._verify_integrity(levels=new_levels)
+
+ names = self.names
+ self._levels = new_levels
+ if any(names):
+ self._set_names(names)
+
+ self._tuples = None
+ self._reset_cache()
+
+ def set_levels(self, levels, level=None, inplace=False,
+ verify_integrity=True):
+ """
+ Set new levels on MultiIndex. Defaults to returning
+ new index.
+
+ Parameters
+ ----------
+ levels : sequence or list of sequence
+ new level(s) to apply
+ level : int, level name, or sequence of int/level names (default None)
+ level(s) to set (None for all levels)
+ inplace : bool
+ if True, mutates in place
+ verify_integrity : bool (default True)
+ if True, checks that levels and codes are compatible
+
+ Returns
+ -------
+ new index (of same type and class...etc)
+
+ Examples
+ --------
+ >>> idx = pd.MultiIndex.from_tuples([(1, u'one'), (1, u'two'),
+ (2, u'one'), (2, u'two')],
+ names=['foo', 'bar'])
+ >>> idx.set_levels([['a','b'], [1,2]])
+ MultiIndex(levels=[[u'a', u'b'], [1, 2]],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
+ names=[u'foo', u'bar'])
+ >>> idx.set_levels(['a','b'], level=0)
+ MultiIndex(levels=[[u'a', u'b'], [u'one', u'two']],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
+ names=[u'foo', u'bar'])
+ >>> idx.set_levels(['a','b'], level='bar')
+ MultiIndex(levels=[[1, 2], [u'a', u'b']],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
+ names=[u'foo', u'bar'])
+ >>> idx.set_levels([['a','b'], [1,2]], level=[0,1])
+ MultiIndex(levels=[[u'a', u'b'], [1, 2]],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
+ names=[u'foo', u'bar'])
+ """
+ if is_list_like(levels) and not isinstance(levels, Index):
+ levels = list(levels)
+
+ if level is not None and not is_list_like(level):
+ if not is_list_like(levels):
+ raise TypeError("Levels must be list-like")
+ if is_list_like(levels[0]):
+ raise TypeError("Levels must be list-like")
+ level = [level]
+ levels = [levels]
+ elif level is None or is_list_like(level):
+ if not is_list_like(levels) or not is_list_like(levels[0]):
+ raise TypeError("Levels must be list of lists-like")
+
+ if inplace:
+ idx = self
+ else:
+ idx = self._shallow_copy()
+ idx._reset_identity()
+ idx._set_levels(levels, level=level, validate=True,
+ verify_integrity=verify_integrity)
+ if not inplace:
+ return idx
+
+ @property
+ def codes(self):
+ return self._codes
+
+ @property
+ def labels(self):
+ warnings.warn((".labels was deprecated in version 0.24.0. "
+ "Use .codes instead."),
+ FutureWarning, stacklevel=2)
+ return self.codes
+
+ def _set_codes(self, codes, level=None, copy=False, validate=True,
+ verify_integrity=False):
+
+ if validate and level is None and len(codes) != self.nlevels:
+ raise ValueError("Length of codes must match number of levels")
+ if validate and level is not None and len(codes) != len(level):
+ raise ValueError('Length of codes must match length of levels.')
+
+ if level is None:
+ new_codes = FrozenList(
+ _ensure_frozen(level_codes, lev, copy=copy)._shallow_copy()
+ for lev, level_codes in zip(self.levels, codes))
+ else:
+ level = [self._get_level_number(l) for l in level]
+ new_codes = list(self._codes)
+ for lev_idx, level_codes in zip(level, codes):
+ lev = self.levels[lev_idx]
+ new_codes[lev_idx] = _ensure_frozen(
+ level_codes, lev, copy=copy)._shallow_copy()
+ new_codes = FrozenList(new_codes)
+
+ if verify_integrity:
+ self._verify_integrity(codes=new_codes)
+
+ self._codes = new_codes
+ self._tuples = None
+ self._reset_cache()
+
+ def set_labels(self, labels, level=None, inplace=False,
+ verify_integrity=True):
+ warnings.warn((".set_labels was deprecated in version 0.24.0. "
+ "Use .set_codes instead."),
+ FutureWarning, stacklevel=2)
+ return self.set_codes(codes=labels, level=level, inplace=inplace,
+ verify_integrity=verify_integrity)
+
+ @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes')
+ def set_codes(self, codes, level=None, inplace=False,
+ verify_integrity=True):
+ """
+ Set new codes on MultiIndex. Defaults to returning
+ new index.
+
+ .. versionadded:: 0.24.0
+
+ New name for deprecated method `set_labels`.
+
+ Parameters
+ ----------
+ codes : sequence or list of sequence
+ new codes to apply
+ level : int, level name, or sequence of int/level names (default None)
+ level(s) to set (None for all levels)
+ inplace : bool
+ if True, mutates in place
+ verify_integrity : bool (default True)
+ if True, checks that levels and codes are compatible
+
+ Returns
+ -------
+ new index (of same type and class...etc)
+
+ Examples
+ --------
+ >>> idx = pd.MultiIndex.from_tuples([(1, u'one'), (1, u'two'),
+ (2, u'one'), (2, u'two')],
+ names=['foo', 'bar'])
+ >>> idx.set_codes([[1,0,1,0], [0,0,1,1]])
+ MultiIndex(levels=[[1, 2], [u'one', u'two']],
+ codes=[[1, 0, 1, 0], [0, 0, 1, 1]],
+ names=[u'foo', u'bar'])
+ >>> idx.set_codes([1,0,1,0], level=0)
+ MultiIndex(levels=[[1, 2], [u'one', u'two']],
+ codes=[[1, 0, 1, 0], [0, 1, 0, 1]],
+ names=[u'foo', u'bar'])
+ >>> idx.set_codes([0,0,1,1], level='bar')
+ MultiIndex(levels=[[1, 2], [u'one', u'two']],
+ codes=[[0, 0, 1, 1], [0, 0, 1, 1]],
+ names=[u'foo', u'bar'])
+ >>> idx.set_codes([[1,0,1,0], [0,0,1,1]], level=[0,1])
+ MultiIndex(levels=[[1, 2], [u'one', u'two']],
+ codes=[[1, 0, 1, 0], [0, 0, 1, 1]],
+ names=[u'foo', u'bar'])
+ """
+ if level is not None and not is_list_like(level):
+ if not is_list_like(codes):
+ raise TypeError("Codes must be list-like")
+ if is_list_like(codes[0]):
+ raise TypeError("Codes must be list-like")
+ level = [level]
+ codes = [codes]
+ elif level is None or is_list_like(level):
+ if not is_list_like(codes) or not is_list_like(codes[0]):
+ raise TypeError("Codes must be list of lists-like")
+
+ if inplace:
+ idx = self
+ else:
+ idx = self._shallow_copy()
+ idx._reset_identity()
+ idx._set_codes(codes, level=level, verify_integrity=verify_integrity)
+ if not inplace:
+ return idx
+
+ @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes')
+ def copy(self, names=None, dtype=None, levels=None, codes=None,
+ deep=False, _set_identity=False, **kwargs):
+ """
+ Make a copy of this object. Names, dtype, levels and codes can be
+ passed and will be set on new copy.
+
+ Parameters
+ ----------
+ names : sequence, optional
+ dtype : numpy dtype or pandas type, optional
+ levels : sequence, optional
+ codes : sequence, optional
+
+ Returns
+ -------
+ copy : MultiIndex
+
+ Notes
+ -----
+ In most cases, there should be no functional difference from using
+ ``deep``, but if ``deep`` is passed it will attempt to deepcopy.
+ This could be potentially expensive on large MultiIndex objects.
+ """
+ name = kwargs.get('name')
+ names = self._validate_names(name=name, names=names, deep=deep)
+
+ if deep:
+ from copy import deepcopy
+ if levels is None:
+ levels = deepcopy(self.levels)
+ if codes is None:
+ codes = deepcopy(self.codes)
+ else:
+ if levels is None:
+ levels = self.levels
+ if codes is None:
+ codes = self.codes
+ return MultiIndex(levels=levels, codes=codes, names=names,
+ sortorder=self.sortorder, verify_integrity=False,
+ _set_identity=_set_identity)
+
+ def __array__(self, dtype=None):
+ """ the array interface, return my values """
+ return self.values
+
+ def view(self, cls=None):
+ """ this is defined as a copy with the same identity """
+ result = self.copy()
+ result._id = self._id
+ return result
+
+ def _shallow_copy_with_infer(self, values, **kwargs):
+ # On equal MultiIndexes the difference is empty.
+ # Therefore, an empty MultiIndex is returned GH13490
+ if len(values) == 0:
+ return MultiIndex(levels=[[] for _ in range(self.nlevels)],
+ codes=[[] for _ in range(self.nlevels)],
+ **kwargs)
+ return self._shallow_copy(values, **kwargs)
+
+ @Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
+ def __contains__(self, key):
+ hash(key)
+ try:
+ self.get_loc(key)
+ return True
+ except (LookupError, TypeError):
+ return False
+
+ contains = __contains__
+
+ @Appender(_index_shared_docs['_shallow_copy'])
+ def _shallow_copy(self, values=None, **kwargs):
+ if values is not None:
+ names = kwargs.pop('names', kwargs.pop('name', self.names))
+ # discards freq
+ kwargs.pop('freq', None)
+ return MultiIndex.from_tuples(values, names=names, **kwargs)
+ return self.view()
+
+ @cache_readonly
+ def dtype(self):
+ return np.dtype('O')
+
+ def _is_memory_usage_qualified(self):
+ """ return a boolean if we need a qualified .info display """
+ def f(l):
+ return 'mixed' in l or 'string' in l or 'unicode' in l
+ return any(f(l) for l in self._inferred_type_levels)
+
+ @Appender(Index.memory_usage.__doc__)
+ def memory_usage(self, deep=False):
+ # we are overwriting our base class to avoid
+ # computing .values here which could materialize
+ # a tuple representation uncessarily
+ return self._nbytes(deep)
+
+ @cache_readonly
+ def nbytes(self):
+ """ return the number of bytes in the underlying data """
+ return self._nbytes(False)
+
+ def _nbytes(self, deep=False):
+ """
+ return the number of bytes in the underlying data
+ deeply introspect the level data if deep=True
+
+ include the engine hashtable
+
+ *this is in internal routine*
+
+ """
+
+ # for implementations with no useful getsizeof (PyPy)
+ objsize = 24
+
+ level_nbytes = sum(i.memory_usage(deep=deep) for i in self.levels)
+ label_nbytes = sum(i.nbytes for i in self.codes)
+ names_nbytes = sum(getsizeof(i, objsize) for i in self.names)
+ result = level_nbytes + label_nbytes + names_nbytes
+
+ # include our engine hashtable
+ result += self._engine.sizeof(deep=deep)
+ return result
+
+ # --------------------------------------------------------------------
+ # Rendering Methods
+
+ def _format_attrs(self):
+ """
+ Return a list of tuples of the (attr,formatted_value)
+ """
+ attrs = [
+ ('levels', ibase.default_pprint(self._levels,
+ max_seq_items=False)),
+ ('codes', ibase.default_pprint(self._codes,
+ max_seq_items=False))]
+ if com._any_not_none(*self.names):
+ attrs.append(('names', ibase.default_pprint(self.names)))
+ if self.sortorder is not None:
+ attrs.append(('sortorder', ibase.default_pprint(self.sortorder)))
+ return attrs
+
+ def _format_space(self):
+ return "\n%s" % (' ' * (len(self.__class__.__name__) + 1))
+
+ def _format_data(self, name=None):
+ # we are formatting thru the attributes
+ return None
+
+ def _format_native_types(self, na_rep='nan', **kwargs):
+ new_levels = []
+ new_codes = []
+
+ # go through the levels and format them
+ for level, level_codes in zip(self.levels, self.codes):
+ level = level._format_native_types(na_rep=na_rep, **kwargs)
+ # add nan values, if there are any
+ mask = (level_codes == -1)
+ if mask.any():
+ nan_index = len(level)
+ level = np.append(level, na_rep)
+ level_codes = level_codes.values()
+ level_codes[mask] = nan_index
+ new_levels.append(level)
+ new_codes.append(level_codes)
+
+ if len(new_levels) == 1:
+ return Index(new_levels[0])._format_native_types()
+ else:
+ # reconstruct the multi-index
+ mi = MultiIndex(levels=new_levels, codes=new_codes,
+ names=self.names, sortorder=self.sortorder,
+ verify_integrity=False)
+ return mi.values
+
+ def format(self, space=2, sparsify=None, adjoin=True, names=False,
+ na_rep=None, formatter=None):
+ if len(self) == 0:
+ return []
+
+ stringified_levels = []
+ for lev, level_codes in zip(self.levels, self.codes):
+ na = na_rep if na_rep is not None else _get_na_rep(lev.dtype.type)
+
+ if len(lev) > 0:
+
+ formatted = lev.take(level_codes).format(formatter=formatter)
+
+ # we have some NA
+ mask = level_codes == -1
+ if mask.any():
+ formatted = np.array(formatted, dtype=object)
+ formatted[mask] = na
+ formatted = formatted.tolist()
+
+ else:
+ # weird all NA case
+ formatted = [pprint_thing(na if isna(x) else x,
+ escape_chars=('\t', '\r', '\n'))
+ for x in algos.take_1d(lev._values, level_codes)]
+ stringified_levels.append(formatted)
+
+ result_levels = []
+ for lev, name in zip(stringified_levels, self.names):
+ level = []
+
+ if names:
+ level.append(pprint_thing(name,
+ escape_chars=('\t', '\r', '\n'))
+ if name is not None else '')
+
+ level.extend(np.array(lev, dtype=object))
+ result_levels.append(level)
+
+ if sparsify is None:
+ sparsify = get_option("display.multi_sparse")
+
+ if sparsify:
+ sentinel = ''
+ # GH3547
+ # use value of sparsify as sentinel, unless it's an obvious
+ # "Truthey" value
+ if sparsify not in [True, 1]:
+ sentinel = sparsify
+ # little bit of a kludge job for #1217
+ result_levels = _sparsify(result_levels, start=int(names),
+ sentinel=sentinel)
+
+ if adjoin:
+ from pandas.io.formats.format import _get_adjustment
+ adj = _get_adjustment()
+ return adj.adjoin(space, *result_levels).split('\n')
+ else:
+ return result_levels
+
+ # --------------------------------------------------------------------
+
+ def __len__(self):
+ return len(self.codes[0])
+
+ def _get_names(self):
+ return FrozenList(level.name for level in self.levels)
+
+ def _set_names(self, names, level=None, validate=True):
+ """
+ Set new names on index. Each name has to be a hashable type.
+
+ Parameters
+ ----------
+ values : str or sequence
+ name(s) to set
+ level : int, level name, or sequence of int/level names (default None)
+ If the index is a MultiIndex (hierarchical), level(s) to set (None
+ for all levels). Otherwise level must be None
+ validate : boolean, default True
+ validate that the names match level lengths
+
+ Raises
+ ------
+ TypeError if each name is not hashable.
+
+ Notes
+ -----
+ sets names on levels. WARNING: mutates!
+
+ Note that you generally want to set this *after* changing levels, so
+ that it only acts on copies
+ """
+ # GH 15110
+ # Don't allow a single string for names in a MultiIndex
+ if names is not None and not is_list_like(names):
+ raise ValueError('Names should be list-like for a MultiIndex')
+ names = list(names)
+
+ if validate and level is not None and len(names) != len(level):
+ raise ValueError('Length of names must match length of level.')
+ if validate and level is None and len(names) != self.nlevels:
+ raise ValueError('Length of names must match number of levels in '
+ 'MultiIndex.')
+
+ if level is None:
+ level = range(self.nlevels)
+ else:
+ level = [self._get_level_number(l) for l in level]
+
+ # set the name
+ for l, name in zip(level, names):
+ if name is not None:
+ # GH 20527
+ # All items in 'names' need to be hashable:
+ if not is_hashable(name):
+ raise TypeError('{}.name must be a hashable type'
+ .format(self.__class__.__name__))
+ self.levels[l].rename(name, inplace=True)
+
+ names = property(fset=_set_names, fget=_get_names,
+ doc="Names of levels in MultiIndex")
+
+ @Appender(_index_shared_docs['_get_grouper_for_level'])
+ def _get_grouper_for_level(self, mapper, level):
+ indexer = self.codes[level]
+ level_index = self.levels[level]
+
+ if mapper is not None:
+ # Handle group mapping function and return
+ level_values = self.levels[level].take(indexer)
+ grouper = level_values.map(mapper)
+ return grouper, None, None
+
+ codes, uniques = algos.factorize(indexer, sort=True)
+
+ if len(uniques) > 0 and uniques[0] == -1:
+ # Handle NAs
+ mask = indexer != -1
+ ok_codes, uniques = algos.factorize(indexer[mask], sort=True)
+
+ codes = np.empty(len(indexer), dtype=indexer.dtype)
+ codes[mask] = ok_codes
+ codes[~mask] = -1
+
+ if len(uniques) < len(level_index):
+ # Remove unobserved levels from level_index
+ level_index = level_index.take(uniques)
+
+ grouper = level_index.take(codes)
+
+ return grouper, codes, level_index
+
+ @property
+ def _constructor(self):
+ return MultiIndex.from_tuples
+
+ @cache_readonly
+ def inferred_type(self):
+ return 'mixed'
+
+ def _get_level_number(self, level):
+ count = self.names.count(level)
+ if (count > 1) and not is_integer(level):
+ raise ValueError('The name %s occurs multiple times, use a '
+ 'level number' % level)
+ try:
+ level = self.names.index(level)
+ except ValueError:
+ if not is_integer(level):
+ raise KeyError('Level %s not found' % str(level))
+ elif level < 0:
+ level += self.nlevels
+ if level < 0:
+ orig_level = level - self.nlevels
+ raise IndexError('Too many levels: Index has only %d '
+ 'levels, %d is not a valid level number' %
+ (self.nlevels, orig_level))
+ # Note: levels are zero-based
+ elif level >= self.nlevels:
+ raise IndexError('Too many levels: Index has only %d levels, '
+ 'not %d' % (self.nlevels, level + 1))
+ return level
+
+ _tuples = None
+
+ @cache_readonly
+ def _engine(self):
+ # Calculate the number of bits needed to represent labels in each
+ # level, as log2 of their sizes (including -1 for NaN):
+ sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels]))
+
+ # Sum bit counts, starting from the _right_....
+ lev_bits = np.cumsum(sizes[::-1])[::-1]
+
+ # ... in order to obtain offsets such that sorting the combination of
+ # shifted codes (one for each level, resulting in a unique integer) is
+ # equivalent to sorting lexicographically the codes themselves. Notice
+ # that each level needs to be shifted by the number of bits needed to
+ # represent the _previous_ ones:
+ offsets = np.concatenate([lev_bits[1:], [0]]).astype('uint64')
+
+ # Check the total number of bits needed for our representation:
+ if lev_bits[0] > 64:
+ # The levels would overflow a 64 bit uint - use Python integers:
+ return MultiIndexPyIntEngine(self.levels, self.codes, offsets)
+ return MultiIndexUIntEngine(self.levels, self.codes, offsets)
+
+ @property
+ def values(self):
+ if self._tuples is not None:
+ return self._tuples
+
+ values = []
+
+ for i in range(self.nlevels):
+ vals = self._get_level_values(i)
+ if is_categorical_dtype(vals):
+ vals = vals.get_values()
+ if (isinstance(vals.dtype, (PandasExtensionDtype, ExtensionDtype))
+ or hasattr(vals, '_box_values')):
+ vals = vals.astype(object)
+ vals = np.array(vals, copy=False)
+ values.append(vals)
+
+ self._tuples = lib.fast_zip(values)
+ return self._tuples
+
+ @property
+ def _has_complex_internals(self):
+ # to disable groupby tricks
+ return True
+
+ @cache_readonly
+ def is_monotonic_increasing(self):
+ """
+ return if the index is monotonic increasing (only equal or
+ increasing) values.
+ """
+
+ # reversed() because lexsort() wants the most significant key last.
+ values = [self._get_level_values(i).values
+ for i in reversed(range(len(self.levels)))]
+ try:
+ sort_order = np.lexsort(values)
+ return Index(sort_order).is_monotonic
+ except TypeError:
+
+ # we have mixed types and np.lexsort is not happy
+ return Index(self.values).is_monotonic
+
+ @cache_readonly
+ def is_monotonic_decreasing(self):
+ """
+ return if the index is monotonic decreasing (only equal or
+ decreasing) values.
+ """
+ # monotonic decreasing if and only if reverse is monotonic increasing
+ return self[::-1].is_monotonic_increasing
+
+ @cache_readonly
+ def _have_mixed_levels(self):
+ """ return a boolean list indicated if we have mixed levels """
+ return ['mixed' in l for l in self._inferred_type_levels]
+
+ @cache_readonly
+ def _inferred_type_levels(self):
+ """ return a list of the inferred types, one for each level """
+ return [i.inferred_type for i in self.levels]
+
+ @cache_readonly
+ def _hashed_values(self):
+ """ return a uint64 ndarray of my hashed values """
+ from pandas.core.util.hashing import hash_tuples
+ return hash_tuples(self)
+
+ def _hashed_indexing_key(self, key):
+ """
+ validate and return the hash for the provided key
+
+ *this is internal for use for the cython routines*
+
+ Parameters
+ ----------
+ key : string or tuple
+
+ Returns
+ -------
+ np.uint64
+
+ Notes
+ -----
+ we need to stringify if we have mixed levels
+
+ """
+ from pandas.core.util.hashing import hash_tuples, hash_tuple
+
+ if not isinstance(key, tuple):
+ return hash_tuples(key)
+
+ if not len(key) == self.nlevels:
+ raise KeyError
+
+ def f(k, stringify):
+ if stringify and not isinstance(k, compat.string_types):
+ k = str(k)
+ return k
+ key = tuple(f(k, stringify)
+ for k, stringify in zip(key, self._have_mixed_levels))
+ return hash_tuple(key)
+
+ @Appender(Index.duplicated.__doc__)
+ def duplicated(self, keep='first'):
+ from pandas.core.sorting import get_group_index
+ from pandas._libs.hashtable import duplicated_int64
+
+ shape = map(len, self.levels)
+ ids = get_group_index(self.codes, shape, sort=False, xnull=False)
+
+ return duplicated_int64(ids, keep)
+
+ def fillna(self, value=None, downcast=None):
+ """
+ fillna is not implemented for MultiIndex
+ """
+ raise NotImplementedError('isna is not defined for MultiIndex')
+
+ @Appender(_index_shared_docs['dropna'])
+ def dropna(self, how='any'):
+ nans = [level_codes == -1 for level_codes in self.codes]
+ if how == 'any':
+ indexer = np.any(nans, axis=0)
+ elif how == 'all':
+ indexer = np.all(nans, axis=0)
+ else:
+ raise ValueError("invalid how option: {0}".format(how))
+
+ new_codes = [level_codes[~indexer] for level_codes in self.codes]
+ return self.copy(codes=new_codes, deep=True)
+
+ def get_value(self, series, key):
+ # somewhat broken encapsulation
+ from pandas.core.indexing import maybe_droplevels
+
+ # Label-based
+ s = com.values_from_object(series)
+ k = com.values_from_object(key)
+
+ def _try_mi(k):
+ # TODO: what if a level contains tuples??
+ loc = self.get_loc(k)
+ new_values = series._values[loc]
+ new_index = self[loc]
+ new_index = maybe_droplevels(new_index, k)
+ return series._constructor(new_values, index=new_index,
+ name=series.name).__finalize__(self)
+
+ try:
+ return self._engine.get_value(s, k)
+ except KeyError as e1:
+ try:
+ return _try_mi(key)
+ except KeyError:
+ pass
+
+ try:
+ return libindex.get_value_at(s, k)
+ except IndexError:
+ raise
+ except TypeError:
+ # generator/iterator-like
+ if is_iterator(key):
+ raise InvalidIndexError(key)
+ else:
+ raise e1
+ except Exception: # pragma: no cover
+ raise e1
+ except TypeError:
+
+ # a Timestamp will raise a TypeError in a multi-index
+ # rather than a KeyError, try it here
+ # note that a string that 'looks' like a Timestamp will raise
+ # a KeyError! (GH5725)
+ if (isinstance(key, (datetime.datetime, np.datetime64)) or
+ (compat.PY3 and isinstance(key, compat.string_types))):
+ try:
+ return _try_mi(key)
+ except KeyError:
+ raise
+ except (IndexError, ValueError, TypeError):
+ pass
+
+ try:
+ return _try_mi(Timestamp(key))
+ except (KeyError, TypeError,
+ IndexError, ValueError, tslibs.OutOfBoundsDatetime):
+ pass
+
+ raise InvalidIndexError(key)
+
+ def _get_level_values(self, level, unique=False):
+ """
+ Return vector of label values for requested level,
+ equal to the length of the index
+
+ **this is an internal method**
+
+ Parameters
+ ----------
+ level : int level
+ unique : bool, default False
+ if True, drop duplicated values
+
+ Returns
+ -------
+ values : ndarray
+ """
+
+ values = self.levels[level]
+ level_codes = self.codes[level]
+ if unique:
+ level_codes = algos.unique(level_codes)
+ filled = algos.take_1d(values._values, level_codes,
+ fill_value=values._na_value)
+ values = values._shallow_copy(filled)
+ return values
+
+ def get_level_values(self, level):
+ """
+ Return vector of label values for requested level,
+ equal to the length of the index.
+
+ Parameters
+ ----------
+ level : int or str
+ ``level`` is either the integer position of the level in the
+ MultiIndex, or the name of the level.
+
+ Returns
+ -------
+ values : Index
+ ``values`` is a level of this MultiIndex converted to
+ a single :class:`Index` (or subclass thereof).
+
+ Examples
+ ---------
+
+ Create a MultiIndex:
+
+ >>> mi = pd.MultiIndex.from_arrays((list('abc'), list('def')))
+ >>> mi.names = ['level_1', 'level_2']
+
+ Get level values by supplying level as either integer or name:
+
+ >>> mi.get_level_values(0)
+ Index(['a', 'b', 'c'], dtype='object', name='level_1')
+ >>> mi.get_level_values('level_2')
+ Index(['d', 'e', 'f'], dtype='object', name='level_2')
+ """
+ level = self._get_level_number(level)
+ values = self._get_level_values(level)
+ return values
+
+ @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs)
+ def unique(self, level=None):
+
+ if level is None:
+ return super(MultiIndex, self).unique()
+ else:
+ level = self._get_level_number(level)
+ return self._get_level_values(level=level, unique=True)
+
+ def _to_safe_for_reshape(self):
+ """ convert to object if we are a categorical """
+ return self.set_levels([i._to_safe_for_reshape() for i in self.levels])
+
+ def to_frame(self, index=True, name=None):
+ """
+ Create a DataFrame with the levels of the MultiIndex as columns.
+
+ Column ordering is determined by the DataFrame constructor with data as
+ a dict.
+
+ .. versionadded:: 0.24.0
+
+ Parameters
+ ----------
+ index : boolean, default True
+ Set the index of the returned DataFrame as the original MultiIndex.
+
+ name : list / sequence of strings, optional
+ The passed names should substitute index level names.
+
+ Returns
+ -------
+ DataFrame : a DataFrame containing the original MultiIndex data.
+
+ See Also
+ --------
+ DataFrame
+ """
+
+ from pandas import DataFrame
+ if name is not None:
+ if not is_list_like(name):
+ raise TypeError("'name' must be a list / sequence "
+ "of column names.")
+
+ if len(name) != len(self.levels):
+ raise ValueError("'name' should have same length as "
+ "number of levels on index.")
+ idx_names = name
+ else:
+ idx_names = self.names
+
+ # Guarantee resulting column order
+ result = DataFrame(
+ OrderedDict([
+ ((level if lvlname is None else lvlname),
+ self._get_level_values(level))
+ for lvlname, level in zip(idx_names, range(len(self.levels)))
+ ]),
+ copy=False
+ )
+
+ if index:
+ result.index = self
+ return result
+
+ def to_hierarchical(self, n_repeat, n_shuffle=1):
+ """
+ Return a MultiIndex reshaped to conform to the
+ shapes given by n_repeat and n_shuffle.
+
+ .. deprecated:: 0.24.0
+
+ Useful to replicate and rearrange a MultiIndex for combination
+ with another Index with n_repeat items.
+
+ Parameters
+ ----------
+ n_repeat : int
+ Number of times to repeat the labels on self
+ n_shuffle : int
+ Controls the reordering of the labels. If the result is going
+ to be an inner level in a MultiIndex, n_shuffle will need to be
+ greater than one. The size of each label must divisible by
+ n_shuffle.
+
+ Returns
+ -------
+ MultiIndex
+
+ Examples
+ --------
+ >>> idx = pd.MultiIndex.from_tuples([(1, u'one'), (1, u'two'),
+ (2, u'one'), (2, u'two')])
+ >>> idx.to_hierarchical(3)
+ MultiIndex(levels=[[1, 2], [u'one', u'two']],
+ codes=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
+ [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]])
+ """
+ levels = self.levels
+ codes = [np.repeat(level_codes, n_repeat) for
+ level_codes in self.codes]
+ # Assumes that each level_codes is divisible by n_shuffle
+ codes = [x.reshape(n_shuffle, -1).ravel(order='F') for x in codes]
+ names = self.names
+ warnings.warn("Method .to_hierarchical is deprecated and will "
+ "be removed in a future version",
+ FutureWarning, stacklevel=2)
+ return MultiIndex(levels=levels, codes=codes, names=names)
+
+ def to_flat_index(self):
+ """
+ Convert a MultiIndex to an Index of Tuples containing the level values.
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ pd.Index
+ Index with the MultiIndex data represented in Tuples.
+
+ Notes
+ -----
+ This method will simply return the caller if called by anything other
+ than a MultiIndex.
+
+ Examples
+ --------
+ >>> index = pd.MultiIndex.from_product(
+ ... [['foo', 'bar'], ['baz', 'qux']],
+ ... names=['a', 'b'])
+ >>> index.to_flat_index()
+ Index([('foo', 'baz'), ('foo', 'qux'),
+ ('bar', 'baz'), ('bar', 'qux')],
+ dtype='object')
+ """
+ return Index(self.values, tupleize_cols=False)
+
+ @property
+ def is_all_dates(self):
+ return False
+
+ def is_lexsorted(self):
+ """
+ Return True if the codes are lexicographically sorted
+ """
+ return self.lexsort_depth == self.nlevels
+
+ @cache_readonly
+ def lexsort_depth(self):
+ if self.sortorder is not None:
+ if self.sortorder == 0:
+ return self.nlevels
+ else:
+ return 0
+
+ int64_codes = [ensure_int64(level_codes) for level_codes in self.codes]
+ for k in range(self.nlevels, 0, -1):
+ if libalgos.is_lexsorted(int64_codes[:k]):
+ return k
+
+ return 0
+
+ def _sort_levels_monotonic(self):
+ """
+ .. versionadded:: 0.20.0
+
+ This is an *internal* function.
+
+ Create a new MultiIndex from the current to monotonically sorted
+ items IN the levels. This does not actually make the entire MultiIndex
+ monotonic, JUST the levels.
+
+ The resulting MultiIndex will have the same outward
+ appearance, meaning the same .values and ordering. It will also
+ be .equals() to the original.
+
+ Returns
+ -------
+ MultiIndex
+
+ Examples
+ --------
+
+ >>> i = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
+ >>> i
+ MultiIndex(levels=[['a', 'b'], ['bb', 'aa']],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
+
+ >>> i.sort_monotonic()
+ MultiIndex(levels=[['a', 'b'], ['aa', 'bb']],
+ codes=[[0, 0, 1, 1], [1, 0, 1, 0]])
+
+ """
+
+ if self.is_lexsorted() and self.is_monotonic:
+ return self
+
+ new_levels = []
+ new_codes = []
+
+ for lev, level_codes in zip(self.levels, self.codes):
+
+ if not lev.is_monotonic:
+ try:
+ # indexer to reorder the levels
+ indexer = lev.argsort()
+ except TypeError:
+ pass
+ else:
+ lev = lev.take(indexer)
+
+ # indexer to reorder the level codes
+ indexer = ensure_int64(indexer)
+ ri = lib.get_reverse_indexer(indexer, len(indexer))
+ level_codes = algos.take_1d(ri, level_codes)
+
+ new_levels.append(lev)
+ new_codes.append(level_codes)
+
+ return MultiIndex(new_levels, new_codes,
+ names=self.names, sortorder=self.sortorder,
+ verify_integrity=False)
+
+ def remove_unused_levels(self):
+ """
+ Create a new MultiIndex from the current that removes
+ unused levels, meaning that they are not expressed in the labels.
+
+ The resulting MultiIndex will have the same outward
+ appearance, meaning the same .values and ordering. It will also
+ be .equals() to the original.
+
+ .. versionadded:: 0.20.0
+
+ Returns
+ -------
+ MultiIndex
+
+ Examples
+ --------
+ >>> i = pd.MultiIndex.from_product([range(2), list('ab')])
+ MultiIndex(levels=[[0, 1], ['a', 'b']],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
+
+ >>> i[2:]
+ MultiIndex(levels=[[0, 1], ['a', 'b']],
+ codes=[[1, 1], [0, 1]])
+
+ The 0 from the first level is not represented
+ and can be removed
+
+ >>> i[2:].remove_unused_levels()
+ MultiIndex(levels=[[1], ['a', 'b']],
+ codes=[[0, 0], [0, 1]])
+ """
+
+ new_levels = []
+ new_codes = []
+
+ changed = False
+ for lev, level_codes in zip(self.levels, self.codes):
+
+ # Since few levels are typically unused, bincount() is more
+ # efficient than unique() - however it only accepts positive values
+ # (and drops order):
+ uniques = np.where(np.bincount(level_codes + 1) > 0)[0] - 1
+ has_na = int(len(uniques) and (uniques[0] == -1))
+
+ if len(uniques) != len(lev) + has_na:
+ # We have unused levels
+ changed = True
+
+ # Recalculate uniques, now preserving order.
+ # Can easily be cythonized by exploiting the already existing
+ # "uniques" and stop parsing "level_codes" when all items
+ # are found:
+ uniques = algos.unique(level_codes)
+ if has_na:
+ na_idx = np.where(uniques == -1)[0]
+ # Just ensure that -1 is in first position:
+ uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]]
+
+ # codes get mapped from uniques to 0:len(uniques)
+ # -1 (if present) is mapped to last position
+ code_mapping = np.zeros(len(lev) + has_na)
+ # ... and reassigned value -1:
+ code_mapping[uniques] = np.arange(len(uniques)) - has_na
+
+ level_codes = code_mapping[level_codes]
+
+ # new levels are simple
+ lev = lev.take(uniques[has_na:])
+
+ new_levels.append(lev)
+ new_codes.append(level_codes)
+
+ result = self._shallow_copy()
+
+ if changed:
+ result._reset_identity()
+ result._set_levels(new_levels, validate=False)
+ result._set_codes(new_codes, validate=False)
+
+ return result
+
+ @property
+ def nlevels(self):
+ """Integer number of levels in this MultiIndex."""
+ return len(self.levels)
+
+ @property
+ def levshape(self):
+ """A tuple with the length of each level."""
+ return tuple(len(x) for x in self.levels)
+
+ def __reduce__(self):
+ """Necessary for making this object picklable"""
+ d = dict(levels=[lev for lev in self.levels],
+ codes=[level_codes for level_codes in self.codes],
+ sortorder=self.sortorder, names=list(self.names))
+ return ibase._new_Index, (self.__class__, d), None
+
+ def __setstate__(self, state):
+ """Necessary for making this object picklable"""
+
+ if isinstance(state, dict):
+ levels = state.get('levels')
+ codes = state.get('codes')
+ sortorder = state.get('sortorder')
+ names = state.get('names')
+
+ elif isinstance(state, tuple):
+
+ nd_state, own_state = state
+ levels, codes, sortorder, names = own_state
+
+ self._set_levels([Index(x) for x in levels], validate=False)
+ self._set_codes(codes)
+ self._set_names(names)
+ self.sortorder = sortorder
+ self._verify_integrity()
+ self._reset_identity()
+
+ def __getitem__(self, key):
+ if is_scalar(key):
+ key = com.cast_scalar_indexer(key)
+
+ retval = []
+ for lev, level_codes in zip(self.levels, self.codes):
+ if level_codes[key] == -1:
+ retval.append(np.nan)
+ else:
+ retval.append(lev[level_codes[key]])
+
+ return tuple(retval)
+ else:
+ if com.is_bool_indexer(key):
+ key = np.asarray(key, dtype=bool)
+ sortorder = self.sortorder
+ else:
+ # cannot be sure whether the result will be sorted
+ sortorder = None
+
+ if isinstance(key, Index):
+ key = np.asarray(key)
+
+ new_codes = [level_codes[key] for level_codes in self.codes]
+
+ return MultiIndex(levels=self.levels, codes=new_codes,
+ names=self.names, sortorder=sortorder,
+ verify_integrity=False)
+
+ @Appender(_index_shared_docs['take'] % _index_doc_kwargs)
+ def take(self, indices, axis=0, allow_fill=True,
+ fill_value=None, **kwargs):
+ nv.validate_take(tuple(), kwargs)
+ indices = ensure_platform_int(indices)
+ taken = self._assert_take_fillable(self.codes, indices,
+ allow_fill=allow_fill,
+ fill_value=fill_value,
+ na_value=-1)
+ return MultiIndex(levels=self.levels, codes=taken,
+ names=self.names, verify_integrity=False)
+
+ def _assert_take_fillable(self, values, indices, allow_fill=True,
+ fill_value=None, na_value=None):
+ """ Internal method to handle NA filling of take """
+ # only fill if we are passing a non-None fill_value
+ if allow_fill and fill_value is not None:
+ if (indices < -1).any():
+ msg = ('When allow_fill=True and fill_value is not None, '
+ 'all indices must be >= -1')
+ raise ValueError(msg)
+ taken = [lab.take(indices) for lab in self.codes]
+ mask = indices == -1
+ if mask.any():
+ masked = []
+ for new_label in taken:
+ label_values = new_label.values()
+ label_values[mask] = na_value
+ masked.append(np.asarray(label_values))
+ taken = masked
+ else:
+ taken = [lab.take(indices) for lab in self.codes]
+ return taken
+
+ def append(self, other):
+ """
+ Append a collection of Index options together
+
+ Parameters
+ ----------
+ other : Index or list/tuple of indices
+
+ Returns
+ -------
+ appended : Index
+ """
+ if not isinstance(other, (list, tuple)):
+ other = [other]
+
+ if all((isinstance(o, MultiIndex) and o.nlevels >= self.nlevels)
+ for o in other):
+ arrays = []
+ for i in range(self.nlevels):
+ label = self._get_level_values(i)
+ appended = [o._get_level_values(i) for o in other]
+ arrays.append(label.append(appended))
+ return MultiIndex.from_arrays(arrays, names=self.names)
+
+ to_concat = (self.values, ) + tuple(k._values for k in other)
+ new_tuples = np.concatenate(to_concat)
+
+ # if all(isinstance(x, MultiIndex) for x in other):
+ try:
+ return MultiIndex.from_tuples(new_tuples, names=self.names)
+ except (TypeError, IndexError):
+ return Index(new_tuples)
+
+ def argsort(self, *args, **kwargs):
+ return self.values.argsort(*args, **kwargs)
+
+ @Appender(_index_shared_docs['repeat'] % _index_doc_kwargs)
+ def repeat(self, repeats, axis=None):
+ nv.validate_repeat(tuple(), dict(axis=axis))
+ return MultiIndex(levels=self.levels,
+ codes=[level_codes.view(np.ndarray).repeat(repeats)
+ for level_codes in self.codes],
+ names=self.names, sortorder=self.sortorder,
+ verify_integrity=False)
+
+ def where(self, cond, other=None):
+ raise NotImplementedError(".where is not supported for "
+ "MultiIndex operations")
+
+ @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes')
+ def drop(self, codes, level=None, errors='raise'):
+ """
+ Make new MultiIndex with passed list of codes deleted
+
+ Parameters
+ ----------
+ codes : array-like
+ Must be a list of tuples
+ level : int or level name, default None
+
+ Returns
+ -------
+ dropped : MultiIndex
+ """
+ if level is not None:
+ return self._drop_from_level(codes, level)
+
+ try:
+ if not isinstance(codes, (np.ndarray, Index)):
+ codes = com.index_labels_to_array(codes)
+ indexer = self.get_indexer(codes)
+ mask = indexer == -1
+ if mask.any():
+ if errors != 'ignore':
+ raise ValueError('codes %s not contained in axis' %
+ codes[mask])
+ except Exception:
+ pass
+
+ inds = []
+ for level_codes in codes:
+ try:
+ loc = self.get_loc(level_codes)
+ # get_loc returns either an integer, a slice, or a boolean
+ # mask
+ if isinstance(loc, int):
+ inds.append(loc)
+ elif isinstance(loc, slice):
+ inds.extend(lrange(loc.start, loc.stop))
+ elif com.is_bool_indexer(loc):
+ if self.lexsort_depth == 0:
+ warnings.warn('dropping on a non-lexsorted multi-index'
+ ' without a level parameter may impact '
+ 'performance.',
+ PerformanceWarning,
+ stacklevel=3)
+ loc = loc.nonzero()[0]
+ inds.extend(loc)
+ else:
+ msg = 'unsupported indexer of type {}'.format(type(loc))
+ raise AssertionError(msg)
+ except KeyError:
+ if errors != 'ignore':
+ raise
+
+ return self.delete(inds)
+
+ def _drop_from_level(self, codes, level):
+ codes = com.index_labels_to_array(codes)
+ i = self._get_level_number(level)
+ index = self.levels[i]
+ values = index.get_indexer(codes)
+
+ mask = ~algos.isin(self.codes[i], values)
+
+ return self[mask]
+
+ def swaplevel(self, i=-2, j=-1):
+ """
+ Swap level i with level j.
+
+ Calling this method does not change the ordering of the values.
+
+ Parameters
+ ----------
+ i : int, str, default -2
+ First level of index to be swapped. Can pass level name as string.
+ Type of parameters can be mixed.
+ j : int, str, default -1
+ Second level of index to be swapped. Can pass level name as string.
+ Type of parameters can be mixed.
+
+ Returns
+ -------
+ MultiIndex
+ A new MultiIndex
+
+ .. versionchanged:: 0.18.1
+
+ The indexes ``i`` and ``j`` are now optional, and default to
+ the two innermost levels of the index.
+
+ See Also
+ --------
+ Series.swaplevel : Swap levels i and j in a MultiIndex.
+ Dataframe.swaplevel : Swap levels i and j in a MultiIndex on a
+ particular axis.
+
+ Examples
+ --------
+ >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']],
+ ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
+ >>> mi
+ MultiIndex(levels=[['a', 'b'], ['bb', 'aa']],
+ codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
+ >>> mi.swaplevel(0, 1)
+ MultiIndex(levels=[['bb', 'aa'], ['a', 'b']],
+ codes=[[0, 1, 0, 1], [0, 0, 1, 1]])
+ """
+ new_levels = list(self.levels)
+ new_codes = list(self.codes)
+ new_names = list(self.names)
+
+ i = self._get_level_number(i)
+ j = self._get_level_number(j)
+
+ new_levels[i], new_levels[j] = new_levels[j], new_levels[i]
+ new_codes[i], new_codes[j] = new_codes[j], new_codes[i]
+ new_names[i], new_names[j] = new_names[j], new_names[i]
+
+ return MultiIndex(levels=new_levels, codes=new_codes,
+ names=new_names, verify_integrity=False)
+
+ def reorder_levels(self, order):
+ """
+ Rearrange levels using input order. May not drop or duplicate levels
+
+ Parameters
+ ----------
+ """
+ order = [self._get_level_number(i) for i in order]
+ if len(order) != self.nlevels:
+ raise AssertionError('Length of order must be same as '
+ 'number of levels (%d), got %d' %
+ (self.nlevels, len(order)))
+ new_levels = [self.levels[i] for i in order]
+ new_codes = [self.codes[i] for i in order]
+ new_names = [self.names[i] for i in order]
+
+ return MultiIndex(levels=new_levels, codes=new_codes,
+ names=new_names, verify_integrity=False)
+
+ def __getslice__(self, i, j):
+ return self.__getitem__(slice(i, j))
+
+ def _get_codes_for_sorting(self):
+ """
+ we categorizing our codes by using the
+ available categories (all, not just observed)
+ excluding any missing ones (-1); this is in preparation
+ for sorting, where we need to disambiguate that -1 is not
+ a valid valid
+ """
+ from pandas.core.arrays import Categorical
+
+ def cats(level_codes):
+ return np.arange(np.array(level_codes).max() + 1 if
+ len(level_codes) else 0,
+ dtype=level_codes.dtype)
+
+ return [Categorical.from_codes(level_codes, cats(level_codes),
+ ordered=True)
+ for level_codes in self.codes]
+
+ def sortlevel(self, level=0, ascending=True, sort_remaining=True):
+ """
+ Sort MultiIndex at the requested level. The result will respect the
+ original ordering of the associated factor at that level.
+
+ Parameters
+ ----------
+ level : list-like, int or str, default 0
+ If a string is given, must be a name of the level
+ If list-like must be names or ints of levels.
+ ascending : boolean, default True
+ False to sort in descending order
+ Can also be a list to specify a directed ordering
+ sort_remaining : sort by the remaining levels after level
+
+ Returns
+ -------
+ sorted_index : pd.MultiIndex
+ Resulting index
+ indexer : np.ndarray
+ Indices of output values in original index
+ """
+ from pandas.core.sorting import indexer_from_factorized
+
+ if isinstance(level, (compat.string_types, int)):
+ level = [level]
+ level = [self._get_level_number(lev) for lev in level]
+ sortorder = None
+
+ # we have a directed ordering via ascending
+ if isinstance(ascending, list):
+ if not len(level) == len(ascending):
+ raise ValueError("level must have same length as ascending")
+
+ from pandas.core.sorting import lexsort_indexer
+ indexer = lexsort_indexer([self.codes[lev] for lev in level],
+ orders=ascending)
+
+ # level ordering
+ else:
+
+ codes = list(self.codes)
+ shape = list(self.levshape)
+
+ # partition codes and shape
+ primary = tuple(codes.pop(lev - i) for i, lev in enumerate(level))
+ primshp = tuple(shape.pop(lev - i) for i, lev in enumerate(level))
+
+ if sort_remaining:
+ primary += primary + tuple(codes)
+ primshp += primshp + tuple(shape)
+ else:
+ sortorder = level[0]
+
+ indexer = indexer_from_factorized(primary, primshp,
+ compress=False)
+
+ if not ascending:
+ indexer = indexer[::-1]
+
+ indexer = ensure_platform_int(indexer)
+ new_codes = [level_codes.take(indexer) for level_codes in self.codes]
+
+ new_index = MultiIndex(codes=new_codes, levels=self.levels,
+ names=self.names, sortorder=sortorder,
+ verify_integrity=False)
+
+ return new_index, indexer
+
+ def _convert_listlike_indexer(self, keyarr, kind=None):
+ """
+ Parameters
+ ----------
+ keyarr : list-like
+ Indexer to convert.
+
+ Returns
+ -------
+ tuple (indexer, keyarr)
+ indexer is an ndarray or None if cannot convert
+ keyarr are tuple-safe keys
+ """
+ indexer, keyarr = super(MultiIndex, self)._convert_listlike_indexer(
+ keyarr, kind=kind)
+
+ # are we indexing a specific level
+ if indexer is None and len(keyarr) and not isinstance(keyarr[0],
+ tuple):
+ level = 0
+ _, indexer = self.reindex(keyarr, level=level)
+
+ # take all
+ if indexer is None:
+ indexer = np.arange(len(self))
+
+ check = self.levels[0].get_indexer(keyarr)
+ mask = check == -1
+ if mask.any():
+ raise KeyError('%s not in index' % keyarr[mask])
+
+ return indexer, keyarr
+
+ @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs)
+ def get_indexer(self, target, method=None, limit=None, tolerance=None):
+ method = missing.clean_reindex_fill_method(method)
+ target = ensure_index(target)
+
+ # empty indexer
+ if is_list_like(target) and not len(target):
+ return ensure_platform_int(np.array([]))
+
+ if not isinstance(target, MultiIndex):
+ try:
+ target = MultiIndex.from_tuples(target)
+ except (TypeError, ValueError):
+
+ # let's instead try with a straight Index
+ if method is None:
+ return Index(self.values).get_indexer(target,
+ method=method,
+ limit=limit,
+ tolerance=tolerance)
+
+ if not self.is_unique:
+ raise ValueError('Reindexing only valid with uniquely valued '
+ 'Index objects')
+
+ if method == 'pad' or method == 'backfill':
+ if tolerance is not None:
+ raise NotImplementedError("tolerance not implemented yet "
+ 'for MultiIndex')
+ indexer = self._engine.get_indexer(target, method, limit)
+ elif method == 'nearest':
+ raise NotImplementedError("method='nearest' not implemented yet "
+ 'for MultiIndex; see GitHub issue 9365')
+ else:
+ indexer = self._engine.get_indexer(target)
+
+ return ensure_platform_int(indexer)
+
+ @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs)
+ def get_indexer_non_unique(self, target):
+ return super(MultiIndex, self).get_indexer_non_unique(target)
+
+ def reindex(self, target, method=None, level=None, limit=None,
+ tolerance=None):
+ """
+ Create index with target's values (move/add/delete values as necessary)
+
+ Returns
+ -------
+ new_index : pd.MultiIndex
+ Resulting index
+ indexer : np.ndarray or None
+ Indices of output values in original index
+
+ """
+ # GH6552: preserve names when reindexing to non-named target
+ # (i.e. neither Index nor Series).
+ preserve_names = not hasattr(target, 'names')
+
+ if level is not None:
+ if method is not None:
+ raise TypeError('Fill method not supported if level passed')
+
+ # GH7774: preserve dtype/tz if target is empty and not an Index.
+ # target may be an iterator
+ target = ibase._ensure_has_len(target)
+ if len(target) == 0 and not isinstance(target, Index):
+ idx = self.levels[level]
+ attrs = idx._get_attributes_dict()
+ attrs.pop('freq', None) # don't preserve freq
+ target = type(idx)._simple_new(np.empty(0, dtype=idx.dtype),
+ **attrs)
+ else:
+ target = ensure_index(target)
+ target, indexer, _ = self._join_level(target, level, how='right',
+ return_indexers=True,
+ keep_order=False)
+ else:
+ target = ensure_index(target)
+ if self.equals(target):
+ indexer = None
+ else:
+ if self.is_unique:
+ indexer = self.get_indexer(target, method=method,
+ limit=limit,
+ tolerance=tolerance)
+ else:
+ raise ValueError("cannot handle a non-unique multi-index!")
+
+ if not isinstance(target, MultiIndex):
+ if indexer is None:
+ target = self
+ elif (indexer >= 0).all():
+ target = self.take(indexer)
+ else:
+ # hopefully?
+ target = MultiIndex.from_tuples(target)
+
+ if (preserve_names and target.nlevels == self.nlevels and
+ target.names != self.names):
+ target = target.copy(deep=False)
+ target.names = self.names
+
+ return target, indexer
+
+ def get_slice_bound(self, label, side, kind):
+
+ if not isinstance(label, tuple):
+ label = label,
+ return self._partial_tup_index(label, side=side)
+
+ def slice_locs(self, start=None, end=None, step=None, kind=None):
+ """
+ For an ordered MultiIndex, compute the slice locations for input
+ labels.
+
+ The input labels can be tuples representing partial levels, e.g. for a
+ MultiIndex with 3 levels, you can pass a single value (corresponding to
+ the first level), or a 1-, 2-, or 3-tuple.
+
+ Parameters
+ ----------
+ start : label or tuple, default None
+ If None, defaults to the beginning
+ end : label or tuple
+ If None, defaults to the end
+ step : int or None
+ Slice step
+ kind : string, optional, defaults None
+
+ Returns
+ -------
+ (start, end) : (int, int)
+
+ Notes
+ -----
+ This method only works if the MultiIndex is properly lexsorted. So,
+ if only the first 2 levels of a 3-level MultiIndex are lexsorted,
+ you can only pass two levels to ``.slice_locs``.
+
+ Examples
+ --------
+ >>> mi = pd.MultiIndex.from_arrays([list('abbd'), list('deff')],
+ ... names=['A', 'B'])
+
+ Get the slice locations from the beginning of 'b' in the first level
+ until the end of the multiindex:
+
+ >>> mi.slice_locs(start='b')
+ (1, 4)
+
+ Like above, but stop at the end of 'b' in the first level and 'f' in
+ the second level:
+
+ >>> mi.slice_locs(start='b', end=('b', 'f'))
+ (1, 3)
+
+ See Also
+ --------
+ MultiIndex.get_loc : Get location for a label or a tuple of labels.
+ MultiIndex.get_locs : Get location for a label/slice/list/mask or a
+ sequence of such.
+ """
+ # This function adds nothing to its parent implementation (the magic
+ # happens in get_slice_bound method), but it adds meaningful doc.
+ return super(MultiIndex, self).slice_locs(start, end, step, kind=kind)
+
+ def _partial_tup_index(self, tup, side='left'):
+ if len(tup) > self.lexsort_depth:
+ raise UnsortedIndexError(
+ 'Key length (%d) was greater than MultiIndex'
+ ' lexsort depth (%d)' %
+ (len(tup), self.lexsort_depth))
+
+ n = len(tup)
+ start, end = 0, len(self)
+ zipped = zip(tup, self.levels, self.codes)
+ for k, (lab, lev, labs) in enumerate(zipped):
+ section = labs[start:end]
+
+ if lab not in lev:
+ if not lev.is_type_compatible(lib.infer_dtype([lab],
+ skipna=False)):
+ raise TypeError('Level type mismatch: %s' % lab)
+
+ # short circuit
+ loc = lev.searchsorted(lab, side=side)
+ if side == 'right' and loc >= 0:
+ loc -= 1
+ return start + section.searchsorted(loc, side=side)
+
+ idx = lev.get_loc(lab)
+ if k < n - 1:
+ end = start + section.searchsorted(idx, side='right')
+ start = start + section.searchsorted(idx, side='left')
+ else:
+ return start + section.searchsorted(idx, side=side)
+
+ def get_loc(self, key, method=None):
+ """
+ Get location for a label or a tuple of labels as an integer, slice or
+ boolean mask.
+
+ Parameters
+ ----------
+ key : label or tuple of labels (one for each level)
+ method : None
+
+ Returns
+ -------
+ loc : int, slice object or boolean mask
+ If the key is past the lexsort depth, the return may be a
+ boolean mask array, otherwise it is always a slice or int.
+
+ Examples
+ ---------
+ >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')])
+
+ >>> mi.get_loc('b')
+ slice(1, 3, None)
+
+ >>> mi.get_loc(('b', 'e'))
+ 1
+
+ Notes
+ ------
+ The key cannot be a slice, list of same-level labels, a boolean mask,
+ or a sequence of such. If you want to use those, use
+ :meth:`MultiIndex.get_locs` instead.
+
+ See Also
+ --------
+ Index.get_loc : The get_loc method for (single-level) index.
+ MultiIndex.slice_locs : Get slice location given start label(s) and
+ end label(s).
+ MultiIndex.get_locs : Get location for a label/slice/list/mask or a
+ sequence of such.
+ """
+ if method is not None:
+ raise NotImplementedError('only the default get_loc method is '
+ 'currently supported for MultiIndex')
+
+ def _maybe_to_slice(loc):
+ """convert integer indexer to boolean mask or slice if possible"""
+ if not isinstance(loc, np.ndarray) or loc.dtype != 'int64':
+ return loc
+
+ loc = lib.maybe_indices_to_slice(loc, len(self))
+ if isinstance(loc, slice):
+ return loc
+
+ mask = np.empty(len(self), dtype='bool')
+ mask.fill(False)
+ mask[loc] = True
+ return mask
+
+ if not isinstance(key, tuple):
+ loc = self._get_level_indexer(key, level=0)
+ return _maybe_to_slice(loc)
+
+ keylen = len(key)
+ if self.nlevels < keylen:
+ raise KeyError('Key length ({0}) exceeds index depth ({1})'
+ ''.format(keylen, self.nlevels))
+
+ if keylen == self.nlevels and self.is_unique:
+ return self._engine.get_loc(key)
+
+ # -- partial selection or non-unique index
+ # break the key into 2 parts based on the lexsort_depth of the index;
+ # the first part returns a continuous slice of the index; the 2nd part
+ # needs linear search within the slice
+ i = self.lexsort_depth
+ lead_key, follow_key = key[:i], key[i:]
+ start, stop = (self.slice_locs(lead_key, lead_key)
+ if lead_key else (0, len(self)))
+
+ if start == stop:
+ raise KeyError(key)
+
+ if not follow_key:
+ return slice(start, stop)
+
+ warnings.warn('indexing past lexsort depth may impact performance.',
+ PerformanceWarning, stacklevel=10)
+
+ loc = np.arange(start, stop, dtype='int64')
+
+ for i, k in enumerate(follow_key, len(lead_key)):
+ mask = self.codes[i][loc] == self.levels[i].get_loc(k)
+ if not mask.all():
+ loc = loc[mask]
+ if not len(loc):
+ raise KeyError(key)
+
+ return (_maybe_to_slice(loc) if len(loc) != stop - start else
+ slice(start, stop))
+
+ def get_loc_level(self, key, level=0, drop_level=True):
+ """
+ Get both the location for the requested label(s) and the
+ resulting sliced index.
+
+ Parameters
+ ----------
+ key : label or sequence of labels
+ level : int/level name or list thereof, optional
+ drop_level : bool, default True
+ if ``False``, the resulting index will not drop any level.
+
+ Returns
+ -------
+ loc : A 2-tuple where the elements are:
+ Element 0: int, slice object or boolean array
+ Element 1: The resulting sliced multiindex/index. If the key
+ contains all levels, this will be ``None``.
+
+ Examples
+ --------
+ >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')],
+ ... names=['A', 'B'])
+
+ >>> mi.get_loc_level('b')
+ (slice(1, 3, None), Index(['e', 'f'], dtype='object', name='B'))
+
+ >>> mi.get_loc_level('e', level='B')
+ (array([False, True, False], dtype=bool),
+ Index(['b'], dtype='object', name='A'))
+
+ >>> mi.get_loc_level(['b', 'e'])
+ (1, None)
+
+ See Also
+ ---------
+ MultiIndex.get_loc : Get location for a label or a tuple of labels.
+ MultiIndex.get_locs : Get location for a label/slice/list/mask or a
+ sequence of such.
+ """
+
+ def maybe_droplevels(indexer, levels, drop_level):
+ if not drop_level:
+ return self[indexer]
+ # kludgearound
+ orig_index = new_index = self[indexer]
+ levels = [self._get_level_number(i) for i in levels]
+ for i in sorted(levels, reverse=True):
+ try:
+ new_index = new_index.droplevel(i)
+ except ValueError:
+
+ # no dropping here
+ return orig_index
+ return new_index
+
+ if isinstance(level, (tuple, list)):
+ if len(key) != len(level):
+ raise AssertionError('Key for location must have same '
+ 'length as number of levels')
+ result = None
+ for lev, k in zip(level, key):
+ loc, new_index = self.get_loc_level(k, level=lev)
+ if isinstance(loc, slice):
+ mask = np.zeros(len(self), dtype=bool)
+ mask[loc] = True
+ loc = mask
+
+ result = loc if result is None else result & loc
+
+ return result, maybe_droplevels(result, level, drop_level)
+
+ level = self._get_level_number(level)
+
+ # kludge for #1796
+ if isinstance(key, list):
+ key = tuple(key)
+
+ if isinstance(key, tuple) and level == 0:
+
+ try:
+ if key in self.levels[0]:
+ indexer = self._get_level_indexer(key, level=level)
+ new_index = maybe_droplevels(indexer, [0], drop_level)
+ return indexer, new_index
+ except TypeError:
+ pass
+
+ if not any(isinstance(k, slice) for k in key):
+
+ # partial selection
+ # optionally get indexer to avoid re-calculation
+ def partial_selection(key, indexer=None):
+ if indexer is None:
+ indexer = self.get_loc(key)
+ ilevels = [i for i in range(len(key))
+ if key[i] != slice(None, None)]
+ return indexer, maybe_droplevels(indexer, ilevels,
+ drop_level)
+
+ if len(key) == self.nlevels and self.is_unique:
+ # Complete key in unique index -> standard get_loc
+ return (self._engine.get_loc(key), None)
+ else:
+ return partial_selection(key)
+ else:
+ indexer = None
+ for i, k in enumerate(key):
+ if not isinstance(k, slice):
+ k = self._get_level_indexer(k, level=i)
+ if isinstance(k, slice):
+ # everything
+ if k.start == 0 and k.stop == len(self):
+ k = slice(None, None)
+ else:
+ k_index = k
+
+ if isinstance(k, slice):
+ if k == slice(None, None):
+ continue
+ else:
+ raise TypeError(key)
+
+ if indexer is None:
+ indexer = k_index
+ else: # pragma: no cover
+ indexer &= k_index
+ if indexer is None:
+ indexer = slice(None, None)
+ ilevels = [i for i in range(len(key))
+ if key[i] != slice(None, None)]
+ return indexer, maybe_droplevels(indexer, ilevels, drop_level)
+ else:
+ indexer = self._get_level_indexer(key, level=level)
+ return indexer, maybe_droplevels(indexer, [level], drop_level)
+
+ def _get_level_indexer(self, key, level=0, indexer=None):
+ # return an indexer, boolean array or a slice showing where the key is
+ # in the totality of values
+ # if the indexer is provided, then use this
+
+ level_index = self.levels[level]
+ level_codes = self.codes[level]
+
+ def convert_indexer(start, stop, step, indexer=indexer,
+ codes=level_codes):
+ # given the inputs and the codes/indexer, compute an indexer set
+ # if we have a provided indexer, then this need not consider
+ # the entire labels set
+
+ r = np.arange(start, stop, step)
+ if indexer is not None and len(indexer) != len(codes):
+
+ # we have an indexer which maps the locations in the labels
+ # that we have already selected (and is not an indexer for the
+ # entire set) otherwise this is wasteful so we only need to
+ # examine locations that are in this set the only magic here is
+ # that the result are the mappings to the set that we have
+ # selected
+ from pandas import Series
+ mapper = Series(indexer)
+ indexer = codes.take(ensure_platform_int(indexer))
+ result = Series(Index(indexer).isin(r).nonzero()[0])
+ m = result.map(mapper)._ndarray_values
+
+ else:
+ m = np.zeros(len(codes), dtype=bool)
+ m[np.in1d(codes, r,
+ assume_unique=Index(codes).is_unique)] = True
+
+ return m
+
+ if isinstance(key, slice):
+ # handle a slice, returnig a slice if we can
+ # otherwise a boolean indexer
+
+ try:
+ if key.start is not None:
+ start = level_index.get_loc(key.start)
+ else:
+ start = 0
+ if key.stop is not None:
+ stop = level_index.get_loc(key.stop)
+ else:
+ stop = len(level_index) - 1
+ step = key.step
+ except KeyError:
+
+ # we have a partial slice (like looking up a partial date
+ # string)
+ start = stop = level_index.slice_indexer(key.start, key.stop,
+ key.step, kind='loc')
+ step = start.step
+
+ if isinstance(start, slice) or isinstance(stop, slice):
+ # we have a slice for start and/or stop
+ # a partial date slicer on a DatetimeIndex generates a slice
+ # note that the stop ALREADY includes the stopped point (if
+ # it was a string sliced)
+ return convert_indexer(start.start, stop.stop, step)
+
+ elif level > 0 or self.lexsort_depth == 0 or step is not None:
+ # need to have like semantics here to right
+ # searching as when we are using a slice
+ # so include the stop+1 (so we include stop)
+ return convert_indexer(start, stop + 1, step)
+ else:
+ # sorted, so can return slice object -> view
+ i = level_codes.searchsorted(start, side='left')
+ j = level_codes.searchsorted(stop, side='right')
+ return slice(i, j, step)
+
+ else:
+
+ code = level_index.get_loc(key)
+
+ if level > 0 or self.lexsort_depth == 0:
+ # Desired level is not sorted
+ locs = np.array(level_codes == code, dtype=bool, copy=False)
+ if not locs.any():
+ # The label is present in self.levels[level] but unused:
+ raise KeyError(key)
+ return locs
+
+ i = level_codes.searchsorted(code, side='left')
+ j = level_codes.searchsorted(code, side='right')
+ if i == j:
+ # The label is present in self.levels[level] but unused:
+ raise KeyError(key)
+ return slice(i, j)
+
+ def get_locs(self, seq):
+ """
+ Get location for a given label/slice/list/mask or a sequence of such as
+ an array of integers.
+
+ Parameters
+ ----------
+ seq : label/slice/list/mask or a sequence of such
+ You should use one of the above for each level.
+ If a level should not be used, set it to ``slice(None)``.
+
+ Returns
+ -------
+ locs : array of integers suitable for passing to iloc
+
+ Examples
+ ---------
+ >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')])
+
+ >>> mi.get_locs('b')
+ array([1, 2], dtype=int64)
+
+ >>> mi.get_locs([slice(None), ['e', 'f']])
+ array([1, 2], dtype=int64)
+
+ >>> mi.get_locs([[True, False, True], slice('e', 'f')])
+ array([2], dtype=int64)
+
+ See Also
+ --------
+ MultiIndex.get_loc : Get location for a label or a tuple of labels.
+ MultiIndex.slice_locs : Get slice location given start label(s) and
+ end label(s).
+ """
+ from .numeric import Int64Index
+
+ # must be lexsorted to at least as many levels
+ true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s]
+ if true_slices and true_slices[-1] >= self.lexsort_depth:
+ raise UnsortedIndexError('MultiIndex slicing requires the index '
+ 'to be lexsorted: slicing on levels {0}, '
+ 'lexsort depth {1}'
+ .format(true_slices, self.lexsort_depth))
+ # indexer
+ # this is the list of all values that we want to select
+ n = len(self)
+ indexer = None
+
+ def _convert_to_indexer(r):
+ # return an indexer
+ if isinstance(r, slice):
+ m = np.zeros(n, dtype=bool)
+ m[r] = True
+ r = m.nonzero()[0]
+ elif com.is_bool_indexer(r):
+ if len(r) != n:
+ raise ValueError("cannot index with a boolean indexer "
+ "that is not the same length as the "
+ "index")
+ r = r.nonzero()[0]
+ return Int64Index(r)
+
+ def _update_indexer(idxr, indexer=indexer):
+ if indexer is None:
+ indexer = Index(np.arange(n))
+ if idxr is None:
+ return indexer
+ return indexer & idxr
+
+ for i, k in enumerate(seq):
+
+ if com.is_bool_indexer(k):
+ # a boolean indexer, must be the same length!
+ k = np.asarray(k)
+ indexer = _update_indexer(_convert_to_indexer(k),
+ indexer=indexer)
+
+ elif is_list_like(k):
+ # a collection of labels to include from this level (these
+ # are or'd)
+ indexers = None
+ for x in k:
+ try:
+ idxrs = _convert_to_indexer(
+ self._get_level_indexer(x, level=i,
+ indexer=indexer))
+ indexers = (idxrs if indexers is None
+ else indexers | idxrs)
+ except KeyError:
+
+ # ignore not founds
+ continue
+
+ if indexers is not None:
+ indexer = _update_indexer(indexers, indexer=indexer)
+ else:
+ # no matches we are done
+ return Int64Index([])._ndarray_values
+
+ elif com.is_null_slice(k):
+ # empty slice
+ indexer = _update_indexer(None, indexer=indexer)
+
+ elif isinstance(k, slice):
+
+ # a slice, include BOTH of the labels
+ indexer = _update_indexer(_convert_to_indexer(
+ self._get_level_indexer(k, level=i, indexer=indexer)),
+ indexer=indexer)
+ else:
+ # a single label
+ indexer = _update_indexer(_convert_to_indexer(
+ self.get_loc_level(k, level=i, drop_level=False)[0]),
+ indexer=indexer)
+
+ # empty indexer
+ if indexer is None:
+ return Int64Index([])._ndarray_values
+ return indexer._ndarray_values
+
+ def truncate(self, before=None, after=None):
+ """
+ Slice index between two labels / tuples, return new MultiIndex
+
+ Parameters
+ ----------
+ before : label or tuple, can be partial. Default None
+ None defaults to start
+ after : label or tuple, can be partial. Default None
+ None defaults to end
+
+ Returns
+ -------
+ truncated : MultiIndex
+ """
+ if after and before and after < before:
+ raise ValueError('after < before')
+
+ i, j = self.levels[0].slice_locs(before, after)
+ left, right = self.slice_locs(before, after)
+
+ new_levels = list(self.levels)
+ new_levels[0] = new_levels[0][i:j]
+
+ new_codes = [level_codes[left:right] for level_codes in self.codes]
+ new_codes[0] = new_codes[0] - i
+
+ return MultiIndex(levels=new_levels, codes=new_codes,
+ verify_integrity=False)
+
+ def equals(self, other):
+ """
+ Determines if two MultiIndex objects have the same labeling information
+ (the levels themselves do not necessarily have to be the same)
+
+ See Also
+ --------
+ equal_levels
+ """
+ if self.is_(other):
+ return True
+
+ if not isinstance(other, Index):
+ return False
+
+ if not isinstance(other, MultiIndex):
+ other_vals = com.values_from_object(ensure_index(other))
+ return array_equivalent(self._ndarray_values, other_vals)
+
+ if self.nlevels != other.nlevels:
+ return False
+
+ if len(self) != len(other):
+ return False
+
+ for i in range(self.nlevels):
+ self_codes = self.codes[i]
+ self_codes = self_codes[self_codes != -1]
+ self_values = algos.take_nd(np.asarray(self.levels[i]._values),
+ self_codes, allow_fill=False)
+
+ other_codes = other.codes[i]
+ other_codes = other_codes[other_codes != -1]
+ other_values = algos.take_nd(
+ np.asarray(other.levels[i]._values),
+ other_codes, allow_fill=False)
+
+ # since we use NaT both datetime64 and timedelta64
+ # we can have a situation where a level is typed say
+ # timedelta64 in self (IOW it has other values than NaT)
+ # but types datetime64 in other (where its all NaT)
+ # but these are equivalent
+ if len(self_values) == 0 and len(other_values) == 0:
+ continue
+
+ if not array_equivalent(self_values, other_values):
+ return False
+
+ return True
+
+ def equal_levels(self, other):
+ """
+ Return True if the levels of both MultiIndex objects are the same
+
+ """
+ if self.nlevels != other.nlevels:
+ return False
+
+ for i in range(self.nlevels):
+ if not self.levels[i].equals(other.levels[i]):
+ return False
+ return True
+
+ def union(self, other, sort=None):
+ """
+ Form the union of two MultiIndex objects
+
+ Parameters
+ ----------
+ other : MultiIndex or array / Index of tuples
+ sort : False or None, default None
+ Whether to sort the resulting Index.
+
+ * None : Sort the result, except when
+
+ 1. `self` and `other` are equal.
+ 2. `self` has length 0.
+ 3. Some values in `self` or `other` cannot be compared.
+ A RuntimeWarning is issued in this case.
+
+ * False : do not sort the result.
+
+ .. versionadded:: 0.24.0
+
+ .. versionchanged:: 0.24.1
+
+ Changed the default value from ``True`` to ``None``
+ (without change in behaviour).
+
+ Returns
+ -------
+ Index
+
+ >>> index.union(index2)
+ """
+ self._validate_sort_keyword(sort)
+ self._assert_can_do_setop(other)
+ other, result_names = self._convert_can_do_setop(other)
+
+ if len(other) == 0 or self.equals(other):
+ return self
+
+ # TODO: Index.union returns other when `len(self)` is 0.
+
+ uniq_tuples = lib.fast_unique_multiple([self._ndarray_values,
+ other._ndarray_values],
+ sort=sort)
+
+ return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0,
+ names=result_names)
+
+ def intersection(self, other, sort=False):
+ """
+ Form the intersection of two MultiIndex objects.
+
+ Parameters
+ ----------
+ other : MultiIndex or array / Index of tuples
+ sort : False or None, default False
+ Sort the resulting MultiIndex if possible
+
+ .. versionadded:: 0.24.0
+
+ .. versionchanged:: 0.24.1
+
+ Changed the default from ``True`` to ``False``, to match
+ behaviour from before 0.24.0
+
+ Returns
+ -------
+ Index
+ """
+ self._validate_sort_keyword(sort)
+ self._assert_can_do_setop(other)
+ other, result_names = self._convert_can_do_setop(other)
+
+ if self.equals(other):
+ return self
+
+ self_tuples = self._ndarray_values
+ other_tuples = other._ndarray_values
+ uniq_tuples = set(self_tuples) & set(other_tuples)
+
+ if sort is None:
+ uniq_tuples = sorted(uniq_tuples)
+
+ if len(uniq_tuples) == 0:
+ return MultiIndex(levels=self.levels,
+ codes=[[]] * self.nlevels,
+ names=result_names, verify_integrity=False)
+ else:
+ return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0,
+ names=result_names)
+
+ def difference(self, other, sort=None):
+ """
+ Compute set difference of two MultiIndex objects
+
+ Parameters
+ ----------
+ other : MultiIndex
+ sort : False or None, default None
+ Sort the resulting MultiIndex if possible
+
+ .. versionadded:: 0.24.0
+
+ .. versionchanged:: 0.24.1
+
+ Changed the default value from ``True`` to ``None``
+ (without change in behaviour).
+
+ Returns
+ -------
+ diff : MultiIndex
+ """
+ self._validate_sort_keyword(sort)
+ self._assert_can_do_setop(other)
+ other, result_names = self._convert_can_do_setop(other)
+
+ if len(other) == 0:
+ return self
+
+ if self.equals(other):
+ return MultiIndex(levels=self.levels,
+ codes=[[]] * self.nlevels,
+ names=result_names, verify_integrity=False)
+
+ this = self._get_unique_index()
+
+ indexer = this.get_indexer(other)
+ indexer = indexer.take((indexer != -1).nonzero()[0])
+
+ label_diff = np.setdiff1d(np.arange(this.size), indexer,
+ assume_unique=True)
+ difference = this.values.take(label_diff)
+ if sort is None:
+ difference = sorted(difference)
+
+ if len(difference) == 0:
+ return MultiIndex(levels=[[]] * self.nlevels,
+ codes=[[]] * self.nlevels,
+ names=result_names, verify_integrity=False)
+ else:
+ return MultiIndex.from_tuples(difference, sortorder=0,
+ names=result_names)
+
+ @Appender(_index_shared_docs['astype'])
+ def astype(self, dtype, copy=True):
+ dtype = pandas_dtype(dtype)
+ if is_categorical_dtype(dtype):
+ msg = '> 1 ndim Categorical are not supported at this time'
+ raise NotImplementedError(msg)
+ elif not is_object_dtype(dtype):
+ msg = ('Setting {cls} dtype to anything other than object '
+ 'is not supported').format(cls=self.__class__)
+ raise TypeError(msg)
+ elif copy is True:
+ return self._shallow_copy()
+ return self
+
+ def _convert_can_do_setop(self, other):
+ result_names = self.names
+
+ if not hasattr(other, 'names'):
+ if len(other) == 0:
+ other = MultiIndex(levels=[[]] * self.nlevels,
+ codes=[[]] * self.nlevels,
+ verify_integrity=False)
+ else:
+ msg = 'other must be a MultiIndex or a list of tuples'
+ try:
+ other = MultiIndex.from_tuples(other)
+ except TypeError:
+ raise TypeError(msg)
+ else:
+ result_names = self.names if self.names == other.names else None
+ return other, result_names
+
+ def insert(self, loc, item):
+ """
+ Make new MultiIndex inserting new item at location
+
+ Parameters
+ ----------
+ loc : int
+ item : tuple
+ Must be same length as number of levels in the MultiIndex
+
+ Returns
+ -------
+ new_index : Index
+ """
+ # Pad the key with empty strings if lower levels of the key
+ # aren't specified:
+ if not isinstance(item, tuple):
+ item = (item, ) + ('', ) * (self.nlevels - 1)
+ elif len(item) != self.nlevels:
+ raise ValueError('Item must have length equal to number of '
+ 'levels.')
+
+ new_levels = []
+ new_codes = []
+ for k, level, level_codes in zip(item, self.levels, self.codes):
+ if k not in level:
+ # have to insert into level
+ # must insert at end otherwise you have to recompute all the
+ # other codes
+ lev_loc = len(level)
+ level = level.insert(lev_loc, k)
+ else:
+ lev_loc = level.get_loc(k)
+
+ new_levels.append(level)
+ new_codes.append(np.insert(
+ ensure_int64(level_codes), loc, lev_loc))
+
+ return MultiIndex(levels=new_levels, codes=new_codes,
+ names=self.names, verify_integrity=False)
+
+ def delete(self, loc):
+ """
+ Make new index with passed location deleted
+
+ Returns
+ -------
+ new_index : MultiIndex
+ """
+ new_codes = [np.delete(level_codes, loc) for level_codes in self.codes]
+ return MultiIndex(levels=self.levels, codes=new_codes,
+ names=self.names, verify_integrity=False)
+
+ def _wrap_joined_index(self, joined, other):
+ names = self.names if self.names == other.names else None
+ return MultiIndex.from_tuples(joined, names=names)
+
+ @Appender(Index.isin.__doc__)
+ def isin(self, values, level=None):
+ if level is None:
+ values = MultiIndex.from_tuples(values,
+ names=self.names).values
+ return algos.isin(self.values, values)
+ else:
+ num = self._get_level_number(level)
+ levs = self.levels[num]
+ level_codes = self.codes[num]
+
+ sought_labels = levs.isin(values).nonzero()[0]
+ if levs.size == 0:
+ return np.zeros(len(level_codes), dtype=np.bool_)
+ else:
+ return np.lib.arraysetops.in1d(level_codes, sought_labels)
+
+
+MultiIndex._add_numeric_methods_disabled()
+MultiIndex._add_numeric_methods_add_sub_disabled()
+MultiIndex._add_logical_methods_disabled()
+
+
+def _sparsify(label_list, start=0, sentinel=''):
+ pivoted = lzip(*label_list)
+ k = len(label_list)
+
+ result = pivoted[:start + 1]
+ prev = pivoted[start]
+
+ for cur in pivoted[start + 1:]:
+ sparse_cur = []
+
+ for i, (p, t) in enumerate(zip(prev, cur)):
+ if i == k - 1:
+ sparse_cur.append(t)
+ result.append(sparse_cur)
+ break
+
+ if p == t:
+ sparse_cur.append(sentinel)
+ else:
+ sparse_cur.extend(cur[i:])
+ result.append(sparse_cur)
+ break
+
+ prev = cur
+
+ return lzip(*result)
+
+
+def _get_na_rep(dtype):
+ return {np.datetime64: 'NaT', np.timedelta64: 'NaT'}.get(dtype, 'NaN')
diff --git a/contrib/python/pandas/py2/pandas/core/indexes/numeric.py b/contrib/python/pandas/py2/pandas/core/indexes/numeric.py
new file mode 100644
index 00000000000..379464f4fce
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/indexes/numeric.py
@@ -0,0 +1,450 @@
+import warnings
+
+import numpy as np
+
+from pandas._libs import index as libindex
+import pandas.compat as compat
+from pandas.util._decorators import Appender, cache_readonly
+
+from pandas.core.dtypes.common import (
+ is_bool, is_bool_dtype, is_dtype_equal, is_extension_array_dtype, is_float,
+ is_integer_dtype, is_scalar, needs_i8_conversion, pandas_dtype)
+import pandas.core.dtypes.concat as _concat
+from pandas.core.dtypes.missing import isna
+
+from pandas.core import algorithms
+import pandas.core.common as com
+import pandas.core.indexes.base as ibase
+from pandas.core.indexes.base import (
+ Index, InvalidIndexError, _index_shared_docs)
+from pandas.core.ops import get_op_result_name
+
+_num_index_shared_docs = dict()
+
+
+class NumericIndex(Index):
+ """
+ Provide numeric type operations
+
+ This is an abstract class
+
+ """
+ _is_numeric_dtype = True
+
+ def __new__(cls, data=None, dtype=None, copy=False, name=None,
+ fastpath=None):
+
+ if fastpath is not None:
+ warnings.warn("The 'fastpath' keyword is deprecated, and will be "
+ "removed in a future version.",
+ FutureWarning, stacklevel=2)
+ if fastpath:
+ return cls._simple_new(data, name=name)
+
+ # is_scalar, generators handled in coerce_to_ndarray
+ data = cls._coerce_to_ndarray(data)
+
+ if issubclass(data.dtype.type, compat.string_types):
+ cls._string_data_error(data)
+
+ if copy or not is_dtype_equal(data.dtype, cls._default_dtype):
+ subarr = np.array(data, dtype=cls._default_dtype, copy=copy)
+ cls._assert_safe_casting(data, subarr)
+ else:
+ subarr = data
+
+ if name is None and hasattr(data, 'name'):
+ name = data.name
+ return cls._simple_new(subarr, name=name)
+
+ @Appender(_index_shared_docs['_maybe_cast_slice_bound'])
+ def _maybe_cast_slice_bound(self, label, side, kind):
+ assert kind in ['ix', 'loc', 'getitem', None]
+
+ # we will try to coerce to integers
+ return self._maybe_cast_indexer(label)
+
+ @Appender(_index_shared_docs['_shallow_copy'])
+ def _shallow_copy(self, values=None, **kwargs):
+ if values is not None and not self._can_hold_na:
+ # Ensure we are not returning an Int64Index with float data:
+ return self._shallow_copy_with_infer(values=values, **kwargs)
+ return (super(NumericIndex, self)._shallow_copy(values=values,
+ **kwargs))
+
+ def _convert_for_op(self, value):
+ """ Convert value to be insertable to ndarray """
+
+ if is_bool(value) or is_bool_dtype(value):
+ # force conversion to object
+ # so we don't lose the bools
+ raise TypeError
+
+ return value
+
+ def _convert_tolerance(self, tolerance, target):
+ tolerance = np.asarray(tolerance)
+ if target.size != tolerance.size and tolerance.size > 1:
+ raise ValueError('list-like tolerance size must match '
+ 'target index size')
+ if not np.issubdtype(tolerance.dtype, np.number):
+ if tolerance.ndim > 0:
+ raise ValueError(('tolerance argument for %s must contain '
+ 'numeric elements if it is list type') %
+ (type(self).__name__,))
+ else:
+ raise ValueError(('tolerance argument for %s must be numeric '
+ 'if it is a scalar: %r') %
+ (type(self).__name__, tolerance))
+ return tolerance
+
+ @classmethod
+ def _assert_safe_casting(cls, data, subarr):
+ """
+ Subclasses need to override this only if the process of casting data
+ from some accepted dtype to the internal dtype(s) bears the risk of
+ truncation (e.g. float to int).
+ """
+ pass
+
+ def _concat_same_dtype(self, indexes, name):
+ return _concat._concat_index_same_dtype(indexes).rename(name)
+
+ @property
+ def is_all_dates(self):
+ """
+ Checks that all the labels are datetime objects
+ """
+ return False
+
+ @Appender(Index.insert.__doc__)
+ def insert(self, loc, item):
+ # treat NA values as nans:
+ if is_scalar(item) and isna(item):
+ item = self._na_value
+ return super(NumericIndex, self).insert(loc, item)
+
+
+_num_index_shared_docs['class_descr'] = """
+ Immutable ndarray implementing an ordered, sliceable set. The basic object
+ storing axis labels for all pandas objects. %(klass)s is a special case
+ of `Index` with purely %(ltype)s labels. %(extra)s
+
+ Parameters
+ ----------
+ data : array-like (1-dimensional)
+ dtype : NumPy dtype (default: %(dtype)s)
+ copy : bool
+ Make a copy of input ndarray
+ name : object
+ Name to be stored in the index
+
+ Attributes
+ ----------
+ None
+
+ Methods
+ -------
+ None
+
+ See Also
+ --------
+ Index : The base pandas Index type.
+
+ Notes
+ -----
+ An Index instance can **only** contain hashable objects.
+"""
+
+_int64_descr_args = dict(
+ klass='Int64Index',
+ ltype='integer',
+ dtype='int64',
+ extra=''
+)
+
+
+class IntegerIndex(NumericIndex):
+ """
+ This is an abstract class for Int64Index, UInt64Index.
+ """
+
+ def __contains__(self, key):
+ """
+ Check if key is a float and has a decimal. If it has, return False.
+ """
+ hash(key)
+ try:
+ if is_float(key) and int(key) != key:
+ return False
+ return key in self._engine
+ except (OverflowError, TypeError, ValueError):
+ return False
+
+
+class Int64Index(IntegerIndex):
+ __doc__ = _num_index_shared_docs['class_descr'] % _int64_descr_args
+
+ _typ = 'int64index'
+ _can_hold_na = False
+ _engine_type = libindex.Int64Engine
+ _default_dtype = np.int64
+
+ @property
+ def inferred_type(self):
+ """Always 'integer' for ``Int64Index``"""
+ return 'integer'
+
+ @property
+ def asi8(self):
+ # do not cache or you'll create a memory leak
+ return self.values.view('i8')
+
+ @Appender(_index_shared_docs['_convert_scalar_indexer'])
+ def _convert_scalar_indexer(self, key, kind=None):
+ assert kind in ['ix', 'loc', 'getitem', 'iloc', None]
+
+ # don't coerce ilocs to integers
+ if kind != 'iloc':
+ key = self._maybe_cast_indexer(key)
+ return (super(Int64Index, self)
+ ._convert_scalar_indexer(key, kind=kind))
+
+ def _wrap_joined_index(self, joined, other):
+ name = get_op_result_name(self, other)
+ return Int64Index(joined, name=name)
+
+ @classmethod
+ def _assert_safe_casting(cls, data, subarr):
+ """
+ Ensure incoming data can be represented as ints.
+ """
+ if not issubclass(data.dtype.type, np.signedinteger):
+ if not np.array_equal(data, subarr):
+ raise TypeError('Unsafe NumPy casting, you must '
+ 'explicitly cast')
+
+
+Int64Index._add_numeric_methods()
+Int64Index._add_logical_methods()
+
+_uint64_descr_args = dict(
+ klass='UInt64Index',
+ ltype='unsigned integer',
+ dtype='uint64',
+ extra=''
+)
+
+
+class UInt64Index(IntegerIndex):
+ __doc__ = _num_index_shared_docs['class_descr'] % _uint64_descr_args
+
+ _typ = 'uint64index'
+ _can_hold_na = False
+ _engine_type = libindex.UInt64Engine
+ _default_dtype = np.uint64
+
+ @property
+ def inferred_type(self):
+ """Always 'integer' for ``UInt64Index``"""
+ return 'integer'
+
+ @property
+ def asi8(self):
+ # do not cache or you'll create a memory leak
+ return self.values.view('u8')
+
+ @Appender(_index_shared_docs['_convert_scalar_indexer'])
+ def _convert_scalar_indexer(self, key, kind=None):
+ assert kind in ['ix', 'loc', 'getitem', 'iloc', None]
+
+ # don't coerce ilocs to integers
+ if kind != 'iloc':
+ key = self._maybe_cast_indexer(key)
+ return (super(UInt64Index, self)
+ ._convert_scalar_indexer(key, kind=kind))
+
+ @Appender(_index_shared_docs['_convert_arr_indexer'])
+ def _convert_arr_indexer(self, keyarr):
+ # Cast the indexer to uint64 if possible so
+ # that the values returned from indexing are
+ # also uint64.
+ keyarr = com.asarray_tuplesafe(keyarr)
+ if is_integer_dtype(keyarr):
+ return com.asarray_tuplesafe(keyarr, dtype=np.uint64)
+ return keyarr
+
+ @Appender(_index_shared_docs['_convert_index_indexer'])
+ def _convert_index_indexer(self, keyarr):
+ # Cast the indexer to uint64 if possible so
+ # that the values returned from indexing are
+ # also uint64.
+ if keyarr.is_integer():
+ return keyarr.astype(np.uint64)
+ return keyarr
+
+ def _wrap_joined_index(self, joined, other):
+ name = get_op_result_name(self, other)
+ return UInt64Index(joined, name=name)
+
+ @classmethod
+ def _assert_safe_casting(cls, data, subarr):
+ """
+ Ensure incoming data can be represented as uints.
+ """
+ if not issubclass(data.dtype.type, np.unsignedinteger):
+ if not np.array_equal(data, subarr):
+ raise TypeError('Unsafe NumPy casting, you must '
+ 'explicitly cast')
+
+
+UInt64Index._add_numeric_methods()
+UInt64Index._add_logical_methods()
+
+_float64_descr_args = dict(
+ klass='Float64Index',
+ dtype='float64',
+ ltype='float',
+ extra=''
+)
+
+
+class Float64Index(NumericIndex):
+ __doc__ = _num_index_shared_docs['class_descr'] % _float64_descr_args
+
+ _typ = 'float64index'
+ _engine_type = libindex.Float64Engine
+ _default_dtype = np.float64
+
+ @property
+ def inferred_type(self):
+ """Always 'floating' for ``Float64Index``"""
+ return 'floating'
+
+ @Appender(_index_shared_docs['astype'])
+ def astype(self, dtype, copy=True):
+ dtype = pandas_dtype(dtype)
+ if needs_i8_conversion(dtype):
+ msg = ('Cannot convert Float64Index to dtype {dtype}; integer '
+ 'values are required for conversion').format(dtype=dtype)
+ raise TypeError(msg)
+ elif (is_integer_dtype(dtype) and
+ not is_extension_array_dtype(dtype)) and self.hasnans:
+ # TODO(jreback); this can change once we have an EA Index type
+ # GH 13149
+ raise ValueError('Cannot convert NA to integer')
+ return super(Float64Index, self).astype(dtype, copy=copy)
+
+ @Appender(_index_shared_docs['_convert_scalar_indexer'])
+ def _convert_scalar_indexer(self, key, kind=None):
+ assert kind in ['ix', 'loc', 'getitem', 'iloc', None]
+
+ if kind == 'iloc':
+ return self._validate_indexer('positional', key, kind)
+
+ return key
+
+ @Appender(_index_shared_docs['_convert_slice_indexer'])
+ def _convert_slice_indexer(self, key, kind=None):
+ # if we are not a slice, then we are done
+ if not isinstance(key, slice):
+ return key
+
+ if kind == 'iloc':
+ return super(Float64Index, self)._convert_slice_indexer(key,
+ kind=kind)
+
+ # translate to locations
+ return self.slice_indexer(key.start, key.stop, key.step, kind=kind)
+
+ def _format_native_types(self, na_rep='', float_format=None, decimal='.',
+ quoting=None, **kwargs):
+ from pandas.io.formats.format import FloatArrayFormatter
+ formatter = FloatArrayFormatter(self.values, na_rep=na_rep,
+ float_format=float_format,
+ decimal=decimal, quoting=quoting,
+ fixed_width=False)
+ return formatter.get_result_as_array()
+
+ def get_value(self, series, key):
+ """ we always want to get an index value, never a value """
+ if not is_scalar(key):
+ raise InvalidIndexError
+
+ k = com.values_from_object(key)
+ loc = self.get_loc(k)
+ new_values = com.values_from_object(series)[loc]
+
+ return new_values
+
+ def equals(self, other):
+ """
+ Determines if two Index objects contain the same elements.
+ """
+ if self is other:
+ return True
+
+ if not isinstance(other, Index):
+ return False
+
+ # need to compare nans locations and make sure that they are the same
+ # since nans don't compare equal this is a bit tricky
+ try:
+ if not isinstance(other, Float64Index):
+ other = self._constructor(other)
+ if (not is_dtype_equal(self.dtype, other.dtype) or
+ self.shape != other.shape):
+ return False
+ left, right = self._ndarray_values, other._ndarray_values
+ return ((left == right) | (self._isnan & other._isnan)).all()
+ except (TypeError, ValueError):
+ return False
+
+ def __contains__(self, other):
+ if super(Float64Index, self).__contains__(other):
+ return True
+
+ try:
+ # if other is a sequence this throws a ValueError
+ return np.isnan(other) and self.hasnans
+ except ValueError:
+ try:
+ return len(other) <= 1 and ibase._try_get_item(other) in self
+ except TypeError:
+ pass
+ except TypeError:
+ pass
+
+ return False
+
+ @Appender(_index_shared_docs['get_loc'])
+ def get_loc(self, key, method=None, tolerance=None):
+ try:
+ if np.all(np.isnan(key)) or is_bool(key):
+ nan_idxs = self._nan_idxs
+ try:
+ return nan_idxs.item()
+ except (ValueError, IndexError):
+ # should only need to catch ValueError here but on numpy
+ # 1.7 .item() can raise IndexError when NaNs are present
+ if not len(nan_idxs):
+ raise KeyError(key)
+ return nan_idxs
+ except (TypeError, NotImplementedError):
+ pass
+ return super(Float64Index, self).get_loc(key, method=method,
+ tolerance=tolerance)
+
+ @cache_readonly
+ def is_unique(self):
+ return super(Float64Index, self).is_unique and self._nan_idxs.size < 2
+
+ @Appender(Index.isin.__doc__)
+ def isin(self, values, level=None):
+ if level is not None:
+ self._validate_index_level(level)
+ return algorithms.isin(np.array(self), values)
+
+
+Float64Index._add_numeric_methods()
+Float64Index._add_logical_methods_disabled()
diff --git a/contrib/python/pandas/py2/pandas/core/indexes/period.py b/contrib/python/pandas/py2/pandas/core/indexes/period.py
new file mode 100644
index 00000000000..a4bd7f9017e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/indexes/period.py
@@ -0,0 +1,966 @@
+# pylint: disable=E1101,E1103,W0232
+from datetime import datetime, timedelta
+import warnings
+
+import numpy as np
+
+from pandas._libs import index as libindex
+from pandas._libs.tslibs import (
+ NaT, frequencies as libfrequencies, iNaT, resolution)
+from pandas._libs.tslibs.period import (
+ DIFFERENT_FREQ, IncompatibleFrequency, Period)
+from pandas.util._decorators import Appender, Substitution, cache_readonly
+
+from pandas.core.dtypes.common import (
+ is_bool_dtype, is_datetime64_any_dtype, is_float, is_float_dtype,
+ is_integer, is_integer_dtype, pandas_dtype)
+
+from pandas import compat
+from pandas.core import common as com
+from pandas.core.accessor import delegate_names
+from pandas.core.algorithms import unique1d
+from pandas.core.arrays.period import (
+ PeriodArray, period_array, validate_dtype_freq)
+from pandas.core.base import _shared_docs
+import pandas.core.indexes.base as ibase
+from pandas.core.indexes.base import _index_shared_docs, ensure_index
+from pandas.core.indexes.datetimelike import (
+ DatetimeIndexOpsMixin, DatetimelikeDelegateMixin)
+from pandas.core.indexes.datetimes import DatetimeIndex, Index, Int64Index
+from pandas.core.missing import isna
+from pandas.core.ops import get_op_result_name
+from pandas.core.tools.datetimes import DateParseError, parse_time_string
+
+from pandas.tseries import frequencies
+from pandas.tseries.offsets import DateOffset, Tick
+
+_index_doc_kwargs = dict(ibase._index_doc_kwargs)
+_index_doc_kwargs.update(
+ dict(target_klass='PeriodIndex or list of Periods'))
+
+
+# --- Period index sketch
+
+
+def _new_PeriodIndex(cls, **d):
+ # GH13277 for unpickling
+ values = d.pop('data')
+ if values.dtype == 'int64':
+ freq = d.pop('freq', None)
+ values = PeriodArray(values, freq=freq)
+ return cls._simple_new(values, **d)
+ else:
+ return cls(values, **d)
+
+
+class PeriodDelegateMixin(DatetimelikeDelegateMixin):
+ """
+ Delegate from PeriodIndex to PeriodArray.
+ """
+ _delegate_class = PeriodArray
+ _delegated_properties = PeriodArray._datetimelike_ops
+ _delegated_methods = (
+ set(PeriodArray._datetimelike_methods) | {'_addsub_int_array'}
+ )
+ _raw_properties = {'is_leap_year'}
+
+
+@delegate_names(PeriodArray,
+ PeriodDelegateMixin._delegated_properties,
+ typ='property')
+@delegate_names(PeriodArray,
+ PeriodDelegateMixin._delegated_methods,
+ typ="method",
+ overwrite=True)
+class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin):
+ """
+ Immutable ndarray holding ordinal values indicating regular periods in
+ time such as particular years, quarters, months, etc.
+
+ Index keys are boxed to Period objects which carries the metadata (eg,
+ frequency information).
+
+ Parameters
+ ----------
+ data : array-like (1-dimensional), optional
+ Optional period-like data to construct index with
+ copy : bool
+ Make a copy of input ndarray
+ freq : string or period object, optional
+ One of pandas period strings or corresponding objects
+ start : starting value, period-like, optional
+ If data is None, used as the start point in generating regular
+ period data.
+
+ .. deprecated:: 0.24.0
+
+ periods : int, optional, > 0
+ Number of periods to generate, if generating index. Takes precedence
+ over end argument
+
+ .. deprecated:: 0.24.0
+
+ end : end value, period-like, optional
+ If periods is none, generated index will extend to first conforming
+ period on or just past end argument
+
+ .. deprecated:: 0.24.0
+
+ year : int, array, or Series, default None
+ month : int, array, or Series, default None
+ quarter : int, array, or Series, default None
+ day : int, array, or Series, default None
+ hour : int, array, or Series, default None
+ minute : int, array, or Series, default None
+ second : int, array, or Series, default None
+ tz : object, default None
+ Timezone for converting datetime64 data to Periods
+ dtype : str or PeriodDtype, default None
+
+ Attributes
+ ----------
+ day
+ dayofweek
+ dayofyear
+ days_in_month
+ daysinmonth
+ end_time
+ freq
+ freqstr
+ hour
+ is_leap_year
+ minute
+ month
+ quarter
+ qyear
+ second
+ start_time
+ week
+ weekday
+ weekofyear
+ year
+
+ Methods
+ -------
+ asfreq
+ strftime
+ to_timestamp
+
+ Notes
+ -----
+ Creating a PeriodIndex based on `start`, `periods`, and `end` has
+ been deprecated in favor of :func:`period_range`.
+
+ Examples
+ --------
+ >>> idx = pd.PeriodIndex(year=year_arr, quarter=q_arr)
+
+ See Also
+ ---------
+ Index : The base pandas Index type.
+ Period : Represents a period of time.
+ DatetimeIndex : Index with datetime64 data.
+ TimedeltaIndex : Index of timedelta64 data.
+ period_range : Create a fixed-frequency PeriodIndex.
+ """
+ _typ = 'periodindex'
+ _attributes = ['name', 'freq']
+
+ # define my properties & methods for delegation
+ _is_numeric_dtype = False
+ _infer_as_myclass = True
+
+ _data = None # type: PeriodArray
+
+ _engine_type = libindex.PeriodEngine
+
+ # ------------------------------------------------------------------------
+ # Index Constructors
+
+ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None,
+ periods=None, tz=None, dtype=None, copy=False, name=None,
+ **fields):
+
+ valid_field_set = {'year', 'month', 'day', 'quarter',
+ 'hour', 'minute', 'second'}
+
+ if not set(fields).issubset(valid_field_set):
+ raise TypeError('__new__() got an unexpected keyword argument {}'.
+ format(list(set(fields) - valid_field_set)[0]))
+
+ if name is None and hasattr(data, 'name'):
+ name = data.name
+
+ if data is None and ordinal is None:
+ # range-based.
+ data, freq2 = PeriodArray._generate_range(start, end, periods,
+ freq, fields)
+ # PeriodArray._generate range does validate that fields is
+ # empty when really using the range-based constructor.
+ if not fields:
+ msg = ("Creating a PeriodIndex by passing range "
+ "endpoints is deprecated. Use "
+ "`pandas.period_range` instead.")
+ # period_range differs from PeriodIndex for cases like
+ # start="2000", periods=4
+ # PeriodIndex interprets that as A-DEC freq.
+ # period_range interprets it as 'D' freq.
+ cond = (
+ freq is None and (
+ (start and not isinstance(start, Period)) or
+ (end and not isinstance(end, Period))
+ )
+ )
+ if cond:
+ msg += (
+ " Note that the default `freq` may differ. Pass "
+ "'freq=\"{}\"' to ensure the same output."
+ ).format(freq2.freqstr)
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+ freq = freq2
+
+ data = PeriodArray(data, freq=freq)
+ else:
+ freq = validate_dtype_freq(dtype, freq)
+
+ # PeriodIndex allow PeriodIndex(period_index, freq=different)
+ # Let's not encourage that kind of behavior in PeriodArray.
+
+ if freq and isinstance(data, cls) and data.freq != freq:
+ # TODO: We can do some of these with no-copy / coercion?
+ # e.g. D -> 2D seems to be OK
+ data = data.asfreq(freq)
+
+ if data is None and ordinal is not None:
+ # we strangely ignore `ordinal` if data is passed.
+ ordinal = np.asarray(ordinal, dtype=np.int64)
+ data = PeriodArray(ordinal, freq)
+ else:
+ # don't pass copy here, since we copy later.
+ data = period_array(data=data, freq=freq)
+
+ if copy:
+ data = data.copy()
+
+ return cls._simple_new(data, name=name)
+
+ @classmethod
+ def _simple_new(cls, values, name=None, freq=None, **kwargs):
+ """
+ Create a new PeriodIndex.
+
+ Parameters
+ ----------
+ values : PeriodArray, PeriodIndex, Index[int64], ndarray[int64]
+ Values that can be converted to a PeriodArray without inference
+ or coercion.
+
+ """
+ # TODO: raising on floats is tested, but maybe not useful.
+ # Should the callers know not to pass floats?
+ # At the very least, I think we can ensure that lists aren't passed.
+ if isinstance(values, list):
+ values = np.asarray(values)
+ if is_float_dtype(values):
+ raise TypeError("PeriodIndex._simple_new does not accept floats.")
+ if freq:
+ freq = Period._maybe_convert_freq(freq)
+ values = PeriodArray(values, freq=freq)
+
+ if not isinstance(values, PeriodArray):
+ raise TypeError("PeriodIndex._simple_new only accepts PeriodArray")
+ result = object.__new__(cls)
+ result._data = values
+ # For groupby perf. See note in indexes/base about _index_data
+ result._index_data = values._data
+ result.name = name
+ result._reset_identity()
+ return result
+
+ # ------------------------------------------------------------------------
+ # Data
+
+ @property
+ def values(self):
+ return np.asarray(self)
+
+ @property
+ def freq(self):
+ return self._data.freq
+
+ @freq.setter
+ def freq(self, value):
+ value = Period._maybe_convert_freq(value)
+ # TODO: When this deprecation is enforced, PeriodIndex.freq can
+ # be removed entirely, and we'll just inherit.
+ msg = ('Setting {cls}.freq has been deprecated and will be '
+ 'removed in a future version; use {cls}.asfreq instead. '
+ 'The {cls}.freq setter is not guaranteed to work.')
+ warnings.warn(msg.format(cls=type(self).__name__),
+ FutureWarning, stacklevel=2)
+ # PeriodArray._freq isn't actually mutable. We set the private _freq
+ # here, but people shouldn't be doing this anyway.
+ self._data._freq = value
+
+ def _shallow_copy(self, values=None, **kwargs):
+ # TODO: simplify, figure out type of values
+ if values is None:
+ values = self._data
+
+ if isinstance(values, type(self)):
+ values = values._values
+
+ if not isinstance(values, PeriodArray):
+ if (isinstance(values, np.ndarray) and
+ is_integer_dtype(values.dtype)):
+ values = PeriodArray(values, freq=self.freq)
+ else:
+ # in particular, I would like to avoid period_array here.
+ # Some people seem to be calling use with unexpected types
+ # Index.difference -> ndarray[Period]
+ # DatetimelikeIndexOpsMixin.repeat -> ndarray[ordinal]
+ # I think that once all of Datetime* are EAs, we can simplify
+ # this quite a bit.
+ values = period_array(values, freq=self.freq)
+
+ # We don't allow changing `freq` in _shallow_copy.
+ validate_dtype_freq(self.dtype, kwargs.get('freq'))
+ attributes = self._get_attributes_dict()
+
+ attributes.update(kwargs)
+ if not len(values) and 'dtype' not in kwargs:
+ attributes['dtype'] = self.dtype
+ return self._simple_new(values, **attributes)
+
+ def _shallow_copy_with_infer(self, values=None, **kwargs):
+ """ we always want to return a PeriodIndex """
+ return self._shallow_copy(values=values, **kwargs)
+
+ @property
+ def _box_func(self):
+ """Maybe box an ordinal or Period"""
+ # TODO(DatetimeArray): Avoid double-boxing
+ # PeriodArray takes care of boxing already, so we need to check
+ # whether we're given an ordinal or a Period. It seems like some
+ # places outside of indexes/period.py are calling this _box_func,
+ # but passing data that's already boxed.
+ def func(x):
+ if isinstance(x, Period) or x is NaT:
+ return x
+ else:
+ return Period._from_ordinal(ordinal=x, freq=self.freq)
+ return func
+
+ def _maybe_convert_timedelta(self, other):
+ """
+ Convert timedelta-like input to an integer multiple of self.freq
+
+ Parameters
+ ----------
+ other : timedelta, np.timedelta64, DateOffset, int, np.ndarray
+
+ Returns
+ -------
+ converted : int, np.ndarray[int64]
+
+ Raises
+ ------
+ IncompatibleFrequency : if the input cannot be written as a multiple
+ of self.freq. Note IncompatibleFrequency subclasses ValueError.
+ """
+ if isinstance(
+ other, (timedelta, np.timedelta64, Tick, np.ndarray)):
+ offset = frequencies.to_offset(self.freq.rule_code)
+ if isinstance(offset, Tick):
+ # _check_timedeltalike_freq_compat will raise if incompatible
+ delta = self._data._check_timedeltalike_freq_compat(other)
+ return delta
+ elif isinstance(other, DateOffset):
+ freqstr = other.rule_code
+ base = libfrequencies.get_base_alias(freqstr)
+ if base == self.freq.rule_code:
+ return other.n
+
+ msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
+ own_freq=self.freqstr,
+ other_freq=other.freqstr)
+ raise IncompatibleFrequency(msg)
+ elif is_integer(other):
+ # integer is passed to .shift via
+ # _add_datetimelike_methods basically
+ # but ufunc may pass integer to _add_delta
+ return other
+
+ # raise when input doesn't have freq
+ msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
+ own_freq=self.freqstr,
+ other_freq=None)
+ raise IncompatibleFrequency(msg)
+
+ # ------------------------------------------------------------------------
+ # Rendering Methods
+
+ def _format_native_types(self, na_rep=u'NaT', quoting=None, **kwargs):
+ # just dispatch, return ndarray
+ return self._data._format_native_types(na_rep=na_rep,
+ quoting=quoting,
+ **kwargs)
+
+ def _mpl_repr(self):
+ # how to represent ourselves to matplotlib
+ return self.astype(object).values
+
+ @property
+ def _formatter_func(self):
+ return self.array._formatter(boxed=False)
+
+ # ------------------------------------------------------------------------
+ # Indexing
+
+ @cache_readonly
+ def _engine(self):
+ return self._engine_type(lambda: self, len(self))
+
+ @Appender(_index_shared_docs['contains'])
+ def __contains__(self, key):
+ if isinstance(key, Period):
+ if key.freq != self.freq:
+ return False
+ else:
+ return key.ordinal in self._engine
+ else:
+ try:
+ self.get_loc(key)
+ return True
+ except Exception:
+ return False
+
+ contains = __contains__
+
+ @cache_readonly
+ def _int64index(self):
+ return Int64Index._simple_new(self.asi8, name=self.name)
+
+ # ------------------------------------------------------------------------
+ # Index Methods
+
+ def _coerce_scalar_to_index(self, item):
+ """
+ we need to coerce a scalar to a compat for our index type
+
+ Parameters
+ ----------
+ item : scalar item to coerce
+ """
+ return PeriodIndex([item], **self._get_attributes_dict())
+
+ def __array__(self, dtype=None):
+ if is_integer_dtype(dtype):
+ return self.asi8
+ else:
+ return self.astype(object).values
+
+ def __array_wrap__(self, result, context=None):
+ """
+ Gets called after a ufunc. Needs additional handling as
+ PeriodIndex stores internal data as int dtype
+
+ Replace this to __numpy_ufunc__ in future version
+ """
+ if isinstance(context, tuple) and len(context) > 0:
+ func = context[0]
+ if func is np.add:
+ pass
+ elif func is np.subtract:
+ name = self.name
+ left = context[1][0]
+ right = context[1][1]
+ if (isinstance(left, PeriodIndex) and
+ isinstance(right, PeriodIndex)):
+ name = left.name if left.name == right.name else None
+ return Index(result, name=name)
+ elif isinstance(left, Period) or isinstance(right, Period):
+ return Index(result, name=name)
+ elif isinstance(func, np.ufunc):
+ if 'M->M' not in func.types:
+ msg = "ufunc '{0}' not supported for the PeriodIndex"
+ # This should be TypeError, but TypeError cannot be raised
+ # from here because numpy catches.
+ raise ValueError(msg.format(func.__name__))
+
+ if is_bool_dtype(result):
+ return result
+ # the result is object dtype array of Period
+ # cannot pass _simple_new as it is
+ return type(self)(result, freq=self.freq, name=self.name)
+
+ def asof_locs(self, where, mask):
+ """
+ where : array of timestamps
+ mask : array of booleans where data is not NA
+
+ """
+ where_idx = where
+ if isinstance(where_idx, DatetimeIndex):
+ where_idx = PeriodIndex(where_idx.values, freq=self.freq)
+
+ locs = self._ndarray_values[mask].searchsorted(
+ where_idx._ndarray_values, side='right')
+
+ locs = np.where(locs > 0, locs - 1, 0)
+ result = np.arange(len(self))[mask].take(locs)
+
+ first = mask.argmax()
+ result[(locs == 0) & (where_idx._ndarray_values <
+ self._ndarray_values[first])] = -1
+
+ return result
+
+ @Appender(_index_shared_docs['astype'])
+ def astype(self, dtype, copy=True, how='start'):
+ dtype = pandas_dtype(dtype)
+
+ if is_datetime64_any_dtype(dtype):
+ # 'how' is index-specific, isn't part of the EA interface.
+ tz = getattr(dtype, 'tz', None)
+ return self.to_timestamp(how=how).tz_localize(tz)
+
+ # TODO: should probably raise on `how` here, so we don't ignore it.
+ return super(PeriodIndex, self).astype(dtype, copy=copy)
+
+ @Substitution(klass='PeriodIndex')
+ @Appender(_shared_docs['searchsorted'])
+ def searchsorted(self, value, side='left', sorter=None):
+ if isinstance(value, Period):
+ if value.freq != self.freq:
+ msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
+ own_freq=self.freqstr,
+ other_freq=value.freqstr)
+ raise IncompatibleFrequency(msg)
+ value = value.ordinal
+ elif isinstance(value, compat.string_types):
+ try:
+ value = Period(value, freq=self.freq).ordinal
+ except DateParseError:
+ raise KeyError("Cannot interpret '{}' as period".format(value))
+
+ return self._ndarray_values.searchsorted(value, side=side,
+ sorter=sorter)
+
+ @property
+ def is_all_dates(self):
+ return True
+
+ @property
+ def is_full(self):
+ """
+ Returns True if this PeriodIndex is range-like in that all Periods
+ between start and end are present, in order.
+ """
+ if len(self) == 0:
+ return True
+ if not self.is_monotonic:
+ raise ValueError('Index is not monotonic')
+ values = self.asi8
+ return ((values[1:] - values[:-1]) < 2).all()
+
+ @property
+ def inferred_type(self):
+ # b/c data is represented as ints make sure we can't have ambiguous
+ # indexing
+ return 'period'
+
+ def get_value(self, series, key):
+ """
+ Fast lookup of value from 1-dimensional ndarray. Only use this if you
+ know what you're doing
+ """
+ s = com.values_from_object(series)
+ try:
+ return com.maybe_box(self,
+ super(PeriodIndex, self).get_value(s, key),
+ series, key)
+ except (KeyError, IndexError):
+ try:
+ asdt, parsed, reso = parse_time_string(key, self.freq)
+ grp = resolution.Resolution.get_freq_group(reso)
+ freqn = resolution.get_freq_group(self.freq)
+
+ vals = self._ndarray_values
+
+ # if our data is higher resolution than requested key, slice
+ if grp < freqn:
+ iv = Period(asdt, freq=(grp, 1))
+ ord1 = iv.asfreq(self.freq, how='S').ordinal
+ ord2 = iv.asfreq(self.freq, how='E').ordinal
+
+ if ord2 < vals[0] or ord1 > vals[-1]:
+ raise KeyError(key)
+
+ pos = np.searchsorted(self._ndarray_values, [ord1, ord2])
+ key = slice(pos[0], pos[1] + 1)
+ return series[key]
+ elif grp == freqn:
+ key = Period(asdt, freq=self.freq).ordinal
+ return com.maybe_box(self, self._engine.get_value(s, key),
+ series, key)
+ else:
+ raise KeyError(key)
+ except TypeError:
+ pass
+
+ period = Period(key, self.freq)
+ key = period.value if isna(period) else period.ordinal
+ return com.maybe_box(self, self._engine.get_value(s, key),
+ series, key)
+
+ @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs)
+ def get_indexer(self, target, method=None, limit=None, tolerance=None):
+ target = ensure_index(target)
+
+ if hasattr(target, 'freq') and target.freq != self.freq:
+ msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
+ own_freq=self.freqstr,
+ other_freq=target.freqstr)
+ raise IncompatibleFrequency(msg)
+
+ if isinstance(target, PeriodIndex):
+ target = target.asi8
+
+ if tolerance is not None:
+ tolerance = self._convert_tolerance(tolerance, target)
+ return Index.get_indexer(self._int64index, target, method,
+ limit, tolerance)
+
+ def _get_unique_index(self, dropna=False):
+ """
+ wrap Index._get_unique_index to handle NaT
+ """
+ res = super(PeriodIndex, self)._get_unique_index(dropna=dropna)
+ if dropna:
+ res = res.dropna()
+ return res
+
+ @Appender(Index.unique.__doc__)
+ def unique(self, level=None):
+ # override the Index.unique method for performance GH#23083
+ if level is not None:
+ # this should never occur, but is retained to make the signature
+ # match Index.unique
+ self._validate_index_level(level)
+
+ values = self._ndarray_values
+ result = unique1d(values)
+ return self._shallow_copy(result)
+
+ def get_loc(self, key, method=None, tolerance=None):
+ """
+ Get integer location for requested label
+
+ Returns
+ -------
+ loc : int
+ """
+ try:
+ return self._engine.get_loc(key)
+ except KeyError:
+ if is_integer(key):
+ raise
+
+ try:
+ asdt, parsed, reso = parse_time_string(key, self.freq)
+ key = asdt
+ except TypeError:
+ pass
+ except DateParseError:
+ # A string with invalid format
+ raise KeyError("Cannot interpret '{}' as period".format(key))
+
+ try:
+ key = Period(key, freq=self.freq)
+ except ValueError:
+ # we cannot construct the Period
+ # as we have an invalid type
+ raise KeyError(key)
+
+ try:
+ ordinal = iNaT if key is NaT else key.ordinal
+ if tolerance is not None:
+ tolerance = self._convert_tolerance(tolerance,
+ np.asarray(key))
+ return self._int64index.get_loc(ordinal, method, tolerance)
+
+ except KeyError:
+ raise KeyError(key)
+
+ def _maybe_cast_slice_bound(self, label, side, kind):
+ """
+ If label is a string or a datetime, cast it to Period.ordinal according
+ to resolution.
+
+ Parameters
+ ----------
+ label : object
+ side : {'left', 'right'}
+ kind : {'ix', 'loc', 'getitem'}
+
+ Returns
+ -------
+ bound : Period or object
+
+ Notes
+ -----
+ Value of `side` parameter should be validated in caller.
+
+ """
+ assert kind in ['ix', 'loc', 'getitem']
+
+ if isinstance(label, datetime):
+ return Period(label, freq=self.freq)
+ elif isinstance(label, compat.string_types):
+ try:
+ _, parsed, reso = parse_time_string(label, self.freq)
+ bounds = self._parsed_string_to_bounds(reso, parsed)
+ return bounds[0 if side == 'left' else 1]
+ except Exception:
+ raise KeyError(label)
+ elif is_integer(label) or is_float(label):
+ self._invalid_indexer('slice', label)
+
+ return label
+
+ def _parsed_string_to_bounds(self, reso, parsed):
+ if reso == 'year':
+ t1 = Period(year=parsed.year, freq='A')
+ elif reso == 'month':
+ t1 = Period(year=parsed.year, month=parsed.month, freq='M')
+ elif reso == 'quarter':
+ q = (parsed.month - 1) // 3 + 1
+ t1 = Period(year=parsed.year, quarter=q, freq='Q-DEC')
+ elif reso == 'day':
+ t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day,
+ freq='D')
+ elif reso == 'hour':
+ t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day,
+ hour=parsed.hour, freq='H')
+ elif reso == 'minute':
+ t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day,
+ hour=parsed.hour, minute=parsed.minute, freq='T')
+ elif reso == 'second':
+ t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day,
+ hour=parsed.hour, minute=parsed.minute,
+ second=parsed.second, freq='S')
+ else:
+ raise KeyError(reso)
+ return (t1.asfreq(self.freq, how='start'),
+ t1.asfreq(self.freq, how='end'))
+
+ def _get_string_slice(self, key):
+ if not self.is_monotonic:
+ raise ValueError('Partial indexing only valid for '
+ 'ordered time series')
+
+ key, parsed, reso = parse_time_string(key, self.freq)
+ grp = resolution.Resolution.get_freq_group(reso)
+ freqn = resolution.get_freq_group(self.freq)
+ if reso in ['day', 'hour', 'minute', 'second'] and not grp < freqn:
+ raise KeyError(key)
+
+ t1, t2 = self._parsed_string_to_bounds(reso, parsed)
+ return slice(self.searchsorted(t1.ordinal, side='left'),
+ self.searchsorted(t2.ordinal, side='right'))
+
+ def _convert_tolerance(self, tolerance, target):
+ tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance,
+ target)
+ if target.size != tolerance.size and tolerance.size > 1:
+ raise ValueError('list-like tolerance size must match '
+ 'target index size')
+ return self._maybe_convert_timedelta(tolerance)
+
+ def insert(self, loc, item):
+ if not isinstance(item, Period) or self.freq != item.freq:
+ return self.astype(object).insert(loc, item)
+
+ idx = np.concatenate((self[:loc].asi8, np.array([item.ordinal]),
+ self[loc:].asi8))
+ return self._shallow_copy(idx)
+
+ def join(self, other, how='left', level=None, return_indexers=False,
+ sort=False):
+ """
+ See Index.join
+ """
+ self._assert_can_do_setop(other)
+
+ result = Int64Index.join(self, other, how=how, level=level,
+ return_indexers=return_indexers,
+ sort=sort)
+
+ if return_indexers:
+ result, lidx, ridx = result
+ return self._apply_meta(result), lidx, ridx
+ return self._apply_meta(result)
+
+ def _assert_can_do_setop(self, other):
+ super(PeriodIndex, self)._assert_can_do_setop(other)
+
+ if not isinstance(other, PeriodIndex):
+ raise ValueError('can only call with other PeriodIndex-ed objects')
+
+ if self.freq != other.freq:
+ msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
+ own_freq=self.freqstr,
+ other_freq=other.freqstr)
+ raise IncompatibleFrequency(msg)
+
+ def _wrap_setop_result(self, other, result):
+ name = get_op_result_name(self, other)
+ result = self._apply_meta(result)
+ result.name = name
+ return result
+
+ def _apply_meta(self, rawarr):
+ if not isinstance(rawarr, PeriodIndex):
+ rawarr = PeriodIndex._simple_new(rawarr, freq=self.freq,
+ name=self.name)
+ return rawarr
+
+ def __setstate__(self, state):
+ """Necessary for making this object picklable"""
+
+ if isinstance(state, dict):
+ super(PeriodIndex, self).__setstate__(state)
+
+ elif isinstance(state, tuple):
+
+ # < 0.15 compat
+ if len(state) == 2:
+ nd_state, own_state = state
+ data = np.empty(nd_state[1], dtype=nd_state[2])
+ np.ndarray.__setstate__(data, nd_state)
+
+ # backcompat
+ freq = Period._maybe_convert_freq(own_state[1])
+
+ else: # pragma: no cover
+ data = np.empty(state)
+ np.ndarray.__setstate__(self, state)
+ freq = None # ?
+
+ data = PeriodArray(data, freq=freq)
+ self._data = data
+
+ else:
+ raise Exception("invalid pickle state")
+
+ _unpickle_compat = __setstate__
+
+ @property
+ def flags(self):
+ """ return the ndarray.flags for the underlying data """
+ warnings.warn("{obj}.flags is deprecated and will be removed "
+ "in a future version".format(obj=type(self).__name__),
+ FutureWarning, stacklevel=2)
+ return self._ndarray_values.flags
+
+ def item(self):
+ """
+ return the first element of the underlying data as a python
+ scalar
+ """
+ # TODO(DatetimeArray): remove
+ if len(self) == 1:
+ return self[0]
+ else:
+ # copy numpy's message here because Py26 raises an IndexError
+ raise ValueError('can only convert an array of size 1 to a '
+ 'Python scalar')
+
+ @property
+ def data(self):
+ """ return the data pointer of the underlying data """
+ warnings.warn("{obj}.data is deprecated and will be removed "
+ "in a future version".format(obj=type(self).__name__),
+ FutureWarning, stacklevel=2)
+ return np.asarray(self._data).data
+
+ @property
+ def base(self):
+ """ return the base object if the memory of the underlying data is
+ shared
+ """
+ warnings.warn("{obj}.base is deprecated and will be removed "
+ "in a future version".format(obj=type(self).__name__),
+ FutureWarning, stacklevel=2)
+ return np.asarray(self._data)
+
+
+PeriodIndex._add_comparison_ops()
+PeriodIndex._add_numeric_methods_disabled()
+PeriodIndex._add_logical_methods_disabled()
+PeriodIndex._add_datetimelike_methods()
+
+
+def period_range(start=None, end=None, periods=None, freq=None, name=None):
+ """
+ Return a fixed frequency PeriodIndex, with day (calendar) as the default
+ frequency
+
+ Parameters
+ ----------
+ start : string or period-like, default None
+ Left bound for generating periods
+ end : string or period-like, default None
+ Right bound for generating periods
+ periods : integer, default None
+ Number of periods to generate
+ freq : string or DateOffset, optional
+ Frequency alias. By default the freq is taken from `start` or `end`
+ if those are Period objects. Otherwise, the default is ``"D"`` for
+ daily frequency.
+
+ name : string, default None
+ Name of the resulting PeriodIndex
+
+ Returns
+ -------
+ prng : PeriodIndex
+
+ Notes
+ -----
+ Of the three parameters: ``start``, ``end``, and ``periods``, exactly two
+ must be specified.
+
+ To learn more about the frequency strings, please see `this link
+ <http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases>`__.
+
+ Examples
+ --------
+
+ >>> pd.period_range(start='2017-01-01', end='2018-01-01', freq='M')
+ PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05',
+ '2017-06', '2017-06', '2017-07', '2017-08', '2017-09',
+ '2017-10', '2017-11', '2017-12', '2018-01'],
+ dtype='period[M]', freq='M')
+
+ If ``start`` or ``end`` are ``Period`` objects, they will be used as anchor
+ endpoints for a ``PeriodIndex`` with frequency matching that of the
+ ``period_range`` constructor.
+
+ >>> pd.period_range(start=pd.Period('2017Q1', freq='Q'),
+ ... end=pd.Period('2017Q2', freq='Q'), freq='M')
+ PeriodIndex(['2017-03', '2017-04', '2017-05', '2017-06'],
+ dtype='period[M]', freq='M')
+ """
+ if com.count_not_none(start, end, periods) != 2:
+ raise ValueError('Of the three parameters: start, end, and periods, '
+ 'exactly two must be specified')
+ if freq is None and (not isinstance(start, Period)
+ and not isinstance(end, Period)):
+ freq = 'D'
+
+ data, freq = PeriodArray._generate_range(start, end, periods, freq,
+ fields={})
+ data = PeriodArray(data, freq=freq)
+ return PeriodIndex(data, name=name)
diff --git a/contrib/python/pandas/py2/pandas/core/indexes/range.py b/contrib/python/pandas/py2/pandas/core/indexes/range.py
new file mode 100644
index 00000000000..5aafe9734b6
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/indexes/range.py
@@ -0,0 +1,702 @@
+from datetime import timedelta
+import operator
+from sys import getsizeof
+import warnings
+
+import numpy as np
+
+from pandas._libs import index as libindex, lib
+import pandas.compat as compat
+from pandas.compat import get_range_parameters, lrange, range
+from pandas.compat.numpy import function as nv
+from pandas.util._decorators import Appender, cache_readonly
+
+from pandas.core.dtypes import concat as _concat
+from pandas.core.dtypes.common import (
+ is_int64_dtype, is_integer, is_scalar, is_timedelta64_dtype)
+from pandas.core.dtypes.generic import (
+ ABCDataFrame, ABCSeries, ABCTimedeltaIndex)
+
+from pandas.core import ops
+import pandas.core.common as com
+import pandas.core.indexes.base as ibase
+from pandas.core.indexes.base import Index, _index_shared_docs
+from pandas.core.indexes.numeric import Int64Index
+
+
+class RangeIndex(Int64Index):
+ """
+ Immutable Index implementing a monotonic integer range.
+
+ RangeIndex is a memory-saving special case of Int64Index limited to
+ representing monotonic ranges. Using RangeIndex may in some instances
+ improve computing speed.
+
+ This is the default index type used
+ by DataFrame and Series when no explicit index is provided by the user.
+
+ Parameters
+ ----------
+ start : int (default: 0), or other RangeIndex instance
+ If int and "stop" is not given, interpreted as "stop" instead.
+ stop : int (default: 0)
+ step : int (default: 1)
+ name : object, optional
+ Name to be stored in the index
+ copy : bool, default False
+ Unused, accepted for homogeneity with other index types.
+
+ Attributes
+ ----------
+ None
+
+ Methods
+ -------
+ from_range
+
+ See Also
+ --------
+ Index : The base pandas Index type.
+ Int64Index : Index of int64 data.
+ """
+
+ _typ = 'rangeindex'
+ _engine_type = libindex.Int64Engine
+
+ # --------------------------------------------------------------------
+ # Constructors
+
+ def __new__(cls, start=None, stop=None, step=None,
+ dtype=None, copy=False, name=None, fastpath=None):
+
+ if fastpath is not None:
+ warnings.warn("The 'fastpath' keyword is deprecated, and will be "
+ "removed in a future version.",
+ FutureWarning, stacklevel=2)
+ if fastpath:
+ return cls._simple_new(start, stop, step, name=name)
+
+ cls._validate_dtype(dtype)
+
+ # RangeIndex
+ if isinstance(start, RangeIndex):
+ if name is None:
+ name = start.name
+ return cls._simple_new(name=name,
+ **dict(start._get_data_as_items()))
+
+ # validate the arguments
+ def ensure_int(value, field):
+ msg = ("RangeIndex(...) must be called with integers,"
+ " {value} was passed for {field}")
+ if not is_scalar(value):
+ raise TypeError(msg.format(value=type(value).__name__,
+ field=field))
+ try:
+ new_value = int(value)
+ assert(new_value == value)
+ except (TypeError, ValueError, AssertionError):
+ raise TypeError(msg.format(value=type(value).__name__,
+ field=field))
+
+ return new_value
+
+ if com._all_none(start, stop, step):
+ msg = "RangeIndex(...) must be called with integers"
+ raise TypeError(msg)
+ elif start is None:
+ start = 0
+ else:
+ start = ensure_int(start, 'start')
+ if stop is None:
+ stop = start
+ start = 0
+ else:
+ stop = ensure_int(stop, 'stop')
+ if step is None:
+ step = 1
+ elif step == 0:
+ raise ValueError("Step must not be zero")
+ else:
+ step = ensure_int(step, 'step')
+
+ return cls._simple_new(start, stop, step, name)
+
+ @classmethod
+ def from_range(cls, data, name=None, dtype=None, **kwargs):
+ """ Create RangeIndex from a range (py3), or xrange (py2) object. """
+ if not isinstance(data, range):
+ raise TypeError(
+ '{0}(...) must be called with object coercible to a '
+ 'range, {1} was passed'.format(cls.__name__, repr(data)))
+
+ start, stop, step = get_range_parameters(data)
+ return RangeIndex(start, stop, step, dtype=dtype, name=name, **kwargs)
+
+ @classmethod
+ def _simple_new(cls, start, stop=None, step=None, name=None,
+ dtype=None, **kwargs):
+ result = object.__new__(cls)
+
+ # handle passed None, non-integers
+ if start is None and stop is None:
+ # empty
+ start, stop, step = 0, 0, 1
+
+ if start is None or not is_integer(start):
+ try:
+
+ return RangeIndex(start, stop, step, name=name, **kwargs)
+ except TypeError:
+ return Index(start, stop, step, name=name, **kwargs)
+
+ result._start = start
+ result._stop = stop or 0
+ result._step = step or 1
+ result.name = name
+ for k, v in compat.iteritems(kwargs):
+ setattr(result, k, v)
+
+ result._reset_identity()
+ return result
+
+ # --------------------------------------------------------------------
+
+ @staticmethod
+ def _validate_dtype(dtype):
+ """ require dtype to be None or int64 """
+ if not (dtype is None or is_int64_dtype(dtype)):
+ raise TypeError('Invalid to pass a non-int64 dtype to RangeIndex')
+
+ @cache_readonly
+ def _constructor(self):
+ """ return the class to use for construction """
+ return Int64Index
+
+ @cache_readonly
+ def _data(self):
+ return np.arange(self._start, self._stop, self._step, dtype=np.int64)
+
+ @cache_readonly
+ def _int64index(self):
+ return Int64Index._simple_new(self._data, name=self.name)
+
+ def _get_data_as_items(self):
+ """ return a list of tuples of start, stop, step """
+ return [('start', self._start),
+ ('stop', self._stop),
+ ('step', self._step)]
+
+ def __reduce__(self):
+ d = self._get_attributes_dict()
+ d.update(dict(self._get_data_as_items()))
+ return ibase._new_Index, (self.__class__, d), None
+
+ # --------------------------------------------------------------------
+ # Rendering Methods
+
+ def _format_attrs(self):
+ """
+ Return a list of tuples of the (attr, formatted_value)
+ """
+ attrs = self._get_data_as_items()
+ if self.name is not None:
+ attrs.append(('name', ibase.default_pprint(self.name)))
+ return attrs
+
+ def _format_data(self, name=None):
+ # we are formatting thru the attributes
+ return None
+
+ # --------------------------------------------------------------------
+
+ @cache_readonly
+ def nbytes(self):
+ """
+ Return the number of bytes in the underlying data
+ On implementations where this is undetermined (PyPy)
+ assume 24 bytes for each value
+ """
+ return sum(getsizeof(getattr(self, v), 24) for v in
+ ['_start', '_stop', '_step'])
+
+ def memory_usage(self, deep=False):
+ """
+ Memory usage of my values
+
+ Parameters
+ ----------
+ deep : bool
+ Introspect the data deeply, interrogate
+ `object` dtypes for system-level memory consumption
+
+ Returns
+ -------
+ bytes used
+
+ Notes
+ -----
+ Memory usage does not include memory consumed by elements that
+ are not components of the array if deep=False
+
+ See Also
+ --------
+ numpy.ndarray.nbytes
+ """
+ return self.nbytes
+
+ @property
+ def dtype(self):
+ return np.dtype(np.int64)
+
+ @property
+ def is_unique(self):
+ """ return if the index has unique values """
+ return True
+
+ @cache_readonly
+ def is_monotonic_increasing(self):
+ return self._step > 0 or len(self) <= 1
+
+ @cache_readonly
+ def is_monotonic_decreasing(self):
+ return self._step < 0 or len(self) <= 1
+
+ @property
+ def has_duplicates(self):
+ return False
+
+ def tolist(self):
+ return lrange(self._start, self._stop, self._step)
+
+ @Appender(_index_shared_docs['_shallow_copy'])
+ def _shallow_copy(self, values=None, **kwargs):
+ if values is None:
+ name = kwargs.get("name", self.name)
+ return RangeIndex._simple_new(
+ name=name, **dict(self._get_data_as_items()))
+ else:
+ kwargs.setdefault('name', self.name)
+ return self._int64index._shallow_copy(values, **kwargs)
+
+ @Appender(ibase._index_shared_docs['copy'])
+ def copy(self, name=None, deep=False, dtype=None, **kwargs):
+ self._validate_dtype(dtype)
+ if name is None:
+ name = self.name
+ return RangeIndex._simple_new(
+ name=name, **dict(self._get_data_as_items()))
+
+ def _minmax(self, meth):
+ no_steps = len(self) - 1
+ if no_steps == -1:
+ return np.nan
+ elif ((meth == 'min' and self._step > 0) or
+ (meth == 'max' and self._step < 0)):
+ return self._start
+
+ return self._start + self._step * no_steps
+
+ def min(self, axis=None, skipna=True):
+ """The minimum value of the RangeIndex"""
+ nv.validate_minmax_axis(axis)
+ return self._minmax('min')
+
+ def max(self, axis=None, skipna=True):
+ """The maximum value of the RangeIndex"""
+ nv.validate_minmax_axis(axis)
+ return self._minmax('max')
+
+ def argsort(self, *args, **kwargs):
+ """
+ Returns the indices that would sort the index and its
+ underlying data.
+
+ Returns
+ -------
+ argsorted : numpy array
+
+ See Also
+ --------
+ numpy.ndarray.argsort
+ """
+ nv.validate_argsort(args, kwargs)
+
+ if self._step > 0:
+ return np.arange(len(self))
+ else:
+ return np.arange(len(self) - 1, -1, -1)
+
+ def equals(self, other):
+ """
+ Determines if two Index objects contain the same elements.
+ """
+ if isinstance(other, RangeIndex):
+ ls = len(self)
+ lo = len(other)
+ return (ls == lo == 0 or
+ ls == lo == 1 and
+ self._start == other._start or
+ ls == lo and
+ self._start == other._start and
+ self._step == other._step)
+
+ return super(RangeIndex, self).equals(other)
+
+ def intersection(self, other, sort=False):
+ """
+ Form the intersection of two Index objects.
+
+ Parameters
+ ----------
+ other : Index or array-like
+ sort : False or None, default False
+ Sort the resulting index if possible
+
+ .. versionadded:: 0.24.0
+
+ .. versionchanged:: 0.24.1
+
+ Changed the default to ``False`` to match the behaviour
+ from before 0.24.0.
+
+ Returns
+ -------
+ intersection : Index
+ """
+ self._validate_sort_keyword(sort)
+
+ if self.equals(other):
+ return self._get_reconciled_name_object(other)
+
+ if not isinstance(other, RangeIndex):
+ return super(RangeIndex, self).intersection(other, sort=sort)
+
+ if not len(self) or not len(other):
+ return RangeIndex._simple_new(None)
+
+ first = self[::-1] if self._step < 0 else self
+ second = other[::-1] if other._step < 0 else other
+
+ # check whether intervals intersect
+ # deals with in- and decreasing ranges
+ int_low = max(first._start, second._start)
+ int_high = min(first._stop, second._stop)
+ if int_high <= int_low:
+ return RangeIndex._simple_new(None)
+
+ # Method hint: linear Diophantine equation
+ # solve intersection problem
+ # performance hint: for identical step sizes, could use
+ # cheaper alternative
+ gcd, s, t = first._extended_gcd(first._step, second._step)
+
+ # check whether element sets intersect
+ if (first._start - second._start) % gcd:
+ return RangeIndex._simple_new(None)
+
+ # calculate parameters for the RangeIndex describing the
+ # intersection disregarding the lower bounds
+ tmp_start = first._start + (second._start - first._start) * \
+ first._step // gcd * s
+ new_step = first._step * second._step // gcd
+ new_index = RangeIndex._simple_new(tmp_start, int_high, new_step)
+
+ # adjust index to limiting interval
+ new_index._start = new_index._min_fitting_element(int_low)
+
+ if (self._step < 0 and other._step < 0) is not (new_index._step < 0):
+ new_index = new_index[::-1]
+ if sort is None:
+ new_index = new_index.sort_values()
+ return new_index
+
+ def _min_fitting_element(self, lower_limit):
+ """Returns the smallest element greater than or equal to the limit"""
+ no_steps = -(-(lower_limit - self._start) // abs(self._step))
+ return self._start + abs(self._step) * no_steps
+
+ def _max_fitting_element(self, upper_limit):
+ """Returns the largest element smaller than or equal to the limit"""
+ no_steps = (upper_limit - self._start) // abs(self._step)
+ return self._start + abs(self._step) * no_steps
+
+ def _extended_gcd(self, a, b):
+ """
+ Extended Euclidean algorithms to solve Bezout's identity:
+ a*x + b*y = gcd(x, y)
+ Finds one particular solution for x, y: s, t
+ Returns: gcd, s, t
+ """
+ s, old_s = 0, 1
+ t, old_t = 1, 0
+ r, old_r = b, a
+ while r:
+ quotient = old_r // r
+ old_r, r = r, old_r - quotient * r
+ old_s, s = s, old_s - quotient * s
+ old_t, t = t, old_t - quotient * t
+ return old_r, old_s, old_t
+
+ def union(self, other):
+ """
+ Form the union of two Index objects and sorts if possible
+
+ Parameters
+ ----------
+ other : Index or array-like
+
+ Returns
+ -------
+ union : Index
+ """
+ self._assert_can_do_setop(other)
+ if len(other) == 0 or self.equals(other) or len(self) == 0:
+ return super(RangeIndex, self).union(other)
+
+ if isinstance(other, RangeIndex):
+ start_s, step_s = self._start, self._step
+ end_s = self._start + self._step * (len(self) - 1)
+ start_o, step_o = other._start, other._step
+ end_o = other._start + other._step * (len(other) - 1)
+ if self._step < 0:
+ start_s, step_s, end_s = end_s, -step_s, start_s
+ if other._step < 0:
+ start_o, step_o, end_o = end_o, -step_o, start_o
+ if len(self) == 1 and len(other) == 1:
+ step_s = step_o = abs(self._start - other._start)
+ elif len(self) == 1:
+ step_s = step_o
+ elif len(other) == 1:
+ step_o = step_s
+ start_r = min(start_s, start_o)
+ end_r = max(end_s, end_o)
+ if step_o == step_s:
+ if ((start_s - start_o) % step_s == 0 and
+ (start_s - end_o) <= step_s and
+ (start_o - end_s) <= step_s):
+ return RangeIndex(start_r, end_r + step_s, step_s)
+ if ((step_s % 2 == 0) and
+ (abs(start_s - start_o) <= step_s / 2) and
+ (abs(end_s - end_o) <= step_s / 2)):
+ return RangeIndex(start_r, end_r + step_s / 2, step_s / 2)
+ elif step_o % step_s == 0:
+ if ((start_o - start_s) % step_s == 0 and
+ (start_o + step_s >= start_s) and
+ (end_o - step_s <= end_s)):
+ return RangeIndex(start_r, end_r + step_s, step_s)
+ elif step_s % step_o == 0:
+ if ((start_s - start_o) % step_o == 0 and
+ (start_s + step_o >= start_o) and
+ (end_s - step_o <= end_o)):
+ return RangeIndex(start_r, end_r + step_o, step_o)
+
+ return self._int64index.union(other)
+
+ @Appender(_index_shared_docs['join'])
+ def join(self, other, how='left', level=None, return_indexers=False,
+ sort=False):
+ if how == 'outer' and self is not other:
+ # note: could return RangeIndex in more circumstances
+ return self._int64index.join(other, how, level, return_indexers,
+ sort)
+
+ return super(RangeIndex, self).join(other, how, level, return_indexers,
+ sort)
+
+ def _concat_same_dtype(self, indexes, name):
+ return _concat._concat_rangeindex_same_dtype(indexes).rename(name)
+
+ def __len__(self):
+ """
+ return the length of the RangeIndex
+ """
+ return max(0, -(-(self._stop - self._start) // self._step))
+
+ @property
+ def size(self):
+ return len(self)
+
+ def __getitem__(self, key):
+ """
+ Conserve RangeIndex type for scalar and slice keys.
+ """
+ super_getitem = super(RangeIndex, self).__getitem__
+
+ if is_scalar(key):
+ if not lib.is_integer(key):
+ raise IndexError("only integers, slices (`:`), "
+ "ellipsis (`...`), numpy.newaxis (`None`) "
+ "and integer or boolean "
+ "arrays are valid indices")
+ n = com.cast_scalar_indexer(key)
+ if n != key:
+ return super_getitem(key)
+ if n < 0:
+ n = len(self) + key
+ if n < 0 or n > len(self) - 1:
+ raise IndexError("index {key} is out of bounds for axis 0 "
+ "with size {size}".format(key=key,
+ size=len(self)))
+ return self._start + n * self._step
+
+ if isinstance(key, slice):
+
+ # This is basically PySlice_GetIndicesEx, but delegation to our
+ # super routines if we don't have integers
+
+ length = len(self)
+
+ # complete missing slice information
+ step = 1 if key.step is None else key.step
+ if key.start is None:
+ start = length - 1 if step < 0 else 0
+ else:
+ start = key.start
+
+ if start < 0:
+ start += length
+ if start < 0:
+ start = -1 if step < 0 else 0
+ if start >= length:
+ start = length - 1 if step < 0 else length
+
+ if key.stop is None:
+ stop = -1 if step < 0 else length
+ else:
+ stop = key.stop
+
+ if stop < 0:
+ stop += length
+ if stop < 0:
+ stop = -1
+ if stop > length:
+ stop = length
+
+ # delegate non-integer slices
+ if (start != int(start) or
+ stop != int(stop) or
+ step != int(step)):
+ return super_getitem(key)
+
+ # convert indexes to values
+ start = self._start + self._step * start
+ stop = self._start + self._step * stop
+ step = self._step * step
+
+ return RangeIndex._simple_new(start, stop, step, name=self.name)
+
+ # fall back to Int64Index
+ return super_getitem(key)
+
+ def __floordiv__(self, other):
+ if isinstance(other, (ABCSeries, ABCDataFrame)):
+ return NotImplemented
+
+ if is_integer(other) and other != 0:
+ if (len(self) == 0 or
+ self._start % other == 0 and
+ self._step % other == 0):
+ start = self._start // other
+ step = self._step // other
+ stop = start + len(self) * step
+ return RangeIndex._simple_new(
+ start, stop, step, name=self.name)
+ if len(self) == 1:
+ start = self._start // other
+ return RangeIndex._simple_new(
+ start, start + 1, 1, name=self.name)
+ return self._int64index // other
+
+ @classmethod
+ def _add_numeric_methods_binary(cls):
+ """ add in numeric methods, specialized to RangeIndex """
+
+ def _make_evaluate_binop(op, step=False):
+ """
+ Parameters
+ ----------
+ op : callable that accepts 2 parms
+ perform the binary op
+ step : callable, optional, default to False
+ op to apply to the step parm if not None
+ if False, use the existing step
+ """
+
+ def _evaluate_numeric_binop(self, other):
+ if isinstance(other, (ABCSeries, ABCDataFrame)):
+ return NotImplemented
+ elif isinstance(other, ABCTimedeltaIndex):
+ # Defer to TimedeltaIndex implementation
+ return NotImplemented
+ elif isinstance(other, (timedelta, np.timedelta64)):
+ # GH#19333 is_integer evaluated True on timedelta64,
+ # so we need to catch these explicitly
+ return op(self._int64index, other)
+ elif is_timedelta64_dtype(other):
+ # Must be an np.ndarray; GH#22390
+ return op(self._int64index, other)
+
+ other = self._validate_for_numeric_binop(other, op)
+ attrs = self._get_attributes_dict()
+ attrs = self._maybe_update_attributes(attrs)
+
+ left, right = self, other
+
+ try:
+ # apply if we have an override
+ if step:
+ with np.errstate(all='ignore'):
+ rstep = step(left._step, right)
+
+ # we don't have a representable op
+ # so return a base index
+ if not is_integer(rstep) or not rstep:
+ raise ValueError
+
+ else:
+ rstep = left._step
+
+ with np.errstate(all='ignore'):
+ rstart = op(left._start, right)
+ rstop = op(left._stop, right)
+
+ result = RangeIndex(rstart,
+ rstop,
+ rstep,
+ **attrs)
+
+ # for compat with numpy / Int64Index
+ # even if we can represent as a RangeIndex, return
+ # as a Float64Index if we have float-like descriptors
+ if not all(is_integer(x) for x in
+ [rstart, rstop, rstep]):
+ result = result.astype('float64')
+
+ return result
+
+ except (ValueError, TypeError, ZeroDivisionError):
+ # Defer to Int64Index implementation
+ return op(self._int64index, other)
+ # TODO: Do attrs get handled reliably?
+
+ name = '__{name}__'.format(name=op.__name__)
+ return compat.set_function_name(_evaluate_numeric_binop, name, cls)
+
+ cls.__add__ = _make_evaluate_binop(operator.add)
+ cls.__radd__ = _make_evaluate_binop(ops.radd)
+ cls.__sub__ = _make_evaluate_binop(operator.sub)
+ cls.__rsub__ = _make_evaluate_binop(ops.rsub)
+ cls.__mul__ = _make_evaluate_binop(operator.mul, step=operator.mul)
+ cls.__rmul__ = _make_evaluate_binop(ops.rmul, step=ops.rmul)
+ cls.__truediv__ = _make_evaluate_binop(operator.truediv,
+ step=operator.truediv)
+ cls.__rtruediv__ = _make_evaluate_binop(ops.rtruediv,
+ step=ops.rtruediv)
+ if not compat.PY3:
+ cls.__div__ = _make_evaluate_binop(operator.div, step=operator.div)
+ cls.__rdiv__ = _make_evaluate_binop(ops.rdiv, step=ops.rdiv)
+
+
+RangeIndex._add_numeric_methods()
+RangeIndex._add_logical_methods()
diff --git a/contrib/python/pandas/py2/pandas/core/indexes/timedeltas.py b/contrib/python/pandas/py2/pandas/core/indexes/timedeltas.py
new file mode 100644
index 00000000000..cbe5ae19883
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/indexes/timedeltas.py
@@ -0,0 +1,804 @@
+""" implement the TimedeltaIndex """
+from datetime import datetime
+import warnings
+
+import numpy as np
+
+from pandas._libs import (
+ NaT, Timedelta, index as libindex, join as libjoin, lib)
+import pandas.compat as compat
+from pandas.util._decorators import Appender, Substitution
+
+from pandas.core.dtypes.common import (
+ _TD_DTYPE, ensure_int64, is_float, is_integer, is_list_like, is_scalar,
+ is_timedelta64_dtype, is_timedelta64_ns_dtype, pandas_dtype)
+import pandas.core.dtypes.concat as _concat
+from pandas.core.dtypes.missing import isna
+
+from pandas.core.accessor import delegate_names
+from pandas.core.arrays import datetimelike as dtl
+from pandas.core.arrays.timedeltas import TimedeltaArray, _is_convertible_to_td
+from pandas.core.base import _shared_docs
+import pandas.core.common as com
+from pandas.core.indexes.base import Index, _index_shared_docs
+from pandas.core.indexes.datetimelike import (
+ DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, maybe_unwrap_index,
+ wrap_arithmetic_op)
+from pandas.core.indexes.numeric import Int64Index
+from pandas.core.ops import get_op_result_name
+
+from pandas.tseries.frequencies import to_offset
+
+
+def _make_wrapped_arith_op(opname):
+
+ meth = getattr(TimedeltaArray, opname)
+
+ def method(self, other):
+ result = meth(self._data, maybe_unwrap_index(other))
+ return wrap_arithmetic_op(self, other, result)
+
+ method.__name__ = opname
+ return method
+
+
+class TimedeltaDelegateMixin(DatetimelikeDelegateMixin):
+ # Most attrs are dispatched via datetimelike_{ops,methods}
+ # Some are "raw" methods, the result is not not re-boxed in an Index
+ # We also have a few "extra" attrs, which may or may not be raw,
+ # which we we dont' want to expose in the .dt accessor.
+ _delegate_class = TimedeltaArray
+ _delegated_properties = (TimedeltaArray._datetimelike_ops + [
+ 'components',
+ ])
+ _delegated_methods = TimedeltaArray._datetimelike_methods + [
+ '_box_values',
+ ]
+ _raw_properties = {
+ 'components',
+ }
+ _raw_methods = {
+ 'to_pytimedelta',
+ }
+
+
+@delegate_names(TimedeltaArray,
+ TimedeltaDelegateMixin._delegated_properties,
+ typ="property")
+@delegate_names(TimedeltaArray,
+ TimedeltaDelegateMixin._delegated_methods,
+ typ="method", overwrite=False)
+class TimedeltaIndex(DatetimeIndexOpsMixin, dtl.TimelikeOps, Int64Index,
+ TimedeltaDelegateMixin):
+ """
+ Immutable ndarray of timedelta64 data, represented internally as int64, and
+ which can be boxed to timedelta objects
+
+ Parameters
+ ----------
+ data : array-like (1-dimensional), optional
+ Optional timedelta-like data to construct index with
+ unit : unit of the arg (D,h,m,s,ms,us,ns) denote the unit, optional
+ which is an integer/float number
+ freq : string or pandas offset object, optional
+ One of pandas date offset strings or corresponding objects. The string
+ 'infer' can be passed in order to set the frequency of the index as the
+ inferred frequency upon creation
+ copy : bool
+ Make a copy of input ndarray
+ start : starting value, timedelta-like, optional
+ If data is None, start is used as the start point in generating regular
+ timedelta data.
+
+ .. deprecated:: 0.24.0
+
+ periods : int, optional, > 0
+ Number of periods to generate, if generating index. Takes precedence
+ over end argument
+
+ .. deprecated:: 0.24.0
+
+ end : end time, timedelta-like, optional
+ If periods is none, generated index will extend to first conforming
+ time on or just past end argument
+
+ .. deprecated:: 0.24. 0
+
+ closed : string or None, default None
+ Make the interval closed with respect to the given frequency to
+ the 'left', 'right', or both sides (None)
+
+ .. deprecated:: 0.24. 0
+
+ name : object
+ Name to be stored in the index
+
+ Attributes
+ ----------
+ days
+ seconds
+ microseconds
+ nanoseconds
+ components
+ inferred_freq
+
+ Methods
+ -------
+ to_pytimedelta
+ to_series
+ round
+ floor
+ ceil
+ to_frame
+
+ See Also
+ ---------
+ Index : The base pandas Index type.
+ Timedelta : Represents a duration between two dates or times.
+ DatetimeIndex : Index of datetime64 data.
+ PeriodIndex : Index of Period data.
+ timedelta_range : Create a fixed-frequency TimedeltaIndex.
+
+ Notes
+ -----
+ To learn more about the frequency strings, please see `this link
+ <http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases>`__.
+
+ Creating a TimedeltaIndex based on `start`, `periods`, and `end` has
+ been deprecated in favor of :func:`timedelta_range`.
+ """
+
+ _typ = 'timedeltaindex'
+ _join_precedence = 10
+
+ def _join_i8_wrapper(joinf, **kwargs):
+ return DatetimeIndexOpsMixin._join_i8_wrapper(
+ joinf, dtype='m8[ns]', **kwargs)
+
+ _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer_int64)
+ _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer_int64)
+ _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer_int64)
+ _left_indexer_unique = _join_i8_wrapper(
+ libjoin.left_join_indexer_unique_int64, with_indexers=False)
+
+ _engine_type = libindex.TimedeltaEngine
+
+ _comparables = ['name', 'freq']
+ _attributes = ['name', 'freq']
+ _is_numeric_dtype = True
+ _infer_as_myclass = True
+
+ _freq = None
+
+ _box_func = TimedeltaArray._box_func
+ _bool_ops = TimedeltaArray._bool_ops
+ _object_ops = TimedeltaArray._object_ops
+ _field_ops = TimedeltaArray._field_ops
+ _datetimelike_ops = TimedeltaArray._datetimelike_ops
+ _datetimelike_methods = TimedeltaArray._datetimelike_methods
+ _other_ops = TimedeltaArray._other_ops
+
+ # -------------------------------------------------------------------
+ # Constructors
+
+ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None,
+ periods=None, closed=None, dtype=_TD_DTYPE, copy=False,
+ name=None, verify_integrity=None):
+
+ if verify_integrity is not None:
+ warnings.warn("The 'verify_integrity' argument is deprecated, "
+ "will be removed in a future version.",
+ FutureWarning, stacklevel=2)
+ else:
+ verify_integrity = True
+
+ if data is None:
+ freq, freq_infer = dtl.maybe_infer_freq(freq)
+ warnings.warn("Creating a TimedeltaIndex by passing range "
+ "endpoints is deprecated. Use "
+ "`pandas.timedelta_range` instead.",
+ FutureWarning, stacklevel=2)
+ result = TimedeltaArray._generate_range(start, end, periods, freq,
+ closed=closed)
+ return cls._simple_new(result._data, freq=freq, name=name)
+
+ if is_scalar(data):
+ raise TypeError('{cls}() must be called with a '
+ 'collection of some kind, {data} was passed'
+ .format(cls=cls.__name__, data=repr(data)))
+
+ if isinstance(data, TimedeltaArray):
+ if copy:
+ data = data.copy()
+ return cls._simple_new(data, name=name, freq=freq)
+
+ if (isinstance(data, TimedeltaIndex) and
+ freq is None and name is None):
+ if copy:
+ return data.copy()
+ else:
+ return data._shallow_copy()
+
+ # - Cases checked above all return/raise before reaching here - #
+
+ tdarr = TimedeltaArray._from_sequence(data, freq=freq, unit=unit,
+ dtype=dtype, copy=copy)
+ return cls._simple_new(tdarr._data, freq=tdarr.freq, name=name)
+
+ @classmethod
+ def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE):
+ # `dtype` is passed by _shallow_copy in corner cases, should always
+ # be timedelta64[ns] if present
+ if not isinstance(values, TimedeltaArray):
+ values = TimedeltaArray._simple_new(values, dtype=dtype,
+ freq=freq)
+ else:
+ if freq is None:
+ freq = values.freq
+ assert isinstance(values, TimedeltaArray), type(values)
+ assert dtype == _TD_DTYPE, dtype
+ assert values.dtype == 'm8[ns]', values.dtype
+
+ tdarr = TimedeltaArray._simple_new(values._data, freq=freq)
+ result = object.__new__(cls)
+ result._data = tdarr
+ result.name = name
+ # For groupby perf. See note in indexes/base about _index_data
+ result._index_data = tdarr._data
+
+ result._reset_identity()
+ return result
+
+ # -------------------------------------------------------------------
+
+ def __setstate__(self, state):
+ """Necessary for making this object picklable"""
+ if isinstance(state, dict):
+ super(TimedeltaIndex, self).__setstate__(state)
+ else:
+ raise Exception("invalid pickle state")
+ _unpickle_compat = __setstate__
+
+ def _maybe_update_attributes(self, attrs):
+ """ Update Index attributes (e.g. freq) depending on op """
+ freq = attrs.get('freq', None)
+ if freq is not None:
+ # no need to infer if freq is None
+ attrs['freq'] = 'infer'
+ return attrs
+
+ # -------------------------------------------------------------------
+ # Rendering Methods
+
+ @property
+ def _formatter_func(self):
+ from pandas.io.formats.format import _get_format_timedelta64
+ return _get_format_timedelta64(self, box=True)
+
+ def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs):
+ from pandas.io.formats.format import Timedelta64Formatter
+ return Timedelta64Formatter(values=self,
+ nat_rep=na_rep,
+ justify='all').get_result()
+
+ # -------------------------------------------------------------------
+ # Wrapping TimedeltaArray
+
+ __mul__ = _make_wrapped_arith_op("__mul__")
+ __rmul__ = _make_wrapped_arith_op("__rmul__")
+ __floordiv__ = _make_wrapped_arith_op("__floordiv__")
+ __rfloordiv__ = _make_wrapped_arith_op("__rfloordiv__")
+ __mod__ = _make_wrapped_arith_op("__mod__")
+ __rmod__ = _make_wrapped_arith_op("__rmod__")
+ __divmod__ = _make_wrapped_arith_op("__divmod__")
+ __rdivmod__ = _make_wrapped_arith_op("__rdivmod__")
+ __truediv__ = _make_wrapped_arith_op("__truediv__")
+ __rtruediv__ = _make_wrapped_arith_op("__rtruediv__")
+ if compat.PY2:
+ __div__ = __truediv__
+ __rdiv__ = __rtruediv__
+
+ # Compat for frequency inference, see GH#23789
+ _is_monotonic_increasing = Index.is_monotonic_increasing
+ _is_monotonic_decreasing = Index.is_monotonic_decreasing
+ _is_unique = Index.is_unique
+
+ @property
+ def _box_func(self):
+ return lambda x: Timedelta(x, unit='ns')
+
+ def __getitem__(self, key):
+ result = self._data.__getitem__(key)
+ if is_scalar(result):
+ return result
+ return type(self)(result, name=self.name)
+
+ # -------------------------------------------------------------------
+
+ @Appender(_index_shared_docs['astype'])
+ def astype(self, dtype, copy=True):
+ dtype = pandas_dtype(dtype)
+ if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype):
+ # Have to repeat the check for 'timedelta64' (not ns) dtype
+ # so that we can return a numeric index, since pandas will return
+ # a TimedeltaIndex when dtype='timedelta'
+ result = self._data.astype(dtype, copy=copy)
+ if self.hasnans:
+ return Index(result, name=self.name)
+ return Index(result.astype('i8'), name=self.name)
+ return DatetimeIndexOpsMixin.astype(self, dtype, copy=copy)
+
+ def union(self, other):
+ """
+ Specialized union for TimedeltaIndex objects. If combine
+ overlapping ranges with the same DateOffset, will be much
+ faster than Index.union
+
+ Parameters
+ ----------
+ other : TimedeltaIndex or array-like
+
+ Returns
+ -------
+ y : Index or TimedeltaIndex
+ """
+ self._assert_can_do_setop(other)
+
+ if len(other) == 0 or self.equals(other) or len(self) == 0:
+ return super(TimedeltaIndex, self).union(other)
+
+ if not isinstance(other, TimedeltaIndex):
+ try:
+ other = TimedeltaIndex(other)
+ except (TypeError, ValueError):
+ pass
+ this, other = self, other
+
+ if this._can_fast_union(other):
+ return this._fast_union(other)
+ else:
+ result = Index.union(this, other)
+ if isinstance(result, TimedeltaIndex):
+ if result.freq is None:
+ result.freq = to_offset(result.inferred_freq)
+ return result
+
+ def join(self, other, how='left', level=None, return_indexers=False,
+ sort=False):
+ """
+ See Index.join
+ """
+ if _is_convertible_to_index(other):
+ try:
+ other = TimedeltaIndex(other)
+ except (TypeError, ValueError):
+ pass
+
+ return Index.join(self, other, how=how, level=level,
+ return_indexers=return_indexers,
+ sort=sort)
+
+ def _wrap_joined_index(self, joined, other):
+ name = get_op_result_name(self, other)
+ if (isinstance(other, TimedeltaIndex) and self.freq == other.freq and
+ self._can_fast_union(other)):
+ joined = self._shallow_copy(joined, name=name)
+ return joined
+ else:
+ return self._simple_new(joined, name)
+
+ def _can_fast_union(self, other):
+ if not isinstance(other, TimedeltaIndex):
+ return False
+
+ freq = self.freq
+
+ if freq is None or freq != other.freq:
+ return False
+
+ if not self.is_monotonic or not other.is_monotonic:
+ return False
+
+ if len(self) == 0 or len(other) == 0:
+ return True
+
+ # to make our life easier, "sort" the two ranges
+ if self[0] <= other[0]:
+ left, right = self, other
+ else:
+ left, right = other, self
+
+ right_start = right[0]
+ left_end = left[-1]
+
+ # Only need to "adjoin", not overlap
+ return (right_start == left_end + freq) or right_start in left
+
+ def _fast_union(self, other):
+ if len(other) == 0:
+ return self.view(type(self))
+
+ if len(self) == 0:
+ return other.view(type(self))
+
+ # to make our life easier, "sort" the two ranges
+ if self[0] <= other[0]:
+ left, right = self, other
+ else:
+ left, right = other, self
+
+ left_end = left[-1]
+ right_end = right[-1]
+
+ # concatenate
+ if left_end < right_end:
+ loc = right.searchsorted(left_end, side='right')
+ right_chunk = right.values[loc:]
+ dates = _concat._concat_compat((left.values, right_chunk))
+ return self._shallow_copy(dates)
+ else:
+ return left
+
+ def intersection(self, other):
+ """
+ Specialized intersection for TimedeltaIndex objects. May be much faster
+ than Index.intersection
+
+ Parameters
+ ----------
+ other : TimedeltaIndex or array-like
+
+ Returns
+ -------
+ y : Index or TimedeltaIndex
+ """
+ self._assert_can_do_setop(other)
+
+ if self.equals(other):
+ return self._get_reconciled_name_object(other)
+
+ if not isinstance(other, TimedeltaIndex):
+ try:
+ other = TimedeltaIndex(other)
+ except (TypeError, ValueError):
+ pass
+ result = Index.intersection(self, other)
+ return result
+
+ if len(self) == 0:
+ return self
+ if len(other) == 0:
+ return other
+ # to make our life easier, "sort" the two ranges
+ if self[0] <= other[0]:
+ left, right = self, other
+ else:
+ left, right = other, self
+
+ end = min(left[-1], right[-1])
+ start = right[0]
+
+ if end < start:
+ return type(self)(data=[])
+ else:
+ lslice = slice(*left.slice_locs(start, end))
+ left_chunk = left.values[lslice]
+ return self._shallow_copy(left_chunk)
+
+ def _maybe_promote(self, other):
+ if other.inferred_type == 'timedelta':
+ other = TimedeltaIndex(other)
+ return self, other
+
+ def get_value(self, series, key):
+ """
+ Fast lookup of value from 1-dimensional ndarray. Only use this if you
+ know what you're doing
+ """
+
+ if _is_convertible_to_td(key):
+ key = Timedelta(key)
+ return self.get_value_maybe_box(series, key)
+
+ try:
+ return com.maybe_box(self, Index.get_value(self, series, key),
+ series, key)
+ except KeyError:
+ try:
+ loc = self._get_string_slice(key)
+ return series[loc]
+ except (TypeError, ValueError, KeyError):
+ pass
+
+ try:
+ return self.get_value_maybe_box(series, key)
+ except (TypeError, ValueError, KeyError):
+ raise KeyError(key)
+
+ def get_value_maybe_box(self, series, key):
+ if not isinstance(key, Timedelta):
+ key = Timedelta(key)
+ values = self._engine.get_value(com.values_from_object(series), key)
+ return com.maybe_box(self, values, series, key)
+
+ def get_loc(self, key, method=None, tolerance=None):
+ """
+ Get integer location for requested label
+
+ Returns
+ -------
+ loc : int
+ """
+ if is_list_like(key) or (isinstance(key, datetime) and key is not NaT):
+ # GH#20464 datetime check here is to ensure we don't allow
+ # datetime objects to be incorrectly treated as timedelta
+ # objects; NaT is a special case because it plays a double role
+ # as Not-A-Timedelta
+ raise TypeError
+
+ if isna(key):
+ key = NaT
+
+ if tolerance is not None:
+ # try converting tolerance now, so errors don't get swallowed by
+ # the try/except clauses below
+ tolerance = self._convert_tolerance(tolerance, np.asarray(key))
+
+ if _is_convertible_to_td(key):
+ key = Timedelta(key)
+ return Index.get_loc(self, key, method, tolerance)
+
+ try:
+ return Index.get_loc(self, key, method, tolerance)
+ except (KeyError, ValueError, TypeError):
+ try:
+ return self._get_string_slice(key)
+ except (TypeError, KeyError, ValueError):
+ pass
+
+ try:
+ stamp = Timedelta(key)
+ return Index.get_loc(self, stamp, method, tolerance)
+ except (KeyError, ValueError):
+ raise KeyError(key)
+
+ def _maybe_cast_slice_bound(self, label, side, kind):
+ """
+ If label is a string, cast it to timedelta according to resolution.
+
+
+ Parameters
+ ----------
+ label : object
+ side : {'left', 'right'}
+ kind : {'ix', 'loc', 'getitem'}
+
+ Returns
+ -------
+ label : object
+
+ """
+ assert kind in ['ix', 'loc', 'getitem', None]
+
+ if isinstance(label, compat.string_types):
+ parsed = Timedelta(label)
+ lbound = parsed.round(parsed.resolution)
+ if side == 'left':
+ return lbound
+ else:
+ return (lbound + to_offset(parsed.resolution) -
+ Timedelta(1, 'ns'))
+ elif ((is_integer(label) or is_float(label)) and
+ not is_timedelta64_dtype(label)):
+ self._invalid_indexer('slice', label)
+
+ return label
+
+ def _get_string_slice(self, key):
+ if is_integer(key) or is_float(key) or key is NaT:
+ self._invalid_indexer('slice', key)
+ loc = self._partial_td_slice(key)
+ return loc
+
+ def _partial_td_slice(self, key):
+
+ # given a key, try to figure out a location for a partial slice
+ if not isinstance(key, compat.string_types):
+ return key
+
+ raise NotImplementedError
+
+ @Substitution(klass='TimedeltaIndex')
+ @Appender(_shared_docs['searchsorted'])
+ def searchsorted(self, value, side='left', sorter=None):
+ if isinstance(value, (np.ndarray, Index)):
+ value = np.array(value, dtype=_TD_DTYPE, copy=False)
+ else:
+ value = Timedelta(value).asm8.view(_TD_DTYPE)
+
+ return self.values.searchsorted(value, side=side, sorter=sorter)
+
+ def is_type_compatible(self, typ):
+ return typ == self.inferred_type or typ == 'timedelta'
+
+ @property
+ def inferred_type(self):
+ return 'timedelta64'
+
+ @property
+ def is_all_dates(self):
+ return True
+
+ def insert(self, loc, item):
+ """
+ Make new Index inserting new item at location
+
+ Parameters
+ ----------
+ loc : int
+ item : object
+ if not either a Python datetime or a numpy integer-like, returned
+ Index dtype will be object rather than datetime.
+
+ Returns
+ -------
+ new_index : Index
+ """
+ # try to convert if possible
+ if _is_convertible_to_td(item):
+ try:
+ item = Timedelta(item)
+ except Exception:
+ pass
+ elif is_scalar(item) and isna(item):
+ # GH 18295
+ item = self._na_value
+
+ freq = None
+ if isinstance(item, Timedelta) or (is_scalar(item) and isna(item)):
+
+ # check freq can be preserved on edge cases
+ if self.freq is not None:
+ if ((loc == 0 or loc == -len(self)) and
+ item + self.freq == self[0]):
+ freq = self.freq
+ elif (loc == len(self)) and item - self.freq == self[-1]:
+ freq = self.freq
+ item = Timedelta(item).asm8.view(_TD_DTYPE)
+
+ try:
+ new_tds = np.concatenate((self[:loc].asi8, [item.view(np.int64)],
+ self[loc:].asi8))
+ return self._shallow_copy(new_tds, freq=freq)
+
+ except (AttributeError, TypeError):
+
+ # fall back to object index
+ if isinstance(item, compat.string_types):
+ return self.astype(object).insert(loc, item)
+ raise TypeError(
+ "cannot insert TimedeltaIndex with incompatible label")
+
+ def delete(self, loc):
+ """
+ Make a new TimedeltaIndex with passed location(s) deleted.
+
+ Parameters
+ ----------
+ loc: int, slice or array of ints
+ Indicate which sub-arrays to remove.
+
+ Returns
+ -------
+ new_index : TimedeltaIndex
+ """
+ new_tds = np.delete(self.asi8, loc)
+
+ freq = 'infer'
+ if is_integer(loc):
+ if loc in (0, -len(self), -1, len(self) - 1):
+ freq = self.freq
+ else:
+ if is_list_like(loc):
+ loc = lib.maybe_indices_to_slice(
+ ensure_int64(np.array(loc)), len(self))
+ if isinstance(loc, slice) and loc.step in (1, None):
+ if (loc.start in (0, None) or loc.stop in (len(self), None)):
+ freq = self.freq
+
+ return TimedeltaIndex(new_tds, name=self.name, freq=freq)
+
+
+TimedeltaIndex._add_comparison_ops()
+TimedeltaIndex._add_numeric_methods_unary()
+TimedeltaIndex._add_logical_methods_disabled()
+TimedeltaIndex._add_datetimelike_methods()
+
+
+def _is_convertible_to_index(other):
+ """
+ return a boolean whether I can attempt conversion to a TimedeltaIndex
+ """
+ if isinstance(other, TimedeltaIndex):
+ return True
+ elif (len(other) > 0 and
+ other.inferred_type not in ('floating', 'mixed-integer', 'integer',
+ 'mixed-integer-float', 'mixed')):
+ return True
+ return False
+
+
+def timedelta_range(start=None, end=None, periods=None, freq=None,
+ name=None, closed=None):
+ """
+ Return a fixed frequency TimedeltaIndex, with day as the default
+ frequency
+
+ Parameters
+ ----------
+ start : string or timedelta-like, default None
+ Left bound for generating timedeltas
+ end : string or timedelta-like, default None
+ Right bound for generating timedeltas
+ periods : integer, default None
+ Number of periods to generate
+ freq : string or DateOffset, default 'D'
+ Frequency strings can have multiples, e.g. '5H'
+ name : string, default None
+ Name of the resulting TimedeltaIndex
+ closed : string, default None
+ Make the interval closed with respect to the given frequency to
+ the 'left', 'right', or both sides (None)
+
+ Returns
+ -------
+ rng : TimedeltaIndex
+
+ Notes
+ -----
+ Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,
+ exactly three must be specified. If ``freq`` is omitted, the resulting
+ ``TimedeltaIndex`` will have ``periods`` linearly spaced elements between
+ ``start`` and ``end`` (closed on both sides).
+
+ To learn more about the frequency strings, please see `this link
+ <http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases>`__.
+
+ Examples
+ --------
+
+ >>> pd.timedelta_range(start='1 day', periods=4)
+ TimedeltaIndex(['1 days', '2 days', '3 days', '4 days'],
+ dtype='timedelta64[ns]', freq='D')
+
+ The ``closed`` parameter specifies which endpoint is included. The default
+ behavior is to include both endpoints.
+
+ >>> pd.timedelta_range(start='1 day', periods=4, closed='right')
+ TimedeltaIndex(['2 days', '3 days', '4 days'],
+ dtype='timedelta64[ns]', freq='D')
+
+ The ``freq`` parameter specifies the frequency of the TimedeltaIndex.
+ Only fixed frequencies can be passed, non-fixed frequencies such as
+ 'M' (month end) will raise.
+
+ >>> pd.timedelta_range(start='1 day', end='2 days', freq='6H')
+ TimedeltaIndex(['1 days 00:00:00', '1 days 06:00:00', '1 days 12:00:00',
+ '1 days 18:00:00', '2 days 00:00:00'],
+ dtype='timedelta64[ns]', freq='6H')
+
+ Specify ``start``, ``end``, and ``periods``; the frequency is generated
+ automatically (linearly spaced).
+
+ >>> pd.timedelta_range(start='1 day', end='5 days', periods=4)
+ TimedeltaIndex(['1 days 00:00:00', '2 days 08:00:00', '3 days 16:00:00',
+ '5 days 00:00:00'],
+ dtype='timedelta64[ns]', freq=None)
+ """
+ if freq is None and com._any_none(periods, start, end):
+ freq = 'D'
+
+ freq, freq_infer = dtl.maybe_infer_freq(freq)
+ tdarr = TimedeltaArray._generate_range(start, end, periods, freq,
+ closed=closed)
+ return TimedeltaIndex._simple_new(tdarr._data, freq=tdarr.freq, name=name)
diff --git a/contrib/python/pandas/py2/pandas/core/indexing.py b/contrib/python/pandas/py2/pandas/core/indexing.py
new file mode 100755
index 00000000000..bbcde8f3b33
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/indexing.py
@@ -0,0 +1,2766 @@
+# pylint: disable=W0223
+import textwrap
+import warnings
+
+import numpy as np
+
+from pandas._libs.indexing import _NDFrameIndexerBase
+import pandas.compat as compat
+from pandas.compat import range, zip
+from pandas.errors import AbstractMethodError
+from pandas.util._decorators import Appender
+
+from pandas.core.dtypes.common import (
+ ensure_platform_int, is_float, is_integer, is_integer_dtype, is_iterator,
+ is_list_like, is_scalar, is_sequence, is_sparse)
+from pandas.core.dtypes.generic import ABCDataFrame, ABCPanel, ABCSeries
+from pandas.core.dtypes.missing import _infer_fill_value, isna
+
+import pandas.core.common as com
+from pandas.core.index import Index, MultiIndex
+
+
+# the supported indexers
+def get_indexers_list():
+
+ return [
+ ('ix', _IXIndexer),
+ ('iloc', _iLocIndexer),
+ ('loc', _LocIndexer),
+ ('at', _AtIndexer),
+ ('iat', _iAtIndexer),
+ ]
+
+
+# "null slice"
+_NS = slice(None, None)
+
+
+# the public IndexSlicerMaker
+class _IndexSlice(object):
+ """
+ Create an object to more easily perform multi-index slicing
+
+ See Also
+ --------
+ MultiIndex.remove_unused_levels : New MultiIndex with no unused levels.
+
+ Notes
+ -----
+ See :ref:`Defined Levels <advanced.shown_levels>`
+ for further info on slicing a MultiIndex.
+
+ Examples
+ --------
+
+ >>> midx = pd.MultiIndex.from_product([['A0','A1'], ['B0','B1','B2','B3']])
+ >>> columns = ['foo', 'bar']
+ >>> dfmi = pd.DataFrame(np.arange(16).reshape((len(midx), len(columns))),
+ index=midx, columns=columns)
+
+ Using the default slice command:
+
+ >>> dfmi.loc[(slice(None), slice('B0', 'B1')), :]
+ foo bar
+ A0 B0 0 1
+ B1 2 3
+ A1 B0 8 9
+ B1 10 11
+
+ Using the IndexSlice class for a more intuitive command:
+
+ >>> idx = pd.IndexSlice
+ >>> dfmi.loc[idx[:, 'B0':'B1'], :]
+ foo bar
+ A0 B0 0 1
+ B1 2 3
+ A1 B0 8 9
+ B1 10 11
+ """
+
+ def __getitem__(self, arg):
+ return arg
+
+
+IndexSlice = _IndexSlice()
+
+
+class IndexingError(Exception):
+ pass
+
+
+class _NDFrameIndexer(_NDFrameIndexerBase):
+ _valid_types = None
+ _exception = KeyError
+ axis = None
+
+ def __call__(self, axis=None):
+ # we need to return a copy of ourselves
+ new_self = self.__class__(self.name, self.obj)
+
+ if axis is not None:
+ axis = self.obj._get_axis_number(axis)
+ new_self.axis = axis
+ return new_self
+
+ def __iter__(self):
+ raise NotImplementedError('ix is not iterable')
+
+ def __getitem__(self, key):
+ if type(key) is tuple:
+ key = tuple(com.apply_if_callable(x, self.obj)
+ for x in key)
+ try:
+ values = self.obj._get_value(*key)
+ if is_scalar(values):
+ return values
+ except Exception:
+ pass
+
+ return self._getitem_tuple(key)
+ else:
+ # we by definition only have the 0th axis
+ axis = self.axis or 0
+
+ key = com.apply_if_callable(key, self.obj)
+ return self._getitem_axis(key, axis=axis)
+
+ def _get_label(self, label, axis=None):
+ if axis is None:
+ axis = self.axis or 0
+
+ if self.ndim == 1:
+ # for perf reasons we want to try _xs first
+ # as its basically direct indexing
+ # but will fail when the index is not present
+ # see GH5667
+ return self.obj._xs(label, axis=axis)
+ elif isinstance(label, tuple) and isinstance(label[axis], slice):
+ raise IndexingError('no slices here, handle elsewhere')
+
+ return self.obj._xs(label, axis=axis)
+
+ def _get_loc(self, key, axis=None):
+ if axis is None:
+ axis = self.axis
+ return self.obj._ixs(key, axis=axis)
+
+ def _slice(self, obj, axis=None, kind=None):
+ if axis is None:
+ axis = self.axis
+ return self.obj._slice(obj, axis=axis, kind=kind)
+
+ def _get_setitem_indexer(self, key):
+ if self.axis is not None:
+ return self._convert_tuple(key, is_setter=True)
+
+ axis = self.obj._get_axis(0)
+
+ if isinstance(axis, MultiIndex) and self.name != 'iloc':
+ try:
+ return axis.get_loc(key)
+ except Exception:
+ pass
+
+ if isinstance(key, tuple):
+ try:
+ return self._convert_tuple(key, is_setter=True)
+ except IndexingError:
+ pass
+
+ if isinstance(key, range):
+ return self._convert_range(key, is_setter=True)
+
+ try:
+ return self._convert_to_indexer(key, is_setter=True)
+ except TypeError as e:
+
+ # invalid indexer type vs 'other' indexing errors
+ if 'cannot do' in str(e):
+ raise
+ raise IndexingError(key)
+
+ def __setitem__(self, key, value):
+ if isinstance(key, tuple):
+ key = tuple(com.apply_if_callable(x, self.obj)
+ for x in key)
+ else:
+ key = com.apply_if_callable(key, self.obj)
+ indexer = self._get_setitem_indexer(key)
+ self._setitem_with_indexer(indexer, value)
+
+ def _validate_key(self, key, axis):
+ """
+ Ensure that key is valid for current indexer.
+
+ Parameters
+ ----------
+ key : scalar, slice or list-like
+ The key requested
+
+ axis : int
+ Dimension on which the indexing is being made
+
+ Raises
+ ------
+ TypeError
+ If the key (or some element of it) has wrong type
+
+ IndexError
+ If the key (or some element of it) is out of bounds
+
+ KeyError
+ If the key was not found
+ """
+ raise AbstractMethodError()
+
+ def _has_valid_tuple(self, key):
+ """ check the key for valid keys across my indexer """
+ for i, k in enumerate(key):
+ if i >= self.obj.ndim:
+ raise IndexingError('Too many indexers')
+ try:
+ self._validate_key(k, i)
+ except ValueError:
+ raise ValueError("Location based indexing can only have "
+ "[{types}] types"
+ .format(types=self._valid_types))
+
+ def _is_nested_tuple_indexer(self, tup):
+ if any(isinstance(ax, MultiIndex) for ax in self.obj.axes):
+ return any(is_nested_tuple(tup, ax) for ax in self.obj.axes)
+ return False
+
+ def _convert_tuple(self, key, is_setter=False):
+ keyidx = []
+ if self.axis is not None:
+ axis = self.obj._get_axis_number(self.axis)
+ for i in range(self.ndim):
+ if i == axis:
+ keyidx.append(self._convert_to_indexer(
+ key, axis=axis, is_setter=is_setter))
+ else:
+ keyidx.append(slice(None))
+ else:
+ for i, k in enumerate(key):
+ if i >= self.obj.ndim:
+ raise IndexingError('Too many indexers')
+ idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter)
+ keyidx.append(idx)
+ return tuple(keyidx)
+
+ def _convert_range(self, key, is_setter=False):
+ """ convert a range argument """
+ return list(key)
+
+ def _convert_scalar_indexer(self, key, axis):
+ # if we are accessing via lowered dim, use the last dim
+ if axis is None:
+ axis = 0
+ ax = self.obj._get_axis(min(axis, self.ndim - 1))
+ # a scalar
+ return ax._convert_scalar_indexer(key, kind=self.name)
+
+ def _convert_slice_indexer(self, key, axis):
+ # if we are accessing via lowered dim, use the last dim
+ ax = self.obj._get_axis(min(axis, self.ndim - 1))
+ return ax._convert_slice_indexer(key, kind=self.name)
+
+ def _has_valid_setitem_indexer(self, indexer):
+ return True
+
+ def _has_valid_positional_setitem_indexer(self, indexer):
+ """ validate that an positional indexer cannot enlarge its target
+ will raise if needed, does not modify the indexer externally
+ """
+ if isinstance(indexer, dict):
+ raise IndexError("{0} cannot enlarge its target object"
+ .format(self.name))
+ else:
+ if not isinstance(indexer, tuple):
+ indexer = self._tuplify(indexer)
+ for ax, i in zip(self.obj.axes, indexer):
+ if isinstance(i, slice):
+ # should check the stop slice?
+ pass
+ elif is_list_like_indexer(i):
+ # should check the elements?
+ pass
+ elif is_integer(i):
+ if i >= len(ax):
+ raise IndexError("{name} cannot enlarge its target "
+ "object".format(name=self.name))
+ elif isinstance(i, dict):
+ raise IndexError("{name} cannot enlarge its target object"
+ .format(name=self.name))
+
+ return True
+
+ def _setitem_with_indexer(self, indexer, value):
+ self._has_valid_setitem_indexer(indexer)
+
+ # also has the side effect of consolidating in-place
+ from pandas import Series
+ info_axis = self.obj._info_axis_number
+
+ # maybe partial set
+ take_split_path = self.obj._is_mixed_type
+
+ # if there is only one block/type, still have to take split path
+ # unless the block is one-dimensional or it can hold the value
+ if not take_split_path and self.obj._data.blocks:
+ blk, = self.obj._data.blocks
+ if 1 < blk.ndim: # in case of dict, keys are indices
+ val = list(value.values()) if isinstance(value,
+ dict) else value
+ take_split_path = not blk._can_hold_element(val)
+
+ if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes):
+
+ for i, ax in zip(indexer, self.obj.axes):
+
+ # if we have any multi-indexes that have non-trivial slices
+ # (not null slices) then we must take the split path, xref
+ # GH 10360
+ if (isinstance(ax, MultiIndex) and
+ not (is_integer(i) or com.is_null_slice(i))):
+ take_split_path = True
+ break
+
+ if isinstance(indexer, tuple):
+ nindexer = []
+ for i, idx in enumerate(indexer):
+ if isinstance(idx, dict):
+
+ # reindex the axis to the new value
+ # and set inplace
+ key, _ = convert_missing_indexer(idx)
+
+ # if this is the items axes, then take the main missing
+ # path first
+ # this correctly sets the dtype and avoids cache issues
+ # essentially this separates out the block that is needed
+ # to possibly be modified
+ if self.ndim > 1 and i == self.obj._info_axis_number:
+
+ # add the new item, and set the value
+ # must have all defined axes if we have a scalar
+ # or a list-like on the non-info axes if we have a
+ # list-like
+ len_non_info_axes = [
+ len(_ax) for _i, _ax in enumerate(self.obj.axes)
+ if _i != i
+ ]
+ if any(not l for l in len_non_info_axes):
+ if not is_list_like_indexer(value):
+ raise ValueError("cannot set a frame with no "
+ "defined index and a scalar")
+ self.obj[key] = value
+ return self.obj
+
+ # add a new item with the dtype setup
+ self.obj[key] = _infer_fill_value(value)
+
+ new_indexer = convert_from_missing_indexer_tuple(
+ indexer, self.obj.axes)
+ self._setitem_with_indexer(new_indexer, value)
+
+ return self.obj
+
+ # reindex the axis
+ # make sure to clear the cache because we are
+ # just replacing the block manager here
+ # so the object is the same
+ index = self.obj._get_axis(i)
+ labels = index.insert(len(index), key)
+ self.obj._data = self.obj.reindex(labels, axis=i)._data
+ self.obj._maybe_update_cacher(clear=True)
+ self.obj._is_copy = None
+
+ nindexer.append(labels.get_loc(key))
+
+ else:
+ nindexer.append(idx)
+
+ indexer = tuple(nindexer)
+ else:
+
+ indexer, missing = convert_missing_indexer(indexer)
+
+ if missing:
+
+ # reindex the axis to the new value
+ # and set inplace
+ if self.ndim == 1:
+ index = self.obj.index
+ new_index = index.insert(len(index), indexer)
+
+ # we have a coerced indexer, e.g. a float
+ # that matches in an Int64Index, so
+ # we will not create a duplicate index, rather
+ # index to that element
+ # e.g. 0.0 -> 0
+ # GH12246
+ if index.is_unique:
+ new_indexer = index.get_indexer([new_index[-1]])
+ if (new_indexer != -1).any():
+ return self._setitem_with_indexer(new_indexer,
+ value)
+
+ # this preserves dtype of the value
+ new_values = Series([value])._values
+ if len(self.obj._values):
+ try:
+ new_values = np.concatenate([self.obj._values,
+ new_values])
+ except TypeError:
+ as_obj = self.obj.astype(object)
+ new_values = np.concatenate([as_obj,
+ new_values])
+ self.obj._data = self.obj._constructor(
+ new_values, index=new_index, name=self.obj.name)._data
+ self.obj._maybe_update_cacher(clear=True)
+ return self.obj
+
+ elif self.ndim == 2:
+
+ # no columns and scalar
+ if not len(self.obj.columns):
+ raise ValueError("cannot set a frame with no defined "
+ "columns")
+
+ # append a Series
+ if isinstance(value, Series):
+
+ value = value.reindex(index=self.obj.columns,
+ copy=True)
+ value.name = indexer
+
+ # a list-list
+ else:
+
+ # must have conforming columns
+ if is_list_like_indexer(value):
+ if len(value) != len(self.obj.columns):
+ raise ValueError("cannot set a row with "
+ "mismatched columns")
+
+ value = Series(value, index=self.obj.columns,
+ name=indexer)
+
+ self.obj._data = self.obj.append(value)._data
+ self.obj._maybe_update_cacher(clear=True)
+ return self.obj
+
+ # set using setitem (Panel and > dims)
+ elif self.ndim >= 3:
+ return self.obj.__setitem__(indexer, value)
+
+ # set
+ item_labels = self.obj._get_axis(info_axis)
+
+ # align and set the values
+ if take_split_path:
+
+ if not isinstance(indexer, tuple):
+ indexer = self._tuplify(indexer)
+
+ if isinstance(value, ABCSeries):
+ value = self._align_series(indexer, value)
+
+ info_idx = indexer[info_axis]
+ if is_integer(info_idx):
+ info_idx = [info_idx]
+ labels = item_labels[info_idx]
+
+ # if we have a partial multiindex, then need to adjust the plane
+ # indexer here
+ if (len(labels) == 1 and
+ isinstance(self.obj[labels[0]].axes[0], MultiIndex)):
+ item = labels[0]
+ obj = self.obj[item]
+ index = obj.index
+ idx = indexer[:info_axis][0]
+
+ plane_indexer = tuple([idx]) + indexer[info_axis + 1:]
+ lplane_indexer = length_of_indexer(plane_indexer[0], index)
+
+ # require that we are setting the right number of values that
+ # we are indexing
+ if is_list_like_indexer(value) and np.iterable(
+ value) and lplane_indexer != len(value):
+
+ if len(obj[idx]) != len(value):
+ raise ValueError("cannot set using a multi-index "
+ "selection indexer with a different "
+ "length than the value")
+
+ # make sure we have an ndarray
+ value = getattr(value, 'values', value).ravel()
+
+ # we can directly set the series here
+ # as we select a slice indexer on the mi
+ idx = index._convert_slice_indexer(idx)
+ obj._consolidate_inplace()
+ obj = obj.copy()
+ obj._data = obj._data.setitem(indexer=tuple([idx]),
+ value=value)
+ self.obj[item] = obj
+ return
+
+ # non-mi
+ else:
+ plane_indexer = indexer[:info_axis] + indexer[info_axis + 1:]
+ if info_axis > 0:
+ plane_axis = self.obj.axes[:info_axis][0]
+ lplane_indexer = length_of_indexer(plane_indexer[0],
+ plane_axis)
+ else:
+ lplane_indexer = 0
+
+ def setter(item, v):
+ s = self.obj[item]
+ pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer
+
+ # perform the equivalent of a setitem on the info axis
+ # as we have a null slice or a slice with full bounds
+ # which means essentially reassign to the columns of a
+ # multi-dim object
+ # GH6149 (null slice), GH10408 (full bounds)
+ if (isinstance(pi, tuple) and
+ all(com.is_null_slice(idx) or
+ com.is_full_slice(idx, len(self.obj))
+ for idx in pi)):
+ s = v
+ else:
+ # set the item, possibly having a dtype change
+ s._consolidate_inplace()
+ s = s.copy()
+ s._data = s._data.setitem(indexer=pi, value=v)
+ s._maybe_update_cacher(clear=True)
+
+ # reset the sliced object if unique
+ self.obj[item] = s
+
+ def can_do_equal_len():
+ """ return True if we have an equal len settable """
+ if (not len(labels) == 1 or not np.iterable(value) or
+ is_scalar(plane_indexer[0])):
+ return False
+
+ item = labels[0]
+ index = self.obj[item].index
+
+ values_len = len(value)
+ # equal len list/ndarray
+ if len(index) == values_len:
+ return True
+ elif lplane_indexer == values_len:
+ return True
+
+ return False
+
+ # we need an iterable, with a ndim of at least 1
+ # eg. don't pass through np.array(0)
+ if is_list_like_indexer(value) and getattr(value, 'ndim', 1) > 0:
+
+ # we have an equal len Frame
+ if isinstance(value, ABCDataFrame) and value.ndim > 1:
+ sub_indexer = list(indexer)
+ multiindex_indexer = isinstance(labels, MultiIndex)
+
+ for item in labels:
+ if item in value:
+ sub_indexer[info_axis] = item
+ v = self._align_series(
+ tuple(sub_indexer), value[item],
+ multiindex_indexer)
+ else:
+ v = np.nan
+
+ setter(item, v)
+
+ # we have an equal len ndarray/convertible to our labels
+ # hasattr first, to avoid coercing to ndarray without reason.
+ # But we may be relying on the ndarray coercion to check ndim.
+ # Why not just convert to an ndarray earlier on if needed?
+ elif ((hasattr(value, 'ndim') and value.ndim == 2)
+ or (not hasattr(value, 'ndim') and
+ np.array(value).ndim) == 2):
+
+ # note that this coerces the dtype if we are mixed
+ # GH 7551
+ value = np.array(value, dtype=object)
+ if len(labels) != value.shape[1]:
+ raise ValueError('Must have equal len keys and value '
+ 'when setting with an ndarray')
+
+ for i, item in enumerate(labels):
+
+ # setting with a list, recoerces
+ setter(item, value[:, i].tolist())
+
+ # we have an equal len list/ndarray
+ elif can_do_equal_len():
+ setter(labels[0], value)
+
+ # per label values
+ else:
+
+ if len(labels) != len(value):
+ raise ValueError('Must have equal len keys and value '
+ 'when setting with an iterable')
+
+ for item, v in zip(labels, value):
+ setter(item, v)
+ else:
+
+ # scalar
+ for item in labels:
+ setter(item, value)
+
+ else:
+ if isinstance(indexer, tuple):
+ indexer = maybe_convert_ix(*indexer)
+
+ # if we are setting on the info axis ONLY
+ # set using those methods to avoid block-splitting
+ # logic here
+ if (len(indexer) > info_axis and
+ is_integer(indexer[info_axis]) and
+ all(com.is_null_slice(idx)
+ for i, idx in enumerate(indexer)
+ if i != info_axis) and
+ item_labels.is_unique):
+ self.obj[item_labels[indexer[info_axis]]] = value
+ return
+
+ if isinstance(value, (ABCSeries, dict)):
+ # TODO(EA): ExtensionBlock.setitem this causes issues with
+ # setting for extensionarrays that store dicts. Need to decide
+ # if it's worth supporting that.
+ value = self._align_series(indexer, Series(value))
+
+ elif isinstance(value, ABCDataFrame):
+ value = self._align_frame(indexer, value)
+
+ if isinstance(value, ABCPanel):
+ value = self._align_panel(indexer, value)
+
+ # check for chained assignment
+ self.obj._check_is_chained_assignment_possible()
+
+ # actually do the set
+ self.obj._consolidate_inplace()
+ self.obj._data = self.obj._data.setitem(indexer=indexer,
+ value=value)
+ self.obj._maybe_update_cacher(clear=True)
+
+ def _align_series(self, indexer, ser, multiindex_indexer=False):
+ """
+ Parameters
+ ----------
+ indexer : tuple, slice, scalar
+ The indexer used to get the locations that will be set to
+ `ser`
+
+ ser : pd.Series
+ The values to assign to the locations specified by `indexer`
+
+ multiindex_indexer : boolean, optional
+ Defaults to False. Should be set to True if `indexer` was from
+ a `pd.MultiIndex`, to avoid unnecessary broadcasting.
+
+
+ Returns:
+ --------
+ `np.array` of `ser` broadcast to the appropriate shape for assignment
+ to the locations selected by `indexer`
+
+ """
+ if isinstance(indexer, (slice, np.ndarray, list, Index)):
+ indexer = tuple([indexer])
+
+ if isinstance(indexer, tuple):
+
+ # flatten np.ndarray indexers
+ def ravel(i):
+ return i.ravel() if isinstance(i, np.ndarray) else i
+ indexer = tuple(map(ravel, indexer))
+
+ aligners = [not com.is_null_slice(idx) for idx in indexer]
+ sum_aligners = sum(aligners)
+ single_aligner = sum_aligners == 1
+ is_frame = self.obj.ndim == 2
+ is_panel = self.obj.ndim >= 3
+ obj = self.obj
+
+ # are we a single alignable value on a non-primary
+ # dim (e.g. panel: 1,2, or frame: 0) ?
+ # hence need to align to a single axis dimension
+ # rather that find all valid dims
+
+ # frame
+ if is_frame:
+ single_aligner = single_aligner and aligners[0]
+
+ # panel
+ elif is_panel:
+ single_aligner = (single_aligner and
+ (aligners[1] or aligners[2]))
+
+ # we have a frame, with multiple indexers on both axes; and a
+ # series, so need to broadcast (see GH5206)
+ if (sum_aligners == self.ndim and
+ all(is_sequence(_) for _ in indexer)):
+ ser = ser.reindex(obj.axes[0][indexer[0]], copy=True)._values
+
+ # single indexer
+ if len(indexer) > 1 and not multiindex_indexer:
+ len_indexer = len(indexer[1])
+ ser = np.tile(ser, len_indexer).reshape(len_indexer, -1).T
+
+ return ser
+
+ for i, idx in enumerate(indexer):
+ ax = obj.axes[i]
+
+ # multiple aligners (or null slices)
+ if is_sequence(idx) or isinstance(idx, slice):
+ if single_aligner and com.is_null_slice(idx):
+ continue
+ new_ix = ax[idx]
+ if not is_list_like_indexer(new_ix):
+ new_ix = Index([new_ix])
+ else:
+ new_ix = Index(new_ix)
+ if ser.index.equals(new_ix) or not len(new_ix):
+ return ser._values.copy()
+
+ return ser.reindex(new_ix)._values
+
+ # 2 dims
+ elif single_aligner and is_frame:
+
+ # reindex along index
+ ax = self.obj.axes[1]
+ if ser.index.equals(ax) or not len(ax):
+ return ser._values.copy()
+ return ser.reindex(ax)._values
+
+ # >2 dims
+ elif single_aligner:
+
+ broadcast = []
+ for n, labels in enumerate(self.obj._get_plane_axes(i)):
+
+ # reindex along the matching dimensions
+ if len(labels & ser.index):
+ ser = ser.reindex(labels)
+ else:
+ broadcast.append((n, len(labels)))
+
+ # broadcast along other dims
+ ser = ser._values.copy()
+ for (axis, l) in broadcast:
+ shape = [-1] * (len(broadcast) + 1)
+ shape[axis] = l
+ ser = np.tile(ser, l).reshape(shape)
+
+ if self.obj.ndim == 3:
+ ser = ser.T
+
+ return ser
+
+ elif is_scalar(indexer):
+ ax = self.obj._get_axis(1)
+
+ if ser.index.equals(ax):
+ return ser._values.copy()
+
+ return ser.reindex(ax)._values
+
+ raise ValueError('Incompatible indexer with Series')
+
+ def _align_frame(self, indexer, df):
+ is_frame = self.obj.ndim == 2
+ is_panel = self.obj.ndim >= 3
+
+ if isinstance(indexer, tuple):
+
+ idx, cols = None, None
+ sindexers = []
+ for i, ix in enumerate(indexer):
+ ax = self.obj.axes[i]
+ if is_sequence(ix) or isinstance(ix, slice):
+ if isinstance(ix, np.ndarray):
+ ix = ix.ravel()
+ if idx is None:
+ idx = ax[ix]
+ elif cols is None:
+ cols = ax[ix]
+ else:
+ break
+ else:
+ sindexers.append(i)
+
+ # panel
+ if is_panel:
+
+ # need to conform to the convention
+ # as we are not selecting on the items axis
+ # and we have a single indexer
+ # GH 7763
+ if len(sindexers) == 1 and sindexers[0] != 0:
+ df = df.T
+
+ if idx is None:
+ idx = df.index
+ if cols is None:
+ cols = df.columns
+
+ if idx is not None and cols is not None:
+
+ if df.index.equals(idx) and df.columns.equals(cols):
+ val = df.copy()._values
+ else:
+ val = df.reindex(idx, columns=cols)._values
+ return val
+
+ elif ((isinstance(indexer, slice) or is_list_like_indexer(indexer)) and
+ is_frame):
+ ax = self.obj.index[indexer]
+ if df.index.equals(ax):
+ val = df.copy()._values
+ else:
+
+ # we have a multi-index and are trying to align
+ # with a particular, level GH3738
+ if (isinstance(ax, MultiIndex) and
+ isinstance(df.index, MultiIndex) and
+ ax.nlevels != df.index.nlevels):
+ raise TypeError("cannot align on a multi-index with out "
+ "specifying the join levels")
+
+ val = df.reindex(index=ax)._values
+ return val
+
+ elif is_scalar(indexer) and is_panel:
+ idx = self.obj.axes[1]
+ cols = self.obj.axes[2]
+
+ # by definition we are indexing on the 0th axis
+ # a passed in dataframe which is actually a transpose
+ # of what is needed
+ if idx.equals(df.index) and cols.equals(df.columns):
+ return df.copy()._values
+
+ return df.reindex(idx, columns=cols)._values
+
+ raise ValueError('Incompatible indexer with DataFrame')
+
+ def _align_panel(self, indexer, df):
+ raise NotImplementedError("cannot set using an indexer with a Panel "
+ "yet!")
+
+ def _getitem_tuple(self, tup):
+ try:
+ return self._getitem_lowerdim(tup)
+ except IndexingError:
+ pass
+
+ # no multi-index, so validate all of the indexers
+ self._has_valid_tuple(tup)
+
+ # ugly hack for GH #836
+ if self._multi_take_opportunity(tup):
+ return self._multi_take(tup)
+
+ # no shortcut needed
+ retval = self.obj
+ for i, key in enumerate(tup):
+ if i >= self.obj.ndim:
+ raise IndexingError('Too many indexers')
+
+ if com.is_null_slice(key):
+ continue
+
+ retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
+
+ return retval
+
+ def _multi_take_opportunity(self, tup):
+ """
+ Check whether there is the possibility to use ``_multi_take``.
+ Currently the limit is that all axes being indexed must be indexed with
+ list-likes.
+
+ Parameters
+ ----------
+ tup : tuple
+ Tuple of indexers, one per axis
+
+ Returns
+ -------
+ boolean: Whether the current indexing can be passed through _multi_take
+ """
+ if not all(is_list_like_indexer(x) for x in tup):
+ return False
+
+ # just too complicated
+ if any(com.is_bool_indexer(x) for x in tup):
+ return False
+
+ return True
+
+ def _multi_take(self, tup):
+ """
+ Create the indexers for the passed tuple of keys, and execute the take
+ operation. This allows the take operation to be executed all at once -
+ rather than once for each dimension - improving efficiency.
+
+ Parameters
+ ----------
+ tup : tuple
+ Tuple of indexers, one per axis
+
+ Returns
+ -------
+ values: same type as the object being indexed
+ """
+ # GH 836
+ o = self.obj
+ d = {axis: self._get_listlike_indexer(key, axis)
+ for (key, axis) in zip(tup, o._AXIS_ORDERS)}
+ return o._reindex_with_indexers(d, copy=True, allow_dups=True)
+
+ def _convert_for_reindex(self, key, axis=None):
+ return key
+
+ def _handle_lowerdim_multi_index_axis0(self, tup):
+ # we have an axis0 multi-index, handle or raise
+
+ try:
+ # fast path for series or for tup devoid of slices
+ return self._get_label(tup, axis=self.axis)
+ except TypeError:
+ # slices are unhashable
+ pass
+ except Exception as e1:
+ if isinstance(tup[0], (slice, Index)):
+ raise IndexingError("Handle elsewhere")
+
+ # raise the error if we are not sorted
+ ax0 = self.obj._get_axis(0)
+ if not ax0.is_lexsorted_for_tuple(tup):
+ raise e1
+
+ return None
+
+ def _getitem_lowerdim(self, tup):
+
+ # we can directly get the axis result since the axis is specified
+ if self.axis is not None:
+ axis = self.obj._get_axis_number(self.axis)
+ return self._getitem_axis(tup, axis=axis)
+
+ # we may have a nested tuples indexer here
+ if self._is_nested_tuple_indexer(tup):
+ return self._getitem_nested_tuple(tup)
+
+ # we maybe be using a tuple to represent multiple dimensions here
+ ax0 = self.obj._get_axis(0)
+ # ...but iloc should handle the tuple as simple integer-location
+ # instead of checking it as multiindex representation (GH 13797)
+ if isinstance(ax0, MultiIndex) and self.name != 'iloc':
+ result = self._handle_lowerdim_multi_index_axis0(tup)
+ if result is not None:
+ return result
+
+ if len(tup) > self.obj.ndim:
+ raise IndexingError("Too many indexers. handle elsewhere")
+
+ # to avoid wasted computation
+ # df.ix[d1:d2, 0] -> columns first (True)
+ # df.ix[0, ['C', 'B', A']] -> rows first (False)
+ for i, key in enumerate(tup):
+ if is_label_like(key) or isinstance(key, tuple):
+ section = self._getitem_axis(key, axis=i)
+
+ # we have yielded a scalar ?
+ if not is_list_like_indexer(section):
+ return section
+
+ elif section.ndim == self.ndim:
+ # we're in the middle of slicing through a MultiIndex
+ # revise the key wrt to `section` by inserting an _NS
+ new_key = tup[:i] + (_NS,) + tup[i + 1:]
+
+ else:
+ new_key = tup[:i] + tup[i + 1:]
+
+ # unfortunately need an odious kludge here because of
+ # DataFrame transposing convention
+ if (isinstance(section, ABCDataFrame) and i > 0 and
+ len(new_key) == 2):
+ a, b = new_key
+ new_key = b, a
+
+ if len(new_key) == 1:
+ new_key, = new_key
+
+ # Slices should return views, but calling iloc/loc with a null
+ # slice returns a new object.
+ if com.is_null_slice(new_key):
+ return section
+ # This is an elided recursive call to iloc/loc/etc'
+ return getattr(section, self.name)[new_key]
+
+ raise IndexingError('not applicable')
+
+ def _getitem_nested_tuple(self, tup):
+ # we have a nested tuple so have at least 1 multi-index level
+ # we should be able to match up the dimensionaility here
+
+ # we have too many indexers for our dim, but have at least 1
+ # multi-index dimension, try to see if we have something like
+ # a tuple passed to a series with a multi-index
+ if len(tup) > self.ndim:
+ result = self._handle_lowerdim_multi_index_axis0(tup)
+ if result is not None:
+ return result
+
+ # this is a series with a multi-index specified a tuple of
+ # selectors
+ return self._getitem_axis(tup, axis=self.axis)
+
+ # handle the multi-axis by taking sections and reducing
+ # this is iterative
+ obj = self.obj
+ axis = 0
+ for i, key in enumerate(tup):
+
+ if com.is_null_slice(key):
+ axis += 1
+ continue
+
+ current_ndim = obj.ndim
+ obj = getattr(obj, self.name)._getitem_axis(key, axis=axis)
+ axis += 1
+
+ # if we have a scalar, we are done
+ if is_scalar(obj) or not hasattr(obj, 'ndim'):
+ break
+
+ # has the dim of the obj changed?
+ # GH 7199
+ if obj.ndim < current_ndim:
+
+ # GH 7516
+ # if had a 3 dim and are going to a 2d
+ # axes are reversed on a DataFrame
+ if i >= 1 and current_ndim == 3 and obj.ndim == 2:
+ obj = obj.T
+
+ axis -= 1
+
+ return obj
+
+ def _getitem_axis(self, key, axis=None):
+
+ if axis is None:
+ axis = self.axis or 0
+
+ if is_iterator(key):
+ key = list(key)
+ self._validate_key(key, axis)
+
+ labels = self.obj._get_axis(axis)
+ if isinstance(key, slice):
+ return self._get_slice_axis(key, axis=axis)
+ elif (is_list_like_indexer(key) and
+ not (isinstance(key, tuple) and
+ isinstance(labels, MultiIndex))):
+
+ if hasattr(key, 'ndim') and key.ndim > 1:
+ raise ValueError('Cannot index with multidimensional key')
+
+ return self._getitem_iterable(key, axis=axis)
+ else:
+
+ # maybe coerce a float scalar to integer
+ key = labels._maybe_cast_indexer(key)
+
+ if is_integer(key):
+ if axis == 0 and isinstance(labels, MultiIndex):
+ try:
+ return self._get_label(key, axis=axis)
+ except (KeyError, TypeError):
+ if self.obj.index.levels[0].is_integer():
+ raise
+
+ # this is the fallback! (for a non-float, non-integer index)
+ if not labels.is_floating() and not labels.is_integer():
+ return self._get_loc(key, axis=axis)
+
+ return self._get_label(key, axis=axis)
+
+ def _get_listlike_indexer(self, key, axis, raise_missing=False):
+ """
+ Transform a list-like of keys into a new index and an indexer.
+
+ Parameters
+ ----------
+ key : list-like
+ Target labels
+ axis: int
+ Dimension on which the indexing is being made
+ raise_missing: bool
+ Whether to raise a KeyError if some labels are not found. Will be
+ removed in the future, and then this method will always behave as
+ if raise_missing=True.
+
+ Raises
+ ------
+ KeyError
+ If at least one key was requested but none was found, and
+ raise_missing=True.
+
+ Returns
+ -------
+ keyarr: Index
+ New index (coinciding with 'key' if the axis is unique)
+ values : array-like
+ An indexer for the return object; -1 denotes keys not found
+ """
+ o = self.obj
+ ax = o._get_axis(axis)
+
+ # Have the index compute an indexer or return None
+ # if it cannot handle:
+ indexer, keyarr = ax._convert_listlike_indexer(key,
+ kind=self.name)
+ # We only act on all found values:
+ if indexer is not None and (indexer != -1).all():
+ self._validate_read_indexer(key, indexer, axis,
+ raise_missing=raise_missing)
+ return ax[indexer], indexer
+
+ if ax.is_unique:
+ # If we are trying to get actual keys from empty Series, we
+ # patiently wait for a KeyError later on - otherwise, convert
+ if len(ax) or not len(key):
+ key = self._convert_for_reindex(key, axis)
+ indexer = ax.get_indexer_for(key)
+ keyarr = ax.reindex(keyarr)[0]
+ else:
+ keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
+
+ self._validate_read_indexer(keyarr, indexer,
+ o._get_axis_number(axis),
+ raise_missing=raise_missing)
+ return keyarr, indexer
+
+ def _getitem_iterable(self, key, axis=None):
+ """
+ Index current object with an an iterable key (which can be a boolean
+ indexer, or a collection of keys).
+
+ Parameters
+ ----------
+ key : iterable
+ Target labels, or boolean indexer
+ axis: int, default None
+ Dimension on which the indexing is being made
+
+ Raises
+ ------
+ KeyError
+ If no key was found. Will change in the future to raise if not all
+ keys were found.
+ IndexingError
+ If the boolean indexer is unalignable with the object being
+ indexed.
+
+ Returns
+ -------
+ scalar, DataFrame, or Series: indexed value(s),
+ """
+
+ if axis is None:
+ axis = self.axis or 0
+
+ self._validate_key(key, axis)
+
+ labels = self.obj._get_axis(axis)
+
+ if com.is_bool_indexer(key):
+ # A boolean indexer
+ key = check_bool_indexer(labels, key)
+ inds, = key.nonzero()
+ return self.obj._take(inds, axis=axis)
+ else:
+ # A collection of keys
+ keyarr, indexer = self._get_listlike_indexer(key, axis,
+ raise_missing=False)
+ return self.obj._reindex_with_indexers({axis: [keyarr, indexer]},
+ copy=True, allow_dups=True)
+
+ def _validate_read_indexer(self, key, indexer, axis, raise_missing=False):
+ """
+ Check that indexer can be used to return a result (e.g. at least one
+ element was found, unless the list of keys was actually empty).
+
+ Parameters
+ ----------
+ key : list-like
+ Target labels (only used to show correct error message)
+ indexer: array-like of booleans
+ Indices corresponding to the key (with -1 indicating not found)
+ axis: int
+ Dimension on which the indexing is being made
+ raise_missing: bool
+ Whether to raise a KeyError if some labels are not found. Will be
+ removed in the future, and then this method will always behave as
+ if raise_missing=True.
+
+ Raises
+ ------
+ KeyError
+ If at least one key was requested but none was found, and
+ raise_missing=True.
+ """
+
+ ax = self.obj._get_axis(axis)
+
+ if len(key) == 0:
+ return
+
+ # Count missing values:
+ missing = (indexer < 0).sum()
+
+ if missing:
+ if missing == len(indexer):
+ raise KeyError(
+ u"None of [{key}] are in the [{axis}]".format(
+ key=key, axis=self.obj._get_axis_name(axis)))
+
+ # We (temporarily) allow for some missing keys with .loc, except in
+ # some cases (e.g. setting) in which "raise_missing" will be False
+ if not(self.name == 'loc' and not raise_missing):
+ not_found = list(set(key) - set(ax))
+ raise KeyError("{} not in index".format(not_found))
+
+ # we skip the warning on Categorical/Interval
+ # as this check is actually done (check for
+ # non-missing values), but a bit later in the
+ # code, so we want to avoid warning & then
+ # just raising
+
+ _missing_key_warning = textwrap.dedent("""
+ Passing list-likes to .loc or [] with any missing label will raise
+ KeyError in the future, you can use .reindex() as an alternative.
+
+ See the documentation here:
+ https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike""") # noqa
+
+ if not (ax.is_categorical() or ax.is_interval()):
+ warnings.warn(_missing_key_warning,
+ FutureWarning, stacklevel=6)
+
+ def _convert_to_indexer(self, obj, axis=None, is_setter=False,
+ raise_missing=False):
+ """
+ Convert indexing key into something we can use to do actual fancy
+ indexing on an ndarray
+
+ Examples
+ ix[:5] -> slice(0, 5)
+ ix[[1,2,3]] -> [1,2,3]
+ ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz)
+
+ Going by Zen of Python?
+ 'In the face of ambiguity, refuse the temptation to guess.'
+ raise AmbiguousIndexError with integer labels?
+ - No, prefer label-based indexing
+ """
+ if axis is None:
+ axis = self.axis or 0
+
+ labels = self.obj._get_axis(axis)
+
+ if isinstance(obj, slice):
+ return self._convert_slice_indexer(obj, axis)
+
+ # try to find out correct indexer, if not type correct raise
+ try:
+ obj = self._convert_scalar_indexer(obj, axis)
+ except TypeError:
+
+ # but we will allow setting
+ if is_setter:
+ pass
+
+ # see if we are positional in nature
+ is_int_index = labels.is_integer()
+ is_int_positional = is_integer(obj) and not is_int_index
+
+ # if we are a label return me
+ try:
+ return labels.get_loc(obj)
+ except LookupError:
+ if isinstance(obj, tuple) and isinstance(labels, MultiIndex):
+ if is_setter and len(obj) == labels.nlevels:
+ return {'key': obj}
+ raise
+ except TypeError:
+ pass
+ except (ValueError):
+ if not is_int_positional:
+ raise
+
+ # a positional
+ if is_int_positional:
+
+ # if we are setting and its not a valid location
+ # its an insert which fails by definition
+ if is_setter:
+
+ # always valid
+ if self.name == 'loc':
+ return {'key': obj}
+
+ # a positional
+ if (obj >= self.obj.shape[axis] and
+ not isinstance(labels, MultiIndex)):
+ raise ValueError("cannot set by positional indexing with "
+ "enlargement")
+
+ return obj
+
+ if is_nested_tuple(obj, labels):
+ return labels.get_locs(obj)
+
+ elif is_list_like_indexer(obj):
+
+ if com.is_bool_indexer(obj):
+ obj = check_bool_indexer(labels, obj)
+ inds, = obj.nonzero()
+ return inds
+ else:
+ # When setting, missing keys are not allowed, even with .loc:
+ kwargs = {'raise_missing': True if is_setter else
+ raise_missing}
+ return self._get_listlike_indexer(obj, axis, **kwargs)[1]
+ else:
+ try:
+ return labels.get_loc(obj)
+ except LookupError:
+ # allow a not found key only if we are a setter
+ if not is_list_like_indexer(obj) and is_setter:
+ return {'key': obj}
+ raise
+
+ def _tuplify(self, loc):
+ tup = [slice(None, None) for _ in range(self.ndim)]
+ tup[0] = loc
+ return tuple(tup)
+
+ def _get_slice_axis(self, slice_obj, axis=None):
+ obj = self.obj
+
+ if axis is None:
+ axis = self.axis or 0
+
+ if not need_slice(slice_obj):
+ return obj.copy(deep=False)
+ indexer = self._convert_slice_indexer(slice_obj, axis)
+
+ if isinstance(indexer, slice):
+ return self._slice(indexer, axis=axis, kind='iloc')
+ else:
+ return self.obj._take(indexer, axis=axis)
+
+
+class _IXIndexer(_NDFrameIndexer):
+ """A primarily label-location based indexer, with integer position
+ fallback.
+
+ Warning: Starting in 0.20.0, the .ix indexer is deprecated, in
+ favor of the more strict .iloc and .loc indexers.
+
+ ``.ix[]`` supports mixed integer and label based access. It is
+ primarily label based, but will fall back to integer positional
+ access unless the corresponding axis is of integer type.
+
+ ``.ix`` is the most general indexer and will support any of the
+ inputs in ``.loc`` and ``.iloc``. ``.ix`` also supports floating
+ point label schemes. ``.ix`` is exceptionally useful when dealing
+ with mixed positional and label based hierarchical indexes.
+
+ However, when an axis is integer based, ONLY label based access
+ and not positional access is supported. Thus, in such cases, it's
+ usually better to be explicit and use ``.iloc`` or ``.loc``.
+
+ See more at :ref:`Advanced Indexing <advanced>`.
+ """
+
+ _ix_deprecation_warning = textwrap.dedent("""
+ .ix is deprecated. Please use
+ .loc for label based indexing or
+ .iloc for positional indexing
+
+ See the documentation here:
+ http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated""") # noqa
+
+ def __init__(self, name, obj):
+ warnings.warn(self._ix_deprecation_warning,
+ DeprecationWarning, stacklevel=2)
+ super(_IXIndexer, self).__init__(name, obj)
+
+ @Appender(_NDFrameIndexer._validate_key.__doc__)
+ def _validate_key(self, key, axis):
+ if isinstance(key, slice):
+ return True
+
+ elif com.is_bool_indexer(key):
+ return True
+
+ elif is_list_like_indexer(key):
+ return True
+
+ else:
+
+ self._convert_scalar_indexer(key, axis)
+
+ return True
+
+ def _convert_for_reindex(self, key, axis=None):
+ """
+ Transform a list of keys into a new array ready to be used as axis of
+ the object we return (e.g. including NaNs).
+
+ Parameters
+ ----------
+ key : list-like
+ Target labels
+ axis: int
+ Where the indexing is being made
+
+ Returns
+ -------
+ list-like of labels
+ """
+
+ if axis is None:
+ axis = self.axis or 0
+ labels = self.obj._get_axis(axis)
+
+ if com.is_bool_indexer(key):
+ key = check_bool_indexer(labels, key)
+ return labels[key]
+
+ if isinstance(key, Index):
+ keyarr = labels._convert_index_indexer(key)
+ else:
+ # asarray can be unsafe, NumPy strings are weird
+ keyarr = com.asarray_tuplesafe(key)
+
+ if is_integer_dtype(keyarr):
+ # Cast the indexer to uint64 if possible so
+ # that the values returned from indexing are
+ # also uint64.
+ keyarr = labels._convert_arr_indexer(keyarr)
+
+ if not labels.is_integer():
+ keyarr = ensure_platform_int(keyarr)
+ return labels.take(keyarr)
+
+ return keyarr
+
+
+class _LocationIndexer(_NDFrameIndexer):
+ _exception = Exception
+
+ def __getitem__(self, key):
+ if type(key) is tuple:
+ key = tuple(com.apply_if_callable(x, self.obj)
+ for x in key)
+ try:
+ if self._is_scalar_access(key):
+ return self._getitem_scalar(key)
+ except (KeyError, IndexError, AttributeError):
+ pass
+ return self._getitem_tuple(key)
+ else:
+ # we by definition only have the 0th axis
+ axis = self.axis or 0
+
+ maybe_callable = com.apply_if_callable(key, self.obj)
+ return self._getitem_axis(maybe_callable, axis=axis)
+
+ def _is_scalar_access(self, key):
+ raise NotImplementedError()
+
+ def _getitem_scalar(self, key):
+ raise NotImplementedError()
+
+ def _getitem_axis(self, key, axis=None):
+ raise NotImplementedError()
+
+ def _getbool_axis(self, key, axis=None):
+ if axis is None:
+ axis = self.axis or 0
+ labels = self.obj._get_axis(axis)
+ key = check_bool_indexer(labels, key)
+ inds, = key.nonzero()
+ try:
+ return self.obj._take(inds, axis=axis)
+ except Exception as detail:
+ raise self._exception(detail)
+
+ def _get_slice_axis(self, slice_obj, axis=None):
+ """ this is pretty simple as we just have to deal with labels """
+ if axis is None:
+ axis = self.axis or 0
+
+ obj = self.obj
+ if not need_slice(slice_obj):
+ return obj.copy(deep=False)
+
+ labels = obj._get_axis(axis)
+ indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop,
+ slice_obj.step, kind=self.name)
+
+ if isinstance(indexer, slice):
+ return self._slice(indexer, axis=axis, kind='iloc')
+ else:
+ return self.obj._take(indexer, axis=axis)
+
+
+class _LocIndexer(_LocationIndexer):
+ """
+ Access a group of rows and columns by label(s) or a boolean array.
+
+ ``.loc[]`` is primarily label based, but may also be used with a
+ boolean array.
+
+ Allowed inputs are:
+
+ - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is
+ interpreted as a *label* of the index, and **never** as an
+ integer position along the index).
+ - A list or array of labels, e.g. ``['a', 'b', 'c']``.
+ - A slice object with labels, e.g. ``'a':'f'``.
+
+ .. warning:: Note that contrary to usual python slices, **both** the
+ start and the stop are included
+
+ - A boolean array of the same length as the axis being sliced,
+ e.g. ``[True, False, True]``.
+ - A ``callable`` function with one argument (the calling Series, DataFrame
+ or Panel) and that returns valid output for indexing (one of the above)
+
+ See more at :ref:`Selection by Label <indexing.label>`
+
+ Raises
+ ------
+ KeyError:
+ when any items are not found
+
+ See Also
+ --------
+ DataFrame.at : Access a single value for a row/column label pair.
+ DataFrame.iloc : Access group of rows and columns by integer position(s).
+ DataFrame.xs : Returns a cross-section (row(s) or column(s)) from the
+ Series/DataFrame.
+ Series.loc : Access group of values using labels.
+
+ Examples
+ --------
+ **Getting values**
+
+ >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
+ ... index=['cobra', 'viper', 'sidewinder'],
+ ... columns=['max_speed', 'shield'])
+ >>> df
+ max_speed shield
+ cobra 1 2
+ viper 4 5
+ sidewinder 7 8
+
+ Single label. Note this returns the row as a Series.
+
+ >>> df.loc['viper']
+ max_speed 4
+ shield 5
+ Name: viper, dtype: int64
+
+ List of labels. Note using ``[[]]`` returns a DataFrame.
+
+ >>> df.loc[['viper', 'sidewinder']]
+ max_speed shield
+ viper 4 5
+ sidewinder 7 8
+
+ Single label for row and column
+
+ >>> df.loc['cobra', 'shield']
+ 2
+
+ Slice with labels for row and single label for column. As mentioned
+ above, note that both the start and stop of the slice are included.
+
+ >>> df.loc['cobra':'viper', 'max_speed']
+ cobra 1
+ viper 4
+ Name: max_speed, dtype: int64
+
+ Boolean list with the same length as the row axis
+
+ >>> df.loc[[False, False, True]]
+ max_speed shield
+ sidewinder 7 8
+
+ Conditional that returns a boolean Series
+
+ >>> df.loc[df['shield'] > 6]
+ max_speed shield
+ sidewinder 7 8
+
+ Conditional that returns a boolean Series with column labels specified
+
+ >>> df.loc[df['shield'] > 6, ['max_speed']]
+ max_speed
+ sidewinder 7
+
+ Callable that returns a boolean Series
+
+ >>> df.loc[lambda df: df['shield'] == 8]
+ max_speed shield
+ sidewinder 7 8
+
+ **Setting values**
+
+ Set value for all items matching the list of labels
+
+ >>> df.loc[['viper', 'sidewinder'], ['shield']] = 50
+ >>> df
+ max_speed shield
+ cobra 1 2
+ viper 4 50
+ sidewinder 7 50
+
+ Set value for an entire row
+
+ >>> df.loc['cobra'] = 10
+ >>> df
+ max_speed shield
+ cobra 10 10
+ viper 4 50
+ sidewinder 7 50
+
+ Set value for an entire column
+
+ >>> df.loc[:, 'max_speed'] = 30
+ >>> df
+ max_speed shield
+ cobra 30 10
+ viper 30 50
+ sidewinder 30 50
+
+ Set value for rows matching callable condition
+
+ >>> df.loc[df['shield'] > 35] = 0
+ >>> df
+ max_speed shield
+ cobra 30 10
+ viper 0 0
+ sidewinder 0 0
+
+ **Getting values on a DataFrame with an index that has integer labels**
+
+ Another example using integers for the index
+
+ >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
+ ... index=[7, 8, 9], columns=['max_speed', 'shield'])
+ >>> df
+ max_speed shield
+ 7 1 2
+ 8 4 5
+ 9 7 8
+
+ Slice with integer labels for rows. As mentioned above, note that both
+ the start and stop of the slice are included.
+
+ >>> df.loc[7:9]
+ max_speed shield
+ 7 1 2
+ 8 4 5
+ 9 7 8
+
+ **Getting values with a MultiIndex**
+
+ A number of examples using a DataFrame with a MultiIndex
+
+ >>> tuples = [
+ ... ('cobra', 'mark i'), ('cobra', 'mark ii'),
+ ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'),
+ ... ('viper', 'mark ii'), ('viper', 'mark iii')
+ ... ]
+ >>> index = pd.MultiIndex.from_tuples(tuples)
+ >>> values = [[12, 2], [0, 4], [10, 20],
+ ... [1, 4], [7, 1], [16, 36]]
+ >>> df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index)
+ >>> df
+ max_speed shield
+ cobra mark i 12 2
+ mark ii 0 4
+ sidewinder mark i 10 20
+ mark ii 1 4
+ viper mark ii 7 1
+ mark iii 16 36
+
+ Single label. Note this returns a DataFrame with a single index.
+
+ >>> df.loc['cobra']
+ max_speed shield
+ mark i 12 2
+ mark ii 0 4
+
+ Single index tuple. Note this returns a Series.
+
+ >>> df.loc[('cobra', 'mark ii')]
+ max_speed 0
+ shield 4
+ Name: (cobra, mark ii), dtype: int64
+
+ Single label for row and column. Similar to passing in a tuple, this
+ returns a Series.
+
+ >>> df.loc['cobra', 'mark i']
+ max_speed 12
+ shield 2
+ Name: (cobra, mark i), dtype: int64
+
+ Single tuple. Note using ``[[]]`` returns a DataFrame.
+
+ >>> df.loc[[('cobra', 'mark ii')]]
+ max_speed shield
+ cobra mark ii 0 4
+
+ Single tuple for the index with a single label for the column
+
+ >>> df.loc[('cobra', 'mark i'), 'shield']
+ 2
+
+ Slice from index tuple to single label
+
+ >>> df.loc[('cobra', 'mark i'):'viper']
+ max_speed shield
+ cobra mark i 12 2
+ mark ii 0 4
+ sidewinder mark i 10 20
+ mark ii 1 4
+ viper mark ii 7 1
+ mark iii 16 36
+
+ Slice from index tuple to index tuple
+
+ >>> df.loc[('cobra', 'mark i'):('viper', 'mark ii')]
+ max_speed shield
+ cobra mark i 12 2
+ mark ii 0 4
+ sidewinder mark i 10 20
+ mark ii 1 4
+ viper mark ii 7 1
+ """
+
+ _valid_types = ("labels (MUST BE IN THE INDEX), slices of labels (BOTH "
+ "endpoints included! Can be slices of integers if the "
+ "index is integers), listlike of labels, boolean")
+ _exception = KeyError
+
+ @Appender(_NDFrameIndexer._validate_key.__doc__)
+ def _validate_key(self, key, axis):
+
+ # valid for a collection of labels (we check their presence later)
+ # slice of labels (where start-end in labels)
+ # slice of integers (only if in the labels)
+ # boolean
+
+ if isinstance(key, slice):
+ return
+
+ if com.is_bool_indexer(key):
+ return
+
+ if not is_list_like_indexer(key):
+ self._convert_scalar_indexer(key, axis)
+
+ def _is_scalar_access(self, key):
+ # this is a shortcut accessor to both .loc and .iloc
+ # that provide the equivalent access of .at and .iat
+ # a) avoid getting things via sections and (to minimize dtype changes)
+ # b) provide a performant path
+ if not hasattr(key, '__len__'):
+ return False
+
+ if len(key) != self.ndim:
+ return False
+
+ for i, k in enumerate(key):
+ if not is_scalar(k):
+ return False
+
+ ax = self.obj.axes[i]
+ if isinstance(ax, MultiIndex):
+ return False
+
+ if not ax.is_unique:
+ return False
+
+ return True
+
+ def _getitem_scalar(self, key):
+ # a fast-path to scalar access
+ # if not, raise
+ values = self.obj._get_value(*key)
+ return values
+
+ def _get_partial_string_timestamp_match_key(self, key, labels):
+ """Translate any partial string timestamp matches in key, returning the
+ new key (GH 10331)"""
+ if isinstance(labels, MultiIndex):
+ if (isinstance(key, compat.string_types) and
+ labels.levels[0].is_all_dates):
+ # Convert key '2016-01-01' to
+ # ('2016-01-01'[, slice(None, None, None)]+)
+ key = tuple([key] + [slice(None)] * (len(labels.levels) - 1))
+
+ if isinstance(key, tuple):
+ # Convert (..., '2016-01-01', ...) in tuple to
+ # (..., slice('2016-01-01', '2016-01-01', None), ...)
+ new_key = []
+ for i, component in enumerate(key):
+ if (isinstance(component, compat.string_types) and
+ labels.levels[i].is_all_dates):
+ new_key.append(slice(component, component, None))
+ else:
+ new_key.append(component)
+ key = tuple(new_key)
+
+ return key
+
+ def _getitem_axis(self, key, axis=None):
+ if axis is None:
+ axis = self.axis or 0
+
+ if is_iterator(key):
+ key = list(key)
+
+ labels = self.obj._get_axis(axis)
+ key = self._get_partial_string_timestamp_match_key(key, labels)
+
+ if isinstance(key, slice):
+ self._validate_key(key, axis)
+ return self._get_slice_axis(key, axis=axis)
+ elif com.is_bool_indexer(key):
+ return self._getbool_axis(key, axis=axis)
+ elif is_list_like_indexer(key):
+
+ # convert various list-like indexers
+ # to a list of keys
+ # we will use the *values* of the object
+ # and NOT the index if its a PandasObject
+ if isinstance(labels, MultiIndex):
+
+ if isinstance(key, (ABCSeries, np.ndarray)) and key.ndim <= 1:
+ # Series, or 0,1 ndim ndarray
+ # GH 14730
+ key = list(key)
+ elif isinstance(key, ABCDataFrame):
+ # GH 15438
+ raise NotImplementedError("Indexing a MultiIndex with a "
+ "DataFrame key is not "
+ "implemented")
+ elif hasattr(key, 'ndim') and key.ndim > 1:
+ raise NotImplementedError("Indexing a MultiIndex with a "
+ "multidimensional key is not "
+ "implemented")
+
+ if (not isinstance(key, tuple) and len(key) > 1 and
+ not isinstance(key[0], tuple)):
+ key = tuple([key])
+
+ # an iterable multi-selection
+ if not (isinstance(key, tuple) and isinstance(labels, MultiIndex)):
+
+ if hasattr(key, 'ndim') and key.ndim > 1:
+ raise ValueError('Cannot index with multidimensional key')
+
+ return self._getitem_iterable(key, axis=axis)
+
+ # nested tuple slicing
+ if is_nested_tuple(key, labels):
+ locs = labels.get_locs(key)
+ indexer = [slice(None)] * self.ndim
+ indexer[axis] = locs
+ return self.obj.iloc[tuple(indexer)]
+
+ # fall thru to straight lookup
+ self._validate_key(key, axis)
+ return self._get_label(key, axis=axis)
+
+
+class _iLocIndexer(_LocationIndexer):
+ """
+ Purely integer-location based indexing for selection by position.
+
+ ``.iloc[]`` is primarily integer position based (from ``0`` to
+ ``length-1`` of the axis), but may also be used with a boolean
+ array.
+
+ Allowed inputs are:
+
+ - An integer, e.g. ``5``.
+ - A list or array of integers, e.g. ``[4, 3, 0]``.
+ - A slice object with ints, e.g. ``1:7``.
+ - A boolean array.
+ - A ``callable`` function with one argument (the calling Series, DataFrame
+ or Panel) and that returns valid output for indexing (one of the above).
+ This is useful in method chains, when you don't have a reference to the
+ calling object, but would like to base your selection on some value.
+
+ ``.iloc`` will raise ``IndexError`` if a requested indexer is
+ out-of-bounds, except *slice* indexers which allow out-of-bounds
+ indexing (this conforms with python/numpy *slice* semantics).
+
+ See more at ref:`Selection by Position <indexing.integer>`.
+
+ See Also
+ --------
+ DataFrame.iat : Fast integer location scalar accessor.
+ DataFrame.loc : Purely label-location based indexer for selection by label.
+ Series.iloc : Purely integer-location based indexing for
+ selection by position.
+
+ Examples
+ --------
+
+ >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4},
+ ... {'a': 100, 'b': 200, 'c': 300, 'd': 400},
+ ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }]
+ >>> df = pd.DataFrame(mydict)
+ >>> df
+ a b c d
+ 0 1 2 3 4
+ 1 100 200 300 400
+ 2 1000 2000 3000 4000
+
+ **Indexing just the rows**
+
+ With a scalar integer.
+
+ >>> type(df.iloc[0])
+ <class 'pandas.core.series.Series'>
+ >>> df.iloc[0]
+ a 1
+ b 2
+ c 3
+ d 4
+ Name: 0, dtype: int64
+
+ With a list of integers.
+
+ >>> df.iloc[[0]]
+ a b c d
+ 0 1 2 3 4
+ >>> type(df.iloc[[0]])
+ <class 'pandas.core.frame.DataFrame'>
+
+ >>> df.iloc[[0, 1]]
+ a b c d
+ 0 1 2 3 4
+ 1 100 200 300 400
+
+ With a `slice` object.
+
+ >>> df.iloc[:3]
+ a b c d
+ 0 1 2 3 4
+ 1 100 200 300 400
+ 2 1000 2000 3000 4000
+
+ With a boolean mask the same length as the index.
+
+ >>> df.iloc[[True, False, True]]
+ a b c d
+ 0 1 2 3 4
+ 2 1000 2000 3000 4000
+
+ With a callable, useful in method chains. The `x` passed
+ to the ``lambda`` is the DataFrame being sliced. This selects
+ the rows whose index label even.
+
+ >>> df.iloc[lambda x: x.index % 2 == 0]
+ a b c d
+ 0 1 2 3 4
+ 2 1000 2000 3000 4000
+
+ **Indexing both axes**
+
+ You can mix the indexer types for the index and columns. Use ``:`` to
+ select the entire axis.
+
+ With scalar integers.
+
+ >>> df.iloc[0, 1]
+ 2
+
+ With lists of integers.
+
+ >>> df.iloc[[0, 2], [1, 3]]
+ b d
+ 0 2 4
+ 2 2000 4000
+
+ With `slice` objects.
+
+ >>> df.iloc[1:3, 0:3]
+ a b c
+ 1 100 200 300
+ 2 1000 2000 3000
+
+ With a boolean array whose length matches the columns.
+
+ >>> df.iloc[:, [True, False, True, False]]
+ a c
+ 0 1 3
+ 1 100 300
+ 2 1000 3000
+
+ With a callable function that expects the Series or DataFrame.
+
+ >>> df.iloc[:, lambda df: [0, 2]]
+ a c
+ 0 1 3
+ 1 100 300
+ 2 1000 3000
+ """
+
+ _valid_types = ("integer, integer slice (START point is INCLUDED, END "
+ "point is EXCLUDED), listlike of integers, boolean array")
+ _exception = IndexError
+
+ def _validate_key(self, key, axis):
+ if com.is_bool_indexer(key):
+ if hasattr(key, 'index') and isinstance(key.index, Index):
+ if key.index.inferred_type == 'integer':
+ raise NotImplementedError("iLocation based boolean "
+ "indexing on an integer type "
+ "is not available")
+ raise ValueError("iLocation based boolean indexing cannot use "
+ "an indexable as a mask")
+ return
+
+ if isinstance(key, slice):
+ return
+ elif is_integer(key):
+ self._validate_integer(key, axis)
+ elif isinstance(key, tuple):
+ # a tuple should already have been caught by this point
+ # so don't treat a tuple as a valid indexer
+ raise IndexingError('Too many indexers')
+ elif is_list_like_indexer(key):
+ # check that the key does not exceed the maximum size of the index
+ arr = np.array(key)
+ len_axis = len(self.obj._get_axis(axis))
+
+ if len(arr) and (arr.max() >= len_axis or arr.min() < -len_axis):
+ raise IndexError("positional indexers are out-of-bounds")
+ else:
+ raise ValueError("Can only index by location with "
+ "a [{types}]".format(types=self._valid_types))
+
+ def _has_valid_setitem_indexer(self, indexer):
+ self._has_valid_positional_setitem_indexer(indexer)
+
+ def _is_scalar_access(self, key):
+ # this is a shortcut accessor to both .loc and .iloc
+ # that provide the equivalent access of .at and .iat
+ # a) avoid getting things via sections and (to minimize dtype changes)
+ # b) provide a performant path
+ if not hasattr(key, '__len__'):
+ return False
+
+ if len(key) != self.ndim:
+ return False
+
+ for i, k in enumerate(key):
+ if not is_integer(k):
+ return False
+
+ ax = self.obj.axes[i]
+ if not ax.is_unique:
+ return False
+
+ return True
+
+ def _getitem_scalar(self, key):
+ # a fast-path to scalar access
+ # if not, raise
+ values = self.obj._get_value(*key, takeable=True)
+ return values
+
+ def _validate_integer(self, key, axis):
+ """
+ Check that 'key' is a valid position in the desired axis.
+
+ Parameters
+ ----------
+ key : int
+ Requested position
+ axis : int
+ Desired axis
+
+ Returns
+ -------
+ None
+
+ Raises
+ ------
+ IndexError
+ If 'key' is not a valid position in axis 'axis'
+ """
+
+ len_axis = len(self.obj._get_axis(axis))
+ if key >= len_axis or key < -len_axis:
+ raise IndexError("single positional indexer is out-of-bounds")
+
+ def _getitem_tuple(self, tup):
+
+ self._has_valid_tuple(tup)
+ try:
+ return self._getitem_lowerdim(tup)
+ except IndexingError:
+ pass
+
+ retval = self.obj
+ axis = 0
+ for i, key in enumerate(tup):
+ if i >= self.obj.ndim:
+ raise IndexingError('Too many indexers')
+
+ if com.is_null_slice(key):
+ axis += 1
+ continue
+
+ retval = getattr(retval, self.name)._getitem_axis(key, axis=axis)
+
+ # if the dim was reduced, then pass a lower-dim the next time
+ if retval.ndim < self.ndim:
+ axis -= 1
+
+ # try to get for the next axis
+ axis += 1
+
+ return retval
+
+ def _get_slice_axis(self, slice_obj, axis=None):
+ if axis is None:
+ axis = self.axis or 0
+ obj = self.obj
+
+ if not need_slice(slice_obj):
+ return obj.copy(deep=False)
+
+ slice_obj = self._convert_slice_indexer(slice_obj, axis)
+ if isinstance(slice_obj, slice):
+ return self._slice(slice_obj, axis=axis, kind='iloc')
+ else:
+ return self.obj._take(slice_obj, axis=axis)
+
+ def _get_list_axis(self, key, axis=None):
+ """
+ Return Series values by list or array of integers
+
+ Parameters
+ ----------
+ key : list-like positional indexer
+ axis : int (can only be zero)
+
+ Returns
+ -------
+ Series object
+ """
+ if axis is None:
+ axis = self.axis or 0
+ try:
+ return self.obj._take(key, axis=axis)
+ except IndexError:
+ # re-raise with different error message
+ raise IndexError("positional indexers are out-of-bounds")
+
+ def _getitem_axis(self, key, axis=None):
+ if axis is None:
+ axis = self.axis or 0
+
+ if isinstance(key, slice):
+ return self._get_slice_axis(key, axis=axis)
+
+ if isinstance(key, list):
+ key = np.asarray(key)
+
+ if com.is_bool_indexer(key):
+ self._validate_key(key, axis)
+ return self._getbool_axis(key, axis=axis)
+
+ # a list of integers
+ elif is_list_like_indexer(key):
+ return self._get_list_axis(key, axis=axis)
+
+ # a single integer
+ else:
+ if not is_integer(key):
+ raise TypeError("Cannot index by location index with a "
+ "non-integer key")
+
+ # validate the location
+ self._validate_integer(key, axis)
+
+ return self._get_loc(key, axis=axis)
+
+ def _convert_to_indexer(self, obj, axis=None, is_setter=False):
+ """ much simpler as we only have to deal with our valid types """
+ if axis is None:
+ axis = self.axis or 0
+
+ # make need to convert a float key
+ if isinstance(obj, slice):
+ return self._convert_slice_indexer(obj, axis)
+
+ elif is_float(obj):
+ return self._convert_scalar_indexer(obj, axis)
+
+ try:
+ self._validate_key(obj, axis)
+ return obj
+ except ValueError:
+ raise ValueError("Can only index by location with "
+ "a [{types}]".format(types=self._valid_types))
+
+
+class _ScalarAccessIndexer(_NDFrameIndexer):
+ """ access scalars quickly """
+
+ def _convert_key(self, key, is_setter=False):
+ return list(key)
+
+ def __getitem__(self, key):
+ if not isinstance(key, tuple):
+
+ # we could have a convertible item here (e.g. Timestamp)
+ if not is_list_like_indexer(key):
+ key = tuple([key])
+ else:
+ raise ValueError('Invalid call for scalar access (getting)!')
+
+ key = self._convert_key(key)
+ return self.obj._get_value(*key, takeable=self._takeable)
+
+ def __setitem__(self, key, value):
+ if isinstance(key, tuple):
+ key = tuple(com.apply_if_callable(x, self.obj)
+ for x in key)
+ else:
+ # scalar callable may return tuple
+ key = com.apply_if_callable(key, self.obj)
+
+ if not isinstance(key, tuple):
+ key = self._tuplify(key)
+ if len(key) != self.obj.ndim:
+ raise ValueError('Not enough indexers for scalar access '
+ '(setting)!')
+ key = list(self._convert_key(key, is_setter=True))
+ key.append(value)
+ self.obj._set_value(*key, takeable=self._takeable)
+
+
+class _AtIndexer(_ScalarAccessIndexer):
+ """
+ Access a single value for a row/column label pair.
+
+ Similar to ``loc``, in that both provide label-based lookups. Use
+ ``at`` if you only need to get or set a single value in a DataFrame
+ or Series.
+
+ Raises
+ ------
+ KeyError
+ When label does not exist in DataFrame
+
+ See Also
+ --------
+ DataFrame.iat : Access a single value for a row/column pair by integer
+ position.
+ DataFrame.loc : Access a group of rows and columns by label(s).
+ Series.at : Access a single value using a label.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]],
+ ... index=[4, 5, 6], columns=['A', 'B', 'C'])
+ >>> df
+ A B C
+ 4 0 2 3
+ 5 0 4 1
+ 6 10 20 30
+
+ Get value at specified row/column pair
+
+ >>> df.at[4, 'B']
+ 2
+
+ Set value at specified row/column pair
+
+ >>> df.at[4, 'B'] = 10
+ >>> df.at[4, 'B']
+ 10
+
+ Get value within a Series
+
+ >>> df.loc[5].at['B']
+ 4
+ """
+
+ _takeable = False
+
+ def _convert_key(self, key, is_setter=False):
+ """ require they keys to be the same type as the index (so we don't
+ fallback)
+ """
+
+ # allow arbitrary setting
+ if is_setter:
+ return list(key)
+
+ for ax, i in zip(self.obj.axes, key):
+ if ax.is_integer():
+ if not is_integer(i):
+ raise ValueError("At based indexing on an integer index "
+ "can only have integer indexers")
+ else:
+ if is_integer(i) and not ax.holds_integer():
+ raise ValueError("At based indexing on an non-integer "
+ "index can only have non-integer "
+ "indexers")
+ return key
+
+
+class _iAtIndexer(_ScalarAccessIndexer):
+ """
+ Access a single value for a row/column pair by integer position.
+
+ Similar to ``iloc``, in that both provide integer-based lookups. Use
+ ``iat`` if you only need to get or set a single value in a DataFrame
+ or Series.
+
+ Raises
+ ------
+ IndexError
+ When integer position is out of bounds
+
+ See Also
+ --------
+ DataFrame.at : Access a single value for a row/column label pair.
+ DataFrame.loc : Access a group of rows and columns by label(s).
+ DataFrame.iloc : Access a group of rows and columns by integer position(s).
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]],
+ ... columns=['A', 'B', 'C'])
+ >>> df
+ A B C
+ 0 0 2 3
+ 1 0 4 1
+ 2 10 20 30
+
+ Get value at specified row/column pair
+
+ >>> df.iat[1, 2]
+ 1
+
+ Set value at specified row/column pair
+
+ >>> df.iat[1, 2] = 10
+ >>> df.iat[1, 2]
+ 10
+
+ Get value within a series
+
+ >>> df.loc[0].iat[1]
+ 2
+ """
+
+ _takeable = True
+
+ def _has_valid_setitem_indexer(self, indexer):
+ self._has_valid_positional_setitem_indexer(indexer)
+
+ def _convert_key(self, key, is_setter=False):
+ """ require integer args (and convert to label arguments) """
+ for a, i in zip(self.obj.axes, key):
+ if not is_integer(i):
+ raise ValueError("iAt based indexing can only have integer "
+ "indexers")
+ return key
+
+
+def length_of_indexer(indexer, target=None):
+ """
+ return the length of a single non-tuple indexer which could be a slice
+ """
+ if target is not None and isinstance(indexer, slice):
+ target_len = len(target)
+ start = indexer.start
+ stop = indexer.stop
+ step = indexer.step
+ if start is None:
+ start = 0
+ elif start < 0:
+ start += target_len
+ if stop is None or stop > target_len:
+ stop = target_len
+ elif stop < 0:
+ stop += target_len
+ if step is None:
+ step = 1
+ elif step < 0:
+ step = -step
+ return (stop - start + step - 1) // step
+ elif isinstance(indexer, (ABCSeries, Index, np.ndarray, list)):
+ return len(indexer)
+ elif not is_list_like_indexer(indexer):
+ return 1
+ raise AssertionError("cannot find the length of the indexer")
+
+
+def convert_to_index_sliceable(obj, key):
+ """
+ if we are index sliceable, then return my slicer, otherwise return None
+ """
+ idx = obj.index
+ if isinstance(key, slice):
+ return idx._convert_slice_indexer(key, kind='getitem')
+
+ elif isinstance(key, compat.string_types):
+
+ # we are an actual column
+ if obj._data.items.contains(key):
+ return None
+
+ # We might have a datetimelike string that we can translate to a
+ # slice here via partial string indexing
+ if idx.is_all_dates:
+ try:
+ return idx._get_string_slice(key)
+ except (KeyError, ValueError, NotImplementedError):
+ return None
+
+ return None
+
+
+def check_bool_indexer(ax, key):
+ # boolean indexing, need to check that the data are aligned, otherwise
+ # disallowed
+
+ # this function assumes that is_bool_indexer(key) == True
+
+ result = key
+ if isinstance(key, ABCSeries) and not key.index.equals(ax):
+ result = result.reindex(ax)
+ mask = isna(result._values)
+ if mask.any():
+ raise IndexingError('Unalignable boolean Series provided as '
+ 'indexer (index of the boolean Series and of '
+ 'the indexed object do not match')
+ result = result.astype(bool)._values
+ elif is_sparse(result):
+ result = result.to_dense()
+ result = np.asarray(result, dtype=bool)
+ else:
+ # is_bool_indexer has already checked for nulls in the case of an
+ # object array key, so no check needed here
+ result = np.asarray(result, dtype=bool)
+
+ return result
+
+
+def check_setitem_lengths(indexer, value, values):
+ """
+ Validate that value and indexer are the same length.
+
+ An special-case is allowed for when the indexer is a boolean array
+ and the number of true values equals the length of ``value``. In
+ this case, no exception is raised.
+
+ Parameters
+ ----------
+ indexer : sequence
+ The key for the setitem
+ value : array-like
+ The value for the setitem
+ values : array-like
+ The values being set into
+
+ Returns
+ -------
+ None
+
+ Raises
+ ------
+ ValueError
+ When the indexer is an ndarray or list and the lengths don't
+ match.
+ """
+ # boolean with truth values == len of the value is ok too
+ if isinstance(indexer, (np.ndarray, list)):
+ if is_list_like(value) and len(indexer) != len(value):
+ if not (isinstance(indexer, np.ndarray) and
+ indexer.dtype == np.bool_ and
+ len(indexer[indexer]) == len(value)):
+ raise ValueError("cannot set using a list-like indexer "
+ "with a different length than the value")
+ # slice
+ elif isinstance(indexer, slice):
+
+ if is_list_like(value) and len(values):
+ if len(value) != length_of_indexer(indexer, values):
+ raise ValueError("cannot set using a slice indexer with a "
+ "different length than the value")
+
+
+def convert_missing_indexer(indexer):
+ """
+ reverse convert a missing indexer, which is a dict
+ return the scalar indexer and a boolean indicating if we converted
+ """
+
+ if isinstance(indexer, dict):
+
+ # a missing key (but not a tuple indexer)
+ indexer = indexer['key']
+
+ if isinstance(indexer, bool):
+ raise KeyError("cannot use a single bool to index into setitem")
+ return indexer, True
+
+ return indexer, False
+
+
+def convert_from_missing_indexer_tuple(indexer, axes):
+ """
+ create a filtered indexer that doesn't have any missing indexers
+ """
+
+ def get_indexer(_i, _idx):
+ return (axes[_i].get_loc(_idx['key']) if isinstance(_idx, dict) else
+ _idx)
+
+ return tuple(get_indexer(_i, _idx) for _i, _idx in enumerate(indexer))
+
+
+def maybe_convert_indices(indices, n):
+ """
+ Attempt to convert indices into valid, positive indices.
+
+ If we have negative indices, translate to positive here.
+ If we have indices that are out-of-bounds, raise an IndexError.
+
+ Parameters
+ ----------
+ indices : array-like
+ The array of indices that we are to convert.
+ n : int
+ The number of elements in the array that we are indexing.
+
+ Returns
+ -------
+ valid_indices : array-like
+ An array-like of positive indices that correspond to the ones
+ that were passed in initially to this function.
+
+ Raises
+ ------
+ IndexError : one of the converted indices either exceeded the number
+ of elements (specified by `n`) OR was still negative.
+ """
+
+ if isinstance(indices, list):
+ indices = np.array(indices)
+ if len(indices) == 0:
+ # If list is empty, np.array will return float and cause indexing
+ # errors.
+ return np.empty(0, dtype=np.intp)
+
+ mask = indices < 0
+ if mask.any():
+ indices = indices.copy()
+ indices[mask] += n
+
+ mask = (indices >= n) | (indices < 0)
+ if mask.any():
+ raise IndexError("indices are out-of-bounds")
+ return indices
+
+
+def validate_indices(indices, n):
+ """
+ Perform bounds-checking for an indexer.
+
+ -1 is allowed for indicating missing values.
+
+ Parameters
+ ----------
+ indices : ndarray
+ n : int
+ length of the array being indexed
+
+ Raises
+ ------
+ ValueError
+
+ Examples
+ --------
+ >>> validate_indices([1, 2], 3)
+ # OK
+ >>> validate_indices([1, -2], 3)
+ ValueError
+ >>> validate_indices([1, 2, 3], 3)
+ IndexError
+ >>> validate_indices([-1, -1], 0)
+ # OK
+ >>> validate_indices([0, 1], 0)
+ IndexError
+ """
+ if len(indices):
+ min_idx = indices.min()
+ if min_idx < -1:
+ msg = ("'indices' contains values less than allowed ({} < {})"
+ .format(min_idx, -1))
+ raise ValueError(msg)
+
+ max_idx = indices.max()
+ if max_idx >= n:
+ raise IndexError("indices are out-of-bounds")
+
+
+def maybe_convert_ix(*args):
+ """
+ We likely want to take the cross-product
+ """
+
+ ixify = True
+ for arg in args:
+ if not isinstance(arg, (np.ndarray, list, ABCSeries, Index)):
+ ixify = False
+
+ if ixify:
+ return np.ix_(*args)
+ else:
+ return args
+
+
+def is_nested_tuple(tup, labels):
+ # check for a compatible nested tuple and multiindexes among the axes
+ if not isinstance(tup, tuple):
+ return False
+
+ for i, k in enumerate(tup):
+
+ if is_list_like(k) or isinstance(k, slice):
+ return isinstance(labels, MultiIndex)
+
+ return False
+
+
+def is_list_like_indexer(key):
+ # allow a list_like, but exclude NamedTuples which can be indexers
+ return is_list_like(key) and not (isinstance(key, tuple) and
+ type(key) is not tuple)
+
+
+def is_label_like(key):
+ # select a label or row
+ return not isinstance(key, slice) and not is_list_like_indexer(key)
+
+
+def need_slice(obj):
+ return (obj.start is not None or obj.stop is not None or
+ (obj.step is not None and obj.step != 1))
+
+
+def maybe_droplevels(index, key):
+ # drop levels
+ original_index = index
+ if isinstance(key, tuple):
+ for _ in key:
+ try:
+ index = index.droplevel(0)
+ except ValueError:
+ # we have dropped too much, so back out
+ return original_index
+ else:
+ try:
+ index = index.droplevel(0)
+ except ValueError:
+ pass
+
+ return index
+
+
+def _non_reducing_slice(slice_):
+ """
+ Ensurse that a slice doesn't reduce to a Series or Scalar.
+
+ Any user-paseed `subset` should have this called on it
+ to make sure we're always working with DataFrames.
+ """
+ # default to column slice, like DataFrame
+ # ['A', 'B'] -> IndexSlices[:, ['A', 'B']]
+ kinds = tuple(list(compat.string_types) + [ABCSeries, np.ndarray, Index,
+ list])
+ if isinstance(slice_, kinds):
+ slice_ = IndexSlice[:, slice_]
+
+ def pred(part):
+ # true when slice does *not* reduce, False when part is a tuple,
+ # i.e. MultiIndex slice
+ return ((isinstance(part, slice) or is_list_like(part))
+ and not isinstance(part, tuple))
+
+ if not is_list_like(slice_):
+ if not isinstance(slice_, slice):
+ # a 1-d slice, like df.loc[1]
+ slice_ = [[slice_]]
+ else:
+ # slice(a, b, c)
+ slice_ = [slice_] # to tuplize later
+ else:
+ slice_ = [part if pred(part) else [part] for part in slice_]
+ return tuple(slice_)
+
+
+def _maybe_numeric_slice(df, slice_, include_bool=False):
+ """
+ want nice defaults for background_gradient that don't break
+ with non-numeric data. But if slice_ is passed go with that.
+ """
+ if slice_ is None:
+ dtypes = [np.number]
+ if include_bool:
+ dtypes.append(bool)
+ slice_ = IndexSlice[:, df.select_dtypes(include=dtypes).columns]
+ return slice_
diff --git a/contrib/python/pandas/py2/pandas/core/internals/__init__.py b/contrib/python/pandas/py2/pandas/core/internals/__init__.py
new file mode 100644
index 00000000000..7878613a8b1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/internals/__init__.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+from .blocks import ( # noqa:F401
+ _block2d_to_blocknd, _factor_indexer, _block_shape, # io.pytables
+ _safe_reshape, # io.packers
+ make_block, # io.pytables, io.packers
+ FloatBlock, IntBlock, ComplexBlock, BoolBlock, ObjectBlock,
+ TimeDeltaBlock, DatetimeBlock, DatetimeTZBlock,
+ CategoricalBlock, ExtensionBlock, Block)
+from .managers import ( # noqa:F401
+ BlockManager, SingleBlockManager,
+ create_block_manager_from_arrays, create_block_manager_from_blocks,
+ items_overlap_with_suffix, # reshape.merge
+ concatenate_block_managers) # reshape.concat, reshape.merge
diff --git a/contrib/python/pandas/py2/pandas/core/internals/arrays.py b/contrib/python/pandas/py2/pandas/core/internals/arrays.py
new file mode 100644
index 00000000000..18af328bfa7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/internals/arrays.py
@@ -0,0 +1,55 @@
+"""
+Methods for cleaning, validating, and unboxing arrays.
+"""
+from pandas.core.dtypes.generic import ABCIndexClass, ABCPandasArray, ABCSeries
+
+
+def extract_array(obj, extract_numpy=False):
+ """
+ Extract the ndarray or ExtensionArray from a Series or Index.
+
+ For all other types, `obj` is just returned as is.
+
+ Parameters
+ ----------
+ obj : object
+ For Series / Index, the underlying ExtensionArray is unboxed.
+ For Numpy-backed ExtensionArrays, the ndarray is extracted.
+
+ extract_numpy : bool, default False
+ Whether to extract the ndarray from a PandasArray
+
+ Returns
+ -------
+ arr : object
+
+ Examples
+ --------
+ >>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category'))
+ [a, b, c]
+ Categories (3, object): [a, b, c]
+
+ Other objects like lists, arrays, and DataFrames are just passed through.
+
+ >>> extract_array([1, 2, 3])
+ [1, 2, 3]
+
+ For an ndarray-backed Series / Index a PandasArray is returned.
+
+ >>> extract_array(pd.Series([1, 2, 3]))
+ <PandasArray>
+ [1, 2, 3]
+ Length: 3, dtype: int64
+
+ To extract all the way down to the ndarray, pass ``extract_numpy=True``.
+
+ >>> extract_array(pd.Series([1, 2, 3]), extract_numpy=True)
+ array([1, 2, 3])
+ """
+ if isinstance(obj, (ABCIndexClass, ABCSeries)):
+ obj = obj.array
+
+ if extract_numpy and isinstance(obj, ABCPandasArray):
+ obj = obj.to_numpy()
+
+ return obj
diff --git a/contrib/python/pandas/py2/pandas/core/internals/blocks.py b/contrib/python/pandas/py2/pandas/core/internals/blocks.py
new file mode 100644
index 00000000000..86306e7f1c5
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/internals/blocks.py
@@ -0,0 +1,3299 @@
+# -*- coding: utf-8 -*-
+from datetime import date, datetime, timedelta
+import functools
+import inspect
+import re
+import warnings
+
+import numpy as np
+
+from pandas._libs import internals as libinternals, lib, tslib, tslibs
+from pandas._libs.tslibs import Timedelta, conversion, is_null_datetimelike
+import pandas.compat as compat
+from pandas.compat import range, zip
+from pandas.util._validators import validate_bool_kwarg
+
+from pandas.core.dtypes.cast import (
+ astype_nansafe, find_common_type, infer_dtype_from,
+ infer_dtype_from_scalar, maybe_convert_objects, maybe_downcast_to_dtype,
+ maybe_infer_dtype_type, maybe_promote, maybe_upcast, soft_convert_objects)
+from pandas.core.dtypes.common import (
+ _NS_DTYPE, _TD_DTYPE, ensure_platform_int, is_bool_dtype, is_categorical,
+ is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
+ is_dtype_equal, is_extension_array_dtype, is_extension_type,
+ is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype,
+ is_list_like, is_numeric_v_string_like, is_object_dtype, is_period_dtype,
+ is_re, is_re_compilable, is_sparse, is_timedelta64_dtype, pandas_dtype)
+import pandas.core.dtypes.concat as _concat
+from pandas.core.dtypes.dtypes import (
+ CategoricalDtype, ExtensionDtype, PandasExtensionDtype)
+from pandas.core.dtypes.generic import (
+ ABCDataFrame, ABCDatetimeIndex, ABCExtensionArray, ABCIndexClass,
+ ABCSeries)
+from pandas.core.dtypes.missing import (
+ _isna_compat, array_equivalent, isna, notna)
+
+import pandas.core.algorithms as algos
+from pandas.core.arrays import (
+ Categorical, DatetimeArray, ExtensionArray, TimedeltaArray)
+from pandas.core.base import PandasObject
+import pandas.core.common as com
+from pandas.core.indexes.datetimes import DatetimeIndex
+from pandas.core.indexing import check_setitem_lengths
+from pandas.core.internals.arrays import extract_array
+import pandas.core.missing as missing
+from pandas.core.nanops import nanpercentile
+
+from pandas.io.formats.printing import pprint_thing
+
+
+class Block(PandasObject):
+ """
+ Canonical n-dimensional unit of homogeneous dtype contained in a pandas
+ data structure
+
+ Index-ignorant; let the container take care of that
+ """
+ __slots__ = ['_mgr_locs', 'values', 'ndim']
+ is_numeric = False
+ is_float = False
+ is_integer = False
+ is_complex = False
+ is_datetime = False
+ is_datetimetz = False
+ is_timedelta = False
+ is_bool = False
+ is_object = False
+ is_categorical = False
+ is_sparse = False
+ is_extension = False
+ _box_to_block_values = True
+ _can_hold_na = False
+ _can_consolidate = True
+ _verify_integrity = True
+ _validate_ndim = True
+ _ftype = 'dense'
+ _concatenator = staticmethod(np.concatenate)
+
+ def __init__(self, values, placement, ndim=None):
+ self.ndim = self._check_ndim(values, ndim)
+ self.mgr_locs = placement
+ self.values = values
+
+ if (self._validate_ndim and self.ndim and
+ len(self.mgr_locs) != len(self.values)):
+ raise ValueError(
+ 'Wrong number of items passed {val}, placement implies '
+ '{mgr}'.format(val=len(self.values), mgr=len(self.mgr_locs)))
+
+ def _check_ndim(self, values, ndim):
+ """ndim inference and validation.
+
+ Infers ndim from 'values' if not provided to __init__.
+ Validates that values.ndim and ndim are consistent if and only if
+ the class variable '_validate_ndim' is True.
+
+ Parameters
+ ----------
+ values : array-like
+ ndim : int or None
+
+ Returns
+ -------
+ ndim : int
+
+ Raises
+ ------
+ ValueError : the number of dimensions do not match
+ """
+ if ndim is None:
+ ndim = values.ndim
+
+ if self._validate_ndim and values.ndim != ndim:
+ msg = ("Wrong number of dimensions. values.ndim != ndim "
+ "[{} != {}]")
+ raise ValueError(msg.format(values.ndim, ndim))
+
+ return ndim
+
+ @property
+ def _holder(self):
+ """The array-like that can hold the underlying values.
+
+ None for 'Block', overridden by subclasses that don't
+ use an ndarray.
+ """
+ return None
+
+ @property
+ def _consolidate_key(self):
+ return (self._can_consolidate, self.dtype.name)
+
+ @property
+ def _is_single_block(self):
+ return self.ndim == 1
+
+ @property
+ def is_view(self):
+ """ return a boolean if I am possibly a view """
+ return self.values.base is not None
+
+ @property
+ def is_datelike(self):
+ """ return True if I am a non-datelike """
+ return self.is_datetime or self.is_timedelta
+
+ def is_categorical_astype(self, dtype):
+ """
+ validate that we have a astypeable to categorical,
+ returns a boolean if we are a categorical
+ """
+ if dtype is Categorical or dtype is CategoricalDtype:
+ # this is a pd.Categorical, but is not
+ # a valid type for astypeing
+ raise TypeError("invalid type {0} for astype".format(dtype))
+
+ elif is_categorical_dtype(dtype):
+ return True
+
+ return False
+
+ def external_values(self, dtype=None):
+ """ return an outside world format, currently just the ndarray """
+ return self.values
+
+ def internal_values(self, dtype=None):
+ """ return an internal format, currently just the ndarray
+ this should be the pure internal API format
+ """
+ return self.values
+
+ def formatting_values(self):
+ """Return the internal values used by the DataFrame/SeriesFormatter"""
+ return self.internal_values()
+
+ def get_values(self, dtype=None):
+ """
+ return an internal format, currently just the ndarray
+ this is often overridden to handle to_dense like operations
+ """
+ if is_object_dtype(dtype):
+ return self.values.astype(object)
+ return self.values
+
+ def to_dense(self):
+ return self.values.view()
+
+ @property
+ def _na_value(self):
+ return np.nan
+
+ @property
+ def fill_value(self):
+ return np.nan
+
+ @property
+ def mgr_locs(self):
+ return self._mgr_locs
+
+ @mgr_locs.setter
+ def mgr_locs(self, new_mgr_locs):
+ if not isinstance(new_mgr_locs, libinternals.BlockPlacement):
+ new_mgr_locs = libinternals.BlockPlacement(new_mgr_locs)
+
+ self._mgr_locs = new_mgr_locs
+
+ @property
+ def array_dtype(self):
+ """ the dtype to return if I want to construct this block as an
+ array
+ """
+ return self.dtype
+
+ def make_block(self, values, placement=None, ndim=None):
+ """
+ Create a new block, with type inference propagate any values that are
+ not specified
+ """
+ if placement is None:
+ placement = self.mgr_locs
+ if ndim is None:
+ ndim = self.ndim
+
+ return make_block(values, placement=placement, ndim=ndim)
+
+ def make_block_same_class(self, values, placement=None, ndim=None,
+ dtype=None):
+ """ Wrap given values in a block of same type as self. """
+ if dtype is not None:
+ # issue 19431 fastparquet is passing this
+ warnings.warn("dtype argument is deprecated, will be removed "
+ "in a future release.", DeprecationWarning)
+ if placement is None:
+ placement = self.mgr_locs
+ return make_block(values, placement=placement, ndim=ndim,
+ klass=self.__class__, dtype=dtype)
+
+ def __unicode__(self):
+
+ # don't want to print out all of the items here
+ name = pprint_thing(self.__class__.__name__)
+ if self._is_single_block:
+
+ result = '{name}: {len} dtype: {dtype}'.format(
+ name=name, len=len(self), dtype=self.dtype)
+
+ else:
+
+ shape = ' x '.join(pprint_thing(s) for s in self.shape)
+ result = '{name}: {index}, {shape}, dtype: {dtype}'.format(
+ name=name, index=pprint_thing(self.mgr_locs.indexer),
+ shape=shape, dtype=self.dtype)
+
+ return result
+
+ def __len__(self):
+ return len(self.values)
+
+ def __getstate__(self):
+ return self.mgr_locs.indexer, self.values
+
+ def __setstate__(self, state):
+ self.mgr_locs = libinternals.BlockPlacement(state[0])
+ self.values = state[1]
+ self.ndim = self.values.ndim
+
+ def _slice(self, slicer):
+ """ return a slice of my values """
+ return self.values[slicer]
+
+ def reshape_nd(self, labels, shape, ref_items):
+ """
+ Parameters
+ ----------
+ labels : list of new axis labels
+ shape : new shape
+ ref_items : new ref_items
+
+ return a new block that is transformed to a nd block
+ """
+ return _block2d_to_blocknd(values=self.get_values().T,
+ placement=self.mgr_locs, shape=shape,
+ labels=labels, ref_items=ref_items)
+
+ def getitem_block(self, slicer, new_mgr_locs=None):
+ """
+ Perform __getitem__-like, return result as block.
+
+ As of now, only supports slices that preserve dimensionality.
+ """
+ if new_mgr_locs is None:
+ if isinstance(slicer, tuple):
+ axis0_slicer = slicer[0]
+ else:
+ axis0_slicer = slicer
+ new_mgr_locs = self.mgr_locs[axis0_slicer]
+
+ new_values = self._slice(slicer)
+
+ if self._validate_ndim and new_values.ndim != self.ndim:
+ raise ValueError("Only same dim slicing is allowed")
+
+ return self.make_block_same_class(new_values, new_mgr_locs)
+
+ @property
+ def shape(self):
+ return self.values.shape
+
+ @property
+ def dtype(self):
+ return self.values.dtype
+
+ @property
+ def ftype(self):
+ if getattr(self.values, '_pandas_ftype', False):
+ dtype = self.dtype.subtype
+ else:
+ dtype = self.dtype
+ return "{dtype}:{ftype}".format(dtype=dtype, ftype=self._ftype)
+
+ def merge(self, other):
+ return _merge_blocks([self, other])
+
+ def concat_same_type(self, to_concat, placement=None):
+ """
+ Concatenate list of single blocks of the same type.
+ """
+ values = self._concatenator([blk.values for blk in to_concat],
+ axis=self.ndim - 1)
+ return self.make_block_same_class(
+ values, placement=placement or slice(0, len(values), 1))
+
+ def iget(self, i):
+ return self.values[i]
+
+ def set(self, locs, values):
+ """
+ Modify Block in-place with new item value
+
+ Returns
+ -------
+ None
+ """
+ self.values[locs] = values
+
+ def delete(self, loc):
+ """
+ Delete given loc(-s) from block in-place.
+ """
+ self.values = np.delete(self.values, loc, 0)
+ self.mgr_locs = self.mgr_locs.delete(loc)
+
+ def apply(self, func, **kwargs):
+ """ apply the function to my values; return a block if we are not
+ one
+ """
+ with np.errstate(all='ignore'):
+ result = func(self.values, **kwargs)
+ if not isinstance(result, Block):
+ result = self.make_block(values=_block_shape(result,
+ ndim=self.ndim))
+
+ return result
+
+ def fillna(self, value, limit=None, inplace=False, downcast=None):
+ """ fillna on the block with the value. If we fail, then convert to
+ ObjectBlock and try again
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+
+ if not self._can_hold_na:
+ if inplace:
+ return self
+ else:
+ return self.copy()
+
+ mask = isna(self.values)
+ if limit is not None:
+ if not is_integer(limit):
+ raise ValueError('Limit must be an integer')
+ if limit < 1:
+ raise ValueError('Limit must be greater than 0')
+ if self.ndim > 2:
+ raise NotImplementedError("number of dimensions for 'fillna' "
+ "is currently limited to 2")
+ mask[mask.cumsum(self.ndim - 1) > limit] = False
+
+ # fillna, but if we cannot coerce, then try again as an ObjectBlock
+ try:
+ values, _ = self._try_coerce_args(self.values, value)
+ blocks = self.putmask(mask, value, inplace=inplace)
+ blocks = [b.make_block(values=self._try_coerce_result(b.values))
+ for b in blocks]
+ return self._maybe_downcast(blocks, downcast)
+ except (TypeError, ValueError):
+
+ # we can't process the value, but nothing to do
+ if not mask.any():
+ return self if inplace else self.copy()
+
+ # operate column-by-column
+ def f(m, v, i):
+ block = self.coerce_to_target_dtype(value)
+
+ # slice out our block
+ if i is not None:
+ block = block.getitem_block(slice(i, i + 1))
+ return block.fillna(value,
+ limit=limit,
+ inplace=inplace,
+ downcast=None)
+
+ return self.split_and_operate(mask, f, inplace)
+
+ def split_and_operate(self, mask, f, inplace):
+ """
+ split the block per-column, and apply the callable f
+ per-column, return a new block for each. Handle
+ masking which will not change a block unless needed.
+
+ Parameters
+ ----------
+ mask : 2-d boolean mask
+ f : callable accepting (1d-mask, 1d values, indexer)
+ inplace : boolean
+
+ Returns
+ -------
+ list of blocks
+ """
+
+ if mask is None:
+ mask = np.ones(self.shape, dtype=bool)
+ new_values = self.values
+
+ def make_a_block(nv, ref_loc):
+ if isinstance(nv, Block):
+ block = nv
+ elif isinstance(nv, list):
+ block = nv[0]
+ else:
+ # Put back the dimension that was taken from it and make
+ # a block out of the result.
+ try:
+ nv = _block_shape(nv, ndim=self.ndim)
+ except (AttributeError, NotImplementedError):
+ pass
+ block = self.make_block(values=nv,
+ placement=ref_loc)
+ return block
+
+ # ndim == 1
+ if self.ndim == 1:
+ if mask.any():
+ nv = f(mask, new_values, None)
+ else:
+ nv = new_values if inplace else new_values.copy()
+ block = make_a_block(nv, self.mgr_locs)
+ return [block]
+
+ # ndim > 1
+ new_blocks = []
+ for i, ref_loc in enumerate(self.mgr_locs):
+ m = mask[i]
+ v = new_values[i]
+
+ # need a new block
+ if m.any():
+ nv = f(m, v, i)
+ else:
+ nv = v if inplace else v.copy()
+
+ block = make_a_block(nv, [ref_loc])
+ new_blocks.append(block)
+
+ return new_blocks
+
+ def _maybe_downcast(self, blocks, downcast=None):
+
+ # no need to downcast our float
+ # unless indicated
+ if downcast is None and self.is_float:
+ return blocks
+ elif downcast is None and (self.is_timedelta or self.is_datetime):
+ return blocks
+
+ if not isinstance(blocks, list):
+ blocks = [blocks]
+ return _extend_blocks([b.downcast(downcast) for b in blocks])
+
+ def downcast(self, dtypes=None):
+ """ try to downcast each item to the dict of dtypes if present """
+
+ # turn it off completely
+ if dtypes is False:
+ return self
+
+ values = self.values
+
+ # single block handling
+ if self._is_single_block:
+
+ # try to cast all non-floats here
+ if dtypes is None:
+ dtypes = 'infer'
+
+ nv = maybe_downcast_to_dtype(values, dtypes)
+ return self.make_block(nv)
+
+ # ndim > 1
+ if dtypes is None:
+ return self
+
+ if not (dtypes == 'infer' or isinstance(dtypes, dict)):
+ raise ValueError("downcast must have a dictionary or 'infer' as "
+ "its argument")
+
+ # operate column-by-column
+ # this is expensive as it splits the blocks items-by-item
+ def f(m, v, i):
+
+ if dtypes == 'infer':
+ dtype = 'infer'
+ else:
+ raise AssertionError("dtypes as dict is not supported yet")
+
+ if dtype is not None:
+ v = maybe_downcast_to_dtype(v, dtype)
+ return v
+
+ return self.split_and_operate(None, f, False)
+
+ def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
+ return self._astype(dtype, copy=copy, errors=errors, values=values,
+ **kwargs)
+
+ def _astype(self, dtype, copy=False, errors='raise', values=None,
+ **kwargs):
+ """Coerce to the new type
+
+ Parameters
+ ----------
+ dtype : str, dtype convertible
+ copy : boolean, default False
+ copy if indicated
+ errors : str, {'raise', 'ignore'}, default 'ignore'
+ - ``raise`` : allow exceptions to be raised
+ - ``ignore`` : suppress exceptions. On error return original object
+
+ Returns
+ -------
+ Block
+ """
+ errors_legal_values = ('raise', 'ignore')
+
+ if errors not in errors_legal_values:
+ invalid_arg = ("Expected value of kwarg 'errors' to be one of {}. "
+ "Supplied value is '{}'".format(
+ list(errors_legal_values), errors))
+ raise ValueError(invalid_arg)
+
+ if (inspect.isclass(dtype) and
+ issubclass(dtype, (PandasExtensionDtype, ExtensionDtype))):
+ msg = ("Expected an instance of {}, but got the class instead. "
+ "Try instantiating 'dtype'.".format(dtype.__name__))
+ raise TypeError(msg)
+
+ # may need to convert to categorical
+ if self.is_categorical_astype(dtype):
+
+ # deprecated 17636
+ if ('categories' in kwargs or 'ordered' in kwargs):
+ if isinstance(dtype, CategoricalDtype):
+ raise TypeError(
+ "Cannot specify a CategoricalDtype and also "
+ "`categories` or `ordered`. Use "
+ "`dtype=CategoricalDtype(categories, ordered)`"
+ " instead.")
+ warnings.warn("specifying 'categories' or 'ordered' in "
+ ".astype() is deprecated; pass a "
+ "CategoricalDtype instead",
+ FutureWarning, stacklevel=7)
+
+ categories = kwargs.get('categories', None)
+ ordered = kwargs.get('ordered', None)
+ if com._any_not_none(categories, ordered):
+ dtype = CategoricalDtype(categories, ordered)
+
+ if is_categorical_dtype(self.values):
+ # GH 10696/18593: update an existing categorical efficiently
+ return self.make_block(self.values.astype(dtype, copy=copy))
+
+ return self.make_block(Categorical(self.values, dtype=dtype))
+
+ # convert dtypes if needed
+ dtype = pandas_dtype(dtype)
+ # astype processing
+ if is_dtype_equal(self.dtype, dtype):
+ if copy:
+ return self.copy()
+ return self
+
+ klass = None
+ if is_sparse(self.values):
+ # special case sparse, Series[Sparse].astype(object) is sparse
+ klass = ExtensionBlock
+ elif is_object_dtype(dtype):
+ klass = ObjectBlock
+ elif is_extension_array_dtype(dtype):
+ klass = ExtensionBlock
+
+ try:
+ # force the copy here
+ if values is None:
+
+ if self.is_extension:
+ values = self.values.astype(dtype)
+ else:
+ if issubclass(dtype.type,
+ (compat.text_type, compat.string_types)):
+
+ # use native type formatting for datetime/tz/timedelta
+ if self.is_datelike:
+ values = self.to_native_types()
+
+ # astype formatting
+ else:
+ values = self.get_values()
+
+ else:
+ values = self.get_values(dtype=dtype)
+
+ # _astype_nansafe works fine with 1-d only
+ values = astype_nansafe(values.ravel(), dtype, copy=True)
+
+ # TODO(extension)
+ # should we make this attribute?
+ try:
+ values = values.reshape(self.shape)
+ except AttributeError:
+ pass
+
+ newb = make_block(values, placement=self.mgr_locs,
+ klass=klass, ndim=self.ndim)
+ except Exception: # noqa: E722
+ if errors == 'raise':
+ raise
+ newb = self.copy() if copy else self
+
+ if newb.is_numeric and self.is_numeric:
+ if newb.shape != self.shape:
+ raise TypeError(
+ "cannot set astype for copy = [{copy}] for dtype "
+ "({dtype} [{shape}]) to different shape "
+ "({newb_dtype} [{newb_shape}])".format(
+ copy=copy, dtype=self.dtype.name,
+ shape=self.shape, newb_dtype=newb.dtype.name,
+ newb_shape=newb.shape))
+ return newb
+
+ def convert(self, copy=True, **kwargs):
+ """ attempt to coerce any object types to better types return a copy
+ of the block (if copy = True) by definition we are not an ObjectBlock
+ here!
+ """
+
+ return self.copy() if copy else self
+
+ def _can_hold_element(self, element):
+ """ require the same dtype as ourselves """
+ dtype = self.values.dtype.type
+ tipo = maybe_infer_dtype_type(element)
+ if tipo is not None:
+ return issubclass(tipo.type, dtype)
+ return isinstance(element, dtype)
+
+ def _try_cast_result(self, result, dtype=None):
+ """ try to cast the result to our original type, we may have
+ roundtripped thru object in the mean-time
+ """
+ if dtype is None:
+ dtype = self.dtype
+
+ if self.is_integer or self.is_bool or self.is_datetime:
+ pass
+ elif self.is_float and result.dtype == self.dtype:
+
+ # protect against a bool/object showing up here
+ if isinstance(dtype, compat.string_types) and dtype == 'infer':
+ return result
+ if not isinstance(dtype, type):
+ dtype = dtype.type
+ if issubclass(dtype, (np.bool_, np.object_)):
+ if issubclass(dtype, np.bool_):
+ if isna(result).all():
+ return result.astype(np.bool_)
+ else:
+ result = result.astype(np.object_)
+ result[result == 1] = True
+ result[result == 0] = False
+ return result
+ else:
+ return result.astype(np.object_)
+
+ return result
+
+ # may need to change the dtype here
+ return maybe_downcast_to_dtype(result, dtype)
+
+ def _try_coerce_args(self, values, other):
+ """ provide coercion to our input arguments """
+
+ if np.any(notna(other)) and not self._can_hold_element(other):
+ # coercion issues
+ # let higher levels handle
+ raise TypeError("cannot convert {} to an {}".format(
+ type(other).__name__,
+ type(self).__name__.lower().replace('Block', '')))
+
+ return values, other
+
+ def _try_coerce_result(self, result):
+ """ reverse of try_coerce_args """
+ return result
+
+ def _try_coerce_and_cast_result(self, result, dtype=None):
+ result = self._try_coerce_result(result)
+ result = self._try_cast_result(result, dtype=dtype)
+ return result
+
+ def to_native_types(self, slicer=None, na_rep='nan', quoting=None,
+ **kwargs):
+ """ convert to our native types format, slicing if desired """
+
+ values = self.get_values()
+
+ if slicer is not None:
+ values = values[:, slicer]
+ mask = isna(values)
+
+ if not self.is_object and not quoting:
+ values = values.astype(str)
+ else:
+ values = np.array(values, dtype='object')
+
+ values[mask] = na_rep
+ return values
+
+ # block actions ####
+ def copy(self, deep=True):
+ """ copy constructor """
+ values = self.values
+ if deep:
+ values = values.copy()
+ return self.make_block_same_class(values)
+
+ def replace(self, to_replace, value, inplace=False, filter=None,
+ regex=False, convert=True):
+ """replace the to_replace value with value, possible to create new
+ blocks here this is just a call to putmask. regex is not used here.
+ It is used in ObjectBlocks. It is here for API compatibility.
+ """
+
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ original_to_replace = to_replace
+
+ # try to replace, if we raise an error, convert to ObjectBlock and
+ # retry
+ try:
+ values, to_replace = self._try_coerce_args(self.values,
+ to_replace)
+ mask = missing.mask_missing(values, to_replace)
+ if filter is not None:
+ filtered_out = ~self.mgr_locs.isin(filter)
+ mask[filtered_out.nonzero()[0]] = False
+
+ blocks = self.putmask(mask, value, inplace=inplace)
+ if convert:
+ blocks = [b.convert(by_item=True, numeric=False,
+ copy=not inplace) for b in blocks]
+ return blocks
+ except (TypeError, ValueError):
+ # GH 22083, TypeError or ValueError occurred within error handling
+ # causes infinite loop. Cast and retry only if not objectblock.
+ if is_object_dtype(self):
+ raise
+
+ # try again with a compatible block
+ block = self.astype(object)
+ return block.replace(to_replace=original_to_replace,
+ value=value,
+ inplace=inplace,
+ filter=filter,
+ regex=regex,
+ convert=convert)
+
+ def _replace_single(self, *args, **kwargs):
+ """ no-op on a non-ObjectBlock """
+ return self if kwargs['inplace'] else self.copy()
+
+ def setitem(self, indexer, value):
+ """Set the value inplace, returning a a maybe different typed block.
+
+ Parameters
+ ----------
+ indexer : tuple, list-like, array-like, slice
+ The subset of self.values to set
+ value : object
+ The value being set
+
+ Returns
+ -------
+ Block
+
+ Notes
+ -----
+ `indexer` is a direct slice/positional indexer. `value` must
+ be a compatible shape.
+ """
+ # coerce None values, if appropriate
+ if value is None:
+ if self.is_numeric:
+ value = np.nan
+
+ # coerce if block dtype can store value
+ values = self.values
+ try:
+ values, value = self._try_coerce_args(values, value)
+ # can keep its own dtype
+ if hasattr(value, 'dtype') and is_dtype_equal(values.dtype,
+ value.dtype):
+ dtype = self.dtype
+ else:
+ dtype = 'infer'
+
+ except (TypeError, ValueError):
+ # current dtype cannot store value, coerce to common dtype
+ find_dtype = False
+
+ if hasattr(value, 'dtype'):
+ dtype = value.dtype
+ find_dtype = True
+
+ elif lib.is_scalar(value):
+ if isna(value):
+ # NaN promotion is handled in latter path
+ dtype = False
+ else:
+ dtype, _ = infer_dtype_from_scalar(value,
+ pandas_dtype=True)
+ find_dtype = True
+ else:
+ dtype = 'infer'
+
+ if find_dtype:
+ dtype = find_common_type([values.dtype, dtype])
+ if not is_dtype_equal(self.dtype, dtype):
+ b = self.astype(dtype)
+ return b.setitem(indexer, value)
+
+ # value must be storeable at this moment
+ arr_value = np.array(value)
+
+ # cast the values to a type that can hold nan (if necessary)
+ if not self._can_hold_element(value):
+ dtype, _ = maybe_promote(arr_value.dtype)
+ values = values.astype(dtype)
+
+ transf = (lambda x: x.T) if self.ndim == 2 else (lambda x: x)
+ values = transf(values)
+
+ # length checking
+ check_setitem_lengths(indexer, value, values)
+
+ def _is_scalar_indexer(indexer):
+ # return True if we are all scalar indexers
+
+ if arr_value.ndim == 1:
+ if not isinstance(indexer, tuple):
+ indexer = tuple([indexer])
+ return any(isinstance(idx, np.ndarray) and len(idx) == 0
+ for idx in indexer)
+ return False
+
+ def _is_empty_indexer(indexer):
+ # return a boolean if we have an empty indexer
+
+ if is_list_like(indexer) and not len(indexer):
+ return True
+ if arr_value.ndim == 1:
+ if not isinstance(indexer, tuple):
+ indexer = tuple([indexer])
+ return any(isinstance(idx, np.ndarray) and len(idx) == 0
+ for idx in indexer)
+ return False
+
+ # empty indexers
+ # 8669 (empty)
+ if _is_empty_indexer(indexer):
+ pass
+
+ # setting a single element for each dim and with a rhs that could
+ # be say a list
+ # GH 6043
+ elif _is_scalar_indexer(indexer):
+ values[indexer] = value
+
+ # if we are an exact match (ex-broadcasting),
+ # then use the resultant dtype
+ elif (len(arr_value.shape) and
+ arr_value.shape[0] == values.shape[0] and
+ np.prod(arr_value.shape) == np.prod(values.shape)):
+ values[indexer] = value
+ try:
+ values = values.astype(arr_value.dtype)
+ except ValueError:
+ pass
+
+ # set
+ else:
+ values[indexer] = value
+
+ # coerce and try to infer the dtypes of the result
+ values = self._try_coerce_and_cast_result(values, dtype)
+ block = self.make_block(transf(values))
+ return block
+
+ def putmask(self, mask, new, align=True, inplace=False, axis=0,
+ transpose=False):
+ """ putmask the data to the block; it is possible that we may create a
+ new dtype of block
+
+ return the resulting block(s)
+
+ Parameters
+ ----------
+ mask : the condition to respect
+ new : a ndarray/object
+ align : boolean, perform alignment on other/cond, default is True
+ inplace : perform inplace modification, default is False
+ axis : int
+ transpose : boolean
+ Set to True if self is stored with axes reversed
+
+ Returns
+ -------
+ a list of new blocks, the result of the putmask
+ """
+
+ new_values = self.values if inplace else self.values.copy()
+
+ new = getattr(new, 'values', new)
+ mask = getattr(mask, 'values', mask)
+
+ # if we are passed a scalar None, convert it here
+ if not is_list_like(new) and isna(new) and not self.is_object:
+ new = self.fill_value
+
+ if self._can_hold_element(new):
+ _, new = self._try_coerce_args(new_values, new)
+
+ if transpose:
+ new_values = new_values.T
+
+ # If the default repeat behavior in np.putmask would go in the
+ # wrong direction, then explicitly repeat and reshape new instead
+ if getattr(new, 'ndim', 0) >= 1:
+ if self.ndim - 1 == new.ndim and axis == 1:
+ new = np.repeat(
+ new, new_values.shape[-1]).reshape(self.shape)
+ new = new.astype(new_values.dtype)
+
+ # we require exact matches between the len of the
+ # values we are setting (or is compat). np.putmask
+ # doesn't check this and will simply truncate / pad
+ # the output, but we want sane error messages
+ #
+ # TODO: this prob needs some better checking
+ # for 2D cases
+ if ((is_list_like(new) and
+ np.any(mask[mask]) and
+ getattr(new, 'ndim', 1) == 1)):
+
+ if not (mask.shape[-1] == len(new) or
+ mask[mask].shape[-1] == len(new) or
+ len(new) == 1):
+ raise ValueError("cannot assign mismatch "
+ "length to masked array")
+
+ np.putmask(new_values, mask, new)
+
+ # maybe upcast me
+ elif mask.any():
+ if transpose:
+ mask = mask.T
+ if isinstance(new, np.ndarray):
+ new = new.T
+ axis = new_values.ndim - axis - 1
+
+ # Pseudo-broadcast
+ if getattr(new, 'ndim', 0) >= 1:
+ if self.ndim - 1 == new.ndim:
+ new_shape = list(new.shape)
+ new_shape.insert(axis, 1)
+ new = new.reshape(tuple(new_shape))
+
+ # operate column-by-column
+ def f(m, v, i):
+
+ if i is None:
+ # ndim==1 case.
+ n = new
+ else:
+
+ if isinstance(new, np.ndarray):
+ n = np.squeeze(new[i % new.shape[0]])
+ else:
+ n = np.array(new)
+
+ # type of the new block
+ dtype, _ = maybe_promote(n.dtype)
+
+ # we need to explicitly astype here to make a copy
+ n = n.astype(dtype)
+
+ nv = _putmask_smart(v, m, n)
+ return nv
+
+ new_blocks = self.split_and_operate(mask, f, inplace)
+ return new_blocks
+
+ if inplace:
+ return [self]
+
+ if transpose:
+ new_values = new_values.T
+
+ return [self.make_block(new_values)]
+
+ def coerce_to_target_dtype(self, other):
+ """
+ coerce the current block to a dtype compat for other
+ we will return a block, possibly object, and not raise
+
+ we can also safely try to coerce to the same dtype
+ and will receive the same block
+ """
+
+ # if we cannot then coerce to object
+ dtype, _ = infer_dtype_from(other, pandas_dtype=True)
+
+ if is_dtype_equal(self.dtype, dtype):
+ return self
+
+ if self.is_bool or is_object_dtype(dtype) or is_bool_dtype(dtype):
+ # we don't upcast to bool
+ return self.astype(object)
+
+ elif ((self.is_float or self.is_complex) and
+ (is_integer_dtype(dtype) or is_float_dtype(dtype))):
+ # don't coerce float/complex to int
+ return self
+
+ elif (self.is_datetime or
+ is_datetime64_dtype(dtype) or
+ is_datetime64tz_dtype(dtype)):
+
+ # not a datetime
+ if not ((is_datetime64_dtype(dtype) or
+ is_datetime64tz_dtype(dtype)) and self.is_datetime):
+ return self.astype(object)
+
+ # don't upcast timezone with different timezone or no timezone
+ mytz = getattr(self.dtype, 'tz', None)
+ othertz = getattr(dtype, 'tz', None)
+
+ if str(mytz) != str(othertz):
+ return self.astype(object)
+
+ raise AssertionError("possible recursion in "
+ "coerce_to_target_dtype: {} {}".format(
+ self, other))
+
+ elif (self.is_timedelta or is_timedelta64_dtype(dtype)):
+
+ # not a timedelta
+ if not (is_timedelta64_dtype(dtype) and self.is_timedelta):
+ return self.astype(object)
+
+ raise AssertionError("possible recursion in "
+ "coerce_to_target_dtype: {} {}".format(
+ self, other))
+
+ try:
+ return self.astype(dtype)
+ except (ValueError, TypeError, OverflowError):
+ pass
+
+ return self.astype(object)
+
+ def interpolate(self, method='pad', axis=0, index=None, values=None,
+ inplace=False, limit=None, limit_direction='forward',
+ limit_area=None, fill_value=None, coerce=False,
+ downcast=None, **kwargs):
+
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+
+ def check_int_bool(self, inplace):
+ # Only FloatBlocks will contain NaNs.
+ # timedelta subclasses IntBlock
+ if (self.is_bool or self.is_integer) and not self.is_timedelta:
+ if inplace:
+ return self
+ else:
+ return self.copy()
+
+ # a fill na type method
+ try:
+ m = missing.clean_fill_method(method)
+ except ValueError:
+ m = None
+
+ if m is not None:
+ r = check_int_bool(self, inplace)
+ if r is not None:
+ return r
+ return self._interpolate_with_fill(method=m, axis=axis,
+ inplace=inplace, limit=limit,
+ fill_value=fill_value,
+ coerce=coerce,
+ downcast=downcast)
+ # try an interp method
+ try:
+ m = missing.clean_interp_method(method, **kwargs)
+ except ValueError:
+ m = None
+
+ if m is not None:
+ r = check_int_bool(self, inplace)
+ if r is not None:
+ return r
+ return self._interpolate(method=m, index=index, values=values,
+ axis=axis, limit=limit,
+ limit_direction=limit_direction,
+ limit_area=limit_area,
+ fill_value=fill_value, inplace=inplace,
+ downcast=downcast, **kwargs)
+
+ raise ValueError("invalid method '{0}' to interpolate.".format(method))
+
+ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False,
+ limit=None, fill_value=None, coerce=False,
+ downcast=None):
+ """ fillna but using the interpolate machinery """
+
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+
+ # if we are coercing, then don't force the conversion
+ # if the block can't hold the type
+ if coerce:
+ if not self._can_hold_na:
+ if inplace:
+ return [self]
+ else:
+ return [self.copy()]
+
+ values = self.values if inplace else self.values.copy()
+ values, fill_value = self._try_coerce_args(values, fill_value)
+ values = missing.interpolate_2d(values, method=method, axis=axis,
+ limit=limit, fill_value=fill_value,
+ dtype=self.dtype)
+ values = self._try_coerce_result(values)
+
+ blocks = [self.make_block_same_class(values, ndim=self.ndim)]
+ return self._maybe_downcast(blocks, downcast)
+
+ def _interpolate(self, method=None, index=None, values=None,
+ fill_value=None, axis=0, limit=None,
+ limit_direction='forward', limit_area=None,
+ inplace=False, downcast=None, **kwargs):
+ """ interpolate using scipy wrappers """
+
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ data = self.values if inplace else self.values.copy()
+
+ # only deal with floats
+ if not self.is_float:
+ if not self.is_integer:
+ return self
+ data = data.astype(np.float64)
+
+ if fill_value is None:
+ fill_value = self.fill_value
+
+ if method in ('krogh', 'piecewise_polynomial', 'pchip'):
+ if not index.is_monotonic:
+ raise ValueError("{0} interpolation requires that the "
+ "index be monotonic.".format(method))
+ # process 1-d slices in the axis direction
+
+ def func(x):
+
+ # process a 1-d slice, returning it
+ # should the axis argument be handled below in apply_along_axis?
+ # i.e. not an arg to missing.interpolate_1d
+ return missing.interpolate_1d(index, x, method=method, limit=limit,
+ limit_direction=limit_direction,
+ limit_area=limit_area,
+ fill_value=fill_value,
+ bounds_error=False, **kwargs)
+
+ # interp each column independently
+ interp_values = np.apply_along_axis(func, axis, data)
+
+ blocks = [self.make_block_same_class(interp_values)]
+ return self._maybe_downcast(blocks, downcast)
+
+ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None):
+ """
+ Take values according to indexer and return them as a block.bb
+
+ """
+
+ # algos.take_nd dispatches for DatetimeTZBlock, CategoricalBlock
+ # so need to preserve types
+ # sparse is treated like an ndarray, but needs .get_values() shaping
+
+ values = self.values
+ if self.is_sparse:
+ values = self.get_values()
+
+ if fill_tuple is None:
+ fill_value = self.fill_value
+ new_values = algos.take_nd(values, indexer, axis=axis,
+ allow_fill=False, fill_value=fill_value)
+ else:
+ fill_value = fill_tuple[0]
+ new_values = algos.take_nd(values, indexer, axis=axis,
+ allow_fill=True, fill_value=fill_value)
+
+ if new_mgr_locs is None:
+ if axis == 0:
+ slc = libinternals.indexer_as_slice(indexer)
+ if slc is not None:
+ new_mgr_locs = self.mgr_locs[slc]
+ else:
+ new_mgr_locs = self.mgr_locs[indexer]
+ else:
+ new_mgr_locs = self.mgr_locs
+
+ if not is_dtype_equal(new_values.dtype, self.dtype):
+ return self.make_block(new_values, new_mgr_locs)
+ else:
+ return self.make_block_same_class(new_values, new_mgr_locs)
+
+ def diff(self, n, axis=1):
+ """ return block for the diff of the values """
+ new_values = algos.diff(self.values, n, axis=axis)
+ return [self.make_block(values=new_values)]
+
+ def shift(self, periods, axis=0, fill_value=None):
+ """ shift the block by periods, possibly upcast """
+
+ # convert integer to float if necessary. need to do a lot more than
+ # that, handle boolean etc also
+ new_values, fill_value = maybe_upcast(self.values, fill_value)
+
+ # make sure array sent to np.roll is c_contiguous
+ f_ordered = new_values.flags.f_contiguous
+ if f_ordered:
+ new_values = new_values.T
+ axis = new_values.ndim - axis - 1
+
+ if np.prod(new_values.shape):
+ new_values = np.roll(new_values, ensure_platform_int(periods),
+ axis=axis)
+
+ axis_indexer = [slice(None)] * self.ndim
+ if periods > 0:
+ axis_indexer[axis] = slice(None, periods)
+ else:
+ axis_indexer[axis] = slice(periods, None)
+ new_values[tuple(axis_indexer)] = fill_value
+
+ # restore original order
+ if f_ordered:
+ new_values = new_values.T
+
+ return [self.make_block(new_values)]
+
+ def where(self, other, cond, align=True, errors='raise',
+ try_cast=False, axis=0, transpose=False):
+ """
+ evaluate the block; return result block(s) from the result
+
+ Parameters
+ ----------
+ other : a ndarray/object
+ cond : the condition to respect
+ align : boolean, perform alignment on other/cond
+ errors : str, {'raise', 'ignore'}, default 'raise'
+ - ``raise`` : allow exceptions to be raised
+ - ``ignore`` : suppress exceptions. On error return original object
+
+ axis : int
+ transpose : boolean
+ Set to True if self is stored with axes reversed
+
+ Returns
+ -------
+ a new block(s), the result of the func
+ """
+ import pandas.core.computation.expressions as expressions
+ assert errors in ['raise', 'ignore']
+
+ values = self.values
+ orig_other = other
+ if transpose:
+ values = values.T
+
+ other = getattr(other, '_values', getattr(other, 'values', other))
+ cond = getattr(cond, 'values', cond)
+
+ # If the default broadcasting would go in the wrong direction, then
+ # explicitly reshape other instead
+ if getattr(other, 'ndim', 0) >= 1:
+ if values.ndim - 1 == other.ndim and axis == 1:
+ other = other.reshape(tuple(other.shape + (1, )))
+ elif transpose and values.ndim == self.ndim - 1:
+ cond = cond.T
+
+ if not hasattr(cond, 'shape'):
+ raise ValueError("where must have a condition that is ndarray "
+ "like")
+
+ # our where function
+ def func(cond, values, other):
+ if cond.ravel().all():
+ return values
+
+ values, other = self._try_coerce_args(values, other)
+
+ try:
+ return self._try_coerce_result(expressions.where(
+ cond, values, other))
+ except Exception as detail:
+ if errors == 'raise':
+ raise TypeError(
+ 'Could not operate [{other!r}] with block values '
+ '[{detail!s}]'.format(other=other, detail=detail))
+ else:
+ # return the values
+ result = np.empty(values.shape, dtype='float64')
+ result.fill(np.nan)
+ return result
+
+ # see if we can operate on the entire block, or need item-by-item
+ # or if we are a single block (ndim == 1)
+ try:
+ result = func(cond, values, other)
+ except TypeError:
+
+ # we cannot coerce, return a compat dtype
+ # we are explicitly ignoring errors
+ block = self.coerce_to_target_dtype(other)
+ blocks = block.where(orig_other, cond, align=align,
+ errors=errors,
+ try_cast=try_cast, axis=axis,
+ transpose=transpose)
+ return self._maybe_downcast(blocks, 'infer')
+
+ if self._can_hold_na or self.ndim == 1:
+
+ if transpose:
+ result = result.T
+
+ # try to cast if requested
+ if try_cast:
+ result = self._try_cast_result(result)
+
+ return self.make_block(result)
+
+ # might need to separate out blocks
+ axis = cond.ndim - 1
+ cond = cond.swapaxes(axis, 0)
+ mask = np.array([cond[i].all() for i in range(cond.shape[0])],
+ dtype=bool)
+
+ result_blocks = []
+ for m in [mask, ~mask]:
+ if m.any():
+ r = self._try_cast_result(result.take(m.nonzero()[0],
+ axis=axis))
+ result_blocks.append(
+ self.make_block(r.T, placement=self.mgr_locs[m]))
+
+ return result_blocks
+
+ def equals(self, other):
+ if self.dtype != other.dtype or self.shape != other.shape:
+ return False
+ return array_equivalent(self.values, other.values)
+
+ def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
+ """Return a list of unstacked blocks of self
+
+ Parameters
+ ----------
+ unstacker_func : callable
+ Partially applied unstacker.
+ new_columns : Index
+ All columns of the unstacked BlockManager.
+ n_rows : int
+ Only used in ExtensionBlock.unstack
+ fill_value : int
+ Only used in ExtensionBlock.unstack
+
+ Returns
+ -------
+ blocks : list of Block
+ New blocks of unstacked values.
+ mask : array_like of bool
+ The mask of columns of `blocks` we should keep.
+ """
+ unstacker = unstacker_func(self.values.T)
+ new_items = unstacker.get_new_columns()
+ new_placement = new_columns.get_indexer(new_items)
+ new_values, mask = unstacker.get_new_values()
+
+ mask = mask.any(0)
+ new_values = new_values.T[mask]
+ new_placement = new_placement[mask]
+
+ blocks = [make_block(new_values, placement=new_placement)]
+ return blocks, mask
+
+ def quantile(self, qs, interpolation='linear', axis=0):
+ """
+ compute the quantiles of the
+
+ Parameters
+ ----------
+ qs: a scalar or list of the quantiles to be computed
+ interpolation: type of interpolation, default 'linear'
+ axis: axis to compute, default 0
+
+ Returns
+ -------
+ Block
+ """
+ if self.is_datetimetz:
+ # TODO: cleanup this special case.
+ # We need to operate on i8 values for datetimetz
+ # but `Block.get_values()` returns an ndarray of objects
+ # right now. We need an API for "values to do numeric-like ops on"
+ values = self.values.asi8
+
+ # TODO: NonConsolidatableMixin shape
+ # Usual shape inconsistencies for ExtensionBlocks
+ if self.ndim > 1:
+ values = values[None, :]
+ else:
+ values = self.get_values()
+ values, _ = self._try_coerce_args(values, values)
+
+ is_empty = values.shape[axis] == 0
+ orig_scalar = not is_list_like(qs)
+ if orig_scalar:
+ # make list-like, unpack later
+ qs = [qs]
+
+ if is_empty:
+ if self.ndim == 1:
+ result = self._na_value
+ else:
+ # create the array of na_values
+ # 2d len(values) * len(qs)
+ result = np.repeat(np.array([self.fill_value] * len(qs)),
+ len(values)).reshape(len(values),
+ len(qs))
+ else:
+ # asarray needed for Sparse, see GH#24600
+ # TODO: Why self.values and not values?
+ mask = np.asarray(isna(self.values))
+ result = nanpercentile(values, np.array(qs) * 100,
+ axis=axis, na_value=self.fill_value,
+ mask=mask, ndim=self.ndim,
+ interpolation=interpolation)
+
+ result = np.array(result, copy=False)
+ if self.ndim > 1:
+ result = result.T
+
+ if orig_scalar and not lib.is_scalar(result):
+ # result could be scalar in case with is_empty and self.ndim == 1
+ assert result.shape[-1] == 1, result.shape
+ result = result[..., 0]
+ result = lib.item_from_zerodim(result)
+
+ ndim = getattr(result, 'ndim', None) or 0
+ result = self._try_coerce_result(result)
+ return make_block(result,
+ placement=np.arange(len(result)),
+ ndim=ndim)
+
+ def _replace_coerce(self, to_replace, value, inplace=True, regex=False,
+ convert=False, mask=None):
+ """
+ Replace value corresponding to the given boolean array with another
+ value.
+
+ Parameters
+ ----------
+ to_replace : object or pattern
+ Scalar to replace or regular expression to match.
+ value : object
+ Replacement object.
+ inplace : bool, default False
+ Perform inplace modification.
+ regex : bool, default False
+ If true, perform regular expression substitution.
+ convert : bool, default True
+ If true, try to coerce any object types to better types.
+ mask : array-like of bool, optional
+ True indicate corresponding element is ignored.
+
+ Returns
+ -------
+ A new block if there is anything to replace or the original block.
+ """
+
+ if mask.any():
+ if not regex:
+ self = self.coerce_to_target_dtype(value)
+ return self.putmask(mask, value, inplace=inplace)
+ else:
+ return self._replace_single(to_replace, value, inplace=inplace,
+ regex=regex,
+ convert=convert,
+ mask=mask)
+ return self
+
+
+class NonConsolidatableMixIn(object):
+ """ hold methods for the nonconsolidatable blocks """
+ _can_consolidate = False
+ _verify_integrity = False
+ _validate_ndim = False
+
+ def __init__(self, values, placement, ndim=None):
+ """Initialize a non-consolidatable block.
+
+ 'ndim' may be inferred from 'placement'.
+
+ This will call continue to call __init__ for the other base
+ classes mixed in with this Mixin.
+ """
+ # Placement must be converted to BlockPlacement so that we can check
+ # its length
+ if not isinstance(placement, libinternals.BlockPlacement):
+ placement = libinternals.BlockPlacement(placement)
+
+ # Maybe infer ndim from placement
+ if ndim is None:
+ if len(placement) != 1:
+ ndim = 1
+ else:
+ ndim = 2
+ super(NonConsolidatableMixIn, self).__init__(values, placement,
+ ndim=ndim)
+
+ @property
+ def shape(self):
+ if self.ndim == 1:
+ return (len(self.values)),
+ return (len(self.mgr_locs), len(self.values))
+
+ def iget(self, col):
+
+ if self.ndim == 2 and isinstance(col, tuple):
+ col, loc = col
+ if not com.is_null_slice(col) and col != 0:
+ raise IndexError("{0} only contains one item".format(self))
+ return self.values[loc]
+ else:
+ if col != 0:
+ raise IndexError("{0} only contains one item".format(self))
+ return self.values
+
+ def should_store(self, value):
+ return isinstance(value, self._holder)
+
+ def set(self, locs, values, check=False):
+ assert locs.tolist() == [0]
+ self.values = values
+
+ def putmask(self, mask, new, align=True, inplace=False, axis=0,
+ transpose=False):
+ """
+ putmask the data to the block; we must be a single block and not
+ generate other blocks
+
+ return the resulting block
+
+ Parameters
+ ----------
+ mask : the condition to respect
+ new : a ndarray/object
+ align : boolean, perform alignment on other/cond, default is True
+ inplace : perform inplace modification, default is False
+
+ Returns
+ -------
+ a new block, the result of the putmask
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+
+ # use block's copy logic.
+ # .values may be an Index which does shallow copy by default
+ new_values = self.values if inplace else self.copy().values
+ new_values, new = self._try_coerce_args(new_values, new)
+
+ if isinstance(new, np.ndarray) and len(new) == len(mask):
+ new = new[mask]
+
+ mask = _safe_reshape(mask, new_values.shape)
+
+ new_values[mask] = new
+ new_values = self._try_coerce_result(new_values)
+ return [self.make_block(values=new_values)]
+
+ def _try_cast_result(self, result, dtype=None):
+ return result
+
+ def _get_unstack_items(self, unstacker, new_columns):
+ """
+ Get the placement, values, and mask for a Block unstack.
+
+ This is shared between ObjectBlock and ExtensionBlock. They
+ differ in that ObjectBlock passes the values, while ExtensionBlock
+ passes the dummy ndarray of positions to be used by a take
+ later.
+
+ Parameters
+ ----------
+ unstacker : pandas.core.reshape.reshape._Unstacker
+ new_columns : Index
+ All columns of the unstacked BlockManager.
+
+ Returns
+ -------
+ new_placement : ndarray[int]
+ The placement of the new columns in `new_columns`.
+ new_values : Union[ndarray, ExtensionArray]
+ The first return value from _Unstacker.get_new_values.
+ mask : ndarray[bool]
+ The second return value from _Unstacker.get_new_values.
+ """
+ # shared with ExtensionBlock
+ new_items = unstacker.get_new_columns()
+ new_placement = new_columns.get_indexer(new_items)
+ new_values, mask = unstacker.get_new_values()
+
+ mask = mask.any(0)
+ return new_placement, new_values, mask
+
+
+class ExtensionBlock(NonConsolidatableMixIn, Block):
+ """Block for holding extension types.
+
+ Notes
+ -----
+ This holds all 3rd-party extension array types. It's also the immediate
+ parent class for our internal extension types' blocks, CategoricalBlock.
+
+ ExtensionArrays are limited to 1-D.
+ """
+ is_extension = True
+
+ def __init__(self, values, placement, ndim=None):
+ values = self._maybe_coerce_values(values)
+ super(ExtensionBlock, self).__init__(values, placement, ndim)
+
+ def _maybe_coerce_values(self, values):
+ """Unbox to an extension array.
+
+ This will unbox an ExtensionArray stored in an Index or Series.
+ ExtensionArrays pass through. No dtype coercion is done.
+
+ Parameters
+ ----------
+ values : Index, Series, ExtensionArray
+
+ Returns
+ -------
+ ExtensionArray
+ """
+ if isinstance(values, (ABCIndexClass, ABCSeries)):
+ values = values._values
+ return values
+
+ @property
+ def _holder(self):
+ # For extension blocks, the holder is values-dependent.
+ return type(self.values)
+
+ @property
+ def fill_value(self):
+ # Used in reindex_indexer
+ return self.values.dtype.na_value
+
+ @property
+ def _can_hold_na(self):
+ # The default ExtensionArray._can_hold_na is True
+ return self._holder._can_hold_na
+
+ @property
+ def is_view(self):
+ """Extension arrays are never treated as views."""
+ return False
+
+ @property
+ def is_numeric(self):
+ return self.values.dtype._is_numeric
+
+ def setitem(self, indexer, value):
+ """Set the value inplace, returning a same-typed block.
+
+ This differs from Block.setitem by not allowing setitem to change
+ the dtype of the Block.
+
+ Parameters
+ ----------
+ indexer : tuple, list-like, array-like, slice
+ The subset of self.values to set
+ value : object
+ The value being set
+
+ Returns
+ -------
+ Block
+
+ Notes
+ -----
+ `indexer` is a direct slice/positional indexer. `value` must
+ be a compatible shape.
+ """
+ if isinstance(indexer, tuple):
+ # we are always 1-D
+ indexer = indexer[0]
+
+ check_setitem_lengths(indexer, value, self.values)
+ self.values[indexer] = value
+ return self
+
+ def get_values(self, dtype=None):
+ # ExtensionArrays must be iterable, so this works.
+ values = np.asarray(self.values)
+ if values.ndim == self.ndim - 1:
+ values = values.reshape((1,) + values.shape)
+ return values
+
+ def to_dense(self):
+ return np.asarray(self.values)
+
+ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None):
+ """
+ Take values according to indexer and return them as a block.
+ """
+ if fill_tuple is None:
+ fill_value = None
+ else:
+ fill_value = fill_tuple[0]
+
+ # axis doesn't matter; we are really a single-dim object
+ # but are passed the axis depending on the calling routing
+ # if its REALLY axis 0, then this will be a reindex and not a take
+ new_values = self.values.take(indexer, fill_value=fill_value,
+ allow_fill=True)
+
+ if self.ndim == 1 and new_mgr_locs is None:
+ new_mgr_locs = [0]
+ else:
+ if new_mgr_locs is None:
+ new_mgr_locs = self.mgr_locs
+
+ return self.make_block_same_class(new_values, new_mgr_locs)
+
+ def _can_hold_element(self, element):
+ # XXX: We may need to think about pushing this onto the array.
+ # We're doing the same as CategoricalBlock here.
+ return True
+
+ def _slice(self, slicer):
+ """ return a slice of my values """
+
+ # slice the category
+ # return same dims as we currently have
+
+ if isinstance(slicer, tuple) and len(slicer) == 2:
+ if not com.is_null_slice(slicer[0]):
+ raise AssertionError("invalid slicing for a 1-ndim "
+ "categorical")
+ slicer = slicer[1]
+
+ return self.values[slicer]
+
+ def formatting_values(self):
+ # Deprecating the ability to override _formatting_values.
+ # Do the warning here, it's only user in pandas, since we
+ # have to check if the subclass overrode it.
+ fv = getattr(type(self.values), '_formatting_values', None)
+ if fv and fv != ExtensionArray._formatting_values:
+ msg = (
+ "'ExtensionArray._formatting_values' is deprecated. "
+ "Specify 'ExtensionArray._formatter' instead."
+ )
+ warnings.warn(msg, DeprecationWarning, stacklevel=10)
+ return self.values._formatting_values()
+
+ return self.values
+
+ def concat_same_type(self, to_concat, placement=None):
+ """
+ Concatenate list of single blocks of the same type.
+ """
+ values = self._holder._concat_same_type(
+ [blk.values for blk in to_concat])
+ placement = placement or slice(0, len(values), 1)
+ return self.make_block_same_class(values, ndim=self.ndim,
+ placement=placement)
+
+ def fillna(self, value, limit=None, inplace=False, downcast=None):
+ values = self.values if inplace else self.values.copy()
+ values = values.fillna(value=value, limit=limit)
+ return [self.make_block_same_class(values=values,
+ placement=self.mgr_locs,
+ ndim=self.ndim)]
+
+ def interpolate(self, method='pad', axis=0, inplace=False, limit=None,
+ fill_value=None, **kwargs):
+
+ values = self.values if inplace else self.values.copy()
+ return self.make_block_same_class(
+ values=values.fillna(value=fill_value, method=method,
+ limit=limit),
+ placement=self.mgr_locs)
+
+ def shift(self, periods, axis=0, fill_value=None):
+ ## type: (int, Optional[BlockPlacement]) -> List[ExtensionBlock]
+ """
+ Shift the block by `periods`.
+
+ Dispatches to underlying ExtensionArray and re-boxes in an
+ ExtensionBlock.
+ """
+ return [
+ self.make_block_same_class(
+ self.values.shift(periods=periods, fill_value=fill_value),
+ placement=self.mgr_locs, ndim=self.ndim)
+ ]
+
+ def where(self, other, cond, align=True, errors='raise',
+ try_cast=False, axis=0, transpose=False):
+ if isinstance(other, ABCDataFrame):
+ # ExtensionArrays are 1-D, so if we get here then
+ # `other` should be a DataFrame with a single column.
+ assert other.shape[1] == 1
+ other = other.iloc[:, 0]
+
+ other = extract_array(other, extract_numpy=True)
+
+ if isinstance(cond, ABCDataFrame):
+ assert cond.shape[1] == 1
+ cond = cond.iloc[:, 0]
+
+ cond = extract_array(cond, extract_numpy=True)
+
+ if lib.is_scalar(other) and isna(other):
+ # The default `other` for Series / Frame is np.nan
+ # we want to replace that with the correct NA value
+ # for the type
+ other = self.dtype.na_value
+
+ if is_sparse(self.values):
+ # TODO(SparseArray.__setitem__): remove this if condition
+ # We need to re-infer the type of the data after doing the
+ # where, for cases where the subtypes don't match
+ dtype = None
+ else:
+ dtype = self.dtype
+
+ try:
+ result = self.values.copy()
+ icond = ~cond
+ if lib.is_scalar(other):
+ result[icond] = other
+ else:
+ result[icond] = other[icond]
+ except (NotImplementedError, TypeError):
+ # NotImplementedError for class not implementing `__setitem__`
+ # TypeError for SparseArray, which implements just to raise
+ # a TypeError
+ result = self._holder._from_sequence(
+ np.where(cond, self.values, other),
+ dtype=dtype,
+ )
+
+ return self.make_block_same_class(result, placement=self.mgr_locs)
+
+ @property
+ def _ftype(self):
+ return getattr(self.values, '_pandas_ftype', Block._ftype)
+
+ def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
+ # ExtensionArray-safe unstack.
+ # We override ObjectBlock._unstack, which unstacks directly on the
+ # values of the array. For EA-backed blocks, this would require
+ # converting to a 2-D ndarray of objects.
+ # Instead, we unstack an ndarray of integer positions, followed by
+ # a `take` on the actual values.
+ dummy_arr = np.arange(n_rows)
+ dummy_unstacker = functools.partial(unstacker_func, fill_value=-1)
+ unstacker = dummy_unstacker(dummy_arr)
+
+ new_placement, new_values, mask = self._get_unstack_items(
+ unstacker, new_columns
+ )
+
+ blocks = [
+ self.make_block_same_class(
+ self.values.take(indices, allow_fill=True,
+ fill_value=fill_value),
+ [place])
+ for indices, place in zip(new_values.T, new_placement)
+ ]
+ return blocks, mask
+
+
+class ObjectValuesExtensionBlock(ExtensionBlock):
+ """
+ Block providing backwards-compatibility for `.values`.
+
+ Used by PeriodArray and IntervalArray to ensure that
+ Series[T].values is an ndarray of objects.
+ """
+
+ def external_values(self, dtype=None):
+ return self.values.astype(object)
+
+
+class NumericBlock(Block):
+ __slots__ = ()
+ is_numeric = True
+ _can_hold_na = True
+
+
+class FloatOrComplexBlock(NumericBlock):
+ __slots__ = ()
+
+ def equals(self, other):
+ if self.dtype != other.dtype or self.shape != other.shape:
+ return False
+ left, right = self.values, other.values
+ return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
+
+
+class FloatBlock(FloatOrComplexBlock):
+ __slots__ = ()
+ is_float = True
+
+ def _can_hold_element(self, element):
+ tipo = maybe_infer_dtype_type(element)
+ if tipo is not None:
+ return (issubclass(tipo.type, (np.floating, np.integer)) and
+ not issubclass(tipo.type, (np.datetime64, np.timedelta64)))
+ return (
+ isinstance(
+ element, (float, int, np.floating, np.int_, compat.long))
+ and not isinstance(element, (bool, np.bool_, datetime, timedelta,
+ np.datetime64, np.timedelta64)))
+
+ def to_native_types(self, slicer=None, na_rep='', float_format=None,
+ decimal='.', quoting=None, **kwargs):
+ """ convert to our native types format, slicing if desired """
+
+ values = self.values
+ if slicer is not None:
+ values = values[:, slicer]
+
+ # see gh-13418: no special formatting is desired at the
+ # output (important for appropriate 'quoting' behaviour),
+ # so do not pass it through the FloatArrayFormatter
+ if float_format is None and decimal == '.':
+ mask = isna(values)
+
+ if not quoting:
+ values = values.astype(str)
+ else:
+ values = np.array(values, dtype='object')
+
+ values[mask] = na_rep
+ return values
+
+ from pandas.io.formats.format import FloatArrayFormatter
+ formatter = FloatArrayFormatter(values, na_rep=na_rep,
+ float_format=float_format,
+ decimal=decimal, quoting=quoting,
+ fixed_width=False)
+ return formatter.get_result_as_array()
+
+ def should_store(self, value):
+ # when inserting a column should not coerce integers to floats
+ # unnecessarily
+ return (issubclass(value.dtype.type, np.floating) and
+ value.dtype == self.dtype)
+
+
+class ComplexBlock(FloatOrComplexBlock):
+ __slots__ = ()
+ is_complex = True
+
+ def _can_hold_element(self, element):
+ tipo = maybe_infer_dtype_type(element)
+ if tipo is not None:
+ return issubclass(tipo.type,
+ (np.floating, np.integer, np.complexfloating))
+ return (
+ isinstance(
+ element,
+ (float, int, complex, np.float_, np.int_, compat.long))
+ and not isinstance(element, (bool, np.bool_)))
+
+ def should_store(self, value):
+ return issubclass(value.dtype.type, np.complexfloating)
+
+
+class IntBlock(NumericBlock):
+ __slots__ = ()
+ is_integer = True
+ _can_hold_na = False
+
+ def _can_hold_element(self, element):
+ tipo = maybe_infer_dtype_type(element)
+ if tipo is not None:
+ return (issubclass(tipo.type, np.integer) and
+ not issubclass(tipo.type, (np.datetime64,
+ np.timedelta64)) and
+ self.dtype.itemsize >= tipo.itemsize)
+ return is_integer(element)
+
+ def should_store(self, value):
+ return is_integer_dtype(value) and value.dtype == self.dtype
+
+
+class DatetimeLikeBlockMixin(object):
+ """Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock."""
+
+ @property
+ def _holder(self):
+ return DatetimeArray
+
+ @property
+ def _na_value(self):
+ return tslibs.NaT
+
+ @property
+ def fill_value(self):
+ return tslibs.iNaT
+
+ def get_values(self, dtype=None):
+ """
+ return object dtype as boxed values, such as Timestamps/Timedelta
+ """
+ if is_object_dtype(dtype):
+ values = self.values
+
+ if self.ndim > 1:
+ values = values.ravel()
+
+ values = lib.map_infer(values, self._box_func)
+
+ if self.ndim > 1:
+ values = values.reshape(self.values.shape)
+
+ return values
+ return self.values
+
+
+class DatetimeBlock(DatetimeLikeBlockMixin, Block):
+ __slots__ = ()
+ is_datetime = True
+ _can_hold_na = True
+
+ def __init__(self, values, placement, ndim=None):
+ values = self._maybe_coerce_values(values)
+ super(DatetimeBlock, self).__init__(values,
+ placement=placement, ndim=ndim)
+
+ def _maybe_coerce_values(self, values):
+ """Input validation for values passed to __init__. Ensure that
+ we have datetime64ns, coercing if necessary.
+
+ Parameters
+ ----------
+ values : array-like
+ Must be convertible to datetime64
+
+ Returns
+ -------
+ values : ndarray[datetime64ns]
+
+ Overridden by DatetimeTZBlock.
+ """
+ if values.dtype != _NS_DTYPE:
+ values = conversion.ensure_datetime64ns(values)
+
+ if isinstance(values, DatetimeArray):
+ values = values._data
+
+ assert isinstance(values, np.ndarray), type(values)
+ return values
+
+ def _astype(self, dtype, **kwargs):
+ """
+ these automatically copy, so copy=True has no effect
+ raise on an except if raise == True
+ """
+ dtype = pandas_dtype(dtype)
+
+ # if we are passed a datetime64[ns, tz]
+ if is_datetime64tz_dtype(dtype):
+ values = self.values
+ if getattr(values, 'tz', None) is None:
+ values = DatetimeIndex(values).tz_localize('UTC')
+ values = values.tz_convert(dtype.tz)
+ return self.make_block(values)
+
+ # delegate
+ return super(DatetimeBlock, self)._astype(dtype=dtype, **kwargs)
+
+ def _can_hold_element(self, element):
+ tipo = maybe_infer_dtype_type(element)
+ if tipo is not None:
+ return tipo == _NS_DTYPE or tipo == np.int64
+ return (is_integer(element) or isinstance(element, datetime) or
+ isna(element))
+
+ def _try_coerce_args(self, values, other):
+ """
+ Coerce values and other to dtype 'i8'. NaN and NaT convert to
+ the smallest i8, and will correctly round-trip to NaT if converted
+ back in _try_coerce_result. values is always ndarray-like, other
+ may not be
+
+ Parameters
+ ----------
+ values : ndarray-like
+ other : ndarray-like or scalar
+
+ Returns
+ -------
+ base-type values, base-type other
+ """
+
+ values = values.view('i8')
+
+ if isinstance(other, bool):
+ raise TypeError
+ elif is_null_datetimelike(other):
+ other = tslibs.iNaT
+ elif isinstance(other, (datetime, np.datetime64, date)):
+ other = self._box_func(other)
+ if getattr(other, 'tz') is not None:
+ raise TypeError("cannot coerce a Timestamp with a tz on a "
+ "naive Block")
+ other = other.asm8.view('i8')
+ elif hasattr(other, 'dtype') and is_datetime64_dtype(other):
+ other = other.astype('i8', copy=False).view('i8')
+ else:
+ # coercion issues
+ # let higher levels handle
+ raise TypeError(other)
+
+ return values, other
+
+ def _try_coerce_result(self, result):
+ """ reverse of try_coerce_args """
+ if isinstance(result, np.ndarray):
+ if result.dtype.kind in ['i', 'f']:
+ result = result.astype('M8[ns]')
+
+ elif isinstance(result, (np.integer, np.float, np.datetime64)):
+ result = self._box_func(result)
+ return result
+
+ @property
+ def _box_func(self):
+ return tslibs.Timestamp
+
+ def to_native_types(self, slicer=None, na_rep=None, date_format=None,
+ quoting=None, **kwargs):
+ """ convert to our native types format, slicing if desired """
+
+ values = self.values
+ i8values = self.values.view('i8')
+
+ if slicer is not None:
+ i8values = i8values[..., slicer]
+
+ from pandas.io.formats.format import _get_format_datetime64_from_values
+ fmt = _get_format_datetime64_from_values(values, date_format)
+
+ result = tslib.format_array_from_datetime(
+ i8values.ravel(), tz=getattr(self.values, 'tz', None),
+ format=fmt, na_rep=na_rep).reshape(i8values.shape)
+ return np.atleast_2d(result)
+
+ def should_store(self, value):
+ return (issubclass(value.dtype.type, np.datetime64) and
+ not is_datetime64tz_dtype(value) and
+ not is_extension_array_dtype(value))
+
+ def set(self, locs, values):
+ """
+ Modify Block in-place with new item value
+
+ Returns
+ -------
+ None
+ """
+ values = conversion.ensure_datetime64ns(values, copy=False)
+
+ self.values[locs] = values
+
+ def external_values(self):
+ return np.asarray(self.values.astype('datetime64[ns]', copy=False))
+
+
+class DatetimeTZBlock(ExtensionBlock, DatetimeBlock):
+ """ implement a datetime64 block with a tz attribute """
+ __slots__ = ()
+ is_datetimetz = True
+ is_extension = True
+
+ @property
+ def _holder(self):
+ return DatetimeArray
+
+ def _maybe_coerce_values(self, values):
+ """Input validation for values passed to __init__. Ensure that
+ we have datetime64TZ, coercing if necessary.
+
+ Parametetrs
+ -----------
+ values : array-like
+ Must be convertible to datetime64
+
+ Returns
+ -------
+ values : DatetimeArray
+ """
+ if not isinstance(values, self._holder):
+ values = self._holder(values)
+
+ if values.tz is None:
+ raise ValueError("cannot create a DatetimeTZBlock without a tz")
+
+ return values
+
+ @property
+ def is_view(self):
+ """ return a boolean if I am possibly a view """
+ # check the ndarray values of the DatetimeIndex values
+ return self.values._data.base is not None
+
+ def copy(self, deep=True):
+ """ copy constructor """
+ values = self.values
+ if deep:
+ values = values.copy(deep=True)
+ return self.make_block_same_class(values)
+
+ def get_values(self, dtype=None):
+ """
+ Returns an ndarray of values.
+
+ Parameters
+ ----------
+ dtype : np.dtype
+ Only `object`-like dtypes are respected here (not sure
+ why).
+
+ Returns
+ -------
+ values : ndarray
+ When ``dtype=object``, then and object-dtype ndarray of
+ boxed values is returned. Otherwise, an M8[ns] ndarray
+ is returned.
+
+ DatetimeArray is always 1-d. ``get_values`` will reshape
+ the return value to be the same dimensionality as the
+ block.
+ """
+ values = self.values
+ if is_object_dtype(dtype):
+ values = values._box_values(values._data)
+
+ values = np.asarray(values)
+
+ if self.ndim == 2:
+ # Ensure that our shape is correct for DataFrame.
+ # ExtensionArrays are always 1-D, even in a DataFrame when
+ # the analogous NumPy-backed column would be a 2-D ndarray.
+ values = values.reshape(1, -1)
+ return values
+
+ def to_dense(self):
+ # we request M8[ns] dtype here, even though it discards tzinfo,
+ # as lots of code (e.g. anything using values_from_object)
+ # expects that behavior.
+ return np.asarray(self.values, dtype=_NS_DTYPE)
+
+ def _slice(self, slicer):
+ """ return a slice of my values """
+ if isinstance(slicer, tuple):
+ col, loc = slicer
+ if not com.is_null_slice(col) and col != 0:
+ raise IndexError("{0} only contains one item".format(self))
+ return self.values[loc]
+ return self.values[slicer]
+
+ def _try_coerce_args(self, values, other):
+ """
+ localize and return i8 for the values
+
+ Parameters
+ ----------
+ values : ndarray-like
+ other : ndarray-like or scalar
+
+ Returns
+ -------
+ base-type values, base-type other
+ """
+ # asi8 is a view, needs copy
+ values = _block_shape(values.view("i8"), ndim=self.ndim)
+
+ if isinstance(other, ABCSeries):
+ other = self._holder(other)
+
+ if isinstance(other, bool):
+ raise TypeError
+ elif is_datetime64_dtype(other):
+ # add the tz back
+ other = self._holder(other, dtype=self.dtype)
+
+ elif is_null_datetimelike(other):
+ other = tslibs.iNaT
+ elif isinstance(other, self._holder):
+ if other.tz != self.values.tz:
+ raise ValueError("incompatible or non tz-aware value")
+ other = _block_shape(other.asi8, ndim=self.ndim)
+ elif isinstance(other, (np.datetime64, datetime, date)):
+ other = tslibs.Timestamp(other)
+ tz = getattr(other, 'tz', None)
+
+ # test we can have an equal time zone
+ if tz is None or str(tz) != str(self.values.tz):
+ raise ValueError("incompatible or non tz-aware value")
+ other = other.value
+ else:
+ raise TypeError(other)
+
+ return values, other
+
+ def _try_coerce_result(self, result):
+ """ reverse of try_coerce_args """
+ if isinstance(result, np.ndarray):
+ if result.dtype.kind in ['i', 'f']:
+ result = result.astype('M8[ns]')
+
+ elif isinstance(result, (np.integer, np.float, np.datetime64)):
+ result = self._box_func(result)
+
+ if isinstance(result, np.ndarray):
+ # allow passing of > 1dim if its trivial
+
+ if result.ndim > 1:
+ result = result.reshape(np.prod(result.shape))
+ # GH#24096 new values invalidates a frequency
+ result = self._holder._simple_new(result, freq=None,
+ dtype=self.values.dtype)
+
+ return result
+
+ @property
+ def _box_func(self):
+ return lambda x: tslibs.Timestamp(x, tz=self.dtype.tz)
+
+ def diff(self, n, axis=0):
+ """1st discrete difference
+
+ Parameters
+ ----------
+ n : int, number of periods to diff
+ axis : int, axis to diff upon. default 0
+
+ Return
+ ------
+ A list with a new TimeDeltaBlock.
+
+ Note
+ ----
+ The arguments here are mimicking shift so they are called correctly
+ by apply.
+ """
+ if axis == 0:
+ # Cannot currently calculate diff across multiple blocks since this
+ # function is invoked via apply
+ raise NotImplementedError
+ new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8
+
+ # Reshape the new_values like how algos.diff does for timedelta data
+ new_values = new_values.reshape(1, len(new_values))
+ new_values = new_values.astype('timedelta64[ns]')
+ return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)]
+
+ def concat_same_type(self, to_concat, placement=None):
+ # need to handle concat([tz1, tz2]) here, since DatetimeArray
+ # only handles cases where all the tzs are the same.
+ # Instead of placing the condition here, it could also go into the
+ # is_uniform_join_units check, but I'm not sure what is better.
+ if len({x.dtype for x in to_concat}) > 1:
+ values = _concat._concat_datetime([x.values for x in to_concat])
+ placement = placement or slice(0, len(values), 1)
+
+ if self.ndim > 1:
+ values = np.atleast_2d(values)
+ return ObjectBlock(values, ndim=self.ndim, placement=placement)
+ return super(DatetimeTZBlock, self).concat_same_type(to_concat,
+ placement)
+
+ def fillna(self, value, limit=None, inplace=False, downcast=None):
+ # We support filling a DatetimeTZ with a `value` whose timezone
+ # is different by coercing to object.
+ try:
+ return super(DatetimeTZBlock, self).fillna(
+ value, limit, inplace, downcast
+ )
+ except (ValueError, TypeError):
+ # different timezones, or a non-tz
+ return self.astype(object).fillna(
+ value, limit=limit, inplace=inplace, downcast=downcast
+ )
+
+ def setitem(self, indexer, value):
+ # https://github.com/pandas-dev/pandas/issues/24020
+ # Need a dedicated setitem until #24020 (type promotion in setitem
+ # for extension arrays) is designed and implemented.
+ try:
+ return super(DatetimeTZBlock, self).setitem(indexer, value)
+ except (ValueError, TypeError):
+ newb = make_block(self.values.astype(object),
+ placement=self.mgr_locs,
+ klass=ObjectBlock,)
+ return newb.setitem(indexer, value)
+
+ def equals(self, other):
+ # override for significant performance improvement
+ if self.dtype != other.dtype or self.shape != other.shape:
+ return False
+ return (self.values.view('i8') == other.values.view('i8')).all()
+
+
+class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock):
+ __slots__ = ()
+ is_timedelta = True
+ _can_hold_na = True
+ is_numeric = False
+
+ def __init__(self, values, placement, ndim=None):
+ if values.dtype != _TD_DTYPE:
+ values = conversion.ensure_timedelta64ns(values)
+ if isinstance(values, TimedeltaArray):
+ values = values._data
+ assert isinstance(values, np.ndarray), type(values)
+ super(TimeDeltaBlock, self).__init__(values,
+ placement=placement, ndim=ndim)
+
+ @property
+ def _holder(self):
+ return TimedeltaArray
+
+ @property
+ def _box_func(self):
+ return lambda x: Timedelta(x, unit='ns')
+
+ def _can_hold_element(self, element):
+ tipo = maybe_infer_dtype_type(element)
+ if tipo is not None:
+ return issubclass(tipo.type, (np.timedelta64, np.int64))
+ return is_integer(element) or isinstance(
+ element, (timedelta, np.timedelta64, np.int64))
+
+ def fillna(self, value, **kwargs):
+
+ # allow filling with integers to be
+ # interpreted as nanoseconds
+ if is_integer(value) and not isinstance(value, np.timedelta64):
+ # Deprecation GH#24694, GH#19233
+ warnings.warn("Passing integers to fillna is deprecated, will "
+ "raise a TypeError in a future version. To retain "
+ "the old behavior, pass pd.Timedelta(seconds=n) "
+ "instead.",
+ FutureWarning, stacklevel=6)
+ value = Timedelta(value, unit='s')
+ return super(TimeDeltaBlock, self).fillna(value, **kwargs)
+
+ def _try_coerce_args(self, values, other):
+ """
+ Coerce values and other to int64, with null values converted to
+ iNaT. values is always ndarray-like, other may not be
+
+ Parameters
+ ----------
+ values : ndarray-like
+ other : ndarray-like or scalar
+
+ Returns
+ -------
+ base-type values, base-type other
+ """
+ values = values.view('i8')
+
+ if isinstance(other, bool):
+ raise TypeError
+ elif is_null_datetimelike(other):
+ other = tslibs.iNaT
+ elif isinstance(other, (timedelta, np.timedelta64)):
+ other = Timedelta(other).value
+ elif hasattr(other, 'dtype') and is_timedelta64_dtype(other):
+ other = other.astype('i8', copy=False).view('i8')
+ else:
+ # coercion issues
+ # let higher levels handle
+ raise TypeError(other)
+
+ return values, other
+
+ def _try_coerce_result(self, result):
+ """ reverse of try_coerce_args / try_operate """
+ if isinstance(result, np.ndarray):
+ mask = isna(result)
+ if result.dtype.kind in ['i', 'f']:
+ result = result.astype('m8[ns]')
+ result[mask] = tslibs.iNaT
+
+ elif isinstance(result, (np.integer, np.float)):
+ result = self._box_func(result)
+
+ return result
+
+ def should_store(self, value):
+ return (issubclass(value.dtype.type, np.timedelta64) and
+ not is_extension_array_dtype(value))
+
+ def to_native_types(self, slicer=None, na_rep=None, quoting=None,
+ **kwargs):
+ """ convert to our native types format, slicing if desired """
+
+ values = self.values
+ if slicer is not None:
+ values = values[:, slicer]
+ mask = isna(values)
+
+ rvalues = np.empty(values.shape, dtype=object)
+ if na_rep is None:
+ na_rep = 'NaT'
+ rvalues[mask] = na_rep
+ imask = (~mask).ravel()
+
+ # FIXME:
+ # should use the formats.format.Timedelta64Formatter here
+ # to figure what format to pass to the Timedelta
+ # e.g. to not show the decimals say
+ rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all')
+ for val in values.ravel()[imask]],
+ dtype=object)
+ return rvalues
+
+ def external_values(self, dtype=None):
+ return np.asarray(self.values.astype("timedelta64[ns]", copy=False))
+
+
+class BoolBlock(NumericBlock):
+ __slots__ = ()
+ is_bool = True
+ _can_hold_na = False
+
+ def _can_hold_element(self, element):
+ tipo = maybe_infer_dtype_type(element)
+ if tipo is not None:
+ return issubclass(tipo.type, np.bool_)
+ return isinstance(element, (bool, np.bool_))
+
+ def should_store(self, value):
+ return (issubclass(value.dtype.type, np.bool_) and not
+ is_extension_array_dtype(value))
+
+ def replace(self, to_replace, value, inplace=False, filter=None,
+ regex=False, convert=True):
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ to_replace_values = np.atleast_1d(to_replace)
+ if not np.can_cast(to_replace_values, bool):
+ return self
+ return super(BoolBlock, self).replace(to_replace, value,
+ inplace=inplace, filter=filter,
+ regex=regex, convert=convert)
+
+
+class ObjectBlock(Block):
+ __slots__ = ()
+ is_object = True
+ _can_hold_na = True
+
+ def __init__(self, values, placement=None, ndim=2):
+ if issubclass(values.dtype.type, compat.string_types):
+ values = np.array(values, dtype=object)
+
+ super(ObjectBlock, self).__init__(values, ndim=ndim,
+ placement=placement)
+
+ @property
+ def is_bool(self):
+ """ we can be a bool if we have only bool values but are of type
+ object
+ """
+ return lib.is_bool_array(self.values.ravel())
+
+ # TODO: Refactor when convert_objects is removed since there will be 1 path
+ def convert(self, *args, **kwargs):
+ """ attempt to coerce any object types to better types return a copy of
+ the block (if copy = True) by definition we ARE an ObjectBlock!!!!!
+
+ can return multiple blocks!
+ """
+
+ if args:
+ raise NotImplementedError
+ by_item = kwargs.get('by_item', True)
+
+ new_inputs = ['coerce', 'datetime', 'numeric', 'timedelta']
+ new_style = False
+ for kw in new_inputs:
+ new_style |= kw in kwargs
+
+ if new_style:
+ fn = soft_convert_objects
+ fn_inputs = new_inputs
+ else:
+ fn = maybe_convert_objects
+ fn_inputs = ['convert_dates', 'convert_numeric',
+ 'convert_timedeltas']
+ fn_inputs += ['copy']
+
+ fn_kwargs = {key: kwargs[key] for key in fn_inputs if key in kwargs}
+
+ # operate column-by-column
+ def f(m, v, i):
+ shape = v.shape
+ values = fn(v.ravel(), **fn_kwargs)
+ try:
+ values = values.reshape(shape)
+ values = _block_shape(values, ndim=self.ndim)
+ except (AttributeError, NotImplementedError):
+ pass
+
+ return values
+
+ if by_item and not self._is_single_block:
+ blocks = self.split_and_operate(None, f, False)
+ else:
+ values = f(None, self.values.ravel(), None)
+ blocks = [make_block(values, ndim=self.ndim,
+ placement=self.mgr_locs)]
+
+ return blocks
+
+ def set(self, locs, values):
+ """
+ Modify Block in-place with new item value
+
+ Returns
+ -------
+ None
+ """
+ try:
+ self.values[locs] = values
+ except (ValueError):
+
+ # broadcasting error
+ # see GH6171
+ new_shape = list(values.shape)
+ new_shape[0] = len(self.items)
+ self.values = np.empty(tuple(new_shape), dtype=self.dtype)
+ self.values.fill(np.nan)
+ self.values[locs] = values
+
+ def _maybe_downcast(self, blocks, downcast=None):
+
+ if downcast is not None:
+ return blocks
+
+ # split and convert the blocks
+ return _extend_blocks([b.convert(datetime=True, numeric=False)
+ for b in blocks])
+
+ def _can_hold_element(self, element):
+ return True
+
+ def _try_coerce_args(self, values, other):
+ """ provide coercion to our input arguments """
+
+ if isinstance(other, ABCDatetimeIndex):
+ # May get a DatetimeIndex here. Unbox it.
+ other = other.array
+
+ if isinstance(other, DatetimeArray):
+ # hit in pandas/tests/indexing/test_coercion.py
+ # ::TestWhereCoercion::test_where_series_datetime64[datetime64tz]
+ # when falling back to ObjectBlock.where
+ other = other.astype(object)
+
+ return values, other
+
+ def should_store(self, value):
+ return not (issubclass(value.dtype.type,
+ (np.integer, np.floating, np.complexfloating,
+ np.datetime64, np.bool_)) or
+ # TODO(ExtensionArray): remove is_extension_type
+ # when all extension arrays have been ported.
+ is_extension_type(value) or
+ is_extension_array_dtype(value))
+
+ def replace(self, to_replace, value, inplace=False, filter=None,
+ regex=False, convert=True):
+ to_rep_is_list = is_list_like(to_replace)
+ value_is_list = is_list_like(value)
+ both_lists = to_rep_is_list and value_is_list
+ either_list = to_rep_is_list or value_is_list
+
+ result_blocks = []
+ blocks = [self]
+
+ if not either_list and is_re(to_replace):
+ return self._replace_single(to_replace, value, inplace=inplace,
+ filter=filter, regex=True,
+ convert=convert)
+ elif not (either_list or regex):
+ return super(ObjectBlock, self).replace(to_replace, value,
+ inplace=inplace,
+ filter=filter, regex=regex,
+ convert=convert)
+ elif both_lists:
+ for to_rep, v in zip(to_replace, value):
+ result_blocks = []
+ for b in blocks:
+ result = b._replace_single(to_rep, v, inplace=inplace,
+ filter=filter, regex=regex,
+ convert=convert)
+ result_blocks = _extend_blocks(result, result_blocks)
+ blocks = result_blocks
+ return result_blocks
+
+ elif to_rep_is_list and regex:
+ for to_rep in to_replace:
+ result_blocks = []
+ for b in blocks:
+ result = b._replace_single(to_rep, value, inplace=inplace,
+ filter=filter, regex=regex,
+ convert=convert)
+ result_blocks = _extend_blocks(result, result_blocks)
+ blocks = result_blocks
+ return result_blocks
+
+ return self._replace_single(to_replace, value, inplace=inplace,
+ filter=filter, convert=convert,
+ regex=regex)
+
+ def _replace_single(self, to_replace, value, inplace=False, filter=None,
+ regex=False, convert=True, mask=None):
+ """
+ Replace elements by the given value.
+
+ Parameters
+ ----------
+ to_replace : object or pattern
+ Scalar to replace or regular expression to match.
+ value : object
+ Replacement object.
+ inplace : bool, default False
+ Perform inplace modification.
+ filter : list, optional
+ regex : bool, default False
+ If true, perform regular expression substitution.
+ convert : bool, default True
+ If true, try to coerce any object types to better types.
+ mask : array-like of bool, optional
+ True indicate corresponding element is ignored.
+
+ Returns
+ -------
+ a new block, the result after replacing
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+
+ # to_replace is regex compilable
+ to_rep_re = regex and is_re_compilable(to_replace)
+
+ # regex is regex compilable
+ regex_re = is_re_compilable(regex)
+
+ # only one will survive
+ if to_rep_re and regex_re:
+ raise AssertionError('only one of to_replace and regex can be '
+ 'regex compilable')
+
+ # if regex was passed as something that can be a regex (rather than a
+ # boolean)
+ if regex_re:
+ to_replace = regex
+
+ regex = regex_re or to_rep_re
+
+ # try to get the pattern attribute (compiled re) or it's a string
+ try:
+ pattern = to_replace.pattern
+ except AttributeError:
+ pattern = to_replace
+
+ # if the pattern is not empty and to_replace is either a string or a
+ # regex
+ if regex and pattern:
+ rx = re.compile(to_replace)
+ else:
+ # if the thing to replace is not a string or compiled regex call
+ # the superclass method -> to_replace is some kind of object
+ return super(ObjectBlock, self).replace(to_replace, value,
+ inplace=inplace,
+ filter=filter, regex=regex)
+
+ new_values = self.values if inplace else self.values.copy()
+
+ # deal with replacing values with objects (strings) that match but
+ # whose replacement is not a string (numeric, nan, object)
+ if isna(value) or not isinstance(value, compat.string_types):
+
+ def re_replacer(s):
+ try:
+ return value if rx.search(s) is not None else s
+ except TypeError:
+ return s
+ else:
+ # value is guaranteed to be a string here, s can be either a string
+ # or null if it's null it gets returned
+ def re_replacer(s):
+ try:
+ return rx.sub(value, s)
+ except TypeError:
+ return s
+
+ f = np.vectorize(re_replacer, otypes=[self.dtype])
+
+ if filter is None:
+ filt = slice(None)
+ else:
+ filt = self.mgr_locs.isin(filter).nonzero()[0]
+
+ if mask is None:
+ new_values[filt] = f(new_values[filt])
+ else:
+ new_values[filt][mask] = f(new_values[filt][mask])
+
+ # convert
+ block = self.make_block(new_values)
+ if convert:
+ block = block.convert(by_item=True, numeric=False)
+ return block
+
+ def _replace_coerce(self, to_replace, value, inplace=True, regex=False,
+ convert=False, mask=None):
+ """
+ Replace value corresponding to the given boolean array with another
+ value.
+
+ Parameters
+ ----------
+ to_replace : object or pattern
+ Scalar to replace or regular expression to match.
+ value : object
+ Replacement object.
+ inplace : bool, default False
+ Perform inplace modification.
+ regex : bool, default False
+ If true, perform regular expression substitution.
+ convert : bool, default True
+ If true, try to coerce any object types to better types.
+ mask : array-like of bool, optional
+ True indicate corresponding element is ignored.
+
+ Returns
+ -------
+ A new block if there is anything to replace or the original block.
+ """
+ if mask.any():
+ block = super(ObjectBlock, self)._replace_coerce(
+ to_replace=to_replace, value=value, inplace=inplace,
+ regex=regex, convert=convert, mask=mask)
+ if convert:
+ block = [b.convert(by_item=True, numeric=False, copy=True)
+ for b in block]
+ return block
+ return self
+
+
+class CategoricalBlock(ExtensionBlock):
+ __slots__ = ()
+ is_categorical = True
+ _verify_integrity = True
+ _can_hold_na = True
+ _concatenator = staticmethod(_concat._concat_categorical)
+
+ def __init__(self, values, placement, ndim=None):
+ from pandas.core.arrays.categorical import _maybe_to_categorical
+
+ # coerce to categorical if we can
+ super(CategoricalBlock, self).__init__(_maybe_to_categorical(values),
+ placement=placement,
+ ndim=ndim)
+
+ @property
+ def _holder(self):
+ return Categorical
+
+ @property
+ def array_dtype(self):
+ """ the dtype to return if I want to construct this block as an
+ array
+ """
+ return np.object_
+
+ def _try_coerce_result(self, result):
+ """ reverse of try_coerce_args """
+
+ # GH12564: CategoricalBlock is 1-dim only
+ # while returned results could be any dim
+ if ((not is_categorical_dtype(result)) and
+ isinstance(result, np.ndarray)):
+ result = _block_shape(result, ndim=self.ndim)
+
+ return result
+
+ def to_dense(self):
+ # Categorical.get_values returns a DatetimeIndex for datetime
+ # categories, so we can't simply use `np.asarray(self.values)` like
+ # other types.
+ return self.values.get_values()
+
+ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
+ """ convert to our native types format, slicing if desired """
+
+ values = self.values
+ if slicer is not None:
+ # Categorical is always one dimension
+ values = values[slicer]
+ mask = isna(values)
+ values = np.array(values, dtype='object')
+ values[mask] = na_rep
+
+ # we are expected to return a 2-d ndarray
+ return values.reshape(1, len(values))
+
+ def concat_same_type(self, to_concat, placement=None):
+ """
+ Concatenate list of single blocks of the same type.
+
+ Note that this CategoricalBlock._concat_same_type *may* not
+ return a CategoricalBlock. When the categories in `to_concat`
+ differ, this will return an object ndarray.
+
+ If / when we decide we don't like that behavior:
+
+ 1. Change Categorical._concat_same_type to use union_categoricals
+ 2. Delete this method.
+ """
+ values = self._concatenator([blk.values for blk in to_concat],
+ axis=self.ndim - 1)
+ # not using self.make_block_same_class as values can be object dtype
+ return make_block(
+ values, placement=placement or slice(0, len(values), 1),
+ ndim=self.ndim)
+
+ def where(self, other, cond, align=True, errors='raise',
+ try_cast=False, axis=0, transpose=False):
+ # TODO(CategoricalBlock.where):
+ # This can all be deleted in favor of ExtensionBlock.where once
+ # we enforce the deprecation.
+ object_msg = (
+ "Implicitly converting categorical to object-dtype ndarray. "
+ "One or more of the values in 'other' are not present in this "
+ "categorical's categories. A future version of pandas will raise "
+ "a ValueError when 'other' contains different categories.\n\n"
+ "To preserve the current behavior, add the new categories to "
+ "the categorical before calling 'where', or convert the "
+ "categorical to a different dtype."
+ )
+ try:
+ # Attempt to do preserve categorical dtype.
+ result = super(CategoricalBlock, self).where(
+ other, cond, align, errors, try_cast, axis, transpose
+ )
+ except (TypeError, ValueError):
+ warnings.warn(object_msg, FutureWarning, stacklevel=6)
+ result = self.astype(object).where(other, cond, align=align,
+ errors=errors,
+ try_cast=try_cast,
+ axis=axis, transpose=transpose)
+ return result
+
+
+# -----------------------------------------------------------------
+# Constructor Helpers
+
+def get_block_type(values, dtype=None):
+ """
+ Find the appropriate Block subclass to use for the given values and dtype.
+
+ Parameters
+ ----------
+ values : ndarray-like
+ dtype : numpy or pandas dtype
+
+ Returns
+ -------
+ cls : class, subclass of Block
+ """
+ dtype = dtype or values.dtype
+ vtype = dtype.type
+
+ if is_sparse(dtype):
+ # Need this first(ish) so that Sparse[datetime] is sparse
+ cls = ExtensionBlock
+ elif is_categorical(values):
+ cls = CategoricalBlock
+ elif issubclass(vtype, np.datetime64):
+ assert not is_datetime64tz_dtype(values)
+ cls = DatetimeBlock
+ elif is_datetime64tz_dtype(values):
+ cls = DatetimeTZBlock
+ elif is_interval_dtype(dtype) or is_period_dtype(dtype):
+ cls = ObjectValuesExtensionBlock
+ elif is_extension_array_dtype(values):
+ cls = ExtensionBlock
+ elif issubclass(vtype, np.floating):
+ cls = FloatBlock
+ elif issubclass(vtype, np.timedelta64):
+ assert issubclass(vtype, np.integer)
+ cls = TimeDeltaBlock
+ elif issubclass(vtype, np.complexfloating):
+ cls = ComplexBlock
+ elif issubclass(vtype, np.integer):
+ cls = IntBlock
+ elif dtype == np.bool_:
+ cls = BoolBlock
+ else:
+ cls = ObjectBlock
+ return cls
+
+
+def make_block(values, placement, klass=None, ndim=None, dtype=None,
+ fastpath=None):
+ if fastpath is not None:
+ # GH#19265 pyarrow is passing this
+ warnings.warn("fastpath argument is deprecated, will be removed "
+ "in a future release.", DeprecationWarning)
+ if klass is None:
+ dtype = dtype or values.dtype
+ klass = get_block_type(values, dtype)
+
+ elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values):
+ # TODO: This is no longer hit internally; does it need to be retained
+ # for e.g. pyarrow?
+ values = DatetimeArray._simple_new(values, dtype=dtype)
+
+ return klass(values, ndim=ndim, placement=placement)
+
+
+# -----------------------------------------------------------------
+
+def _extend_blocks(result, blocks=None):
+ """ return a new extended blocks, givin the result """
+ from pandas.core.internals import BlockManager
+ if blocks is None:
+ blocks = []
+ if isinstance(result, list):
+ for r in result:
+ if isinstance(r, list):
+ blocks.extend(r)
+ else:
+ blocks.append(r)
+ elif isinstance(result, BlockManager):
+ blocks.extend(result.blocks)
+ else:
+ blocks.append(result)
+ return blocks
+
+
+def _block_shape(values, ndim=1, shape=None):
+ """ guarantee the shape of the values to be at least 1 d """
+ if values.ndim < ndim:
+ if shape is None:
+ shape = values.shape
+ if not is_extension_array_dtype(values):
+ # TODO: https://github.com/pandas-dev/pandas/issues/23023
+ # block.shape is incorrect for "2D" ExtensionArrays
+ # We can't, and don't need to, reshape.
+ values = values.reshape(tuple((1, ) + shape))
+ return values
+
+
+def _merge_blocks(blocks, dtype=None, _can_consolidate=True):
+
+ if len(blocks) == 1:
+ return blocks[0]
+
+ if _can_consolidate:
+
+ if dtype is None:
+ if len({b.dtype for b in blocks}) != 1:
+ raise AssertionError("_merge_blocks are invalid!")
+ dtype = blocks[0].dtype
+
+ # FIXME: optimization potential in case all mgrs contain slices and
+ # combination of those slices is a slice, too.
+ new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks])
+ new_values = np.vstack([b.values for b in blocks])
+
+ argsort = np.argsort(new_mgr_locs)
+ new_values = new_values[argsort]
+ new_mgr_locs = new_mgr_locs[argsort]
+
+ return make_block(new_values, placement=new_mgr_locs)
+
+ # no merge
+ return blocks
+
+
+def _block2d_to_blocknd(values, placement, shape, labels, ref_items):
+ """ pivot to the labels shape """
+ panel_shape = (len(placement),) + shape
+
+ # TODO: lexsort depth needs to be 2!!
+
+ # Create observation selection vector using major and minor
+ # labels, for converting to panel format.
+ selector = _factor_indexer(shape[1:], labels)
+ mask = np.zeros(np.prod(shape), dtype=bool)
+ mask.put(selector, True)
+
+ if mask.all():
+ pvalues = np.empty(panel_shape, dtype=values.dtype)
+ else:
+ dtype, fill_value = maybe_promote(values.dtype)
+ pvalues = np.empty(panel_shape, dtype=dtype)
+ pvalues.fill(fill_value)
+
+ for i in range(len(placement)):
+ pvalues[i].flat[mask] = values[:, i]
+
+ return make_block(pvalues, placement=placement)
+
+
+def _safe_reshape(arr, new_shape):
+ """
+ If possible, reshape `arr` to have shape `new_shape`,
+ with a couple of exceptions (see gh-13012):
+
+ 1) If `arr` is a ExtensionArray or Index, `arr` will be
+ returned as is.
+ 2) If `arr` is a Series, the `_values` attribute will
+ be reshaped and returned.
+
+ Parameters
+ ----------
+ arr : array-like, object to be reshaped
+ new_shape : int or tuple of ints, the new shape
+ """
+ if isinstance(arr, ABCSeries):
+ arr = arr._values
+ if not isinstance(arr, ABCExtensionArray):
+ arr = arr.reshape(new_shape)
+ return arr
+
+
+def _factor_indexer(shape, labels):
+ """
+ given a tuple of shape and a list of Categorical labels, return the
+ expanded label indexer
+ """
+ mult = np.array(shape)[::-1].cumprod()[::-1]
+ return ensure_platform_int(
+ np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T)
+
+
+def _putmask_smart(v, m, n):
+ """
+ Return a new ndarray, try to preserve dtype if possible.
+
+ Parameters
+ ----------
+ v : `values`, updated in-place (array like)
+ m : `mask`, applies to both sides (array like)
+ n : `new values` either scalar or an array like aligned with `values`
+
+ Returns
+ -------
+ values : ndarray with updated values
+ this *may* be a copy of the original
+
+ See Also
+ --------
+ ndarray.putmask
+ """
+
+ # we cannot use np.asarray() here as we cannot have conversions
+ # that numpy does when numeric are mixed with strings
+
+ # n should be the length of the mask or a scalar here
+ if not is_list_like(n):
+ n = np.repeat(n, len(m))
+ elif isinstance(n, np.ndarray) and n.ndim == 0: # numpy scalar
+ n = np.repeat(np.array(n, ndmin=1), len(m))
+
+ # see if we are only masking values that if putted
+ # will work in the current dtype
+ try:
+ nn = n[m]
+
+ # make sure that we have a nullable type
+ # if we have nulls
+ if not _isna_compat(v, nn[0]):
+ raise ValueError
+
+ # we ignore ComplexWarning here
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter("ignore", np.ComplexWarning)
+ nn_at = nn.astype(v.dtype)
+
+ # avoid invalid dtype comparisons
+ # between numbers & strings
+
+ # only compare integers/floats
+ # don't compare integers to datetimelikes
+ if (not is_numeric_v_string_like(nn, nn_at) and
+ (is_float_dtype(nn.dtype) or
+ is_integer_dtype(nn.dtype) and
+ is_float_dtype(nn_at.dtype) or
+ is_integer_dtype(nn_at.dtype))):
+
+ comp = (nn == nn_at)
+ if is_list_like(comp) and comp.all():
+ nv = v.copy()
+ nv[m] = nn_at
+ return nv
+ except (ValueError, IndexError, TypeError, OverflowError):
+ pass
+
+ n = np.asarray(n)
+
+ def _putmask_preserve(nv, n):
+ try:
+ nv[m] = n[m]
+ except (IndexError, ValueError):
+ nv[m] = n
+ return nv
+
+ # preserves dtype if possible
+ if v.dtype.kind == n.dtype.kind:
+ return _putmask_preserve(v, n)
+
+ # change the dtype if needed
+ dtype, _ = maybe_promote(n.dtype)
+
+ if is_extension_type(v.dtype) and is_object_dtype(dtype):
+ v = v.get_values(dtype)
+ else:
+ v = v.astype(dtype)
+
+ return _putmask_preserve(v, n)
diff --git a/contrib/python/pandas/py2/pandas/core/internals/concat.py b/contrib/python/pandas/py2/pandas/core/internals/concat.py
new file mode 100644
index 00000000000..cb982749626
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/internals/concat.py
@@ -0,0 +1,485 @@
+# -*- coding: utf-8 -*-
+# TODO: Needs a better name; too many modules are already called "concat"
+from collections import defaultdict
+import copy
+
+import numpy as np
+
+from pandas._libs import internals as libinternals, tslibs
+from pandas.util._decorators import cache_readonly
+
+from pandas.core.dtypes.cast import maybe_promote
+from pandas.core.dtypes.common import (
+ _get_dtype, is_categorical_dtype, is_datetime64_dtype,
+ is_datetime64tz_dtype, is_extension_array_dtype, is_float_dtype,
+ is_numeric_dtype, is_sparse, is_timedelta64_dtype)
+import pandas.core.dtypes.concat as _concat
+from pandas.core.dtypes.missing import isna
+
+import pandas.core.algorithms as algos
+
+
+def get_mgr_concatenation_plan(mgr, indexers):
+ """
+ Construct concatenation plan for given block manager and indexers.
+
+ Parameters
+ ----------
+ mgr : BlockManager
+ indexers : dict of {axis: indexer}
+
+ Returns
+ -------
+ plan : list of (BlockPlacement, JoinUnit) tuples
+
+ """
+ # Calculate post-reindex shape , save for item axis which will be separate
+ # for each block anyway.
+ mgr_shape = list(mgr.shape)
+ for ax, indexer in indexers.items():
+ mgr_shape[ax] = len(indexer)
+ mgr_shape = tuple(mgr_shape)
+
+ if 0 in indexers:
+ ax0_indexer = indexers.pop(0)
+ blknos = algos.take_1d(mgr._blknos, ax0_indexer, fill_value=-1)
+ blklocs = algos.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1)
+ else:
+
+ if mgr._is_single_block:
+ blk = mgr.blocks[0]
+ return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))]
+
+ ax0_indexer = None
+ blknos = mgr._blknos
+ blklocs = mgr._blklocs
+
+ plan = []
+ for blkno, placements in libinternals.get_blkno_placements(blknos,
+ mgr.nblocks,
+ group=False):
+
+ assert placements.is_slice_like
+
+ join_unit_indexers = indexers.copy()
+
+ shape = list(mgr_shape)
+ shape[0] = len(placements)
+ shape = tuple(shape)
+
+ if blkno == -1:
+ unit = JoinUnit(None, shape)
+ else:
+ blk = mgr.blocks[blkno]
+ ax0_blk_indexer = blklocs[placements.indexer]
+
+ unit_no_ax0_reindexing = (len(placements) == len(blk.mgr_locs) and
+ # Fastpath detection of join unit not
+ # needing to reindex its block: no ax0
+ # reindexing took place and block
+ # placement was sequential before.
+ ((ax0_indexer is None and
+ blk.mgr_locs.is_slice_like and
+ blk.mgr_locs.as_slice.step == 1) or
+ # Slow-ish detection: all indexer locs
+ # are sequential (and length match is
+ # checked above).
+ (np.diff(ax0_blk_indexer) == 1).all()))
+
+ # Omit indexer if no item reindexing is required.
+ if unit_no_ax0_reindexing:
+ join_unit_indexers.pop(0, None)
+ else:
+ join_unit_indexers[0] = ax0_blk_indexer
+
+ unit = JoinUnit(blk, shape, join_unit_indexers)
+
+ plan.append((placements, unit))
+
+ return plan
+
+
+class JoinUnit(object):
+
+ def __init__(self, block, shape, indexers=None):
+ # Passing shape explicitly is required for cases when block is None.
+ if indexers is None:
+ indexers = {}
+ self.block = block
+ self.indexers = indexers
+ self.shape = shape
+
+ def __repr__(self):
+ return '{name}({block!r}, {indexers})'.format(
+ name=self.__class__.__name__, block=self.block,
+ indexers=self.indexers)
+
+ @cache_readonly
+ def needs_filling(self):
+ for indexer in self.indexers.values():
+ # FIXME: cache results of indexer == -1 checks.
+ if (indexer == -1).any():
+ return True
+
+ return False
+
+ @cache_readonly
+ def dtype(self):
+ if self.block is None:
+ raise AssertionError("Block is None, no dtype")
+
+ if not self.needs_filling:
+ return self.block.dtype
+ else:
+ return _get_dtype(maybe_promote(self.block.dtype,
+ self.block.fill_value)[0])
+
+ @cache_readonly
+ def is_na(self):
+ if self.block is None:
+ return True
+
+ if not self.block._can_hold_na:
+ return False
+
+ # Usually it's enough to check but a small fraction of values to see if
+ # a block is NOT null, chunks should help in such cases. 1000 value
+ # was chosen rather arbitrarily.
+ values = self.block.values
+ if self.block.is_categorical:
+ values_flat = values.categories
+ elif is_sparse(self.block.values.dtype):
+ return False
+ elif self.block.is_extension:
+ values_flat = values
+ else:
+ values_flat = values.ravel(order='K')
+ total_len = values_flat.shape[0]
+ chunk_len = max(total_len // 40, 1000)
+ for i in range(0, total_len, chunk_len):
+ if not isna(values_flat[i:i + chunk_len]).all():
+ return False
+
+ return True
+
+ def get_reindexed_values(self, empty_dtype, upcasted_na):
+ if upcasted_na is None:
+ # No upcasting is necessary
+ fill_value = self.block.fill_value
+ values = self.block.get_values()
+ else:
+ fill_value = upcasted_na
+
+ if self.is_na:
+ if getattr(self.block, 'is_object', False):
+ # we want to avoid filling with np.nan if we are
+ # using None; we already know that we are all
+ # nulls
+ values = self.block.values.ravel(order='K')
+ if len(values) and values[0] is None:
+ fill_value = None
+
+ if (getattr(self.block, 'is_datetimetz', False) or
+ is_datetime64tz_dtype(empty_dtype)):
+ if self.block is None:
+ array = empty_dtype.construct_array_type()
+ return array(np.full(self.shape[1], fill_value.value),
+ dtype=empty_dtype)
+ pass
+ elif getattr(self.block, 'is_categorical', False):
+ pass
+ elif getattr(self.block, 'is_sparse', False):
+ pass
+ elif getattr(self.block, 'is_extension', False):
+ pass
+ else:
+ missing_arr = np.empty(self.shape, dtype=empty_dtype)
+ missing_arr.fill(fill_value)
+ return missing_arr
+
+ if not self.indexers:
+ if not self.block._can_consolidate:
+ # preserve these for validation in _concat_compat
+ return self.block.values
+
+ if self.block.is_bool and not self.block.is_categorical:
+ # External code requested filling/upcasting, bool values must
+ # be upcasted to object to avoid being upcasted to numeric.
+ values = self.block.astype(np.object_).values
+ elif self.block.is_extension:
+ values = self.block.values
+ else:
+ # No dtype upcasting is done here, it will be performed during
+ # concatenation itself.
+ values = self.block.get_values()
+
+ if not self.indexers:
+ # If there's no indexing to be done, we want to signal outside
+ # code that this array must be copied explicitly. This is done
+ # by returning a view and checking `retval.base`.
+ values = values.view()
+
+ else:
+ for ax, indexer in self.indexers.items():
+ values = algos.take_nd(values, indexer, axis=ax,
+ fill_value=fill_value)
+
+ return values
+
+
+def concatenate_join_units(join_units, concat_axis, copy):
+ """
+ Concatenate values from several join units along selected axis.
+ """
+ if concat_axis == 0 and len(join_units) > 1:
+ # Concatenating join units along ax0 is handled in _merge_blocks.
+ raise AssertionError("Concatenating join units along axis0")
+
+ empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units)
+
+ to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype,
+ upcasted_na=upcasted_na)
+ for ju in join_units]
+
+ if len(to_concat) == 1:
+ # Only one block, nothing to concatenate.
+ concat_values = to_concat[0]
+ if copy:
+ if isinstance(concat_values, np.ndarray):
+ # non-reindexed (=not yet copied) arrays are made into a view
+ # in JoinUnit.get_reindexed_values
+ if concat_values.base is not None:
+ concat_values = concat_values.copy()
+ else:
+ concat_values = concat_values.copy()
+ else:
+ concat_values = _concat._concat_compat(to_concat, axis=concat_axis)
+
+ return concat_values
+
+
+def get_empty_dtype_and_na(join_units):
+ """
+ Return dtype and N/A values to use when concatenating specified units.
+
+ Returned N/A value may be None which means there was no casting involved.
+
+ Returns
+ -------
+ dtype
+ na
+ """
+ if len(join_units) == 1:
+ blk = join_units[0].block
+ if blk is None:
+ return np.float64, np.nan
+
+ if is_uniform_reindex(join_units):
+ # XXX: integrate property
+ empty_dtype = join_units[0].block.dtype
+ upcasted_na = join_units[0].block.fill_value
+ return empty_dtype, upcasted_na
+
+ has_none_blocks = False
+ dtypes = [None] * len(join_units)
+ for i, unit in enumerate(join_units):
+ if unit.block is None:
+ has_none_blocks = True
+ else:
+ dtypes[i] = unit.dtype
+
+ upcast_classes = defaultdict(list)
+ null_upcast_classes = defaultdict(list)
+ for dtype, unit in zip(dtypes, join_units):
+ if dtype is None:
+ continue
+
+ if is_categorical_dtype(dtype):
+ upcast_cls = 'category'
+ elif is_datetime64tz_dtype(dtype):
+ upcast_cls = 'datetimetz'
+ elif issubclass(dtype.type, np.bool_):
+ upcast_cls = 'bool'
+ elif issubclass(dtype.type, np.object_):
+ upcast_cls = 'object'
+ elif is_datetime64_dtype(dtype):
+ upcast_cls = 'datetime'
+ elif is_timedelta64_dtype(dtype):
+ upcast_cls = 'timedelta'
+ elif is_sparse(dtype):
+ upcast_cls = dtype.subtype.name
+ elif is_extension_array_dtype(dtype):
+ upcast_cls = 'object'
+ elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
+ upcast_cls = dtype.name
+ else:
+ upcast_cls = 'float'
+
+ # Null blocks should not influence upcast class selection, unless there
+ # are only null blocks, when same upcasting rules must be applied to
+ # null upcast classes.
+ if unit.is_na:
+ null_upcast_classes[upcast_cls].append(dtype)
+ else:
+ upcast_classes[upcast_cls].append(dtype)
+
+ if not upcast_classes:
+ upcast_classes = null_upcast_classes
+
+ # create the result
+ if 'object' in upcast_classes:
+ return np.dtype(np.object_), np.nan
+ elif 'bool' in upcast_classes:
+ if has_none_blocks:
+ return np.dtype(np.object_), np.nan
+ else:
+ return np.dtype(np.bool_), None
+ elif 'category' in upcast_classes:
+ return np.dtype(np.object_), np.nan
+ elif 'datetimetz' in upcast_classes:
+ # GH-25014. We use NaT instead of iNaT, since this eventually
+ # ends up in DatetimeArray.take, which does not allow iNaT.
+ dtype = upcast_classes['datetimetz']
+ return dtype[0], tslibs.NaT
+ elif 'datetime' in upcast_classes:
+ return np.dtype('M8[ns]'), tslibs.iNaT
+ elif 'timedelta' in upcast_classes:
+ return np.dtype('m8[ns]'), tslibs.iNaT
+ else: # pragma
+ try:
+ g = np.find_common_type(upcast_classes, [])
+ except TypeError:
+ # At least one is an ExtensionArray
+ return np.dtype(np.object_), np.nan
+ else:
+ if is_float_dtype(g):
+ return g, g.type(np.nan)
+ elif is_numeric_dtype(g):
+ if has_none_blocks:
+ return np.float64, np.nan
+ else:
+ return g, None
+
+ msg = "invalid dtype determination in get_concat_dtype"
+ raise AssertionError(msg)
+
+
+def is_uniform_join_units(join_units):
+ """
+ Check if the join units consist of blocks of uniform type that can
+ be concatenated using Block.concat_same_type instead of the generic
+ concatenate_join_units (which uses `_concat._concat_compat`).
+
+ """
+ return (
+ # all blocks need to have the same type
+ all(type(ju.block) is type(join_units[0].block) for ju in join_units) and # noqa
+ # no blocks that would get missing values (can lead to type upcasts)
+ # unless we're an extension dtype.
+ all(not ju.is_na or ju.block.is_extension for ju in join_units) and
+ # no blocks with indexers (as then the dimensions do not fit)
+ all(not ju.indexers for ju in join_units) and
+ # disregard Panels
+ all(ju.block.ndim <= 2 for ju in join_units) and
+ # only use this path when there is something to concatenate
+ len(join_units) > 1)
+
+
+def is_uniform_reindex(join_units):
+ return (
+ # TODO: should this be ju.block._can_hold_na?
+ all(ju.block and ju.block.is_extension for ju in join_units) and
+ len({ju.block.dtype.name for ju in join_units}) == 1
+ )
+
+
+def trim_join_unit(join_unit, length):
+ """
+ Reduce join_unit's shape along item axis to length.
+
+ Extra items that didn't fit are returned as a separate block.
+ """
+
+ if 0 not in join_unit.indexers:
+ extra_indexers = join_unit.indexers
+
+ if join_unit.block is None:
+ extra_block = None
+ else:
+ extra_block = join_unit.block.getitem_block(slice(length, None))
+ join_unit.block = join_unit.block.getitem_block(slice(length))
+ else:
+ extra_block = join_unit.block
+
+ extra_indexers = copy.copy(join_unit.indexers)
+ extra_indexers[0] = extra_indexers[0][length:]
+ join_unit.indexers[0] = join_unit.indexers[0][:length]
+
+ extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:]
+ join_unit.shape = (length,) + join_unit.shape[1:]
+
+ return JoinUnit(block=extra_block, indexers=extra_indexers,
+ shape=extra_shape)
+
+
+def combine_concat_plans(plans, concat_axis):
+ """
+ Combine multiple concatenation plans into one.
+
+ existing_plan is updated in-place.
+ """
+ if len(plans) == 1:
+ for p in plans[0]:
+ yield p[0], [p[1]]
+
+ elif concat_axis == 0:
+ offset = 0
+ for plan in plans:
+ last_plc = None
+
+ for plc, unit in plan:
+ yield plc.add(offset), [unit]
+ last_plc = plc
+
+ if last_plc is not None:
+ offset += last_plc.as_slice.stop
+
+ else:
+ num_ended = [0]
+
+ def _next_or_none(seq):
+ retval = next(seq, None)
+ if retval is None:
+ num_ended[0] += 1
+ return retval
+
+ plans = list(map(iter, plans))
+ next_items = list(map(_next_or_none, plans))
+
+ while num_ended[0] != len(next_items):
+ if num_ended[0] > 0:
+ raise ValueError("Plan shapes are not aligned")
+
+ placements, units = zip(*next_items)
+
+ lengths = list(map(len, placements))
+ min_len, max_len = min(lengths), max(lengths)
+
+ if min_len == max_len:
+ yield placements[0], units
+ next_items[:] = map(_next_or_none, plans)
+ else:
+ yielded_placement = None
+ yielded_units = [None] * len(next_items)
+ for i, (plc, unit) in enumerate(next_items):
+ yielded_units[i] = unit
+ if len(plc) > min_len:
+ # trim_join_unit updates unit in place, so only
+ # placement needs to be sliced to skip min_len.
+ next_items[i] = (plc[min_len:],
+ trim_join_unit(unit, min_len))
+ else:
+ yielded_placement = plc
+ next_items[i] = _next_or_none(plans[i])
+
+ yield yielded_placement, yielded_units
diff --git a/contrib/python/pandas/py2/pandas/core/internals/construction.py b/contrib/python/pandas/py2/pandas/core/internals/construction.py
new file mode 100644
index 00000000000..c05a9a0f8f3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/internals/construction.py
@@ -0,0 +1,721 @@
+"""
+Functions for preparing various inputs passed to the DataFrame or Series
+constructors before passing them to a BlockManager.
+"""
+from collections import OrderedDict
+
+import numpy as np
+import numpy.ma as ma
+
+from pandas._libs import lib
+from pandas._libs.tslibs import IncompatibleFrequency
+import pandas.compat as compat
+from pandas.compat import (
+ get_range_parameters, lmap, lrange, raise_with_traceback, range)
+
+from pandas.core.dtypes.cast import (
+ construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na,
+ construct_1d_object_array_from_listlike, infer_dtype_from_scalar,
+ maybe_cast_to_datetime, maybe_cast_to_integer_array, maybe_castable,
+ maybe_convert_platform, maybe_infer_to_datetimelike, maybe_upcast)
+from pandas.core.dtypes.common import (
+ is_categorical_dtype, is_datetime64tz_dtype, is_dtype_equal,
+ is_extension_array_dtype, is_extension_type, is_float_dtype,
+ is_integer_dtype, is_iterator, is_list_like, is_object_dtype, pandas_dtype)
+from pandas.core.dtypes.generic import (
+ ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCPandasArray,
+ ABCPeriodIndex, ABCSeries, ABCTimedeltaIndex)
+from pandas.core.dtypes.missing import isna
+
+from pandas.core import algorithms, common as com
+from pandas.core.arrays import Categorical, ExtensionArray, period_array
+from pandas.core.index import (
+ Index, _get_objs_combined_axis, _union_indexes, ensure_index)
+from pandas.core.indexes import base as ibase
+from pandas.core.internals import (
+ create_block_manager_from_arrays, create_block_manager_from_blocks)
+from pandas.core.internals.arrays import extract_array
+
+# ---------------------------------------------------------------------
+# BlockManager Interface
+
+
+def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None):
+ """
+ Segregate Series based on type and coerce into matrices.
+
+ Needs to handle a lot of exceptional cases.
+ """
+ # figure out the index, if necessary
+ if index is None:
+ index = extract_index(arrays)
+ else:
+ index = ensure_index(index)
+
+ # don't force copy because getting jammed in an ndarray anyway
+ arrays = _homogenize(arrays, index, dtype)
+
+ # from BlockManager perspective
+ axes = [ensure_index(columns), index]
+
+ return create_block_manager_from_arrays(arrays, arr_names, axes)
+
+
+def masked_rec_array_to_mgr(data, index, columns, dtype, copy):
+ """
+ Extract from a masked rec array and create the manager.
+ """
+
+ # essentially process a record array then fill it
+ fill_value = data.fill_value
+ fdata = ma.getdata(data)
+ if index is None:
+ index = get_names_from_index(fdata)
+ if index is None:
+ index = ibase.default_index(len(data))
+ index = ensure_index(index)
+
+ if columns is not None:
+ columns = ensure_index(columns)
+ arrays, arr_columns = to_arrays(fdata, columns)
+
+ # fill if needed
+ new_arrays = []
+ for fv, arr, col in zip(fill_value, arrays, arr_columns):
+ mask = ma.getmaskarray(data[col])
+ if mask.any():
+ arr, fv = maybe_upcast(arr, fill_value=fv, copy=True)
+ arr[mask] = fv
+ new_arrays.append(arr)
+
+ # create the manager
+ arrays, arr_columns = reorder_arrays(new_arrays, arr_columns, columns)
+ if columns is None:
+ columns = arr_columns
+
+ mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype)
+
+ if copy:
+ mgr = mgr.copy()
+ return mgr
+
+
+# ---------------------------------------------------------------------
+# DataFrame Constructor Interface
+
+def init_ndarray(values, index, columns, dtype=None, copy=False):
+ # input must be a ndarray, list, Series, index
+
+ if isinstance(values, ABCSeries):
+ if columns is None:
+ if values.name is not None:
+ columns = [values.name]
+ if index is None:
+ index = values.index
+ else:
+ values = values.reindex(index)
+
+ # zero len case (GH #2234)
+ if not len(values) and columns is not None and len(columns):
+ values = np.empty((0, 1), dtype=object)
+
+ # we could have a categorical type passed or coerced to 'category'
+ # recast this to an arrays_to_mgr
+ if (is_categorical_dtype(getattr(values, 'dtype', None)) or
+ is_categorical_dtype(dtype)):
+
+ if not hasattr(values, 'dtype'):
+ values = prep_ndarray(values, copy=copy)
+ values = values.ravel()
+ elif copy:
+ values = values.copy()
+
+ index, columns = _get_axes(len(values), 1, index, columns)
+ return arrays_to_mgr([values], columns, index, columns,
+ dtype=dtype)
+ elif (is_datetime64tz_dtype(values) or
+ is_extension_array_dtype(values)):
+ # GH#19157
+ if columns is None:
+ columns = [0]
+ return arrays_to_mgr([values], columns, index, columns,
+ dtype=dtype)
+
+ # by definition an array here
+ # the dtypes will be coerced to a single dtype
+ values = prep_ndarray(values, copy=copy)
+
+ if dtype is not None:
+ if not is_dtype_equal(values.dtype, dtype):
+ try:
+ values = values.astype(dtype)
+ except Exception as orig:
+ e = ValueError("failed to cast to '{dtype}' (Exception "
+ "was: {orig})".format(dtype=dtype,
+ orig=orig))
+ raise_with_traceback(e)
+
+ index, columns = _get_axes(*values.shape, index=index, columns=columns)
+ values = values.T
+
+ # if we don't have a dtype specified, then try to convert objects
+ # on the entire block; this is to convert if we have datetimelike's
+ # embedded in an object type
+ if dtype is None and is_object_dtype(values):
+ values = maybe_infer_to_datetimelike(values)
+
+ return create_block_manager_from_blocks([values], [columns, index])
+
+
+def init_dict(data, index, columns, dtype=None):
+ """
+ Segregate Series based on type and coerce into matrices.
+ Needs to handle a lot of exceptional cases.
+ """
+ if columns is not None:
+ from pandas.core.series import Series
+ arrays = Series(data, index=columns, dtype=object)
+ data_names = arrays.index
+
+ missing = arrays.isnull()
+ if index is None:
+ # GH10856
+ # raise ValueError if only scalars in dict
+ index = extract_index(arrays[~missing])
+ else:
+ index = ensure_index(index)
+
+ # no obvious "empty" int column
+ if missing.any() and not is_integer_dtype(dtype):
+ if dtype is None or np.issubdtype(dtype, np.flexible):
+ # GH#1783
+ nan_dtype = object
+ else:
+ nan_dtype = dtype
+ val = construct_1d_arraylike_from_scalar(np.nan, len(index),
+ nan_dtype)
+ arrays.loc[missing] = [val] * missing.sum()
+
+ else:
+
+ for key in data:
+ if (isinstance(data[key], ABCDatetimeIndex) and
+ data[key].tz is not None):
+ # GH#24096 need copy to be deep for datetime64tz case
+ # TODO: See if we can avoid these copies
+ data[key] = data[key].copy(deep=True)
+
+ keys = com.dict_keys_to_ordered_list(data)
+ columns = data_names = Index(keys)
+ arrays = [data[k] for k in keys]
+
+ return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
+
+
+# ---------------------------------------------------------------------
+
+def prep_ndarray(values, copy=True):
+ if not isinstance(values, (np.ndarray, ABCSeries, Index)):
+ if len(values) == 0:
+ return np.empty((0, 0), dtype=object)
+
+ def convert(v):
+ return maybe_convert_platform(v)
+
+ # we could have a 1-dim or 2-dim list here
+ # this is equiv of np.asarray, but does object conversion
+ # and platform dtype preservation
+ try:
+ if is_list_like(values[0]) or hasattr(values[0], 'len'):
+ values = np.array([convert(v) for v in values])
+ elif isinstance(values[0], np.ndarray) and values[0].ndim == 0:
+ # GH#21861
+ values = np.array([convert(v) for v in values])
+ else:
+ values = convert(values)
+ except (ValueError, TypeError):
+ values = convert(values)
+
+ else:
+
+ # drop subclass info, do not copy data
+ values = np.asarray(values)
+ if copy:
+ values = values.copy()
+
+ if values.ndim == 1:
+ values = values.reshape((values.shape[0], 1))
+ elif values.ndim != 2:
+ raise ValueError('Must pass 2-d input')
+
+ return values
+
+
+def _homogenize(data, index, dtype=None):
+ oindex = None
+ homogenized = []
+
+ for val in data:
+ if isinstance(val, ABCSeries):
+ if dtype is not None:
+ val = val.astype(dtype)
+ if val.index is not index:
+ # Forces alignment. No need to copy data since we
+ # are putting it into an ndarray later
+ val = val.reindex(index, copy=False)
+ else:
+ if isinstance(val, dict):
+ if oindex is None:
+ oindex = index.astype('O')
+
+ if isinstance(index, (ABCDatetimeIndex, ABCTimedeltaIndex)):
+ val = com.dict_compat(val)
+ else:
+ val = dict(val)
+ val = lib.fast_multiget(val, oindex.values, default=np.nan)
+ val = sanitize_array(val, index, dtype=dtype, copy=False,
+ raise_cast_failure=False)
+
+ homogenized.append(val)
+
+ return homogenized
+
+
+def extract_index(data):
+ index = None
+ if len(data) == 0:
+ index = Index([])
+ elif len(data) > 0:
+ raw_lengths = []
+ indexes = []
+
+ have_raw_arrays = False
+ have_series = False
+ have_dicts = False
+
+ for val in data:
+ if isinstance(val, ABCSeries):
+ have_series = True
+ indexes.append(val.index)
+ elif isinstance(val, dict):
+ have_dicts = True
+ indexes.append(list(val.keys()))
+ elif is_list_like(val) and getattr(val, 'ndim', 1) == 1:
+ have_raw_arrays = True
+ raw_lengths.append(len(val))
+
+ if not indexes and not raw_lengths:
+ raise ValueError('If using all scalar values, you must pass'
+ ' an index')
+
+ if have_series or have_dicts:
+ index = _union_indexes(indexes)
+
+ if have_raw_arrays:
+ lengths = list(set(raw_lengths))
+ if len(lengths) > 1:
+ raise ValueError('arrays must all be same length')
+
+ if have_dicts:
+ raise ValueError('Mixing dicts with non-Series may lead to '
+ 'ambiguous ordering.')
+
+ if have_series:
+ if lengths[0] != len(index):
+ msg = ('array length {length} does not match index '
+ 'length {idx_len}'
+ .format(length=lengths[0], idx_len=len(index)))
+ raise ValueError(msg)
+ else:
+ index = ibase.default_index(lengths[0])
+
+ return ensure_index(index)
+
+
+def reorder_arrays(arrays, arr_columns, columns):
+ # reorder according to the columns
+ if (columns is not None and len(columns) and arr_columns is not None and
+ len(arr_columns)):
+ indexer = ensure_index(arr_columns).get_indexer(columns)
+ arr_columns = ensure_index([arr_columns[i] for i in indexer])
+ arrays = [arrays[i] for i in indexer]
+ return arrays, arr_columns
+
+
+def get_names_from_index(data):
+ has_some_name = any(getattr(s, 'name', None) is not None for s in data)
+ if not has_some_name:
+ return ibase.default_index(len(data))
+
+ index = lrange(len(data))
+ count = 0
+ for i, s in enumerate(data):
+ n = getattr(s, 'name', None)
+ if n is not None:
+ index[i] = n
+ else:
+ index[i] = 'Unnamed {count}'.format(count=count)
+ count += 1
+
+ return index
+
+
+def _get_axes(N, K, index, columns):
+ # helper to create the axes as indexes
+ # return axes or defaults
+
+ if index is None:
+ index = ibase.default_index(N)
+ else:
+ index = ensure_index(index)
+
+ if columns is None:
+ columns = ibase.default_index(K)
+ else:
+ columns = ensure_index(columns)
+ return index, columns
+
+
+# ---------------------------------------------------------------------
+# Conversion of Inputs to Arrays
+
+def to_arrays(data, columns, coerce_float=False, dtype=None):
+ """
+ Return list of arrays, columns.
+ """
+ if isinstance(data, ABCDataFrame):
+ if columns is not None:
+ arrays = [data._ixs(i, axis=1).values
+ for i, col in enumerate(data.columns) if col in columns]
+ else:
+ columns = data.columns
+ arrays = [data._ixs(i, axis=1).values for i in range(len(columns))]
+
+ return arrays, columns
+
+ if not len(data):
+ if isinstance(data, np.ndarray):
+ columns = data.dtype.names
+ if columns is not None:
+ return [[]] * len(columns), columns
+ return [], [] # columns if columns is not None else []
+ if isinstance(data[0], (list, tuple)):
+ return _list_to_arrays(data, columns, coerce_float=coerce_float,
+ dtype=dtype)
+ elif isinstance(data[0], compat.Mapping):
+ return _list_of_dict_to_arrays(data, columns,
+ coerce_float=coerce_float, dtype=dtype)
+ elif isinstance(data[0], ABCSeries):
+ return _list_of_series_to_arrays(data, columns,
+ coerce_float=coerce_float,
+ dtype=dtype)
+ elif isinstance(data[0], Categorical):
+ if columns is None:
+ columns = ibase.default_index(len(data))
+ return data, columns
+ elif (isinstance(data, (np.ndarray, ABCSeries, Index)) and
+ data.dtype.names is not None):
+
+ columns = list(data.dtype.names)
+ arrays = [data[k] for k in columns]
+ return arrays, columns
+ else:
+ # last ditch effort
+ data = lmap(tuple, data)
+ return _list_to_arrays(data, columns, coerce_float=coerce_float,
+ dtype=dtype)
+
+
+def _list_to_arrays(data, columns, coerce_float=False, dtype=None):
+ if len(data) > 0 and isinstance(data[0], tuple):
+ content = list(lib.to_object_array_tuples(data).T)
+ else:
+ # list of lists
+ content = list(lib.to_object_array(data).T)
+ return _convert_object_array(content, columns, dtype=dtype,
+ coerce_float=coerce_float)
+
+
+def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
+ if columns is None:
+ columns = _get_objs_combined_axis(data, sort=False)
+
+ indexer_cache = {}
+
+ aligned_values = []
+ for s in data:
+ index = getattr(s, 'index', None)
+ if index is None:
+ index = ibase.default_index(len(s))
+
+ if id(index) in indexer_cache:
+ indexer = indexer_cache[id(index)]
+ else:
+ indexer = indexer_cache[id(index)] = index.get_indexer(columns)
+
+ values = com.values_from_object(s)
+ aligned_values.append(algorithms.take_1d(values, indexer))
+
+ values = np.vstack(aligned_values)
+
+ if values.dtype == np.object_:
+ content = list(values.T)
+ return _convert_object_array(content, columns, dtype=dtype,
+ coerce_float=coerce_float)
+ else:
+ return values.T, columns
+
+
+def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
+ if columns is None:
+ gen = (list(x.keys()) for x in data)
+ sort = not any(isinstance(d, OrderedDict) for d in data)
+ columns = lib.fast_unique_multiple_list_gen(gen, sort=sort)
+
+ # assure that they are of the base dict class and not of derived
+ # classes
+ data = [(type(d) is dict) and d or dict(d) for d in data]
+
+ content = list(lib.dicts_to_array(data, list(columns)).T)
+ return _convert_object_array(content, columns, dtype=dtype,
+ coerce_float=coerce_float)
+
+
+def _convert_object_array(content, columns, coerce_float=False, dtype=None):
+ if columns is None:
+ columns = ibase.default_index(len(content))
+ else:
+ if len(columns) != len(content): # pragma: no cover
+ # caller's responsibility to check for this...
+ raise AssertionError('{col:d} columns passed, passed data had '
+ '{con} columns'.format(col=len(columns),
+ con=len(content)))
+
+ # provide soft conversion of object dtypes
+ def convert(arr):
+ if dtype != object and dtype != np.object:
+ arr = lib.maybe_convert_objects(arr, try_float=coerce_float)
+ arr = maybe_cast_to_datetime(arr, dtype)
+ return arr
+
+ arrays = [convert(arr) for arr in content]
+
+ return arrays, columns
+
+
+# ---------------------------------------------------------------------
+# Series-Based
+
+def sanitize_index(data, index, copy=False):
+ """
+ Sanitize an index type to return an ndarray of the underlying, pass
+ through a non-Index.
+ """
+
+ if index is None:
+ return data
+
+ if len(data) != len(index):
+ raise ValueError('Length of values does not match length of index')
+
+ if isinstance(data, ABCIndexClass) and not copy:
+ pass
+ elif isinstance(data, (ABCPeriodIndex, ABCDatetimeIndex)):
+ data = data._values
+ if copy:
+ data = data.copy()
+
+ elif isinstance(data, np.ndarray):
+
+ # coerce datetimelike types
+ if data.dtype.kind in ['M', 'm']:
+ data = sanitize_array(data, index, copy=copy)
+
+ return data
+
+
+def sanitize_array(data, index, dtype=None, copy=False,
+ raise_cast_failure=False):
+ """
+ Sanitize input data to an ndarray, copy if specified, coerce to the
+ dtype if specified.
+ """
+ if dtype is not None:
+ dtype = pandas_dtype(dtype)
+
+ if isinstance(data, ma.MaskedArray):
+ mask = ma.getmaskarray(data)
+ if mask.any():
+ data, fill_value = maybe_upcast(data, copy=True)
+ data.soften_mask() # set hardmask False if it was True
+ data[mask] = fill_value
+ else:
+ data = data.copy()
+
+ data = extract_array(data, extract_numpy=True)
+
+ # GH#846
+ if isinstance(data, np.ndarray):
+
+ if dtype is not None:
+ subarr = np.array(data, copy=False)
+
+ # possibility of nan -> garbage
+ if is_float_dtype(data.dtype) and is_integer_dtype(dtype):
+ try:
+ subarr = _try_cast(data, True, dtype, copy,
+ True)
+ except ValueError:
+ if copy:
+ subarr = data.copy()
+ else:
+ subarr = _try_cast(data, True, dtype, copy, raise_cast_failure)
+ elif isinstance(data, Index):
+ # don't coerce Index types
+ # e.g. indexes can have different conversions (so don't fast path
+ # them)
+ # GH#6140
+ subarr = sanitize_index(data, index, copy=copy)
+ else:
+
+ # we will try to copy be-definition here
+ subarr = _try_cast(data, True, dtype, copy, raise_cast_failure)
+
+ elif isinstance(data, ExtensionArray):
+ if isinstance(data, ABCPandasArray):
+ # We don't want to let people put our PandasArray wrapper
+ # (the output of Series/Index.array), into a Series. So
+ # we explicitly unwrap it here.
+ subarr = data.to_numpy()
+ else:
+ subarr = data
+
+ # everything else in this block must also handle ndarray's,
+ # becuase we've unwrapped PandasArray into an ndarray.
+
+ if dtype is not None:
+ subarr = data.astype(dtype)
+
+ if copy:
+ subarr = data.copy()
+ return subarr
+
+ elif isinstance(data, (list, tuple)) and len(data) > 0:
+ if dtype is not None:
+ try:
+ subarr = _try_cast(data, False, dtype, copy,
+ raise_cast_failure)
+ except Exception:
+ if raise_cast_failure: # pragma: no cover
+ raise
+ subarr = np.array(data, dtype=object, copy=copy)
+ subarr = lib.maybe_convert_objects(subarr)
+
+ else:
+ subarr = maybe_convert_platform(data)
+
+ subarr = maybe_cast_to_datetime(subarr, dtype)
+
+ elif isinstance(data, range):
+ # GH#16804
+ start, stop, step = get_range_parameters(data)
+ arr = np.arange(start, stop, step, dtype='int64')
+ subarr = _try_cast(arr, False, dtype, copy, raise_cast_failure)
+ else:
+ subarr = _try_cast(data, False, dtype, copy, raise_cast_failure)
+
+ # scalar like, GH
+ if getattr(subarr, 'ndim', 0) == 0:
+ if isinstance(data, list): # pragma: no cover
+ subarr = np.array(data, dtype=object)
+ elif index is not None:
+ value = data
+
+ # figure out the dtype from the value (upcast if necessary)
+ if dtype is None:
+ dtype, value = infer_dtype_from_scalar(value)
+ else:
+ # need to possibly convert the value here
+ value = maybe_cast_to_datetime(value, dtype)
+
+ subarr = construct_1d_arraylike_from_scalar(
+ value, len(index), dtype)
+
+ else:
+ return subarr.item()
+
+ # the result that we want
+ elif subarr.ndim == 1:
+ if index is not None:
+
+ # a 1-element ndarray
+ if len(subarr) != len(index) and len(subarr) == 1:
+ subarr = construct_1d_arraylike_from_scalar(
+ subarr[0], len(index), subarr.dtype)
+
+ elif subarr.ndim > 1:
+ if isinstance(data, np.ndarray):
+ raise Exception('Data must be 1-dimensional')
+ else:
+ subarr = com.asarray_tuplesafe(data, dtype=dtype)
+
+ # This is to prevent mixed-type Series getting all casted to
+ # NumPy string type, e.g. NaN --> '-1#IND'.
+ if issubclass(subarr.dtype.type, compat.string_types):
+ # GH#16605
+ # If not empty convert the data to dtype
+ # GH#19853: If data is a scalar, subarr has already the result
+ if not lib.is_scalar(data):
+ if not np.all(isna(data)):
+ data = np.array(data, dtype=dtype, copy=False)
+ subarr = np.array(data, dtype=object, copy=copy)
+
+ if is_object_dtype(subarr.dtype) and dtype != 'object':
+ inferred = lib.infer_dtype(subarr, skipna=False)
+ if inferred == 'period':
+ try:
+ subarr = period_array(subarr)
+ except IncompatibleFrequency:
+ pass
+
+ return subarr
+
+
+def _try_cast(arr, take_fast_path, dtype, copy, raise_cast_failure):
+
+ # perf shortcut as this is the most common case
+ if take_fast_path:
+ if maybe_castable(arr) and not copy and dtype is None:
+ return arr
+
+ try:
+ # GH#15832: Check if we are requesting a numeric dype and
+ # that we can convert the data to the requested dtype.
+ if is_integer_dtype(dtype):
+ subarr = maybe_cast_to_integer_array(arr, dtype)
+
+ subarr = maybe_cast_to_datetime(arr, dtype)
+ # Take care in creating object arrays (but iterators are not
+ # supported):
+ if is_object_dtype(dtype) and (is_list_like(subarr) and
+ not (is_iterator(subarr) or
+ isinstance(subarr, np.ndarray))):
+ subarr = construct_1d_object_array_from_listlike(subarr)
+ elif not is_extension_type(subarr):
+ subarr = construct_1d_ndarray_preserving_na(subarr, dtype,
+ copy=copy)
+ except (ValueError, TypeError):
+ if is_categorical_dtype(dtype):
+ # We *do* allow casting to categorical, since we know
+ # that Categorical is the only array type for 'category'.
+ subarr = Categorical(arr, dtype.categories,
+ ordered=dtype.ordered)
+ elif is_extension_array_dtype(dtype):
+ # create an extension array from its dtype
+ array_type = dtype.construct_array_type()._from_sequence
+ subarr = array_type(arr, dtype=dtype, copy=copy)
+ elif dtype is not None and raise_cast_failure:
+ raise
+ else:
+ subarr = np.array(arr, dtype=object, copy=copy)
+ return subarr
diff --git a/contrib/python/pandas/py2/pandas/core/internals/managers.py b/contrib/python/pandas/py2/pandas/core/internals/managers.py
new file mode 100644
index 00000000000..5725b809902
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/internals/managers.py
@@ -0,0 +1,2065 @@
+# -*- coding: utf-8 -*-
+from collections import defaultdict
+from functools import partial
+import itertools
+import operator
+import re
+
+import numpy as np
+
+from pandas._libs import internals as libinternals, lib
+from pandas.compat import map, range, zip
+from pandas.util._validators import validate_bool_kwarg
+
+from pandas.core.dtypes.cast import (
+ find_common_type, infer_dtype_from_scalar, maybe_convert_objects,
+ maybe_promote)
+from pandas.core.dtypes.common import (
+ _NS_DTYPE, is_datetimelike_v_numeric, is_extension_array_dtype,
+ is_extension_type, is_list_like, is_numeric_v_string_like, is_scalar)
+import pandas.core.dtypes.concat as _concat
+from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries
+from pandas.core.dtypes.missing import isna
+
+import pandas.core.algorithms as algos
+from pandas.core.arrays.sparse import _maybe_to_sparse
+from pandas.core.base import PandasObject
+from pandas.core.index import Index, MultiIndex, ensure_index
+from pandas.core.indexing import maybe_convert_indices
+
+from pandas.io.formats.printing import pprint_thing
+
+from .blocks import (
+ Block, CategoricalBlock, DatetimeTZBlock, ExtensionBlock,
+ ObjectValuesExtensionBlock, _extend_blocks, _merge_blocks, _safe_reshape,
+ get_block_type, make_block)
+from .concat import ( # all for concatenate_block_managers
+ combine_concat_plans, concatenate_join_units, get_mgr_concatenation_plan,
+ is_uniform_join_units)
+
+# TODO: flexible with index=None and/or items=None
+
+
+class BlockManager(PandasObject):
+ """
+ Core internal data structure to implement DataFrame, Series, Panel, etc.
+
+ Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a
+ lightweight blocked set of labeled data to be manipulated by the DataFrame
+ public API class
+
+ Attributes
+ ----------
+ shape
+ ndim
+ axes
+ values
+ items
+
+ Methods
+ -------
+ set_axis(axis, new_labels)
+ copy(deep=True)
+
+ get_dtype_counts
+ get_ftype_counts
+ get_dtypes
+ get_ftypes
+
+ apply(func, axes, block_filter_fn)
+
+ get_bool_data
+ get_numeric_data
+
+ get_slice(slice_like, axis)
+ get(label)
+ iget(loc)
+
+ take(indexer, axis)
+ reindex_axis(new_labels, axis)
+ reindex_indexer(new_labels, indexer, axis)
+
+ delete(label)
+ insert(loc, label, value)
+ set(label, value)
+
+ Parameters
+ ----------
+
+
+ Notes
+ -----
+ This is *not* a public API class
+ """
+ __slots__ = ['axes', 'blocks', '_ndim', '_shape', '_known_consolidated',
+ '_is_consolidated', '_blknos', '_blklocs']
+
+ def __init__(self, blocks, axes, do_integrity_check=True):
+ self.axes = [ensure_index(ax) for ax in axes]
+ self.blocks = tuple(blocks)
+
+ for block in blocks:
+ if block.is_sparse:
+ if len(block.mgr_locs) != 1:
+ raise AssertionError("Sparse block refers to multiple "
+ "items")
+ else:
+ if self.ndim != block.ndim:
+ raise AssertionError(
+ 'Number of Block dimensions ({block}) must equal '
+ 'number of axes ({self})'.format(block=block.ndim,
+ self=self.ndim))
+
+ if do_integrity_check:
+ self._verify_integrity()
+
+ self._consolidate_check()
+
+ self._rebuild_blknos_and_blklocs()
+
+ def make_empty(self, axes=None):
+ """ return an empty BlockManager with the items axis of len 0 """
+ if axes is None:
+ axes = [ensure_index([])] + [ensure_index(a)
+ for a in self.axes[1:]]
+
+ # preserve dtype if possible
+ if self.ndim == 1:
+ blocks = np.array([], dtype=self.array_dtype)
+ else:
+ blocks = []
+ return self.__class__(blocks, axes)
+
+ def __nonzero__(self):
+ return True
+
+ # Python3 compat
+ __bool__ = __nonzero__
+
+ @property
+ def shape(self):
+ return tuple(len(ax) for ax in self.axes)
+
+ @property
+ def ndim(self):
+ return len(self.axes)
+
+ def set_axis(self, axis, new_labels):
+ new_labels = ensure_index(new_labels)
+ old_len = len(self.axes[axis])
+ new_len = len(new_labels)
+
+ if new_len != old_len:
+ raise ValueError(
+ 'Length mismatch: Expected axis has {old} elements, new '
+ 'values have {new} elements'.format(old=old_len, new=new_len))
+
+ self.axes[axis] = new_labels
+
+ def rename_axis(self, mapper, axis, copy=True, level=None):
+ """
+ Rename one of axes.
+
+ Parameters
+ ----------
+ mapper : unary callable
+ axis : int
+ copy : boolean, default True
+ level : int, default None
+ """
+ obj = self.copy(deep=copy)
+ obj.set_axis(axis, _transform_index(self.axes[axis], mapper, level))
+ return obj
+
+ @property
+ def _is_single_block(self):
+ if self.ndim == 1:
+ return True
+
+ if len(self.blocks) != 1:
+ return False
+
+ blk = self.blocks[0]
+ return (blk.mgr_locs.is_slice_like and
+ blk.mgr_locs.as_slice == slice(0, len(self), 1))
+
+ def _rebuild_blknos_and_blklocs(self):
+ """
+ Update mgr._blknos / mgr._blklocs.
+ """
+ new_blknos = np.empty(self.shape[0], dtype=np.int64)
+ new_blklocs = np.empty(self.shape[0], dtype=np.int64)
+ new_blknos.fill(-1)
+ new_blklocs.fill(-1)
+
+ for blkno, blk in enumerate(self.blocks):
+ rl = blk.mgr_locs
+ new_blknos[rl.indexer] = blkno
+ new_blklocs[rl.indexer] = np.arange(len(rl))
+
+ if (new_blknos == -1).any():
+ raise AssertionError("Gaps in blk ref_locs")
+
+ self._blknos = new_blknos
+ self._blklocs = new_blklocs
+
+ @property
+ def items(self):
+ return self.axes[0]
+
+ def _get_counts(self, f):
+ """ return a dict of the counts of the function in BlockManager """
+ self._consolidate_inplace()
+ counts = dict()
+ for b in self.blocks:
+ v = f(b)
+ counts[v] = counts.get(v, 0) + b.shape[0]
+ return counts
+
+ def get_dtype_counts(self):
+ return self._get_counts(lambda b: b.dtype.name)
+
+ def get_ftype_counts(self):
+ return self._get_counts(lambda b: b.ftype)
+
+ def get_dtypes(self):
+ dtypes = np.array([blk.dtype for blk in self.blocks])
+ return algos.take_1d(dtypes, self._blknos, allow_fill=False)
+
+ def get_ftypes(self):
+ ftypes = np.array([blk.ftype for blk in self.blocks])
+ return algos.take_1d(ftypes, self._blknos, allow_fill=False)
+
+ def __getstate__(self):
+ block_values = [b.values for b in self.blocks]
+ block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]
+ axes_array = [ax for ax in self.axes]
+
+ extra_state = {
+ '0.14.1': {
+ 'axes': axes_array,
+ 'blocks': [dict(values=b.values, mgr_locs=b.mgr_locs.indexer)
+ for b in self.blocks]
+ }
+ }
+
+ # First three elements of the state are to maintain forward
+ # compatibility with 0.13.1.
+ return axes_array, block_values, block_items, extra_state
+
+ def __setstate__(self, state):
+ def unpickle_block(values, mgr_locs):
+ return make_block(values, placement=mgr_locs)
+
+ if (isinstance(state, tuple) and len(state) >= 4 and
+ '0.14.1' in state[3]):
+ state = state[3]['0.14.1']
+ self.axes = [ensure_index(ax) for ax in state['axes']]
+ self.blocks = tuple(unpickle_block(b['values'], b['mgr_locs'])
+ for b in state['blocks'])
+ else:
+ # discard anything after 3rd, support beta pickling format for a
+ # little while longer
+ ax_arrays, bvalues, bitems = state[:3]
+
+ self.axes = [ensure_index(ax) for ax in ax_arrays]
+
+ if len(bitems) == 1 and self.axes[0].equals(bitems[0]):
+ # This is a workaround for pre-0.14.1 pickles that didn't
+ # support unpickling multi-block frames/panels with non-unique
+ # columns/items, because given a manager with items ["a", "b",
+ # "a"] there's no way of knowing which block's "a" is where.
+ #
+ # Single-block case can be supported under the assumption that
+ # block items corresponded to manager items 1-to-1.
+ all_mgr_locs = [slice(0, len(bitems[0]))]
+ else:
+ all_mgr_locs = [self.axes[0].get_indexer(blk_items)
+ for blk_items in bitems]
+
+ self.blocks = tuple(
+ unpickle_block(values, mgr_locs)
+ for values, mgr_locs in zip(bvalues, all_mgr_locs))
+
+ self._post_setstate()
+
+ def _post_setstate(self):
+ self._is_consolidated = False
+ self._known_consolidated = False
+ self._rebuild_blknos_and_blklocs()
+
+ def __len__(self):
+ return len(self.items)
+
+ def __unicode__(self):
+ output = pprint_thing(self.__class__.__name__)
+ for i, ax in enumerate(self.axes):
+ if i == 0:
+ output += u'\nItems: {ax}'.format(ax=ax)
+ else:
+ output += u'\nAxis {i}: {ax}'.format(i=i, ax=ax)
+
+ for block in self.blocks:
+ output += u'\n{block}'.format(block=pprint_thing(block))
+ return output
+
+ def _verify_integrity(self):
+ mgr_shape = self.shape
+ tot_items = sum(len(x.mgr_locs) for x in self.blocks)
+ for block in self.blocks:
+ if block._verify_integrity and block.shape[1:] != mgr_shape[1:]:
+ construction_error(tot_items, block.shape[1:], self.axes)
+ if len(self.items) != tot_items:
+ raise AssertionError('Number of manager items must equal union of '
+ 'block items\n# manager items: {0}, # '
+ 'tot_items: {1}'.format(
+ len(self.items), tot_items))
+
+ def apply(self, f, axes=None, filter=None, do_integrity_check=False,
+ consolidate=True, **kwargs):
+ """
+ iterate over the blocks, collect and create a new block manager
+
+ Parameters
+ ----------
+ f : the callable or function name to operate on at the block level
+ axes : optional (if not supplied, use self.axes)
+ filter : list, if supplied, only call the block if the filter is in
+ the block
+ do_integrity_check : boolean, default False. Do the block manager
+ integrity check
+ consolidate: boolean, default True. Join together blocks having same
+ dtype
+
+ Returns
+ -------
+ Block Manager (new object)
+
+ """
+
+ result_blocks = []
+
+ # filter kwarg is used in replace-* family of methods
+ if filter is not None:
+ filter_locs = set(self.items.get_indexer_for(filter))
+ if len(filter_locs) == len(self.items):
+ # All items are included, as if there were no filtering
+ filter = None
+ else:
+ kwargs['filter'] = filter_locs
+
+ if consolidate:
+ self._consolidate_inplace()
+
+ if f == 'where':
+ align_copy = True
+ if kwargs.get('align', True):
+ align_keys = ['other', 'cond']
+ else:
+ align_keys = ['cond']
+ elif f == 'putmask':
+ align_copy = False
+ if kwargs.get('align', True):
+ align_keys = ['new', 'mask']
+ else:
+ align_keys = ['mask']
+ elif f == 'fillna':
+ # fillna internally does putmask, maybe it's better to do this
+ # at mgr, not block level?
+ align_copy = False
+ align_keys = ['value']
+ else:
+ align_keys = []
+
+ # TODO(EA): may interfere with ExtensionBlock.setitem for blocks
+ # with a .values attribute.
+ aligned_args = {k: kwargs[k]
+ for k in align_keys
+ if hasattr(kwargs[k], 'values') and
+ not isinstance(kwargs[k], ABCExtensionArray)}
+
+ for b in self.blocks:
+ if filter is not None:
+ if not b.mgr_locs.isin(filter_locs).any():
+ result_blocks.append(b)
+ continue
+
+ if aligned_args:
+ b_items = self.items[b.mgr_locs.indexer]
+
+ for k, obj in aligned_args.items():
+ axis = getattr(obj, '_info_axis_number', 0)
+ kwargs[k] = obj.reindex(b_items, axis=axis,
+ copy=align_copy)
+
+ applied = getattr(b, f)(**kwargs)
+ result_blocks = _extend_blocks(applied, result_blocks)
+
+ if len(result_blocks) == 0:
+ return self.make_empty(axes or self.axes)
+ bm = self.__class__(result_blocks, axes or self.axes,
+ do_integrity_check=do_integrity_check)
+ bm._consolidate_inplace()
+ return bm
+
+ def quantile(self, axis=0, consolidate=True, transposed=False,
+ interpolation='linear', qs=None, numeric_only=None):
+ """
+ Iterate over blocks applying quantile reduction.
+ This routine is intended for reduction type operations and
+ will do inference on the generated blocks.
+
+ Parameters
+ ----------
+ axis: reduction axis, default 0
+ consolidate: boolean, default True. Join together blocks having same
+ dtype
+ transposed: boolean, default False
+ we are holding transposed data
+ interpolation : type of interpolation, default 'linear'
+ qs : a scalar or list of the quantiles to be computed
+ numeric_only : ignored
+
+ Returns
+ -------
+ Block Manager (new object)
+ """
+
+ # Series dispatches to DataFrame for quantile, which allows us to
+ # simplify some of the code here and in the blocks
+ assert self.ndim >= 2
+
+ if consolidate:
+ self._consolidate_inplace()
+
+ def get_axe(block, qs, axes):
+ from pandas import Float64Index
+ if is_list_like(qs):
+ ax = Float64Index(qs)
+ elif block.ndim == 1:
+ ax = Float64Index([qs])
+ else:
+ ax = axes[0]
+ return ax
+
+ axes, blocks = [], []
+ for b in self.blocks:
+ block = b.quantile(axis=axis, qs=qs, interpolation=interpolation)
+
+ axe = get_axe(b, qs, axes=self.axes)
+
+ axes.append(axe)
+ blocks.append(block)
+
+ # note that some DatetimeTZ, Categorical are always ndim==1
+ ndim = {b.ndim for b in blocks}
+ assert 0 not in ndim, ndim
+
+ if 2 in ndim:
+
+ new_axes = list(self.axes)
+
+ # multiple blocks that are reduced
+ if len(blocks) > 1:
+ new_axes[1] = axes[0]
+
+ # reset the placement to the original
+ for b, sb in zip(blocks, self.blocks):
+ b.mgr_locs = sb.mgr_locs
+
+ else:
+ new_axes[axis] = Index(np.concatenate(
+ [ax.values for ax in axes]))
+
+ if transposed:
+ new_axes = new_axes[::-1]
+ blocks = [b.make_block(b.values.T,
+ placement=np.arange(b.shape[1])
+ ) for b in blocks]
+
+ return self.__class__(blocks, new_axes)
+
+ # single block, i.e. ndim == {1}
+ values = _concat._concat_compat([b.values for b in blocks])
+
+ # compute the orderings of our original data
+ if len(self.blocks) > 1:
+
+ indexer = np.empty(len(self.axes[0]), dtype=np.intp)
+ i = 0
+ for b in self.blocks:
+ for j in b.mgr_locs:
+ indexer[j] = i
+ i = i + 1
+
+ values = values.take(indexer)
+
+ return SingleBlockManager(
+ [make_block(values,
+ ndim=1,
+ placement=np.arange(len(values)))],
+ axes[0])
+
+ def isna(self, func, **kwargs):
+ return self.apply('apply', func=func, **kwargs)
+
+ def where(self, **kwargs):
+ return self.apply('where', **kwargs)
+
+ def setitem(self, **kwargs):
+ return self.apply('setitem', **kwargs)
+
+ def putmask(self, **kwargs):
+ return self.apply('putmask', **kwargs)
+
+ def diff(self, **kwargs):
+ return self.apply('diff', **kwargs)
+
+ def interpolate(self, **kwargs):
+ return self.apply('interpolate', **kwargs)
+
+ def shift(self, **kwargs):
+ return self.apply('shift', **kwargs)
+
+ def fillna(self, **kwargs):
+ return self.apply('fillna', **kwargs)
+
+ def downcast(self, **kwargs):
+ return self.apply('downcast', **kwargs)
+
+ def astype(self, dtype, **kwargs):
+ return self.apply('astype', dtype=dtype, **kwargs)
+
+ def convert(self, **kwargs):
+ return self.apply('convert', **kwargs)
+
+ def replace(self, **kwargs):
+ return self.apply('replace', **kwargs)
+
+ def replace_list(self, src_list, dest_list, inplace=False, regex=False):
+ """ do a list replace """
+
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+
+ # figure out our mask a-priori to avoid repeated replacements
+ values = self.as_array()
+
+ def comp(s, regex=False):
+ """
+ Generate a bool array by perform an equality check, or perform
+ an element-wise regular expression matching
+ """
+ if isna(s):
+ return isna(values)
+ if hasattr(s, 'asm8'):
+ return _compare_or_regex_search(maybe_convert_objects(values),
+ getattr(s, 'asm8'), regex)
+ return _compare_or_regex_search(values, s, regex)
+
+ masks = [comp(s, regex) for i, s in enumerate(src_list)]
+
+ result_blocks = []
+ src_len = len(src_list) - 1
+ for blk in self.blocks:
+
+ # its possible to get multiple result blocks here
+ # replace ALWAYS will return a list
+ rb = [blk if inplace else blk.copy()]
+ for i, (s, d) in enumerate(zip(src_list, dest_list)):
+ new_rb = []
+ for b in rb:
+ m = masks[i][b.mgr_locs.indexer]
+ convert = i == src_len
+ result = b._replace_coerce(mask=m, to_replace=s, value=d,
+ inplace=inplace,
+ convert=convert, regex=regex)
+ if m.any():
+ new_rb = _extend_blocks(result, new_rb)
+ else:
+ new_rb.append(b)
+ rb = new_rb
+ result_blocks.extend(rb)
+
+ bm = self.__class__(result_blocks, self.axes)
+ bm._consolidate_inplace()
+ return bm
+
+ def reshape_nd(self, axes, **kwargs):
+ """ a 2d-nd reshape operation on a BlockManager """
+ return self.apply('reshape_nd', axes=axes, **kwargs)
+
+ def is_consolidated(self):
+ """
+ Return True if more than one block with the same dtype
+ """
+ if not self._known_consolidated:
+ self._consolidate_check()
+ return self._is_consolidated
+
+ def _consolidate_check(self):
+ ftypes = [blk.ftype for blk in self.blocks]
+ self._is_consolidated = len(ftypes) == len(set(ftypes))
+ self._known_consolidated = True
+
+ @property
+ def is_mixed_type(self):
+ # Warning, consolidation needs to get checked upstairs
+ self._consolidate_inplace()
+ return len(self.blocks) > 1
+
+ @property
+ def is_numeric_mixed_type(self):
+ # Warning, consolidation needs to get checked upstairs
+ self._consolidate_inplace()
+ return all(block.is_numeric for block in self.blocks)
+
+ @property
+ def is_datelike_mixed_type(self):
+ # Warning, consolidation needs to get checked upstairs
+ self._consolidate_inplace()
+ return any(block.is_datelike for block in self.blocks)
+
+ @property
+ def any_extension_types(self):
+ """Whether any of the blocks in this manager are extension blocks"""
+ return any(block.is_extension for block in self.blocks)
+
+ @property
+ def is_view(self):
+ """ return a boolean if we are a single block and are a view """
+ if len(self.blocks) == 1:
+ return self.blocks[0].is_view
+
+ # It is technically possible to figure out which blocks are views
+ # e.g. [ b.values.base is not None for b in self.blocks ]
+ # but then we have the case of possibly some blocks being a view
+ # and some blocks not. setting in theory is possible on the non-view
+ # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit
+ # complicated
+
+ return False
+
+ def get_bool_data(self, copy=False):
+ """
+ Parameters
+ ----------
+ copy : boolean, default False
+ Whether to copy the blocks
+ """
+ self._consolidate_inplace()
+ return self.combine([b for b in self.blocks if b.is_bool], copy)
+
+ def get_numeric_data(self, copy=False):
+ """
+ Parameters
+ ----------
+ copy : boolean, default False
+ Whether to copy the blocks
+ """
+ self._consolidate_inplace()
+ return self.combine([b for b in self.blocks if b.is_numeric], copy)
+
+ def combine(self, blocks, copy=True):
+ """ return a new manager with the blocks """
+ if len(blocks) == 0:
+ return self.make_empty()
+
+ # FIXME: optimization potential
+ indexer = np.sort(np.concatenate([b.mgr_locs.as_array
+ for b in blocks]))
+ inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0])
+
+ new_blocks = []
+ for b in blocks:
+ b = b.copy(deep=copy)
+ b.mgr_locs = algos.take_1d(inv_indexer, b.mgr_locs.as_array,
+ axis=0, allow_fill=False)
+ new_blocks.append(b)
+
+ axes = list(self.axes)
+ axes[0] = self.items.take(indexer)
+
+ return self.__class__(new_blocks, axes, do_integrity_check=False)
+
+ def get_slice(self, slobj, axis=0):
+ if axis >= self.ndim:
+ raise IndexError("Requested axis not found in manager")
+
+ if axis == 0:
+ new_blocks = self._slice_take_blocks_ax0(slobj)
+ else:
+ slicer = [slice(None)] * (axis + 1)
+ slicer[axis] = slobj
+ slicer = tuple(slicer)
+ new_blocks = [blk.getitem_block(slicer) for blk in self.blocks]
+
+ new_axes = list(self.axes)
+ new_axes[axis] = new_axes[axis][slobj]
+
+ bm = self.__class__(new_blocks, new_axes, do_integrity_check=False)
+ bm._consolidate_inplace()
+ return bm
+
+ def __contains__(self, item):
+ return item in self.items
+
+ @property
+ def nblocks(self):
+ return len(self.blocks)
+
+ def copy(self, deep=True):
+ """
+ Make deep or shallow copy of BlockManager
+
+ Parameters
+ ----------
+ deep : boolean o rstring, default True
+ If False, return shallow copy (do not copy data)
+ If 'all', copy data and a deep copy of the index
+
+ Returns
+ -------
+ copy : BlockManager
+ """
+ # this preserves the notion of view copying of axes
+ if deep:
+ if deep == 'all':
+ copy = lambda ax: ax.copy(deep=True)
+ else:
+ copy = lambda ax: ax.view()
+ new_axes = [copy(ax) for ax in self.axes]
+ else:
+ new_axes = list(self.axes)
+ return self.apply('copy', axes=new_axes, deep=deep,
+ do_integrity_check=False)
+
+ def as_array(self, transpose=False, items=None):
+ """Convert the blockmanager data into an numpy array.
+
+ Parameters
+ ----------
+ transpose : boolean, default False
+ If True, transpose the return array
+ items : list of strings or None
+ Names of block items that will be included in the returned
+ array. ``None`` means that all block items will be used
+
+ Returns
+ -------
+ arr : ndarray
+ """
+ if len(self.blocks) == 0:
+ arr = np.empty(self.shape, dtype=float)
+ return arr.transpose() if transpose else arr
+
+ if items is not None:
+ mgr = self.reindex_axis(items, axis=0)
+ else:
+ mgr = self
+
+ if self._is_single_block and mgr.blocks[0].is_datetimetz:
+ # TODO(Block.get_values): Make DatetimeTZBlock.get_values
+ # always be object dtype. Some callers seem to want the
+ # DatetimeArray (previously DTI)
+ arr = mgr.blocks[0].get_values(dtype=object)
+ elif self._is_single_block or not self.is_mixed_type:
+ arr = np.asarray(mgr.blocks[0].get_values())
+ else:
+ arr = mgr._interleave()
+
+ return arr.transpose() if transpose else arr
+
+ def _interleave(self):
+ """
+ Return ndarray from blocks with specified item order
+ Items must be contained in the blocks
+ """
+ from pandas.core.dtypes.common import is_sparse
+ dtype = _interleaved_dtype(self.blocks)
+
+ # TODO: https://github.com/pandas-dev/pandas/issues/22791
+ # Give EAs some input on what happens here. Sparse needs this.
+ if is_sparse(dtype):
+ dtype = dtype.subtype
+ elif is_extension_array_dtype(dtype):
+ dtype = 'object'
+
+ result = np.empty(self.shape, dtype=dtype)
+
+ itemmask = np.zeros(self.shape[0])
+
+ for blk in self.blocks:
+ rl = blk.mgr_locs
+ result[rl.indexer] = blk.get_values(dtype)
+ itemmask[rl.indexer] = 1
+
+ if not itemmask.all():
+ raise AssertionError('Some items were not contained in blocks')
+
+ return result
+
+ def to_dict(self, copy=True):
+ """
+ Return a dict of str(dtype) -> BlockManager
+
+ Parameters
+ ----------
+ copy : boolean, default True
+
+ Returns
+ -------
+ values : a dict of dtype -> BlockManager
+
+ Notes
+ -----
+ This consolidates based on str(dtype)
+ """
+ self._consolidate_inplace()
+
+ bd = {}
+ for b in self.blocks:
+ bd.setdefault(str(b.dtype), []).append(b)
+
+ return {dtype: self.combine(blocks, copy=copy)
+ for dtype, blocks in bd.items()}
+
+ def xs(self, key, axis=1, copy=True, takeable=False):
+ if axis < 1:
+ raise AssertionError(
+ 'Can only take xs across axis >= 1, got {ax}'.format(ax=axis))
+
+ # take by position
+ if takeable:
+ loc = key
+ else:
+ loc = self.axes[axis].get_loc(key)
+
+ slicer = [slice(None, None) for _ in range(self.ndim)]
+ slicer[axis] = loc
+ slicer = tuple(slicer)
+
+ new_axes = list(self.axes)
+
+ # could be an array indexer!
+ if isinstance(loc, (slice, np.ndarray)):
+ new_axes[axis] = new_axes[axis][loc]
+ else:
+ new_axes.pop(axis)
+
+ new_blocks = []
+ if len(self.blocks) > 1:
+ # we must copy here as we are mixed type
+ for blk in self.blocks:
+ newb = make_block(values=blk.values[slicer],
+ klass=blk.__class__,
+ placement=blk.mgr_locs)
+ new_blocks.append(newb)
+ elif len(self.blocks) == 1:
+ block = self.blocks[0]
+ vals = block.values[slicer]
+ if copy:
+ vals = vals.copy()
+ new_blocks = [make_block(values=vals,
+ placement=block.mgr_locs,
+ klass=block.__class__)]
+
+ return self.__class__(new_blocks, new_axes)
+
+ def fast_xs(self, loc):
+ """
+ get a cross sectional for a given location in the
+ items ; handle dups
+
+ return the result, is *could* be a view in the case of a
+ single block
+ """
+ if len(self.blocks) == 1:
+ return self.blocks[0].iget((slice(None), loc))
+
+ items = self.items
+
+ # non-unique (GH4726)
+ if not items.is_unique:
+ result = self._interleave()
+ if self.ndim == 2:
+ result = result.T
+ return result[loc]
+
+ # unique
+ dtype = _interleaved_dtype(self.blocks)
+
+ n = len(items)
+ if is_extension_array_dtype(dtype):
+ # we'll eventually construct an ExtensionArray.
+ result = np.empty(n, dtype=object)
+ else:
+ result = np.empty(n, dtype=dtype)
+
+ for blk in self.blocks:
+ # Such assignment may incorrectly coerce NaT to None
+ # result[blk.mgr_locs] = blk._slice((slice(None), loc))
+ for i, rl in enumerate(blk.mgr_locs):
+ result[rl] = blk._try_coerce_result(blk.iget((i, loc)))
+
+ if is_extension_array_dtype(dtype):
+ result = dtype.construct_array_type()._from_sequence(
+ result, dtype=dtype
+ )
+
+ return result
+
+ def consolidate(self):
+ """
+ Join together blocks having same dtype
+
+ Returns
+ -------
+ y : BlockManager
+ """
+ if self.is_consolidated():
+ return self
+
+ bm = self.__class__(self.blocks, self.axes)
+ bm._is_consolidated = False
+ bm._consolidate_inplace()
+ return bm
+
+ def _consolidate_inplace(self):
+ if not self.is_consolidated():
+ self.blocks = tuple(_consolidate(self.blocks))
+ self._is_consolidated = True
+ self._known_consolidated = True
+ self._rebuild_blknos_and_blklocs()
+
+ def get(self, item, fastpath=True):
+ """
+ Return values for selected item (ndarray or BlockManager).
+ """
+ if self.items.is_unique:
+
+ if not isna(item):
+ loc = self.items.get_loc(item)
+ else:
+ indexer = np.arange(len(self.items))[isna(self.items)]
+
+ # allow a single nan location indexer
+ if not is_scalar(indexer):
+ if len(indexer) == 1:
+ loc = indexer.item()
+ else:
+ raise ValueError("cannot label index with a null key")
+
+ return self.iget(loc, fastpath=fastpath)
+ else:
+
+ if isna(item):
+ raise TypeError("cannot label index with a null key")
+
+ indexer = self.items.get_indexer_for([item])
+ return self.reindex_indexer(new_axis=self.items[indexer],
+ indexer=indexer, axis=0,
+ allow_dups=True)
+
+ def iget(self, i, fastpath=True):
+ """
+ Return the data as a SingleBlockManager if fastpath=True and possible
+
+ Otherwise return as a ndarray
+ """
+ block = self.blocks[self._blknos[i]]
+ values = block.iget(self._blklocs[i])
+ if not fastpath or not block._box_to_block_values or values.ndim != 1:
+ return values
+
+ # fastpath shortcut for select a single-dim from a 2-dim BM
+ return SingleBlockManager(
+ [block.make_block_same_class(values,
+ placement=slice(0, len(values)),
+ ndim=1)],
+ self.axes[1])
+
+ def delete(self, item):
+ """
+ Delete selected item (items if non-unique) in-place.
+ """
+ indexer = self.items.get_loc(item)
+
+ is_deleted = np.zeros(self.shape[0], dtype=np.bool_)
+ is_deleted[indexer] = True
+ ref_loc_offset = -is_deleted.cumsum()
+
+ is_blk_deleted = [False] * len(self.blocks)
+
+ if isinstance(indexer, int):
+ affected_start = indexer
+ else:
+ affected_start = is_deleted.nonzero()[0][0]
+
+ for blkno, _ in _fast_count_smallints(self._blknos[affected_start:]):
+ blk = self.blocks[blkno]
+ bml = blk.mgr_locs
+ blk_del = is_deleted[bml.indexer].nonzero()[0]
+
+ if len(blk_del) == len(bml):
+ is_blk_deleted[blkno] = True
+ continue
+ elif len(blk_del) != 0:
+ blk.delete(blk_del)
+ bml = blk.mgr_locs
+
+ blk.mgr_locs = bml.add(ref_loc_offset[bml.indexer])
+
+ # FIXME: use Index.delete as soon as it uses fastpath=True
+ self.axes[0] = self.items[~is_deleted]
+ self.blocks = tuple(b for blkno, b in enumerate(self.blocks)
+ if not is_blk_deleted[blkno])
+ self._shape = None
+ self._rebuild_blknos_and_blklocs()
+
+ def set(self, item, value):
+ """
+ Set new item in-place. Does not consolidate. Adds new Block if not
+ contained in the current set of items
+ """
+ # FIXME: refactor, clearly separate broadcasting & zip-like assignment
+ # can prob also fix the various if tests for sparse/categorical
+
+ # TODO(EA): Remove an is_extension_ when all extension types satisfy
+ # the interface
+ value_is_extension_type = (is_extension_type(value) or
+ is_extension_array_dtype(value))
+
+ # categorical/spares/datetimetz
+ if value_is_extension_type:
+
+ def value_getitem(placement):
+ return value
+ else:
+ if value.ndim == self.ndim - 1:
+ value = _safe_reshape(value, (1,) + value.shape)
+
+ def value_getitem(placement):
+ return value
+ else:
+
+ def value_getitem(placement):
+ return value[placement.indexer]
+
+ if value.shape[1:] != self.shape[1:]:
+ raise AssertionError('Shape of new values must be compatible '
+ 'with manager shape')
+
+ try:
+ loc = self.items.get_loc(item)
+ except KeyError:
+ # This item wasn't present, just insert at end
+ self.insert(len(self.items), item, value)
+ return
+
+ if isinstance(loc, int):
+ loc = [loc]
+
+ blknos = self._blknos[loc]
+ blklocs = self._blklocs[loc].copy()
+
+ unfit_mgr_locs = []
+ unfit_val_locs = []
+ removed_blknos = []
+ for blkno, val_locs in libinternals.get_blkno_placements(blknos,
+ self.nblocks,
+ group=True):
+ blk = self.blocks[blkno]
+ blk_locs = blklocs[val_locs.indexer]
+ if blk.should_store(value):
+ blk.set(blk_locs, value_getitem(val_locs))
+ else:
+ unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
+ unfit_val_locs.append(val_locs)
+
+ # If all block items are unfit, schedule the block for removal.
+ if len(val_locs) == len(blk.mgr_locs):
+ removed_blknos.append(blkno)
+ else:
+ self._blklocs[blk.mgr_locs.indexer] = -1
+ blk.delete(blk_locs)
+ self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk))
+
+ if len(removed_blknos):
+ # Remove blocks & update blknos accordingly
+ is_deleted = np.zeros(self.nblocks, dtype=np.bool_)
+ is_deleted[removed_blknos] = True
+
+ new_blknos = np.empty(self.nblocks, dtype=np.int64)
+ new_blknos.fill(-1)
+ new_blknos[~is_deleted] = np.arange(self.nblocks -
+ len(removed_blknos))
+ self._blknos = algos.take_1d(new_blknos, self._blknos, axis=0,
+ allow_fill=False)
+ self.blocks = tuple(blk for i, blk in enumerate(self.blocks)
+ if i not in set(removed_blknos))
+
+ if unfit_val_locs:
+ unfit_mgr_locs = np.concatenate(unfit_mgr_locs)
+ unfit_count = len(unfit_mgr_locs)
+
+ new_blocks = []
+ if value_is_extension_type:
+ # This code (ab-)uses the fact that sparse blocks contain only
+ # one item.
+ new_blocks.extend(
+ make_block(values=value.copy(), ndim=self.ndim,
+ placement=slice(mgr_loc, mgr_loc + 1))
+ for mgr_loc in unfit_mgr_locs)
+
+ self._blknos[unfit_mgr_locs] = (np.arange(unfit_count) +
+ len(self.blocks))
+ self._blklocs[unfit_mgr_locs] = 0
+
+ else:
+ # unfit_val_locs contains BlockPlacement objects
+ unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:])
+
+ new_blocks.append(
+ make_block(values=value_getitem(unfit_val_items),
+ ndim=self.ndim, placement=unfit_mgr_locs))
+
+ self._blknos[unfit_mgr_locs] = len(self.blocks)
+ self._blklocs[unfit_mgr_locs] = np.arange(unfit_count)
+
+ self.blocks += tuple(new_blocks)
+
+ # Newly created block's dtype may already be present.
+ self._known_consolidated = False
+
+ def insert(self, loc, item, value, allow_duplicates=False):
+ """
+ Insert item at selected position.
+
+ Parameters
+ ----------
+ loc : int
+ item : hashable
+ value : array_like
+ allow_duplicates: bool
+ If False, trying to insert non-unique item will raise
+
+ """
+ if not allow_duplicates and item in self.items:
+ # Should this be a different kind of error??
+ raise ValueError('cannot insert {}, already exists'.format(item))
+
+ if not isinstance(loc, int):
+ raise TypeError("loc must be int")
+
+ # insert to the axis; this could possibly raise a TypeError
+ new_axis = self.items.insert(loc, item)
+
+ block = make_block(values=value, ndim=self.ndim,
+ placement=slice(loc, loc + 1))
+
+ for blkno, count in _fast_count_smallints(self._blknos[loc:]):
+ blk = self.blocks[blkno]
+ if count == len(blk.mgr_locs):
+ blk.mgr_locs = blk.mgr_locs.add(1)
+ else:
+ new_mgr_locs = blk.mgr_locs.as_array.copy()
+ new_mgr_locs[new_mgr_locs >= loc] += 1
+ blk.mgr_locs = new_mgr_locs
+
+ if loc == self._blklocs.shape[0]:
+ # np.append is a lot faster, let's use it if we can.
+ self._blklocs = np.append(self._blklocs, 0)
+ self._blknos = np.append(self._blknos, len(self.blocks))
+ else:
+ self._blklocs = np.insert(self._blklocs, loc, 0)
+ self._blknos = np.insert(self._blknos, loc, len(self.blocks))
+
+ self.axes[0] = new_axis
+ self.blocks += (block,)
+ self._shape = None
+
+ self._known_consolidated = False
+
+ if len(self.blocks) > 100:
+ self._consolidate_inplace()
+
+ def reindex_axis(self, new_index, axis, method=None, limit=None,
+ fill_value=None, copy=True):
+ """
+ Conform block manager to new index.
+ """
+ new_index = ensure_index(new_index)
+ new_index, indexer = self.axes[axis].reindex(new_index, method=method,
+ limit=limit)
+
+ return self.reindex_indexer(new_index, indexer, axis=axis,
+ fill_value=fill_value, copy=copy)
+
+ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None,
+ allow_dups=False, copy=True):
+ """
+ Parameters
+ ----------
+ new_axis : Index
+ indexer : ndarray of int64 or None
+ axis : int
+ fill_value : object
+ allow_dups : bool
+
+ pandas-indexer with -1's only.
+ """
+ if indexer is None:
+ if new_axis is self.axes[axis] and not copy:
+ return self
+
+ result = self.copy(deep=copy)
+ result.axes = list(self.axes)
+ result.axes[axis] = new_axis
+ return result
+
+ self._consolidate_inplace()
+
+ # some axes don't allow reindexing with dups
+ if not allow_dups:
+ self.axes[axis]._can_reindex(indexer)
+
+ if axis >= self.ndim:
+ raise IndexError("Requested axis not found in manager")
+
+ if axis == 0:
+ new_blocks = self._slice_take_blocks_ax0(indexer,
+ fill_tuple=(fill_value,))
+ else:
+ new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=(
+ fill_value if fill_value is not None else blk.fill_value,))
+ for blk in self.blocks]
+
+ new_axes = list(self.axes)
+ new_axes[axis] = new_axis
+ return self.__class__(new_blocks, new_axes)
+
+ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None):
+ """
+ Slice/take blocks along axis=0.
+
+ Overloaded for SingleBlock
+
+ Returns
+ -------
+ new_blocks : list of Block
+
+ """
+
+ allow_fill = fill_tuple is not None
+
+ sl_type, slobj, sllen = _preprocess_slice_or_indexer(
+ slice_or_indexer, self.shape[0], allow_fill=allow_fill)
+
+ if self._is_single_block:
+ blk = self.blocks[0]
+
+ if sl_type in ('slice', 'mask'):
+ return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))]
+ elif not allow_fill or self.ndim == 1:
+ if allow_fill and fill_tuple[0] is None:
+ _, fill_value = maybe_promote(blk.dtype)
+ fill_tuple = (fill_value, )
+
+ return [blk.take_nd(slobj, axis=0,
+ new_mgr_locs=slice(0, sllen),
+ fill_tuple=fill_tuple)]
+
+ if sl_type in ('slice', 'mask'):
+ blknos = self._blknos[slobj]
+ blklocs = self._blklocs[slobj]
+ else:
+ blknos = algos.take_1d(self._blknos, slobj, fill_value=-1,
+ allow_fill=allow_fill)
+ blklocs = algos.take_1d(self._blklocs, slobj, fill_value=-1,
+ allow_fill=allow_fill)
+
+ # When filling blknos, make sure blknos is updated before appending to
+ # blocks list, that way new blkno is exactly len(blocks).
+ #
+ # FIXME: mgr_groupby_blknos must return mgr_locs in ascending order,
+ # pytables serialization will break otherwise.
+ blocks = []
+ for blkno, mgr_locs in libinternals.get_blkno_placements(blknos,
+ self.nblocks,
+ group=True):
+ if blkno == -1:
+ # If we've got here, fill_tuple was not None.
+ fill_value = fill_tuple[0]
+
+ blocks.append(self._make_na_block(placement=mgr_locs,
+ fill_value=fill_value))
+ else:
+ blk = self.blocks[blkno]
+
+ # Otherwise, slicing along items axis is necessary.
+ if not blk._can_consolidate:
+ # A non-consolidatable block, it's easy, because there's
+ # only one item and each mgr loc is a copy of that single
+ # item.
+ for mgr_loc in mgr_locs:
+ newblk = blk.copy(deep=True)
+ newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1)
+ blocks.append(newblk)
+
+ else:
+ blocks.append(blk.take_nd(blklocs[mgr_locs.indexer],
+ axis=0, new_mgr_locs=mgr_locs,
+ fill_tuple=None))
+
+ return blocks
+
+ def _make_na_block(self, placement, fill_value=None):
+ # TODO: infer dtypes other than float64 from fill_value
+
+ if fill_value is None:
+ fill_value = np.nan
+ block_shape = list(self.shape)
+ block_shape[0] = len(placement)
+
+ dtype, fill_value = infer_dtype_from_scalar(fill_value)
+ block_values = np.empty(block_shape, dtype=dtype)
+ block_values.fill(fill_value)
+ return make_block(block_values, placement=placement)
+
+ def take(self, indexer, axis=1, verify=True, convert=True):
+ """
+ Take items along any axis.
+ """
+ self._consolidate_inplace()
+ indexer = (np.arange(indexer.start, indexer.stop, indexer.step,
+ dtype='int64')
+ if isinstance(indexer, slice)
+ else np.asanyarray(indexer, dtype='int64'))
+
+ n = self.shape[axis]
+ if convert:
+ indexer = maybe_convert_indices(indexer, n)
+
+ if verify:
+ if ((indexer == -1) | (indexer >= n)).any():
+ raise Exception('Indices must be nonzero and less than '
+ 'the axis length')
+
+ new_labels = self.axes[axis].take(indexer)
+ return self.reindex_indexer(new_axis=new_labels, indexer=indexer,
+ axis=axis, allow_dups=True)
+
+ def merge(self, other, lsuffix='', rsuffix=''):
+ # We assume at this point that the axes of self and other match.
+ # This is only called from Panel.join, which reindexes prior
+ # to calling to ensure this assumption holds.
+ l, r = items_overlap_with_suffix(left=self.items, lsuffix=lsuffix,
+ right=other.items, rsuffix=rsuffix)
+ new_items = _concat_indexes([l, r])
+
+ new_blocks = [blk.copy(deep=False) for blk in self.blocks]
+
+ offset = self.shape[0]
+ for blk in other.blocks:
+ blk = blk.copy(deep=False)
+ blk.mgr_locs = blk.mgr_locs.add(offset)
+ new_blocks.append(blk)
+
+ new_axes = list(self.axes)
+ new_axes[0] = new_items
+
+ return self.__class__(_consolidate(new_blocks), new_axes)
+
+ def equals(self, other):
+ self_axes, other_axes = self.axes, other.axes
+ if len(self_axes) != len(other_axes):
+ return False
+ if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)):
+ return False
+ self._consolidate_inplace()
+ other._consolidate_inplace()
+ if len(self.blocks) != len(other.blocks):
+ return False
+
+ # canonicalize block order, using a tuple combining the type
+ # name and then mgr_locs because there might be unconsolidated
+ # blocks (say, Categorical) which can only be distinguished by
+ # the iteration order
+ def canonicalize(block):
+ return (block.dtype.name, block.mgr_locs.as_array.tolist())
+
+ self_blocks = sorted(self.blocks, key=canonicalize)
+ other_blocks = sorted(other.blocks, key=canonicalize)
+ return all(block.equals(oblock)
+ for block, oblock in zip(self_blocks, other_blocks))
+
+ def unstack(self, unstacker_func, fill_value):
+ """Return a blockmanager with all blocks unstacked.
+
+ Parameters
+ ----------
+ unstacker_func : callable
+ A (partially-applied) ``pd.core.reshape._Unstacker`` class.
+ fill_value : Any
+ fill_value for newly introduced missing values.
+
+ Returns
+ -------
+ unstacked : BlockManager
+ """
+ n_rows = self.shape[-1]
+ dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items)
+ new_columns = dummy.get_new_columns()
+ new_index = dummy.get_new_index()
+ new_blocks = []
+ columns_mask = []
+
+ for blk in self.blocks:
+ blocks, mask = blk._unstack(
+ partial(unstacker_func,
+ value_columns=self.items[blk.mgr_locs.indexer]),
+ new_columns,
+ n_rows,
+ fill_value
+ )
+
+ new_blocks.extend(blocks)
+ columns_mask.extend(mask)
+
+ new_columns = new_columns[columns_mask]
+
+ bm = BlockManager(new_blocks, [new_columns, new_index])
+ return bm
+
+
+class SingleBlockManager(BlockManager):
+ """ manage a single block with """
+
+ ndim = 1
+ _is_consolidated = True
+ _known_consolidated = True
+ __slots__ = ()
+
+ def __init__(self, block, axis, do_integrity_check=False, fastpath=False):
+
+ if isinstance(axis, list):
+ if len(axis) != 1:
+ raise ValueError("cannot create SingleBlockManager with more "
+ "than 1 axis")
+ axis = axis[0]
+
+ # passed from constructor, single block, single axis
+ if fastpath:
+ self.axes = [axis]
+ if isinstance(block, list):
+
+ # empty block
+ if len(block) == 0:
+ block = [np.array([])]
+ elif len(block) != 1:
+ raise ValueError('Cannot create SingleBlockManager with '
+ 'more than 1 block')
+ block = block[0]
+ else:
+ self.axes = [ensure_index(axis)]
+
+ # create the block here
+ if isinstance(block, list):
+
+ # provide consolidation to the interleaved_dtype
+ if len(block) > 1:
+ dtype = _interleaved_dtype(block)
+ block = [b.astype(dtype) for b in block]
+ block = _consolidate(block)
+
+ if len(block) != 1:
+ raise ValueError('Cannot create SingleBlockManager with '
+ 'more than 1 block')
+ block = block[0]
+
+ if not isinstance(block, Block):
+ block = make_block(block, placement=slice(0, len(axis)), ndim=1)
+
+ self.blocks = [block]
+
+ def _post_setstate(self):
+ pass
+
+ @property
+ def _block(self):
+ return self.blocks[0]
+
+ @property
+ def _values(self):
+ return self._block.values
+
+ @property
+ def _blknos(self):
+ """ compat with BlockManager """
+ return None
+
+ @property
+ def _blklocs(self):
+ """ compat with BlockManager """
+ return None
+
+ def get_slice(self, slobj, axis=0):
+ if axis >= self.ndim:
+ raise IndexError("Requested axis not found in manager")
+
+ return self.__class__(self._block._slice(slobj),
+ self.index[slobj], fastpath=True)
+
+ @property
+ def index(self):
+ return self.axes[0]
+
+ def convert(self, **kwargs):
+ """ convert the whole block as one """
+ kwargs['by_item'] = False
+ return self.apply('convert', **kwargs)
+
+ @property
+ def dtype(self):
+ return self._block.dtype
+
+ @property
+ def array_dtype(self):
+ return self._block.array_dtype
+
+ @property
+ def ftype(self):
+ return self._block.ftype
+
+ def get_dtype_counts(self):
+ return {self.dtype.name: 1}
+
+ def get_ftype_counts(self):
+ return {self.ftype: 1}
+
+ def get_dtypes(self):
+ return np.array([self._block.dtype])
+
+ def get_ftypes(self):
+ return np.array([self._block.ftype])
+
+ def external_values(self):
+ return self._block.external_values()
+
+ def internal_values(self):
+ return self._block.internal_values()
+
+ def formatting_values(self):
+ """Return the internal values used by the DataFrame/SeriesFormatter"""
+ return self._block.formatting_values()
+
+ def get_values(self):
+ """ return a dense type view """
+ return np.array(self._block.to_dense(), copy=False)
+
+ @property
+ def asobject(self):
+ """
+ return a object dtype array. datetime/timedelta like values are boxed
+ to Timestamp/Timedelta instances.
+ """
+ return self._block.get_values(dtype=object)
+
+ @property
+ def _can_hold_na(self):
+ return self._block._can_hold_na
+
+ def is_consolidated(self):
+ return True
+
+ def _consolidate_check(self):
+ pass
+
+ def _consolidate_inplace(self):
+ pass
+
+ def delete(self, item):
+ """
+ Delete single item from SingleBlockManager.
+
+ Ensures that self.blocks doesn't become empty.
+ """
+ loc = self.items.get_loc(item)
+ self._block.delete(loc)
+ self.axes[0] = self.axes[0].delete(loc)
+
+ def fast_xs(self, loc):
+ """
+ fast path for getting a cross-section
+ return a view of the data
+ """
+ return self._block.values[loc]
+
+ def concat(self, to_concat, new_axis):
+ """
+ Concatenate a list of SingleBlockManagers into a single
+ SingleBlockManager.
+
+ Used for pd.concat of Series objects with axis=0.
+
+ Parameters
+ ----------
+ to_concat : list of SingleBlockManagers
+ new_axis : Index of the result
+
+ Returns
+ -------
+ SingleBlockManager
+
+ """
+ non_empties = [x for x in to_concat if len(x) > 0]
+
+ # check if all series are of the same block type:
+ if len(non_empties) > 0:
+ blocks = [obj.blocks[0] for obj in non_empties]
+ if len({b.dtype for b in blocks}) == 1:
+ new_block = blocks[0].concat_same_type(blocks)
+ else:
+ values = [x.values for x in blocks]
+ values = _concat._concat_compat(values)
+ new_block = make_block(
+ values, placement=slice(0, len(values), 1))
+ else:
+ values = [x._block.values for x in to_concat]
+ values = _concat._concat_compat(values)
+ new_block = make_block(
+ values, placement=slice(0, len(values), 1))
+
+ mgr = SingleBlockManager(new_block, new_axis)
+ return mgr
+
+
+# --------------------------------------------------------------------
+# Constructor Helpers
+
+def create_block_manager_from_blocks(blocks, axes):
+ try:
+ if len(blocks) == 1 and not isinstance(blocks[0], Block):
+ # if blocks[0] is of length 0, return empty blocks
+ if not len(blocks[0]):
+ blocks = []
+ else:
+ # It's OK if a single block is passed as values, its placement
+ # is basically "all items", but if there're many, don't bother
+ # converting, it's an error anyway.
+ blocks = [make_block(values=blocks[0],
+ placement=slice(0, len(axes[0])))]
+
+ mgr = BlockManager(blocks, axes)
+ mgr._consolidate_inplace()
+ return mgr
+
+ except (ValueError) as e:
+ blocks = [getattr(b, 'values', b) for b in blocks]
+ tot_items = sum(b.shape[0] for b in blocks)
+ construction_error(tot_items, blocks[0].shape[1:], axes, e)
+
+
+def create_block_manager_from_arrays(arrays, names, axes):
+
+ try:
+ blocks = form_blocks(arrays, names, axes)
+ mgr = BlockManager(blocks, axes)
+ mgr._consolidate_inplace()
+ return mgr
+ except ValueError as e:
+ construction_error(len(arrays), arrays[0].shape, axes, e)
+
+
+def construction_error(tot_items, block_shape, axes, e=None):
+ """ raise a helpful message about our construction """
+ passed = tuple(map(int, [tot_items] + list(block_shape)))
+ # Correcting the user facing error message during dataframe construction
+ if len(passed) <= 2:
+ passed = passed[::-1]
+
+ implied = tuple(len(ax) for ax in axes)
+ # Correcting the user facing error message during dataframe construction
+ if len(implied) <= 2:
+ implied = implied[::-1]
+
+ if passed == implied and e is not None:
+ raise e
+ if block_shape[0] == 0:
+ raise ValueError("Empty data passed with indices specified.")
+ raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
+ passed, implied))
+
+
+# -----------------------------------------------------------------------
+
+def form_blocks(arrays, names, axes):
+ # put "leftover" items in float bucket, where else?
+ # generalize?
+ items_dict = defaultdict(list)
+ extra_locs = []
+
+ names_idx = ensure_index(names)
+ if names_idx.equals(axes[0]):
+ names_indexer = np.arange(len(names_idx))
+ else:
+ assert names_idx.intersection(axes[0]).is_unique
+ names_indexer = names_idx.get_indexer_for(axes[0])
+
+ for i, name_idx in enumerate(names_indexer):
+ if name_idx == -1:
+ extra_locs.append(i)
+ continue
+
+ k = names[name_idx]
+ v = arrays[name_idx]
+
+ block_type = get_block_type(v)
+ items_dict[block_type.__name__].append((i, k, v))
+
+ blocks = []
+ if len(items_dict['FloatBlock']):
+ float_blocks = _multi_blockify(items_dict['FloatBlock'])
+ blocks.extend(float_blocks)
+
+ if len(items_dict['ComplexBlock']):
+ complex_blocks = _multi_blockify(items_dict['ComplexBlock'])
+ blocks.extend(complex_blocks)
+
+ if len(items_dict['TimeDeltaBlock']):
+ timedelta_blocks = _multi_blockify(items_dict['TimeDeltaBlock'])
+ blocks.extend(timedelta_blocks)
+
+ if len(items_dict['IntBlock']):
+ int_blocks = _multi_blockify(items_dict['IntBlock'])
+ blocks.extend(int_blocks)
+
+ if len(items_dict['DatetimeBlock']):
+ datetime_blocks = _simple_blockify(items_dict['DatetimeBlock'],
+ _NS_DTYPE)
+ blocks.extend(datetime_blocks)
+
+ if len(items_dict['DatetimeTZBlock']):
+ dttz_blocks = [make_block(array,
+ klass=DatetimeTZBlock,
+ placement=[i])
+ for i, _, array in items_dict['DatetimeTZBlock']]
+ blocks.extend(dttz_blocks)
+
+ if len(items_dict['BoolBlock']):
+ bool_blocks = _simple_blockify(items_dict['BoolBlock'], np.bool_)
+ blocks.extend(bool_blocks)
+
+ if len(items_dict['ObjectBlock']) > 0:
+ object_blocks = _simple_blockify(items_dict['ObjectBlock'], np.object_)
+ blocks.extend(object_blocks)
+
+ if len(items_dict['SparseBlock']) > 0:
+ sparse_blocks = _sparse_blockify(items_dict['SparseBlock'])
+ blocks.extend(sparse_blocks)
+
+ if len(items_dict['CategoricalBlock']) > 0:
+ cat_blocks = [make_block(array, klass=CategoricalBlock, placement=[i])
+ for i, _, array in items_dict['CategoricalBlock']]
+ blocks.extend(cat_blocks)
+
+ if len(items_dict['ExtensionBlock']):
+
+ external_blocks = [
+ make_block(array, klass=ExtensionBlock, placement=[i])
+ for i, _, array in items_dict['ExtensionBlock']
+ ]
+
+ blocks.extend(external_blocks)
+
+ if len(items_dict['ObjectValuesExtensionBlock']):
+ external_blocks = [
+ make_block(array, klass=ObjectValuesExtensionBlock, placement=[i])
+ for i, _, array in items_dict['ObjectValuesExtensionBlock']
+ ]
+
+ blocks.extend(external_blocks)
+
+ if len(extra_locs):
+ shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:])
+
+ # empty items -> dtype object
+ block_values = np.empty(shape, dtype=object)
+ block_values.fill(np.nan)
+
+ na_block = make_block(block_values, placement=extra_locs)
+ blocks.append(na_block)
+
+ return blocks
+
+
+def _simple_blockify(tuples, dtype):
+ """ return a single array of a block that has a single dtype; if dtype is
+ not None, coerce to this dtype
+ """
+ values, placement = _stack_arrays(tuples, dtype)
+
+ # CHECK DTYPE?
+ if dtype is not None and values.dtype != dtype: # pragma: no cover
+ values = values.astype(dtype)
+
+ block = make_block(values, placement=placement)
+ return [block]
+
+
+def _multi_blockify(tuples, dtype=None):
+ """ return an array of blocks that potentially have different dtypes """
+
+ # group by dtype
+ grouper = itertools.groupby(tuples, lambda x: x[2].dtype)
+
+ new_blocks = []
+ for dtype, tup_block in grouper:
+
+ values, placement = _stack_arrays(list(tup_block), dtype)
+
+ block = make_block(values, placement=placement)
+ new_blocks.append(block)
+
+ return new_blocks
+
+
+def _sparse_blockify(tuples, dtype=None):
+ """ return an array of blocks that potentially have different dtypes (and
+ are sparse)
+ """
+
+ new_blocks = []
+ for i, names, array in tuples:
+ array = _maybe_to_sparse(array)
+ block = make_block(array, placement=[i])
+ new_blocks.append(block)
+
+ return new_blocks
+
+
+def _stack_arrays(tuples, dtype):
+
+ # fml
+ def _asarray_compat(x):
+ if isinstance(x, ABCSeries):
+ return x._values
+ else:
+ return np.asarray(x)
+
+ def _shape_compat(x):
+ if isinstance(x, ABCSeries):
+ return len(x),
+ else:
+ return x.shape
+
+ placement, names, arrays = zip(*tuples)
+
+ first = arrays[0]
+ shape = (len(arrays),) + _shape_compat(first)
+
+ stacked = np.empty(shape, dtype=dtype)
+ for i, arr in enumerate(arrays):
+ stacked[i] = _asarray_compat(arr)
+
+ return stacked, placement
+
+
+def _interleaved_dtype(blocks):
+ # type: (List[Block]) -> Optional[Union[np.dtype, ExtensionDtype]]
+ """Find the common dtype for `blocks`.
+
+ Parameters
+ ----------
+ blocks : List[Block]
+
+ Returns
+ -------
+ dtype : Optional[Union[np.dtype, ExtensionDtype]]
+ None is returned when `blocks` is empty.
+ """
+ if not len(blocks):
+ return None
+
+ return find_common_type([b.dtype for b in blocks])
+
+
+def _consolidate(blocks):
+ """
+ Merge blocks having same dtype, exclude non-consolidating blocks
+ """
+
+ # sort by _can_consolidate, dtype
+ gkey = lambda x: x._consolidate_key
+ grouper = itertools.groupby(sorted(blocks, key=gkey), gkey)
+
+ new_blocks = []
+ for (_can_consolidate, dtype), group_blocks in grouper:
+ merged_blocks = _merge_blocks(list(group_blocks), dtype=dtype,
+ _can_consolidate=_can_consolidate)
+ new_blocks = _extend_blocks(merged_blocks, new_blocks)
+ return new_blocks
+
+
+def _compare_or_regex_search(a, b, regex=False):
+ """
+ Compare two array_like inputs of the same shape or two scalar values
+
+ Calls operator.eq or re.search, depending on regex argument. If regex is
+ True, perform an element-wise regex matching.
+
+ Parameters
+ ----------
+ a : array_like or scalar
+ b : array_like or scalar
+ regex : bool, default False
+
+ Returns
+ -------
+ mask : array_like of bool
+ """
+ if not regex:
+ op = lambda x: operator.eq(x, b)
+ else:
+ op = np.vectorize(lambda x: bool(re.search(b, x)) if isinstance(x, str)
+ else False)
+
+ is_a_array = isinstance(a, np.ndarray)
+ is_b_array = isinstance(b, np.ndarray)
+
+ # numpy deprecation warning to have i8 vs integer comparisons
+ if is_datetimelike_v_numeric(a, b):
+ result = False
+
+ # numpy deprecation warning if comparing numeric vs string-like
+ elif is_numeric_v_string_like(a, b):
+ result = False
+ else:
+ result = op(a)
+
+ if is_scalar(result) and (is_a_array or is_b_array):
+ type_names = [type(a).__name__, type(b).__name__]
+
+ if is_a_array:
+ type_names[0] = 'ndarray(dtype={dtype})'.format(dtype=a.dtype)
+
+ if is_b_array:
+ type_names[1] = 'ndarray(dtype={dtype})'.format(dtype=b.dtype)
+
+ raise TypeError(
+ "Cannot compare types {a!r} and {b!r}".format(a=type_names[0],
+ b=type_names[1]))
+ return result
+
+
+def _concat_indexes(indexes):
+ return indexes[0].append(indexes[1:])
+
+
+def items_overlap_with_suffix(left, lsuffix, right, rsuffix):
+ """
+ If two indices overlap, add suffixes to overlapping entries.
+
+ If corresponding suffix is empty, the entry is simply converted to string.
+
+ """
+ to_rename = left.intersection(right)
+ if len(to_rename) == 0:
+ return left, right
+ else:
+ if not lsuffix and not rsuffix:
+ raise ValueError('columns overlap but no suffix specified: '
+ '{rename}'.format(rename=to_rename))
+
+ def lrenamer(x):
+ if x in to_rename:
+ return '{x}{lsuffix}'.format(x=x, lsuffix=lsuffix)
+ return x
+
+ def rrenamer(x):
+ if x in to_rename:
+ return '{x}{rsuffix}'.format(x=x, rsuffix=rsuffix)
+ return x
+
+ return (_transform_index(left, lrenamer),
+ _transform_index(right, rrenamer))
+
+
+def _transform_index(index, func, level=None):
+ """
+ Apply function to all values found in index.
+
+ This includes transforming multiindex entries separately.
+ Only apply function to one level of the MultiIndex if level is specified.
+
+ """
+ if isinstance(index, MultiIndex):
+ if level is not None:
+ items = [tuple(func(y) if i == level else y
+ for i, y in enumerate(x)) for x in index]
+ else:
+ items = [tuple(func(y) for y in x) for x in index]
+ return MultiIndex.from_tuples(items, names=index.names)
+ else:
+ items = [func(x) for x in index]
+ return Index(items, name=index.name, tupleize_cols=False)
+
+
+def _fast_count_smallints(arr):
+ """Faster version of set(arr) for sequences of small numbers."""
+ counts = np.bincount(arr.astype(np.int_))
+ nz = counts.nonzero()[0]
+ return np.c_[nz, counts[nz]]
+
+
+def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill):
+ if isinstance(slice_or_indexer, slice):
+ return ('slice', slice_or_indexer,
+ libinternals.slice_len(slice_or_indexer, length))
+ elif (isinstance(slice_or_indexer, np.ndarray) and
+ slice_or_indexer.dtype == np.bool_):
+ return 'mask', slice_or_indexer, slice_or_indexer.sum()
+ else:
+ indexer = np.asanyarray(slice_or_indexer, dtype=np.int64)
+ if not allow_fill:
+ indexer = maybe_convert_indices(indexer, length)
+ return 'fancy', indexer, len(indexer)
+
+
+def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
+ """
+ Concatenate block managers into one.
+
+ Parameters
+ ----------
+ mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples
+ axes : list of Index
+ concat_axis : int
+ copy : bool
+
+ """
+ concat_plans = [get_mgr_concatenation_plan(mgr, indexers)
+ for mgr, indexers in mgrs_indexers]
+ concat_plan = combine_concat_plans(concat_plans, concat_axis)
+ blocks = []
+
+ for placement, join_units in concat_plan:
+
+ if len(join_units) == 1 and not join_units[0].indexers:
+ b = join_units[0].block
+ values = b.values
+ if copy:
+ values = values.copy()
+ elif not copy:
+ values = values.view()
+ b = b.make_block_same_class(values, placement=placement)
+ elif is_uniform_join_units(join_units):
+ b = join_units[0].block.concat_same_type(
+ [ju.block for ju in join_units], placement=placement)
+ else:
+ b = make_block(
+ concatenate_join_units(join_units, concat_axis, copy=copy),
+ placement=placement)
+ blocks.append(b)
+
+ return BlockManager(blocks, axes)
diff --git a/contrib/python/pandas/py2/pandas/core/missing.py b/contrib/python/pandas/py2/pandas/core/missing.py
new file mode 100644
index 00000000000..15538b81966
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/missing.py
@@ -0,0 +1,748 @@
+"""
+Routines for filling missing data
+"""
+from distutils.version import LooseVersion
+import operator
+
+import numpy as np
+
+from pandas._libs import algos, lib
+from pandas.compat import range, string_types
+
+from pandas.core.dtypes.cast import infer_dtype_from_array
+from pandas.core.dtypes.common import (
+ ensure_float64, is_datetime64_dtype, is_datetime64tz_dtype, is_float_dtype,
+ is_integer, is_integer_dtype, is_numeric_v_string_like, is_scalar,
+ is_timedelta64_dtype, needs_i8_conversion)
+from pandas.core.dtypes.missing import isna
+
+
+def mask_missing(arr, values_to_mask):
+ """
+ Return a masking array of same size/shape as arr
+ with entries equaling any member of values_to_mask set to True
+ """
+ dtype, values_to_mask = infer_dtype_from_array(values_to_mask)
+
+ try:
+ values_to_mask = np.array(values_to_mask, dtype=dtype)
+
+ except Exception:
+ values_to_mask = np.array(values_to_mask, dtype=object)
+
+ na_mask = isna(values_to_mask)
+ nonna = values_to_mask[~na_mask]
+
+ mask = None
+ for x in nonna:
+ if mask is None:
+
+ # numpy elementwise comparison warning
+ if is_numeric_v_string_like(arr, x):
+ mask = False
+ else:
+ mask = arr == x
+
+ # if x is a string and arr is not, then we get False and we must
+ # expand the mask to size arr.shape
+ if is_scalar(mask):
+ mask = np.zeros(arr.shape, dtype=bool)
+ else:
+
+ # numpy elementwise comparison warning
+ if is_numeric_v_string_like(arr, x):
+ mask |= False
+ else:
+ mask |= arr == x
+
+ if na_mask.any():
+ if mask is None:
+ mask = isna(arr)
+ else:
+ mask |= isna(arr)
+
+ # GH 21977
+ if mask is None:
+ mask = np.zeros(arr.shape, dtype=bool)
+
+ return mask
+
+
+def clean_fill_method(method, allow_nearest=False):
+ # asfreq is compat for resampling
+ if method in [None, 'asfreq']:
+ return None
+
+ if isinstance(method, string_types):
+ method = method.lower()
+ if method == 'ffill':
+ method = 'pad'
+ elif method == 'bfill':
+ method = 'backfill'
+
+ valid_methods = ['pad', 'backfill']
+ expecting = 'pad (ffill) or backfill (bfill)'
+ if allow_nearest:
+ valid_methods.append('nearest')
+ expecting = 'pad (ffill), backfill (bfill) or nearest'
+ if method not in valid_methods:
+ msg = ('Invalid fill method. Expecting {expecting}. Got {method}'
+ .format(expecting=expecting, method=method))
+ raise ValueError(msg)
+ return method
+
+
+def clean_interp_method(method, **kwargs):
+ order = kwargs.get('order')
+ valid = ['linear', 'time', 'index', 'values', 'nearest', 'zero', 'slinear',
+ 'quadratic', 'cubic', 'barycentric', 'polynomial', 'krogh',
+ 'piecewise_polynomial', 'pchip', 'akima', 'spline',
+ 'from_derivatives']
+ if method in ('spline', 'polynomial') and order is None:
+ raise ValueError("You must specify the order of the spline or "
+ "polynomial.")
+ if method not in valid:
+ raise ValueError("method must be one of {valid}. Got '{method}' "
+ "instead.".format(valid=valid, method=method))
+
+ return method
+
+
+def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
+ limit_direction='forward', limit_area=None, fill_value=None,
+ bounds_error=False, order=None, **kwargs):
+ """
+ Logic for the 1-d interpolation. The result should be 1-d, inputs
+ xvalues and yvalues will each be 1-d arrays of the same length.
+
+ Bounds_error is currently hardcoded to False since non-scipy ones don't
+ take it as an argumnet.
+ """
+ # Treat the original, non-scipy methods first.
+
+ invalid = isna(yvalues)
+ valid = ~invalid
+
+ if not valid.any():
+ # have to call np.asarray(xvalues) since xvalues could be an Index
+ # which can't be mutated
+ result = np.empty_like(np.asarray(xvalues), dtype=np.float64)
+ result.fill(np.nan)
+ return result
+
+ if valid.all():
+ return yvalues
+
+ if method == 'time':
+ if not getattr(xvalues, 'is_all_dates', None):
+ # if not issubclass(xvalues.dtype.type, np.datetime64):
+ raise ValueError('time-weighted interpolation only works '
+ 'on Series or DataFrames with a '
+ 'DatetimeIndex')
+ method = 'values'
+
+ valid_limit_directions = ['forward', 'backward', 'both']
+ limit_direction = limit_direction.lower()
+ if limit_direction not in valid_limit_directions:
+ msg = ('Invalid limit_direction: expecting one of {valid!r}, '
+ 'got {invalid!r}.')
+ raise ValueError(msg.format(valid=valid_limit_directions,
+ invalid=limit_direction))
+
+ if limit_area is not None:
+ valid_limit_areas = ['inside', 'outside']
+ limit_area = limit_area.lower()
+ if limit_area not in valid_limit_areas:
+ raise ValueError('Invalid limit_area: expecting one of {}, got '
+ '{}.'.format(valid_limit_areas, limit_area))
+
+ # default limit is unlimited GH #16282
+ if limit is None:
+ # limit = len(xvalues)
+ pass
+ elif not is_integer(limit):
+ raise ValueError('Limit must be an integer')
+ elif limit < 1:
+ raise ValueError('Limit must be greater than 0')
+
+ from pandas import Series
+ ys = Series(yvalues)
+
+ # These are sets of index pointers to invalid values... i.e. {0, 1, etc...
+ all_nans = set(np.flatnonzero(invalid))
+ start_nans = set(range(ys.first_valid_index()))
+ end_nans = set(range(1 + ys.last_valid_index(), len(valid)))
+ mid_nans = all_nans - start_nans - end_nans
+
+ # Like the sets above, preserve_nans contains indices of invalid values,
+ # but in this case, it is the final set of indices that need to be
+ # preserved as NaN after the interpolation.
+
+ # For example if limit_direction='forward' then preserve_nans will
+ # contain indices of NaNs at the beginning of the series, and NaNs that
+ # are more than'limit' away from the prior non-NaN.
+
+ # set preserve_nans based on direction using _interp_limit
+ if limit_direction == 'forward':
+ preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
+ elif limit_direction == 'backward':
+ preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
+ else:
+ # both directions... just use _interp_limit
+ preserve_nans = set(_interp_limit(invalid, limit, limit))
+
+ # if limit_area is set, add either mid or outside indices
+ # to preserve_nans GH #16284
+ if limit_area == 'inside':
+ # preserve NaNs on the outside
+ preserve_nans |= start_nans | end_nans
+ elif limit_area == 'outside':
+ # preserve NaNs on the inside
+ preserve_nans |= mid_nans
+
+ # sort preserve_nans and covert to list
+ preserve_nans = sorted(preserve_nans)
+
+ xvalues = getattr(xvalues, 'values', xvalues)
+ yvalues = getattr(yvalues, 'values', yvalues)
+ result = yvalues.copy()
+
+ if method in ['linear', 'time', 'index', 'values']:
+ if method in ('values', 'index'):
+ inds = np.asarray(xvalues)
+ # hack for DatetimeIndex, #1646
+ if needs_i8_conversion(inds.dtype.type):
+ inds = inds.view(np.int64)
+ if inds.dtype == np.object_:
+ inds = lib.maybe_convert_objects(inds)
+ else:
+ inds = xvalues
+ result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid])
+ result[preserve_nans] = np.nan
+ return result
+
+ sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic',
+ 'barycentric', 'krogh', 'spline', 'polynomial',
+ 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima']
+
+ if method in sp_methods:
+ inds = np.asarray(xvalues)
+ # hack for DatetimeIndex, #1646
+ if issubclass(inds.dtype.type, np.datetime64):
+ inds = inds.view(np.int64)
+ result[invalid] = _interpolate_scipy_wrapper(inds[valid],
+ yvalues[valid],
+ inds[invalid],
+ method=method,
+ fill_value=fill_value,
+ bounds_error=bounds_error,
+ order=order, **kwargs)
+ result[preserve_nans] = np.nan
+ return result
+
+
+def _interpolate_scipy_wrapper(x, y, new_x, method, fill_value=None,
+ bounds_error=False, order=None, **kwargs):
+ """
+ passed off to scipy.interpolate.interp1d. method is scipy's kind.
+ Returns an array interpolated at new_x. Add any new methods to
+ the list in _clean_interp_method
+ """
+ try:
+ from scipy import interpolate
+ # TODO: Why is DatetimeIndex being imported here?
+ from pandas import DatetimeIndex # noqa
+ except ImportError:
+ raise ImportError('{method} interpolation requires SciPy'
+ .format(method=method))
+
+ new_x = np.asarray(new_x)
+
+ # ignores some kwargs that could be passed along.
+ alt_methods = {
+ 'barycentric': interpolate.barycentric_interpolate,
+ 'krogh': interpolate.krogh_interpolate,
+ 'from_derivatives': _from_derivatives,
+ 'piecewise_polynomial': _from_derivatives,
+ }
+
+ if getattr(x, 'is_all_dates', False):
+ # GH 5975, scipy.interp1d can't hande datetime64s
+ x, new_x = x._values.astype('i8'), new_x.astype('i8')
+
+ if method == 'pchip':
+ try:
+ alt_methods['pchip'] = interpolate.pchip_interpolate
+ except AttributeError:
+ raise ImportError("Your version of Scipy does not support "
+ "PCHIP interpolation.")
+ elif method == 'akima':
+ try:
+ from scipy.interpolate import Akima1DInterpolator # noqa
+ alt_methods['akima'] = _akima_interpolate
+ except ImportError:
+ raise ImportError("Your version of Scipy does not support "
+ "Akima interpolation.")
+
+ interp1d_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic',
+ 'polynomial']
+ if method in interp1d_methods:
+ if method == 'polynomial':
+ method = order
+ terp = interpolate.interp1d(x, y, kind=method, fill_value=fill_value,
+ bounds_error=bounds_error)
+ new_y = terp(new_x)
+ elif method == 'spline':
+ # GH #10633
+ if not order:
+ raise ValueError("order needs to be specified and greater than 0")
+ terp = interpolate.UnivariateSpline(x, y, k=order, **kwargs)
+ new_y = terp(new_x)
+ else:
+ # GH 7295: need to be able to write for some reason
+ # in some circumstances: check all three
+ if not x.flags.writeable:
+ x = x.copy()
+ if not y.flags.writeable:
+ y = y.copy()
+ if not new_x.flags.writeable:
+ new_x = new_x.copy()
+ method = alt_methods[method]
+ new_y = method(x, y, new_x, **kwargs)
+ return new_y
+
+
+def _from_derivatives(xi, yi, x, order=None, der=0, extrapolate=False):
+ """
+ Convenience function for interpolate.BPoly.from_derivatives
+
+ Construct a piecewise polynomial in the Bernstein basis, compatible
+ with the specified values and derivatives at breakpoints.
+
+ Parameters
+ ----------
+ xi : array_like
+ sorted 1D array of x-coordinates
+ yi : array_like or list of array-likes
+ yi[i][j] is the j-th derivative known at xi[i]
+ orders : None or int or array_like of ints. Default: None.
+ Specifies the degree of local polynomials. If not None, some
+ derivatives are ignored.
+ der : int or list
+ How many derivatives to extract; None for all potentially nonzero
+ derivatives (that is a number equal to the number of points), or a
+ list of derivatives to extract. This numberincludes the function
+ value as 0th derivative.
+ extrapolate : bool, optional
+ Whether to extrapolate to ouf-of-bounds points based on first and last
+ intervals, or to return NaNs. Default: True.
+
+ See Also
+ --------
+ scipy.interpolate.BPoly.from_derivatives
+
+ Returns
+ -------
+ y : scalar or array_like
+ The result, of length R or length M or M by R,
+
+ """
+ import scipy
+ from scipy import interpolate
+
+ if LooseVersion(scipy.__version__) < LooseVersion('0.18.0'):
+ try:
+ method = interpolate.piecewise_polynomial_interpolate
+ return method(xi, yi.reshape(-1, 1), x,
+ orders=order, der=der)
+ except AttributeError:
+ pass
+
+ # return the method for compat with scipy version & backwards compat
+ method = interpolate.BPoly.from_derivatives
+ m = method(xi, yi.reshape(-1, 1),
+ orders=order, extrapolate=extrapolate)
+
+ return m(x)
+
+
+def _akima_interpolate(xi, yi, x, der=0, axis=0):
+ """
+ Convenience function for akima interpolation.
+ xi and yi are arrays of values used to approximate some function f,
+ with ``yi = f(xi)``.
+
+ See `Akima1DInterpolator` for details.
+
+ Parameters
+ ----------
+ xi : array_like
+ A sorted list of x-coordinates, of length N.
+ yi : array_like
+ A 1-D array of real values. `yi`'s length along the interpolation
+ axis must be equal to the length of `xi`. If N-D array, use axis
+ parameter to select correct axis.
+ x : scalar or array_like
+ Of length M.
+ der : int or list, optional
+ How many derivatives to extract; None for all potentially
+ nonzero derivatives (that is a number equal to the number
+ of points), or a list of derivatives to extract. This number
+ includes the function value as 0th derivative.
+ axis : int, optional
+ Axis in the yi array corresponding to the x-coordinate values.
+
+ See Also
+ --------
+ scipy.interpolate.Akima1DInterpolator
+
+ Returns
+ -------
+ y : scalar or array_like
+ The result, of length R or length M or M by R,
+
+ """
+ from scipy import interpolate
+ try:
+ P = interpolate.Akima1DInterpolator(xi, yi, axis=axis)
+ except TypeError:
+ # Scipy earlier than 0.17.0 missing axis
+ P = interpolate.Akima1DInterpolator(xi, yi)
+ if der == 0:
+ return P(x)
+ elif interpolate._isscalar(der):
+ return P(x, der=der)
+ else:
+ return [P(x, nu) for nu in der]
+
+
+def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None,
+ dtype=None):
+ """ perform an actual interpolation of values, values will be make 2-d if
+ needed fills inplace, returns the result
+ """
+
+ transf = (lambda x: x) if axis == 0 else (lambda x: x.T)
+
+ # reshape a 1 dim if needed
+ ndim = values.ndim
+ if values.ndim == 1:
+ if axis != 0: # pragma: no cover
+ raise AssertionError("cannot interpolate on a ndim == 1 with "
+ "axis != 0")
+ values = values.reshape(tuple((1,) + values.shape))
+
+ if fill_value is None:
+ mask = None
+ else: # todo create faster fill func without masking
+ mask = mask_missing(transf(values), fill_value)
+
+ method = clean_fill_method(method)
+ if method == 'pad':
+ values = transf(pad_2d(
+ transf(values), limit=limit, mask=mask, dtype=dtype))
+ else:
+ values = transf(backfill_2d(
+ transf(values), limit=limit, mask=mask, dtype=dtype))
+
+ # reshape back
+ if ndim == 1:
+ values = values[0]
+
+ return values
+
+
+def _cast_values_for_fillna(values, dtype):
+ """
+ Cast values to a dtype that algos.pad and algos.backfill can handle.
+ """
+ # TODO: for int-dtypes we make a copy, but for everything else this
+ # alters the values in-place. Is this intentional?
+
+ if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or
+ is_timedelta64_dtype(dtype)):
+ values = values.view(np.int64)
+
+ elif is_integer_dtype(values):
+ # NB: this check needs to come after the datetime64 check above
+ values = ensure_float64(values)
+
+ return values
+
+
+def _fillna_prep(values, mask=None, dtype=None):
+ # boilerplate for pad_1d, backfill_1d, pad_2d, backfill_2d
+ if dtype is None:
+ dtype = values.dtype
+
+ if mask is None:
+ # This needs to occur before datetime/timedeltas are cast to int64
+ mask = isna(values)
+
+ values = _cast_values_for_fillna(values, dtype)
+
+ mask = mask.view(np.uint8)
+ return values, mask
+
+
+def pad_1d(values, limit=None, mask=None, dtype=None):
+ values, mask = _fillna_prep(values, mask, dtype)
+ algos.pad_inplace(values, mask, limit=limit)
+ return values
+
+
+def backfill_1d(values, limit=None, mask=None, dtype=None):
+ values, mask = _fillna_prep(values, mask, dtype)
+ algos.backfill_inplace(values, mask, limit=limit)
+ return values
+
+
+def pad_2d(values, limit=None, mask=None, dtype=None):
+ values, mask = _fillna_prep(values, mask, dtype)
+
+ if np.all(values.shape):
+ algos.pad_2d_inplace(values, mask, limit=limit)
+ else:
+ # for test coverage
+ pass
+ return values
+
+
+def backfill_2d(values, limit=None, mask=None, dtype=None):
+ values, mask = _fillna_prep(values, mask, dtype)
+
+ if np.all(values.shape):
+ algos.backfill_2d_inplace(values, mask, limit=limit)
+ else:
+ # for test coverage
+ pass
+ return values
+
+
+_fill_methods = {'pad': pad_1d, 'backfill': backfill_1d}
+
+
+def get_fill_func(method):
+ method = clean_fill_method(method)
+ return _fill_methods[method]
+
+
+def clean_reindex_fill_method(method):
+ return clean_fill_method(method, allow_nearest=True)
+
+
+def fill_zeros(result, x, y, name, fill):
+ """
+ if this is a reversed op, then flip x,y
+
+ if we have an integer value (or array in y)
+ and we have 0's, fill them with the fill,
+ return the result
+
+ mask the nan's from x
+ """
+ if fill is None or is_float_dtype(result):
+ return result
+
+ if name.startswith(('r', '__r')):
+ x, y = y, x
+
+ is_variable_type = (hasattr(y, 'dtype') or hasattr(y, 'type'))
+ is_scalar_type = is_scalar(y)
+
+ if not is_variable_type and not is_scalar_type:
+ return result
+
+ if is_scalar_type:
+ y = np.array(y)
+
+ if is_integer_dtype(y):
+
+ if (y == 0).any():
+
+ # GH 7325, mask and nans must be broadcastable (also: PR 9308)
+ # Raveling and then reshaping makes np.putmask faster
+ mask = ((y == 0) & ~np.isnan(result)).ravel()
+
+ shape = result.shape
+ result = result.astype('float64', copy=False).ravel()
+
+ np.putmask(result, mask, fill)
+
+ # if we have a fill of inf, then sign it correctly
+ # (GH 6178 and PR 9308)
+ if np.isinf(fill):
+ signs = y if name.startswith(('r', '__r')) else x
+ signs = np.sign(signs.astype('float', copy=False))
+ negative_inf_mask = (signs.ravel() < 0) & mask
+ np.putmask(result, negative_inf_mask, -fill)
+
+ if "floordiv" in name: # (PR 9308)
+ nan_mask = ((y == 0) & (x == 0)).ravel()
+ np.putmask(result, nan_mask, np.nan)
+
+ result = result.reshape(shape)
+
+ return result
+
+
+def mask_zero_div_zero(x, y, result, copy=False):
+ """
+ Set results of 0 / 0 or 0 // 0 to np.nan, regardless of the dtypes
+ of the numerator or the denominator.
+
+ Parameters
+ ----------
+ x : ndarray
+ y : ndarray
+ result : ndarray
+ copy : bool (default False)
+ Whether to always create a new array or try to fill in the existing
+ array if possible.
+
+ Returns
+ -------
+ filled_result : ndarray
+
+ Examples
+ --------
+ >>> x = np.array([1, 0, -1], dtype=np.int64)
+ >>> y = 0 # int 0; numpy behavior is different with float
+ >>> result = x / y
+ >>> result # raw numpy result does not fill division by zero
+ array([0, 0, 0])
+ >>> mask_zero_div_zero(x, y, result)
+ array([ inf, nan, -inf])
+ """
+ if is_scalar(y):
+ y = np.array(y)
+
+ zmask = y == 0
+ if zmask.any():
+ shape = result.shape
+
+ nan_mask = (zmask & (x == 0)).ravel()
+ neginf_mask = (zmask & (x < 0)).ravel()
+ posinf_mask = (zmask & (x > 0)).ravel()
+
+ if nan_mask.any() or neginf_mask.any() or posinf_mask.any():
+ # Fill negative/0 with -inf, positive/0 with +inf, 0/0 with NaN
+ result = result.astype('float64', copy=copy).ravel()
+
+ np.putmask(result, nan_mask, np.nan)
+ np.putmask(result, posinf_mask, np.inf)
+ np.putmask(result, neginf_mask, -np.inf)
+
+ result = result.reshape(shape)
+
+ return result
+
+
+def dispatch_missing(op, left, right, result):
+ """
+ Fill nulls caused by division by zero, casting to a diffferent dtype
+ if necessary.
+
+ Parameters
+ ----------
+ op : function (operator.add, operator.div, ...)
+ left : object (Index for non-reversed ops)
+ right : object (Index fof reversed ops)
+ result : ndarray
+
+ Returns
+ -------
+ result : ndarray
+ """
+ opstr = '__{opname}__'.format(opname=op.__name__).replace('____', '__')
+ if op in [operator.truediv, operator.floordiv,
+ getattr(operator, 'div', None)]:
+ result = mask_zero_div_zero(left, right, result)
+ elif op is operator.mod:
+ result = fill_zeros(result, left, right, opstr, np.nan)
+ elif op is divmod:
+ res0 = mask_zero_div_zero(left, right, result[0])
+ res1 = fill_zeros(result[1], left, right, opstr, np.nan)
+ result = (res0, res1)
+ return result
+
+
+def _interp_limit(invalid, fw_limit, bw_limit):
+ """
+ Get indexers of values that won't be filled
+ because they exceed the limits.
+
+ Parameters
+ ----------
+ invalid : boolean ndarray
+ fw_limit : int or None
+ forward limit to index
+ bw_limit : int or None
+ backward limit to index
+
+ Returns
+ -------
+ set of indexers
+
+ Notes
+ -----
+ This is equivalent to the more readable, but slower
+
+ .. code-block:: python
+
+ def _interp_limit(invalid, fw_limit, bw_limit):
+ for x in np.where(invalid)[0]:
+ if invalid[max(0, x - fw_limit):x + bw_limit + 1].all():
+ yield x
+ """
+ # handle forward first; the backward direction is the same except
+ # 1. operate on the reversed array
+ # 2. subtract the returned indices from N - 1
+ N = len(invalid)
+ f_idx = set()
+ b_idx = set()
+
+ def inner(invalid, limit):
+ limit = min(limit, N)
+ windowed = _rolling_window(invalid, limit + 1).all(1)
+ idx = (set(np.where(windowed)[0] + limit) |
+ set(np.where((~invalid[:limit + 1]).cumsum() == 0)[0]))
+ return idx
+
+ if fw_limit is not None:
+
+ if fw_limit == 0:
+ f_idx = set(np.where(invalid)[0])
+ else:
+ f_idx = inner(invalid, fw_limit)
+
+ if bw_limit is not None:
+
+ if bw_limit == 0:
+ # then we don't even need to care about backwards
+ # just use forwards
+ return f_idx
+ else:
+ b_idx = list(inner(invalid[::-1], bw_limit))
+ b_idx = set(N - 1 - np.asarray(b_idx))
+ if fw_limit == 0:
+ return b_idx
+
+ return f_idx & b_idx
+
+
+def _rolling_window(a, window):
+ """
+ [True, True, False, True, False], 2 ->
+
+ [
+ [True, True],
+ [True, False],
+ [False, True],
+ [True, False],
+ ]
+ """
+ # https://stackoverflow.com/a/6811241
+ shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
+ strides = a.strides + (a.strides[-1],)
+ return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
diff --git a/contrib/python/pandas/py2/pandas/core/nanops.py b/contrib/python/pandas/py2/pandas/core/nanops.py
new file mode 100644
index 00000000000..86c3c380636
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/nanops.py
@@ -0,0 +1,1272 @@
+from distutils.version import LooseVersion
+import functools
+import itertools
+import operator
+import warnings
+
+import numpy as np
+
+from pandas._libs import iNaT, lib, tslibs
+import pandas.compat as compat
+
+from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask
+from pandas.core.dtypes.common import (
+ _get_dtype, is_any_int_dtype, is_bool_dtype, is_complex, is_complex_dtype,
+ is_datetime64_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype,
+ is_float, is_float_dtype, is_integer, is_integer_dtype, is_numeric_dtype,
+ is_object_dtype, is_scalar, is_timedelta64_dtype, pandas_dtype)
+from pandas.core.dtypes.dtypes import DatetimeTZDtype
+from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna
+
+import pandas.core.common as com
+from pandas.core.config import get_option
+
+_BOTTLENECK_INSTALLED = False
+_MIN_BOTTLENECK_VERSION = '1.0.0'
+
+try:
+ import bottleneck as bn
+ ver = bn.__version__
+ _BOTTLENECK_INSTALLED = (LooseVersion(ver) >=
+ LooseVersion(_MIN_BOTTLENECK_VERSION))
+
+ if not _BOTTLENECK_INSTALLED:
+ warnings.warn(
+ "The installed version of bottleneck {ver} is not supported "
+ "in pandas and will be not be used\nThe minimum supported "
+ "version is {min_ver}\n".format(
+ ver=ver, min_ver=_MIN_BOTTLENECK_VERSION), UserWarning)
+
+except ImportError: # pragma: no cover
+ pass
+
+
+_USE_BOTTLENECK = False
+
+
+def set_use_bottleneck(v=True):
+ # set/unset to use bottleneck
+ global _USE_BOTTLENECK
+ if _BOTTLENECK_INSTALLED:
+ _USE_BOTTLENECK = v
+
+
+set_use_bottleneck(get_option('compute.use_bottleneck'))
+
+
+class disallow(object):
+
+ def __init__(self, *dtypes):
+ super(disallow, self).__init__()
+ self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes)
+
+ def check(self, obj):
+ return hasattr(obj, 'dtype') and issubclass(obj.dtype.type,
+ self.dtypes)
+
+ def __call__(self, f):
+ @functools.wraps(f)
+ def _f(*args, **kwargs):
+ obj_iter = itertools.chain(args, compat.itervalues(kwargs))
+ if any(self.check(obj) for obj in obj_iter):
+ msg = 'reduction operation {name!r} not allowed for this dtype'
+ raise TypeError(msg.format(name=f.__name__.replace('nan', '')))
+ try:
+ with np.errstate(invalid='ignore'):
+ return f(*args, **kwargs)
+ except ValueError as e:
+ # we want to transform an object array
+ # ValueError message to the more typical TypeError
+ # e.g. this is normally a disallowed function on
+ # object arrays that contain strings
+ if is_object_dtype(args[0]):
+ raise TypeError(e)
+ raise
+
+ return _f
+
+
+class bottleneck_switch(object):
+
+ def __init__(self, **kwargs):
+ self.kwargs = kwargs
+
+ def __call__(self, alt):
+ bn_name = alt.__name__
+
+ try:
+ bn_func = getattr(bn, bn_name)
+ except (AttributeError, NameError): # pragma: no cover
+ bn_func = None
+
+ @functools.wraps(alt)
+ def f(values, axis=None, skipna=True, **kwds):
+ if len(self.kwargs) > 0:
+ for k, v in compat.iteritems(self.kwargs):
+ if k not in kwds:
+ kwds[k] = v
+ try:
+ if values.size == 0 and kwds.get('min_count') is None:
+ # We are empty, returning NA for our type
+ # Only applies for the default `min_count` of None
+ # since that affects how empty arrays are handled.
+ # TODO(GH-18976) update all the nanops methods to
+ # correctly handle empty inputs and remove this check.
+ # It *may* just be `var`
+ return _na_for_min_count(values, axis)
+
+ if (_USE_BOTTLENECK and skipna and
+ _bn_ok_dtype(values.dtype, bn_name)):
+ result = bn_func(values, axis=axis, **kwds)
+
+ # prefer to treat inf/-inf as NA, but must compute the func
+ # twice :(
+ if _has_infs(result):
+ result = alt(values, axis=axis, skipna=skipna, **kwds)
+ else:
+ result = alt(values, axis=axis, skipna=skipna, **kwds)
+ except Exception:
+ try:
+ result = alt(values, axis=axis, skipna=skipna, **kwds)
+ except ValueError as e:
+ # we want to transform an object array
+ # ValueError message to the more typical TypeError
+ # e.g. this is normally a disallowed function on
+ # object arrays that contain strings
+
+ if is_object_dtype(values):
+ raise TypeError(e)
+ raise
+
+ return result
+
+ return f
+
+
+def _bn_ok_dtype(dt, name):
+ # Bottleneck chokes on datetime64
+ if (not is_object_dtype(dt) and
+ not (is_datetime_or_timedelta_dtype(dt) or
+ is_datetime64tz_dtype(dt))):
+
+ # GH 15507
+ # bottleneck does not properly upcast during the sum
+ # so can overflow
+
+ # GH 9422
+ # further we also want to preserve NaN when all elements
+ # are NaN, unlinke bottleneck/numpy which consider this
+ # to be 0
+ if name in ['nansum', 'nanprod']:
+ return False
+
+ return True
+ return False
+
+
+def _has_infs(result):
+ if isinstance(result, np.ndarray):
+ if result.dtype == 'f8':
+ return lib.has_infs_f8(result.ravel())
+ elif result.dtype == 'f4':
+ return lib.has_infs_f4(result.ravel())
+ try:
+ return np.isinf(result).any()
+ except (TypeError, NotImplementedError):
+ # if it doesn't support infs, then it can't have infs
+ return False
+
+
+def _get_fill_value(dtype, fill_value=None, fill_value_typ=None):
+ """ return the correct fill value for the dtype of the values """
+ if fill_value is not None:
+ return fill_value
+ if _na_ok_dtype(dtype):
+ if fill_value_typ is None:
+ return np.nan
+ else:
+ if fill_value_typ == '+inf':
+ return np.inf
+ else:
+ return -np.inf
+ else:
+ if fill_value_typ is None:
+ return tslibs.iNaT
+ else:
+ if fill_value_typ == '+inf':
+ # need the max int here
+ return _int64_max
+ else:
+ return tslibs.iNaT
+
+
+def _get_values(values, skipna, fill_value=None, fill_value_typ=None,
+ isfinite=False, copy=True, mask=None):
+ """ utility to get the values view, mask, dtype
+ if necessary copy and mask using the specified fill_value
+ copy = True will force the copy
+ """
+
+ if is_datetime64tz_dtype(values):
+ # com.values_from_object returns M8[ns] dtype instead of tz-aware,
+ # so this case must be handled separately from the rest
+ dtype = values.dtype
+ values = getattr(values, "_values", values)
+ else:
+ values = com.values_from_object(values)
+ dtype = values.dtype
+
+ if mask is None:
+ if isfinite:
+ mask = _isfinite(values)
+ else:
+ mask = isna(values)
+
+ if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values):
+ # changing timedelta64/datetime64 to int64 needs to happen after
+ # finding `mask` above
+ values = getattr(values, "asi8", values)
+ values = values.view(np.int64)
+
+ dtype_ok = _na_ok_dtype(dtype)
+
+ # get our fill value (in case we need to provide an alternative
+ # dtype for it)
+ fill_value = _get_fill_value(dtype, fill_value=fill_value,
+ fill_value_typ=fill_value_typ)
+
+ if skipna:
+ if copy:
+ values = values.copy()
+ if dtype_ok:
+ np.putmask(values, mask, fill_value)
+
+ # promote if needed
+ else:
+ values, changed = maybe_upcast_putmask(values, mask, fill_value)
+
+ elif copy:
+ values = values.copy()
+
+ # return a platform independent precision dtype
+ dtype_max = dtype
+ if is_integer_dtype(dtype) or is_bool_dtype(dtype):
+ dtype_max = np.int64
+ elif is_float_dtype(dtype):
+ dtype_max = np.float64
+
+ return values, mask, dtype, dtype_max, fill_value
+
+
+def _isfinite(values):
+ if is_datetime_or_timedelta_dtype(values):
+ return isna(values)
+ if (is_complex_dtype(values) or is_float_dtype(values) or
+ is_integer_dtype(values) or is_bool_dtype(values)):
+ return ~np.isfinite(values)
+ return ~np.isfinite(values.astype('float64'))
+
+
+def _na_ok_dtype(dtype):
+ # TODO: what about datetime64tz? PeriodDtype?
+ return not issubclass(dtype.type,
+ (np.integer, np.timedelta64, np.datetime64))
+
+
+def _wrap_results(result, dtype, fill_value=None):
+ """ wrap our results if needed """
+
+ if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
+ if fill_value is None:
+ # GH#24293
+ fill_value = iNaT
+ if not isinstance(result, np.ndarray):
+ tz = getattr(dtype, 'tz', None)
+ assert not isna(fill_value), "Expected non-null fill_value"
+ if result == fill_value:
+ result = np.nan
+ result = tslibs.Timestamp(result, tz=tz)
+ else:
+ result = result.view(dtype)
+ elif is_timedelta64_dtype(dtype):
+ if not isinstance(result, np.ndarray):
+ if result == fill_value:
+ result = np.nan
+
+ # raise if we have a timedelta64[ns] which is too large
+ if np.fabs(result) > _int64_max:
+ raise ValueError("overflow in timedelta operation")
+
+ result = tslibs.Timedelta(result, unit='ns')
+ else:
+ result = result.astype('i8').view(dtype)
+
+ return result
+
+
+def _na_for_min_count(values, axis):
+ """Return the missing value for `values`
+
+ Parameters
+ ----------
+ values : ndarray
+ axis : int or None
+ axis for the reduction
+
+ Returns
+ -------
+ result : scalar or ndarray
+ For 1-D values, returns a scalar of the correct missing type.
+ For 2-D values, returns a 1-D array where each element is missing.
+ """
+ # we either return np.nan or pd.NaT
+ if is_numeric_dtype(values):
+ values = values.astype('float64')
+ fill_value = na_value_for_dtype(values.dtype)
+
+ if values.ndim == 1:
+ return fill_value
+ else:
+ result_shape = (values.shape[:axis] +
+ values.shape[axis + 1:])
+ result = np.empty(result_shape, dtype=values.dtype)
+ result.fill(fill_value)
+ return result
+
+
+def nanany(values, axis=None, skipna=True, mask=None):
+ """
+ Check if any elements along an axis evaluate to True.
+
+ Parameters
+ ----------
+ values : ndarray
+ axis : int, optional
+ skipna : bool, default True
+ mask : ndarray[bool], optional
+ nan-mask if known
+
+ Returns
+ -------
+ result : bool
+
+ Examples
+ --------
+ >>> import pandas.core.nanops as nanops
+ >>> s = pd.Series([1, 2])
+ >>> nanops.nanany(s)
+ True
+
+ >>> import pandas.core.nanops as nanops
+ >>> s = pd.Series([np.nan])
+ >>> nanops.nanany(s)
+ False
+ """
+ values, mask, dtype, _, _ = _get_values(values, skipna, False, copy=skipna,
+ mask=mask)
+ return values.any(axis)
+
+
+def nanall(values, axis=None, skipna=True, mask=None):
+ """
+ Check if all elements along an axis evaluate to True.
+
+ Parameters
+ ----------
+ values : ndarray
+ axis: int, optional
+ skipna : bool, default True
+ mask : ndarray[bool], optional
+ nan-mask if known
+
+ Returns
+ -------
+ result : bool
+
+ Examples
+ --------
+ >>> import pandas.core.nanops as nanops
+ >>> s = pd.Series([1, 2, np.nan])
+ >>> nanops.nanall(s)
+ True
+
+ >>> import pandas.core.nanops as nanops
+ >>> s = pd.Series([1, 0])
+ >>> nanops.nanall(s)
+ False
+ """
+ values, mask, dtype, _, _ = _get_values(values, skipna, True, copy=skipna,
+ mask=mask)
+ return values.all(axis)
+
+
+@disallow('M8')
+def nansum(values, axis=None, skipna=True, min_count=0, mask=None):
+ """
+ Sum the elements along an axis ignoring NaNs
+
+ Parameters
+ ----------
+ values : ndarray[dtype]
+ axis: int, optional
+ skipna : bool, default True
+ min_count: int, default 0
+ mask : ndarray[bool], optional
+ nan-mask if known
+
+ Returns
+ -------
+ result : dtype
+
+ Examples
+ --------
+ >>> import pandas.core.nanops as nanops
+ >>> s = pd.Series([1, 2, np.nan])
+ >>> nanops.nansum(s)
+ 3.0
+ """
+ values, mask, dtype, dtype_max, _ = _get_values(values,
+ skipna, 0, mask=mask)
+ dtype_sum = dtype_max
+ if is_float_dtype(dtype):
+ dtype_sum = dtype
+ elif is_timedelta64_dtype(dtype):
+ dtype_sum = np.float64
+ the_sum = values.sum(axis, dtype=dtype_sum)
+ the_sum = _maybe_null_out(the_sum, axis, mask, min_count=min_count)
+
+ return _wrap_results(the_sum, dtype)
+
+
+@disallow('M8', DatetimeTZDtype)
+@bottleneck_switch()
+def nanmean(values, axis=None, skipna=True, mask=None):
+ """
+ Compute the mean of the element along an axis ignoring NaNs
+
+ Parameters
+ ----------
+ values : ndarray
+ axis: int, optional
+ skipna : bool, default True
+ mask : ndarray[bool], optional
+ nan-mask if known
+
+ Returns
+ -------
+ result : float
+ Unless input is a float array, in which case use the same
+ precision as the input array.
+
+ Examples
+ --------
+ >>> import pandas.core.nanops as nanops
+ >>> s = pd.Series([1, 2, np.nan])
+ >>> nanops.nanmean(s)
+ 1.5
+ """
+ values, mask, dtype, dtype_max, _ = _get_values(
+ values, skipna, 0, mask=mask)
+ dtype_sum = dtype_max
+ dtype_count = np.float64
+ if (is_integer_dtype(dtype) or is_timedelta64_dtype(dtype) or
+ is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype)):
+ dtype_sum = np.float64
+ elif is_float_dtype(dtype):
+ dtype_sum = dtype
+ dtype_count = dtype
+ count = _get_counts(mask, axis, dtype=dtype_count)
+ the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
+
+ if axis is not None and getattr(the_sum, 'ndim', False):
+ with np.errstate(all="ignore"):
+ # suppress division by zero warnings
+ the_mean = the_sum / count
+ ct_mask = count == 0
+ if ct_mask.any():
+ the_mean[ct_mask] = np.nan
+ else:
+ the_mean = the_sum / count if count > 0 else np.nan
+
+ return _wrap_results(the_mean, dtype)
+
+
+@disallow('M8')
+@bottleneck_switch()
+def nanmedian(values, axis=None, skipna=True, mask=None):
+ """
+ Parameters
+ ----------
+ values : ndarray
+ axis: int, optional
+ skipna : bool, default True
+ mask : ndarray[bool], optional
+ nan-mask if known
+
+ Returns
+ -------
+ result : float
+ Unless input is a float array, in which case use the same
+ precision as the input array.
+
+ Examples
+ --------
+ >>> import pandas.core.nanops as nanops
+ >>> s = pd.Series([1, np.nan, 2, 2])
+ >>> nanops.nanmedian(s)
+ 2.0
+ """
+ def get_median(x):
+ mask = notna(x)
+ if not skipna and not mask.all():
+ return np.nan
+ return np.nanmedian(x[mask])
+
+ values, mask, dtype, dtype_max, _ = _get_values(values, skipna, mask=mask)
+ if not is_float_dtype(values):
+ values = values.astype('f8')
+ values[mask] = np.nan
+
+ if axis is None:
+ values = values.ravel()
+
+ notempty = values.size
+
+ # an array from a frame
+ if values.ndim > 1:
+
+ # there's a non-empty array to apply over otherwise numpy raises
+ if notempty:
+ if not skipna:
+ return _wrap_results(
+ np.apply_along_axis(get_median, axis, values), dtype)
+
+ # fastpath for the skipna case
+ return _wrap_results(np.nanmedian(values, axis), dtype)
+
+ # must return the correct shape, but median is not defined for the
+ # empty set so return nans of shape "everything but the passed axis"
+ # since "axis" is where the reduction would occur if we had a nonempty
+ # array
+ shp = np.array(values.shape)
+ dims = np.arange(values.ndim)
+ ret = np.empty(shp[dims != axis])
+ ret.fill(np.nan)
+ return _wrap_results(ret, dtype)
+
+ # otherwise return a scalar value
+ return _wrap_results(get_median(values) if notempty else np.nan, dtype)
+
+
+def _get_counts_nanvar(mask, axis, ddof, dtype=float):
+ dtype = _get_dtype(dtype)
+ count = _get_counts(mask, axis, dtype=dtype)
+ d = count - dtype.type(ddof)
+
+ # always return NaN, never inf
+ if is_scalar(count):
+ if count <= ddof:
+ count = np.nan
+ d = np.nan
+ else:
+ mask2 = count <= ddof
+ if mask2.any():
+ np.putmask(d, mask2, np.nan)
+ np.putmask(count, mask2, np.nan)
+ return count, d
+
+
+@disallow('M8')
+@bottleneck_switch(ddof=1)
+def nanstd(values, axis=None, skipna=True, ddof=1, mask=None):
+ """
+ Compute the standard deviation along given axis while ignoring NaNs
+
+ Parameters
+ ----------
+ values : ndarray
+ axis: int, optional
+ skipna : bool, default True
+ ddof : int, default 1
+ Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
+ where N represents the number of elements.
+ mask : ndarray[bool], optional
+ nan-mask if known
+
+ Returns
+ -------
+ result : float
+ Unless input is a float array, in which case use the same
+ precision as the input array.
+
+ Examples
+ --------
+ >>> import pandas.core.nanops as nanops
+ >>> s = pd.Series([1, np.nan, 2, 3])
+ >>> nanops.nanstd(s)
+ 1.0
+ """
+ result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof,
+ mask=mask))
+ return _wrap_results(result, values.dtype)
+
+
+@disallow('M8')
+@bottleneck_switch(ddof=1)
+def nanvar(values, axis=None, skipna=True, ddof=1, mask=None):
+ """
+ Compute the variance along given axis while ignoring NaNs
+
+ Parameters
+ ----------
+ values : ndarray
+ axis: int, optional
+ skipna : bool, default True
+ ddof : int, default 1
+ Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
+ where N represents the number of elements.
+ mask : ndarray[bool], optional
+ nan-mask if known
+
+ Returns
+ -------
+ result : float
+ Unless input is a float array, in which case use the same
+ precision as the input array.
+
+ Examples
+ --------
+ >>> import pandas.core.nanops as nanops
+ >>> s = pd.Series([1, np.nan, 2, 3])
+ >>> nanops.nanvar(s)
+ 1.0
+ """
+ values = com.values_from_object(values)
+ dtype = values.dtype
+ if mask is None:
+ mask = isna(values)
+ if is_any_int_dtype(values):
+ values = values.astype('f8')
+ values[mask] = np.nan
+
+ if is_float_dtype(values):
+ count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype)
+ else:
+ count, d = _get_counts_nanvar(mask, axis, ddof)
+
+ if skipna:
+ values = values.copy()
+ np.putmask(values, mask, 0)
+
+ # xref GH10242
+ # Compute variance via two-pass algorithm, which is stable against
+ # cancellation errors and relatively accurate for small numbers of
+ # observations.
+ #
+ # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+ avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
+ if axis is not None:
+ avg = np.expand_dims(avg, axis)
+ sqr = _ensure_numeric((avg - values) ** 2)
+ np.putmask(sqr, mask, 0)
+ result = sqr.sum(axis=axis, dtype=np.float64) / d
+
+ # Return variance as np.float64 (the datatype used in the accumulator),
+ # unless we were dealing with a float array, in which case use the same
+ # precision as the original values array.
+ if is_float_dtype(dtype):
+ result = result.astype(dtype)
+ return _wrap_results(result, values.dtype)
+
+
+@disallow('M8', 'm8')
+def nansem(values, axis=None, skipna=True, ddof=1, mask=None):
+ """
+ Compute the standard error in the mean along given axis while ignoring NaNs
+
+ Parameters
+ ----------
+ values : ndarray
+ axis: int, optional
+ skipna : bool, default True
+ ddof : int, default 1
+ Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
+ where N represents the number of elements.
+ mask : ndarray[bool], optional
+ nan-mask if known
+
+ Returns
+ -------
+ result : float64
+ Unless input is a float array, in which case use the same
+ precision as the input array.
+
+ Examples
+ --------
+ >>> import pandas.core.nanops as nanops
+ >>> s = pd.Series([1, np.nan, 2, 3])
+ >>> nanops.nansem(s)
+ 0.5773502691896258
+ """
+
+ # This checks if non-numeric-like data is passed with numeric_only=False
+ # and raises a TypeError otherwise
+ nanvar(values, axis, skipna, ddof=ddof, mask=mask)
+
+ if mask is None:
+ mask = isna(values)
+ if not is_float_dtype(values.dtype):
+ values = values.astype('f8')
+ count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype)
+ var = nanvar(values, axis, skipna, ddof=ddof)
+
+ return np.sqrt(var) / np.sqrt(count)
+
+
+def _nanminmax(meth, fill_value_typ):
+ @bottleneck_switch()
+ def reduction(values, axis=None, skipna=True, mask=None):
+
+ values, mask, dtype, dtype_max, fill_value = _get_values(
+ values, skipna, fill_value_typ=fill_value_typ, mask=mask)
+
+ if ((axis is not None and values.shape[axis] == 0) or
+ values.size == 0):
+ try:
+ result = getattr(values, meth)(axis, dtype=dtype_max)
+ result.fill(np.nan)
+ except (AttributeError, TypeError,
+ ValueError, np.core._internal.AxisError):
+ result = np.nan
+ else:
+ result = getattr(values, meth)(axis)
+
+ result = _wrap_results(result, dtype, fill_value)
+ return _maybe_null_out(result, axis, mask)
+
+ reduction.__name__ = 'nan' + meth
+ return reduction
+
+
+nanmin = _nanminmax('min', fill_value_typ='+inf')
+nanmax = _nanminmax('max', fill_value_typ='-inf')
+
+
+@disallow('O')
+def nanargmax(values, axis=None, skipna=True, mask=None):
+ """
+ Parameters
+ ----------
+ values : ndarray
+ axis: int, optional
+ skipna : bool, default True
+ mask : ndarray[bool], optional
+ nan-mask if known
+
+ Returns
+ --------
+ result : int
+ The index of max value in specified axis or -1 in the NA case
+
+ Examples
+ --------
+ >>> import pandas.core.nanops as nanops
+ >>> s = pd.Series([1, 2, 3, np.nan, 4])
+ >>> nanops.nanargmax(s)
+ 4
+ """
+ values, mask, dtype, _, _ = _get_values(
+ values, skipna, fill_value_typ='-inf', mask=mask)
+ result = values.argmax(axis)
+ result = _maybe_arg_null_out(result, axis, mask, skipna)
+ return result
+
+
+@disallow('O')
+def nanargmin(values, axis=None, skipna=True, mask=None):
+ """
+ Parameters
+ ----------
+ values : ndarray
+ axis: int, optional
+ skipna : bool, default True
+ mask : ndarray[bool], optional
+ nan-mask if known
+
+ Returns
+ --------
+ result : int
+ The index of min value in specified axis or -1 in the NA case
+
+ Examples
+ --------
+ >>> import pandas.core.nanops as nanops
+ >>> s = pd.Series([1, 2, 3, np.nan, 4])
+ >>> nanops.nanargmin(s)
+ 0
+ """
+ values, mask, dtype, _, _ = _get_values(
+ values, skipna, fill_value_typ='+inf', mask=mask)
+ result = values.argmin(axis)
+ result = _maybe_arg_null_out(result, axis, mask, skipna)
+ return result
+
+
+@disallow('M8', 'm8')
+def nanskew(values, axis=None, skipna=True, mask=None):
+ """ Compute the sample skewness.
+
+ The statistic computed here is the adjusted Fisher-Pearson standardized
+ moment coefficient G1. The algorithm computes this coefficient directly
+ from the second and third central moment.
+
+ Parameters
+ ----------
+ values : ndarray
+ axis: int, optional
+ skipna : bool, default True
+ mask : ndarray[bool], optional
+ nan-mask if known
+
+ Returns
+ -------
+ result : float64
+ Unless input is a float array, in which case use the same
+ precision as the input array.
+
+ Examples
+ --------
+ >>> import pandas.core.nanops as nanops
+ >>> s = pd.Series([1,np.nan, 1, 2])
+ >>> nanops.nanskew(s)
+ 1.7320508075688787
+ """
+ values = com.values_from_object(values)
+ if mask is None:
+ mask = isna(values)
+ if not is_float_dtype(values.dtype):
+ values = values.astype('f8')
+ count = _get_counts(mask, axis)
+ else:
+ count = _get_counts(mask, axis, dtype=values.dtype)
+
+ if skipna:
+ values = values.copy()
+ np.putmask(values, mask, 0)
+
+ mean = values.sum(axis, dtype=np.float64) / count
+ if axis is not None:
+ mean = np.expand_dims(mean, axis)
+
+ adjusted = values - mean
+ if skipna:
+ np.putmask(adjusted, mask, 0)
+ adjusted2 = adjusted ** 2
+ adjusted3 = adjusted2 * adjusted
+ m2 = adjusted2.sum(axis, dtype=np.float64)
+ m3 = adjusted3.sum(axis, dtype=np.float64)
+
+ # floating point error
+ #
+ # #18044 in _libs/windows.pyx calc_skew follow this behavior
+ # to fix the fperr to treat m2 <1e-14 as zero
+ m2 = _zero_out_fperr(m2)
+ m3 = _zero_out_fperr(m3)
+
+ with np.errstate(invalid='ignore', divide='ignore'):
+ result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5)
+
+ dtype = values.dtype
+ if is_float_dtype(dtype):
+ result = result.astype(dtype)
+
+ if isinstance(result, np.ndarray):
+ result = np.where(m2 == 0, 0, result)
+ result[count < 3] = np.nan
+ return result
+ else:
+ result = 0 if m2 == 0 else result
+ if count < 3:
+ return np.nan
+ return result
+
+
+@disallow('M8', 'm8')
+def nankurt(values, axis=None, skipna=True, mask=None):
+ """
+ Compute the sample excess kurtosis
+
+ The statistic computed here is the adjusted Fisher-Pearson standardized
+ moment coefficient G2, computed directly from the second and fourth
+ central moment.
+
+ Parameters
+ ----------
+ values : ndarray
+ axis: int, optional
+ skipna : bool, default True
+ mask : ndarray[bool], optional
+ nan-mask if known
+
+ Returns
+ -------
+ result : float64
+ Unless input is a float array, in which case use the same
+ precision as the input array.
+
+ Examples
+ --------
+ >>> import pandas.core.nanops as nanops
+ >>> s = pd.Series([1,np.nan, 1, 3, 2])
+ >>> nanops.nankurt(s)
+ -1.2892561983471076
+ """
+ values = com.values_from_object(values)
+ if mask is None:
+ mask = isna(values)
+ if not is_float_dtype(values.dtype):
+ values = values.astype('f8')
+ count = _get_counts(mask, axis)
+ else:
+ count = _get_counts(mask, axis, dtype=values.dtype)
+
+ if skipna:
+ values = values.copy()
+ np.putmask(values, mask, 0)
+
+ mean = values.sum(axis, dtype=np.float64) / count
+ if axis is not None:
+ mean = np.expand_dims(mean, axis)
+
+ adjusted = values - mean
+ if skipna:
+ np.putmask(adjusted, mask, 0)
+ adjusted2 = adjusted ** 2
+ adjusted4 = adjusted2 ** 2
+ m2 = adjusted2.sum(axis, dtype=np.float64)
+ m4 = adjusted4.sum(axis, dtype=np.float64)
+
+ with np.errstate(invalid='ignore', divide='ignore'):
+ adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
+ numer = count * (count + 1) * (count - 1) * m4
+ denom = (count - 2) * (count - 3) * m2 ** 2
+
+ # floating point error
+ #
+ # #18044 in _libs/windows.pyx calc_kurt follow this behavior
+ # to fix the fperr to treat denom <1e-14 as zero
+ numer = _zero_out_fperr(numer)
+ denom = _zero_out_fperr(denom)
+
+ if not isinstance(denom, np.ndarray):
+ # if ``denom`` is a scalar, check these corner cases first before
+ # doing division
+ if count < 4:
+ return np.nan
+ if denom == 0:
+ return 0
+
+ with np.errstate(invalid='ignore', divide='ignore'):
+ result = numer / denom - adj
+
+ dtype = values.dtype
+ if is_float_dtype(dtype):
+ result = result.astype(dtype)
+
+ if isinstance(result, np.ndarray):
+ result = np.where(denom == 0, 0, result)
+ result[count < 4] = np.nan
+
+ return result
+
+
+@disallow('M8', 'm8')
+def nanprod(values, axis=None, skipna=True, min_count=0, mask=None):
+ """
+ Parameters
+ ----------
+ values : ndarray[dtype]
+ axis: int, optional
+ skipna : bool, default True
+ min_count: int, default 0
+ mask : ndarray[bool], optional
+ nan-mask if known
+
+ Returns
+ -------
+ result : dtype
+
+ Examples
+ --------
+ >>> import pandas.core.nanops as nanops
+ >>> s = pd.Series([1, 2, 3, np.nan])
+ >>> nanops.nanprod(s)
+ 6.0
+
+ Returns
+ --------
+ The product of all elements on a given axis. ( NaNs are treated as 1)
+ """
+ if mask is None:
+ mask = isna(values)
+ if skipna and not is_any_int_dtype(values):
+ values = values.copy()
+ values[mask] = 1
+ result = values.prod(axis)
+ return _maybe_null_out(result, axis, mask, min_count=min_count)
+
+
+def _maybe_arg_null_out(result, axis, mask, skipna):
+ # helper function for nanargmin/nanargmax
+ if axis is None or not getattr(result, 'ndim', False):
+ if skipna:
+ if mask.all():
+ result = -1
+ else:
+ if mask.any():
+ result = -1
+ else:
+ if skipna:
+ na_mask = mask.all(axis)
+ else:
+ na_mask = mask.any(axis)
+ if na_mask.any():
+ result[na_mask] = -1
+ return result
+
+
+def _get_counts(mask, axis, dtype=float):
+ dtype = _get_dtype(dtype)
+ if axis is None:
+ return dtype.type(mask.size - mask.sum())
+
+ count = mask.shape[axis] - mask.sum(axis)
+ if is_scalar(count):
+ return dtype.type(count)
+ try:
+ return count.astype(dtype)
+ except AttributeError:
+ return np.array(count, dtype=dtype)
+
+
+def _maybe_null_out(result, axis, mask, min_count=1):
+ if axis is not None and getattr(result, 'ndim', False):
+ null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0
+ if np.any(null_mask):
+ if is_numeric_dtype(result):
+ if np.iscomplexobj(result):
+ result = result.astype('c16')
+ else:
+ result = result.astype('f8')
+ result[null_mask] = np.nan
+ else:
+ # GH12941, use None to auto cast null
+ result[null_mask] = None
+ elif result is not tslibs.NaT:
+ null_mask = mask.size - mask.sum()
+ if null_mask < min_count:
+ result = np.nan
+
+ return result
+
+
+def _zero_out_fperr(arg):
+ # #18044 reference this behavior to fix rolling skew/kurt issue
+ if isinstance(arg, np.ndarray):
+ with np.errstate(invalid='ignore'):
+ return np.where(np.abs(arg) < 1e-14, 0, arg)
+ else:
+ return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg
+
+
+@disallow('M8', 'm8')
+def nancorr(a, b, method='pearson', min_periods=None):
+ """
+ a, b: ndarrays
+ """
+ if len(a) != len(b):
+ raise AssertionError('Operands to nancorr must have same size')
+
+ if min_periods is None:
+ min_periods = 1
+
+ valid = notna(a) & notna(b)
+ if not valid.all():
+ a = a[valid]
+ b = b[valid]
+
+ if len(a) < min_periods:
+ return np.nan
+
+ f = get_corr_func(method)
+ return f(a, b)
+
+
+def get_corr_func(method):
+ if method in ['kendall', 'spearman']:
+ from scipy.stats import kendalltau, spearmanr
+ elif callable(method):
+ return method
+
+ def _pearson(a, b):
+ return np.corrcoef(a, b)[0, 1]
+
+ def _kendall(a, b):
+ rs = kendalltau(a, b)
+ if isinstance(rs, tuple):
+ return rs[0]
+ return rs
+
+ def _spearman(a, b):
+ return spearmanr(a, b)[0]
+
+ _cor_methods = {
+ 'pearson': _pearson,
+ 'kendall': _kendall,
+ 'spearman': _spearman
+ }
+ return _cor_methods[method]
+
+
+@disallow('M8', 'm8')
+def nancov(a, b, min_periods=None):
+ if len(a) != len(b):
+ raise AssertionError('Operands to nancov must have same size')
+
+ if min_periods is None:
+ min_periods = 1
+
+ valid = notna(a) & notna(b)
+ if not valid.all():
+ a = a[valid]
+ b = b[valid]
+
+ if len(a) < min_periods:
+ return np.nan
+
+ return np.cov(a, b)[0, 1]
+
+
+def _ensure_numeric(x):
+ if isinstance(x, np.ndarray):
+ if is_integer_dtype(x) or is_bool_dtype(x):
+ x = x.astype(np.float64)
+ elif is_object_dtype(x):
+ try:
+ x = x.astype(np.complex128)
+ except (TypeError, ValueError):
+ x = x.astype(np.float64)
+ else:
+ if not np.any(x.imag):
+ x = x.real
+ elif not (is_float(x) or is_integer(x) or is_complex(x)):
+ try:
+ x = float(x)
+ except Exception:
+ try:
+ x = complex(x)
+ except Exception:
+ raise TypeError('Could not convert {value!s} to numeric'
+ .format(value=x))
+ return x
+
+# NA-friendly array comparisons
+
+
+def make_nancomp(op):
+ def f(x, y):
+ xmask = isna(x)
+ ymask = isna(y)
+ mask = xmask | ymask
+
+ with np.errstate(all='ignore'):
+ result = op(x, y)
+
+ if mask.any():
+ if is_bool_dtype(result):
+ result = result.astype('O')
+ np.putmask(result, mask, np.nan)
+
+ return result
+
+ return f
+
+
+nangt = make_nancomp(operator.gt)
+nange = make_nancomp(operator.ge)
+nanlt = make_nancomp(operator.lt)
+nanle = make_nancomp(operator.le)
+naneq = make_nancomp(operator.eq)
+nanne = make_nancomp(operator.ne)
+
+
+def _nanpercentile_1d(values, mask, q, na_value, interpolation):
+ """
+ Wraper for np.percentile that skips missing values, specialized to
+ 1-dimensional case.
+
+ Parameters
+ ----------
+ values : array over which to find quantiles
+ mask : ndarray[bool]
+ locations in values that should be considered missing
+ q : scalar or array of quantile indices to find
+ na_value : scalar
+ value to return for empty or all-null values
+ interpolation : str
+
+ Returns
+ -------
+ quantiles : scalar or array
+ """
+ # mask is Union[ExtensionArray, ndarray]
+ values = values[~mask]
+
+ if len(values) == 0:
+ if lib.is_scalar(q):
+ return na_value
+ else:
+ return np.array([na_value] * len(q),
+ dtype=values.dtype)
+
+ return np.percentile(values, q, interpolation=interpolation)
+
+
+def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation):
+ """
+ Wraper for np.percentile that skips missing values.
+
+ Parameters
+ ----------
+ values : array over which to find quantiles
+ q : scalar or array of quantile indices to find
+ axis : {0, 1}
+ na_value : scalar
+ value to return for empty or all-null values
+ mask : ndarray[bool]
+ locations in values that should be considered missing
+ ndim : {1, 2}
+ interpolation : str
+
+ Returns
+ -------
+ quantiles : scalar or array
+ """
+ if not lib.is_scalar(mask) and mask.any():
+ if ndim == 1:
+ return _nanpercentile_1d(values, mask, q, na_value,
+ interpolation=interpolation)
+ else:
+ # for nonconsolidatable blocks mask is 1D, but values 2D
+ if mask.ndim < values.ndim:
+ mask = mask.reshape(values.shape)
+ if axis == 0:
+ values = values.T
+ mask = mask.T
+ result = [_nanpercentile_1d(val, m, q, na_value,
+ interpolation=interpolation)
+ for (val, m) in zip(list(values), list(mask))]
+ result = np.array(result, dtype=values.dtype, copy=False).T
+ return result
+ else:
+ return np.percentile(values, q, axis=axis, interpolation=interpolation)
diff --git a/contrib/python/pandas/py2/pandas/core/ops.py b/contrib/python/pandas/py2/pandas/core/ops.py
new file mode 100644
index 00000000000..10cebc6f94b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/ops.py
@@ -0,0 +1,2309 @@
+"""
+Arithmetic operations for PandasObjects
+
+This is not a public API.
+"""
+# necessary to enforce truediv in Python 2.X
+from __future__ import division
+
+import datetime
+import operator
+import textwrap
+import warnings
+
+import numpy as np
+
+from pandas._libs import algos as libalgos, lib, ops as libops
+import pandas.compat as compat
+from pandas.compat import bind_method
+from pandas.errors import NullFrequencyError
+from pandas.util._decorators import Appender
+
+from pandas.core.dtypes.cast import (
+ construct_1d_object_array_from_listlike, find_common_type,
+ maybe_upcast_putmask)
+from pandas.core.dtypes.common import (
+ ensure_object, is_bool_dtype, is_categorical_dtype, is_datetime64_dtype,
+ is_datetime64tz_dtype, is_datetimelike_v_numeric, is_extension_array_dtype,
+ is_integer_dtype, is_list_like, is_object_dtype, is_period_dtype,
+ is_scalar, is_timedelta64_dtype, needs_i8_conversion)
+from pandas.core.dtypes.generic import (
+ ABCDataFrame, ABCIndex, ABCIndexClass, ABCPanel, ABCSeries, ABCSparseArray,
+ ABCSparseSeries)
+from pandas.core.dtypes.missing import isna, notna
+
+import pandas as pd
+import pandas.core.common as com
+import pandas.core.missing as missing
+
+# -----------------------------------------------------------------------------
+# Ops Wrapping Utilities
+
+
+def get_op_result_name(left, right):
+ """
+ Find the appropriate name to pin to an operation result. This result
+ should always be either an Index or a Series.
+
+ Parameters
+ ----------
+ left : {Series, Index}
+ right : object
+
+ Returns
+ -------
+ name : object
+ Usually a string
+ """
+ # `left` is always a pd.Series when called from within ops
+ if isinstance(right, (ABCSeries, pd.Index)):
+ name = _maybe_match_name(left, right)
+ else:
+ name = left.name
+ return name
+
+
+def _maybe_match_name(a, b):
+ """
+ Try to find a name to attach to the result of an operation between
+ a and b. If only one of these has a `name` attribute, return that
+ name. Otherwise return a consensus name if they match of None if
+ they have different names.
+
+ Parameters
+ ----------
+ a : object
+ b : object
+
+ Returns
+ -------
+ name : str or None
+
+ See Also
+ --------
+ pandas.core.common.consensus_name_attr
+ """
+ a_has = hasattr(a, 'name')
+ b_has = hasattr(b, 'name')
+ if a_has and b_has:
+ if a.name == b.name:
+ return a.name
+ else:
+ # TODO: what if they both have np.nan for their names?
+ return None
+ elif a_has:
+ return a.name
+ elif b_has:
+ return b.name
+ return None
+
+
+def maybe_upcast_for_op(obj):
+ """
+ Cast non-pandas objects to pandas types to unify behavior of arithmetic
+ and comparison operations.
+
+ Parameters
+ ----------
+ obj: object
+
+ Returns
+ -------
+ out : object
+
+ Notes
+ -----
+ Be careful to call this *after* determining the `name` attribute to be
+ attached to the result of the arithmetic operation.
+ """
+ if type(obj) is datetime.timedelta:
+ # GH#22390 cast up to Timedelta to rely on Timedelta
+ # implementation; otherwise operation against numeric-dtype
+ # raises TypeError
+ return pd.Timedelta(obj)
+ elif isinstance(obj, np.timedelta64) and not isna(obj):
+ # In particular non-nanosecond timedelta64 needs to be cast to
+ # nanoseconds, or else we get undesired behavior like
+ # np.timedelta64(3, 'D') / 2 == np.timedelta64(1, 'D')
+ # The isna check is to avoid casting timedelta64("NaT"), which would
+ # return NaT and incorrectly be treated as a datetime-NaT.
+ return pd.Timedelta(obj)
+ elif isinstance(obj, np.ndarray) and is_timedelta64_dtype(obj):
+ # GH#22390 Unfortunately we need to special-case right-hand
+ # timedelta64 dtypes because numpy casts integer dtypes to
+ # timedelta64 when operating with timedelta64
+ return pd.TimedeltaIndex(obj)
+ return obj
+
+
+# -----------------------------------------------------------------------------
+# Reversed Operations not available in the stdlib operator module.
+# Defining these instead of using lambdas allows us to reference them by name.
+
+def radd(left, right):
+ return right + left
+
+
+def rsub(left, right):
+ return right - left
+
+
+def rmul(left, right):
+ return right * left
+
+
+def rdiv(left, right):
+ return right / left
+
+
+def rtruediv(left, right):
+ return right / left
+
+
+def rfloordiv(left, right):
+ return right // left
+
+
+def rmod(left, right):
+ # check if right is a string as % is the string
+ # formatting operation; this is a TypeError
+ # otherwise perform the op
+ if isinstance(right, compat.string_types):
+ raise TypeError("{typ} cannot perform the operation mod".format(
+ typ=type(left).__name__))
+
+ return right % left
+
+
+def rdivmod(left, right):
+ return divmod(right, left)
+
+
+def rpow(left, right):
+ return right ** left
+
+
+def rand_(left, right):
+ return operator.and_(right, left)
+
+
+def ror_(left, right):
+ return operator.or_(right, left)
+
+
+def rxor(left, right):
+ return operator.xor(right, left)
+
+
+# -----------------------------------------------------------------------------
+
+def make_invalid_op(name):
+ """
+ Return a binary method that always raises a TypeError.
+
+ Parameters
+ ----------
+ name : str
+
+ Returns
+ -------
+ invalid_op : function
+ """
+ def invalid_op(self, other=None):
+ raise TypeError("cannot perform {name} with this index type: "
+ "{typ}".format(name=name, typ=type(self).__name__))
+
+ invalid_op.__name__ = name
+ return invalid_op
+
+
+def _gen_eval_kwargs(name):
+ """
+ Find the keyword arguments to pass to numexpr for the given operation.
+
+ Parameters
+ ----------
+ name : str
+
+ Returns
+ -------
+ eval_kwargs : dict
+
+ Examples
+ --------
+ >>> _gen_eval_kwargs("__add__")
+ {}
+
+ >>> _gen_eval_kwargs("rtruediv")
+ {'reversed': True, 'truediv': True}
+ """
+ kwargs = {}
+
+ # Series and Panel appear to only pass __add__, __radd__, ...
+ # but DataFrame gets both these dunder names _and_ non-dunder names
+ # add, radd, ...
+ name = name.replace('__', '')
+
+ if name.startswith('r'):
+ if name not in ['radd', 'rand', 'ror', 'rxor']:
+ # Exclude commutative operations
+ kwargs['reversed'] = True
+
+ if name in ['truediv', 'rtruediv']:
+ kwargs['truediv'] = True
+
+ if name in ['ne']:
+ kwargs['masker'] = True
+
+ return kwargs
+
+
+def _gen_fill_zeros(name):
+ """
+ Find the appropriate fill value to use when filling in undefined values
+ in the results of the given operation caused by operating on
+ (generally dividing by) zero.
+
+ Parameters
+ ----------
+ name : str
+
+ Returns
+ -------
+ fill_value : {None, np.nan, np.inf}
+ """
+ name = name.strip('__')
+ if 'div' in name:
+ # truediv, floordiv, div, and reversed variants
+ fill_value = np.inf
+ elif 'mod' in name:
+ # mod, rmod
+ fill_value = np.nan
+ else:
+ fill_value = None
+ return fill_value
+
+
+def _get_frame_op_default_axis(name):
+ """
+ Only DataFrame cares about default_axis, specifically:
+ special methods have default_axis=None and flex methods
+ have default_axis='columns'.
+
+ Parameters
+ ----------
+ name : str
+
+ Returns
+ -------
+ default_axis: str or None
+ """
+ if name.replace('__r', '__') in ['__and__', '__or__', '__xor__']:
+ # bool methods
+ return 'columns'
+ elif name.startswith('__'):
+ # __add__, __mul__, ...
+ return None
+ else:
+ # add, mul, ...
+ return 'columns'
+
+
+def _get_opstr(op, cls):
+ """
+ Find the operation string, if any, to pass to numexpr for this
+ operation.
+
+ Parameters
+ ----------
+ op : binary operator
+ cls : class
+
+ Returns
+ -------
+ op_str : string or None
+ """
+ # numexpr is available for non-sparse classes
+ subtyp = getattr(cls, '_subtyp', '')
+ use_numexpr = 'sparse' not in subtyp
+
+ if not use_numexpr:
+ # if we're not using numexpr, then don't pass a str_rep
+ return None
+
+ return {operator.add: '+',
+ radd: '+',
+ operator.mul: '*',
+ rmul: '*',
+ operator.sub: '-',
+ rsub: '-',
+ operator.truediv: '/',
+ rtruediv: '/',
+ operator.floordiv: '//',
+ rfloordiv: '//',
+ operator.mod: None, # TODO: Why None for mod but '%' for rmod?
+ rmod: '%',
+ operator.pow: '**',
+ rpow: '**',
+ operator.eq: '==',
+ operator.ne: '!=',
+ operator.le: '<=',
+ operator.lt: '<',
+ operator.ge: '>=',
+ operator.gt: '>',
+ operator.and_: '&',
+ rand_: '&',
+ operator.or_: '|',
+ ror_: '|',
+ operator.xor: '^',
+ rxor: '^',
+ divmod: None,
+ rdivmod: None}[op]
+
+
+def _get_op_name(op, special):
+ """
+ Find the name to attach to this method according to conventions
+ for special and non-special methods.
+
+ Parameters
+ ----------
+ op : binary operator
+ special : bool
+
+ Returns
+ -------
+ op_name : str
+ """
+ opname = op.__name__.strip('_')
+ if special:
+ opname = '__{opname}__'.format(opname=opname)
+ return opname
+
+
+# -----------------------------------------------------------------------------
+# Docstring Generation and Templates
+
+_op_descriptions = {
+ # Arithmetic Operators
+ 'add': {'op': '+',
+ 'desc': 'Addition',
+ 'reverse': 'radd'},
+ 'sub': {'op': '-',
+ 'desc': 'Subtraction',
+ 'reverse': 'rsub'},
+ 'mul': {'op': '*',
+ 'desc': 'Multiplication',
+ 'reverse': 'rmul',
+ 'df_examples': None},
+ 'mod': {'op': '%',
+ 'desc': 'Modulo',
+ 'reverse': 'rmod'},
+ 'pow': {'op': '**',
+ 'desc': 'Exponential power',
+ 'reverse': 'rpow',
+ 'df_examples': None},
+ 'truediv': {'op': '/',
+ 'desc': 'Floating division',
+ 'reverse': 'rtruediv',
+ 'df_examples': None},
+ 'floordiv': {'op': '//',
+ 'desc': 'Integer division',
+ 'reverse': 'rfloordiv',
+ 'df_examples': None},
+ 'divmod': {'op': 'divmod',
+ 'desc': 'Integer division and modulo',
+ 'reverse': 'rdivmod',
+ 'df_examples': None},
+
+ # Comparison Operators
+ 'eq': {'op': '==',
+ 'desc': 'Equal to',
+ 'reverse': None},
+ 'ne': {'op': '!=',
+ 'desc': 'Not equal to',
+ 'reverse': None},
+ 'lt': {'op': '<',
+ 'desc': 'Less than',
+ 'reverse': None},
+ 'le': {'op': '<=',
+ 'desc': 'Less than or equal to',
+ 'reverse': None},
+ 'gt': {'op': '>',
+ 'desc': 'Greater than',
+ 'reverse': None},
+ 'ge': {'op': '>=',
+ 'desc': 'Greater than or equal to',
+ 'reverse': None}
+}
+
+_op_names = list(_op_descriptions.keys())
+for key in _op_names:
+ _op_descriptions[key]['reversed'] = False
+ reverse_op = _op_descriptions[key]['reverse']
+ if reverse_op is not None:
+ _op_descriptions[reverse_op] = _op_descriptions[key].copy()
+ _op_descriptions[reverse_op]['reversed'] = True
+ _op_descriptions[reverse_op]['reverse'] = key
+
+_flex_doc_SERIES = """
+{desc} of series and other, element-wise (binary operator `{op_name}`).
+
+Equivalent to ``{equiv}``, but with support to substitute a fill_value for
+missing data in one of the inputs.
+
+Parameters
+----------
+other : Series or scalar value
+fill_value : None or float value, default None (NaN)
+ Fill existing missing (NaN) values, and any new element needed for
+ successful Series alignment, with this value before computation.
+ If data in both corresponding Series locations is missing
+ the result will be missing
+level : int or name
+ Broadcast across a level, matching Index values on the
+ passed MultiIndex level
+
+Returns
+-------
+result : Series
+
+See Also
+--------
+Series.{reverse}
+
+Examples
+--------
+>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'])
+>>> a
+a 1.0
+b 1.0
+c 1.0
+d NaN
+dtype: float64
+>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e'])
+>>> b
+a 1.0
+b NaN
+d 1.0
+e NaN
+dtype: float64
+>>> a.add(b, fill_value=0)
+a 2.0
+b 1.0
+c 1.0
+d 1.0
+e NaN
+dtype: float64
+"""
+
+_arith_doc_FRAME = """
+Binary operator %s with support to substitute a fill_value for missing data in
+one of the inputs
+
+Parameters
+----------
+other : Series, DataFrame, or constant
+axis : {0, 1, 'index', 'columns'}
+ For Series input, axis to match Series index on
+fill_value : None or float value, default None
+ Fill existing missing (NaN) values, and any new element needed for
+ successful DataFrame alignment, with this value before computation.
+ If data in both corresponding DataFrame locations is missing
+ the result will be missing
+level : int or name
+ Broadcast across a level, matching Index values on the
+ passed MultiIndex level
+
+Returns
+-------
+result : DataFrame
+
+Notes
+-----
+Mismatched indices will be unioned together
+"""
+
+_flex_doc_FRAME = """
+{desc} of dataframe and other, element-wise (binary operator `{op_name}`).
+
+Equivalent to ``{equiv}``, but with support to substitute a fill_value
+for missing data in one of the inputs. With reverse version, `{reverse}`.
+
+Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to
+arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`.
+
+Parameters
+----------
+other : scalar, sequence, Series, or DataFrame
+ Any single or multiple element data structure, or list-like object.
+axis : {{0 or 'index', 1 or 'columns'}}
+ Whether to compare by the index (0 or 'index') or columns
+ (1 or 'columns'). For Series input, axis to match Series index on.
+level : int or label
+ Broadcast across a level, matching Index values on the
+ passed MultiIndex level.
+fill_value : float or None, default None
+ Fill existing missing (NaN) values, and any new element needed for
+ successful DataFrame alignment, with this value before computation.
+ If data in both corresponding DataFrame locations is missing
+ the result will be missing.
+
+Returns
+-------
+DataFrame
+ Result of the arithmetic operation.
+
+See Also
+--------
+DataFrame.add : Add DataFrames.
+DataFrame.sub : Subtract DataFrames.
+DataFrame.mul : Multiply DataFrames.
+DataFrame.div : Divide DataFrames (float division).
+DataFrame.truediv : Divide DataFrames (float division).
+DataFrame.floordiv : Divide DataFrames (integer division).
+DataFrame.mod : Calculate modulo (remainder after division).
+DataFrame.pow : Calculate exponential power.
+
+Notes
+-----
+Mismatched indices will be unioned together.
+
+Examples
+--------
+>>> df = pd.DataFrame({{'angles': [0, 3, 4],
+... 'degrees': [360, 180, 360]}},
+... index=['circle', 'triangle', 'rectangle'])
+>>> df
+ angles degrees
+circle 0 360
+triangle 3 180
+rectangle 4 360
+
+Add a scalar with operator version which return the same
+results.
+
+>>> df + 1
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
+>>> df.add(1)
+ angles degrees
+circle 1 361
+triangle 4 181
+rectangle 5 361
+
+Divide by constant with reverse version.
+
+>>> df.div(10)
+ angles degrees
+circle 0.0 36.0
+triangle 0.3 18.0
+rectangle 0.4 36.0
+
+>>> df.rdiv(10)
+ angles degrees
+circle inf 0.027778
+triangle 3.333333 0.055556
+rectangle 2.500000 0.027778
+
+Subtract a list and Series by axis with operator version.
+
+>>> df - [1, 2]
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
+>>> df.sub([1, 2], axis='columns')
+ angles degrees
+circle -1 358
+triangle 2 178
+rectangle 3 358
+
+>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),
+... axis='index')
+ angles degrees
+circle -1 359
+triangle 2 179
+rectangle 3 359
+
+Multiply a DataFrame of different shape with operator version.
+
+>>> other = pd.DataFrame({{'angles': [0, 3, 4]}},
+... index=['circle', 'triangle', 'rectangle'])
+>>> other
+ angles
+circle 0
+triangle 3
+rectangle 4
+
+>>> df * other
+ angles degrees
+circle 0 NaN
+triangle 9 NaN
+rectangle 16 NaN
+
+>>> df.mul(other, fill_value=0)
+ angles degrees
+circle 0 0.0
+triangle 9 0.0
+rectangle 16 0.0
+
+Divide by a MultiIndex by level.
+
+>>> df_multindex = pd.DataFrame({{'angles': [0, 3, 4, 4, 5, 6],
+... 'degrees': [360, 180, 360, 360, 540, 720]}},
+... index=[['A', 'A', 'A', 'B', 'B', 'B'],
+... ['circle', 'triangle', 'rectangle',
+... 'square', 'pentagon', 'hexagon']])
+>>> df_multindex
+ angles degrees
+A circle 0 360
+ triangle 3 180
+ rectangle 4 360
+B square 4 360
+ pentagon 5 540
+ hexagon 6 720
+
+>>> df.div(df_multindex, level=1, fill_value=0)
+ angles degrees
+A circle NaN 1.0
+ triangle 1.0 1.0
+ rectangle 1.0 1.0
+B square 0.0 0.0
+ pentagon 0.0 0.0
+ hexagon 0.0 0.0
+"""
+
+_flex_comp_doc_FRAME = """
+{desc} of dataframe and other, element-wise (binary operator `{op_name}`).
+
+Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison
+operators.
+
+Equivalent to `==`, `=!`, `<=`, `<`, `>=`, `>` with support to choose axis
+(rows or columns) and level for comparison.
+
+Parameters
+----------
+other : scalar, sequence, Series, or DataFrame
+ Any single or multiple element data structure, or list-like object.
+axis : {{0 or 'index', 1 or 'columns'}}, default 'columns'
+ Whether to compare by the index (0 or 'index') or columns
+ (1 or 'columns').
+level : int or label
+ Broadcast across a level, matching Index values on the passed
+ MultiIndex level.
+
+Returns
+-------
+DataFrame of bool
+ Result of the comparison.
+
+See Also
+--------
+DataFrame.eq : Compare DataFrames for equality elementwise.
+DataFrame.ne : Compare DataFrames for inequality elementwise.
+DataFrame.le : Compare DataFrames for less than inequality
+ or equality elementwise.
+DataFrame.lt : Compare DataFrames for strictly less than
+ inequality elementwise.
+DataFrame.ge : Compare DataFrames for greater than inequality
+ or equality elementwise.
+DataFrame.gt : Compare DataFrames for strictly greater than
+ inequality elementwise.
+
+Notes
+--------
+Mismatched indices will be unioned together.
+`NaN` values are considered different (i.e. `NaN` != `NaN`).
+
+Examples
+--------
+>>> df = pd.DataFrame({{'cost': [250, 150, 100],
+... 'revenue': [100, 250, 300]}},
+... index=['A', 'B', 'C'])
+>>> df
+ cost revenue
+A 250 100
+B 150 250
+C 100 300
+
+Comparison with a scalar, using either the operator or method:
+
+>>> df == 100
+ cost revenue
+A False True
+B False False
+C True False
+
+>>> df.eq(100)
+ cost revenue
+A False True
+B False False
+C True False
+
+When `other` is a :class:`Series`, the columns of a DataFrame are aligned
+with the index of `other` and broadcast:
+
+>>> df != pd.Series([100, 250], index=["cost", "revenue"])
+ cost revenue
+A True True
+B True False
+C False True
+
+Use the method to control the broadcast axis:
+
+>>> df.ne(pd.Series([100, 300], index=["A", "D"]), axis='index')
+ cost revenue
+A True False
+B True True
+C True True
+D True True
+
+When comparing to an arbitrary sequence, the number of columns must
+match the number elements in `other`:
+
+>>> df == [250, 100]
+ cost revenue
+A True True
+B False False
+C False False
+
+Use the method to control the axis:
+
+>>> df.eq([250, 250, 100], axis='index')
+ cost revenue
+A True False
+B False True
+C True False
+
+Compare to a DataFrame of different shape.
+
+>>> other = pd.DataFrame({{'revenue': [300, 250, 100, 150]}},
+... index=['A', 'B', 'C', 'D'])
+>>> other
+ revenue
+A 300
+B 250
+C 100
+D 150
+
+>>> df.gt(other)
+ cost revenue
+A False False
+B False False
+C False True
+D False False
+
+Compare to a MultiIndex by level.
+
+>>> df_multindex = pd.DataFrame({{'cost': [250, 150, 100, 150, 300, 220],
+... 'revenue': [100, 250, 300, 200, 175, 225]}},
+... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'],
+... ['A', 'B', 'C', 'A', 'B', 'C']])
+>>> df_multindex
+ cost revenue
+Q1 A 250 100
+ B 150 250
+ C 100 300
+Q2 A 150 200
+ B 300 175
+ C 220 225
+
+>>> df.le(df_multindex, level=1)
+ cost revenue
+Q1 A True True
+ B True True
+ C True True
+Q2 A False True
+ B True False
+ C True False
+"""
+
+_flex_doc_PANEL = """
+{desc} of series and other, element-wise (binary operator `{op_name}`).
+Equivalent to ``{equiv}``.
+
+Parameters
+----------
+other : DataFrame or Panel
+axis : {{items, major_axis, minor_axis}}
+ Axis to broadcast over
+
+Returns
+-------
+Panel
+
+See Also
+--------
+Panel.{reverse}
+"""
+
+
+_agg_doc_PANEL = """
+Wrapper method for {op_name}
+
+Parameters
+----------
+other : DataFrame or Panel
+axis : {{items, major_axis, minor_axis}}
+ Axis to broadcast over
+
+Returns
+-------
+Panel
+"""
+
+
+def _make_flex_doc(op_name, typ):
+ """
+ Make the appropriate substitutions for the given operation and class-typ
+ into either _flex_doc_SERIES or _flex_doc_FRAME to return the docstring
+ to attach to a generated method.
+
+ Parameters
+ ----------
+ op_name : str {'__add__', '__sub__', ... '__eq__', '__ne__', ...}
+ typ : str {series, 'dataframe']}
+
+ Returns
+ -------
+ doc : str
+ """
+ op_name = op_name.replace('__', '')
+ op_desc = _op_descriptions[op_name]
+
+ if op_desc['reversed']:
+ equiv = 'other ' + op_desc['op'] + ' ' + typ
+ else:
+ equiv = typ + ' ' + op_desc['op'] + ' other'
+
+ if typ == 'series':
+ base_doc = _flex_doc_SERIES
+ doc = base_doc.format(desc=op_desc['desc'], op_name=op_name,
+ equiv=equiv, reverse=op_desc['reverse'])
+ elif typ == 'dataframe':
+ base_doc = _flex_doc_FRAME
+ doc = base_doc.format(desc=op_desc['desc'], op_name=op_name,
+ equiv=equiv, reverse=op_desc['reverse'])
+ elif typ == 'panel':
+ base_doc = _flex_doc_PANEL
+ doc = base_doc.format(desc=op_desc['desc'], op_name=op_name,
+ equiv=equiv, reverse=op_desc['reverse'])
+ else:
+ raise AssertionError('Invalid typ argument.')
+ return doc
+
+
+# -----------------------------------------------------------------------------
+# Masking NA values and fallbacks for operations numpy does not support
+
+def fill_binop(left, right, fill_value):
+ """
+ If a non-None fill_value is given, replace null entries in left and right
+ with this value, but only in positions where _one_ of left/right is null,
+ not both.
+
+ Parameters
+ ----------
+ left : array-like
+ right : array-like
+ fill_value : object
+
+ Returns
+ -------
+ left : array-like
+ right : array-like
+
+ Notes
+ -----
+ Makes copies if fill_value is not None
+ """
+ # TODO: can we make a no-copy implementation?
+ if fill_value is not None:
+ left_mask = isna(left)
+ right_mask = isna(right)
+ left = left.copy()
+ right = right.copy()
+
+ # one but not both
+ mask = left_mask ^ right_mask
+ left[left_mask & mask] = fill_value
+ right[right_mask & mask] = fill_value
+ return left, right
+
+
+def mask_cmp_op(x, y, op, allowed_types):
+ """
+ Apply the function `op` to only non-null points in x and y.
+
+ Parameters
+ ----------
+ x : array-like
+ y : array-like
+ op : binary operation
+ allowed_types : class or tuple of classes
+
+ Returns
+ -------
+ result : ndarray[bool]
+ """
+ # TODO: Can we make the allowed_types arg unnecessary?
+ xrav = x.ravel()
+ result = np.empty(x.size, dtype=bool)
+ if isinstance(y, allowed_types):
+ yrav = y.ravel()
+ mask = notna(xrav) & notna(yrav)
+ result[mask] = op(np.array(list(xrav[mask])),
+ np.array(list(yrav[mask])))
+ else:
+ mask = notna(xrav)
+ result[mask] = op(np.array(list(xrav[mask])), y)
+
+ if op == operator.ne: # pragma: no cover
+ np.putmask(result, ~mask, True)
+ else:
+ np.putmask(result, ~mask, False)
+ result = result.reshape(x.shape)
+ return result
+
+
+def masked_arith_op(x, y, op):
+ """
+ If the given arithmetic operation fails, attempt it again on
+ only the non-null elements of the input array(s).
+
+ Parameters
+ ----------
+ x : np.ndarray
+ y : np.ndarray, Series, Index
+ op : binary operator
+ """
+ # For Series `x` is 1D so ravel() is a no-op; calling it anyway makes
+ # the logic valid for both Series and DataFrame ops.
+ xrav = x.ravel()
+ assert isinstance(x, (np.ndarray, ABCSeries)), type(x)
+ if isinstance(y, (np.ndarray, ABCSeries, ABCIndexClass)):
+ dtype = find_common_type([x.dtype, y.dtype])
+ result = np.empty(x.size, dtype=dtype)
+
+ # PeriodIndex.ravel() returns int64 dtype, so we have
+ # to work around that case. See GH#19956
+ yrav = y if is_period_dtype(y) else y.ravel()
+ mask = notna(xrav) & notna(yrav)
+
+ if yrav.shape != mask.shape:
+ # FIXME: GH#5284, GH#5035, GH#19448
+ # Without specifically raising here we get mismatched
+ # errors in Py3 (TypeError) vs Py2 (ValueError)
+ # Note: Only = an issue in DataFrame case
+ raise ValueError('Cannot broadcast operands together.')
+
+ if mask.any():
+ with np.errstate(all='ignore'):
+ result[mask] = op(xrav[mask],
+ com.values_from_object(yrav[mask]))
+
+ else:
+ assert is_scalar(y), type(y)
+ assert isinstance(x, np.ndarray), type(x)
+ # mask is only meaningful for x
+ result = np.empty(x.size, dtype=x.dtype)
+ mask = notna(xrav)
+
+ # 1 ** np.nan is 1. So we have to unmask those.
+ if op == pow:
+ mask = np.where(x == 1, False, mask)
+ elif op == rpow:
+ mask = np.where(y == 1, False, mask)
+
+ if mask.any():
+ with np.errstate(all='ignore'):
+ result[mask] = op(xrav[mask], y)
+
+ result, changed = maybe_upcast_putmask(result, ~mask, np.nan)
+ result = result.reshape(x.shape) # 2D compat
+ return result
+
+
+def invalid_comparison(left, right, op):
+ """
+ If a comparison has mismatched types and is not necessarily meaningful,
+ follow python3 conventions by:
+
+ - returning all-False for equality
+ - returning all-True for inequality
+ - raising TypeError otherwise
+
+ Parameters
+ ----------
+ left : array-like
+ right : scalar, array-like
+ op : operator.{eq, ne, lt, le, gt}
+
+ Raises
+ ------
+ TypeError : on inequality comparisons
+ """
+ if op is operator.eq:
+ res_values = np.zeros(left.shape, dtype=bool)
+ elif op is operator.ne:
+ res_values = np.ones(left.shape, dtype=bool)
+ else:
+ raise TypeError("Invalid comparison between dtype={dtype} and {typ}"
+ .format(dtype=left.dtype, typ=type(right).__name__))
+ return res_values
+
+
+# -----------------------------------------------------------------------------
+# Dispatch logic
+
+def should_series_dispatch(left, right, op):
+ """
+ Identify cases where a DataFrame operation should dispatch to its
+ Series counterpart.
+
+ Parameters
+ ----------
+ left : DataFrame
+ right : DataFrame
+ op : binary operator
+
+ Returns
+ -------
+ override : bool
+ """
+ if left._is_mixed_type or right._is_mixed_type:
+ return True
+
+ if not len(left.columns) or not len(right.columns):
+ # ensure obj.dtypes[0] exists for each obj
+ return False
+
+ ldtype = left.dtypes.iloc[0]
+ rdtype = right.dtypes.iloc[0]
+
+ if ((is_timedelta64_dtype(ldtype) and is_integer_dtype(rdtype)) or
+ (is_timedelta64_dtype(rdtype) and is_integer_dtype(ldtype))):
+ # numpy integer dtypes as timedelta64 dtypes in this scenario
+ return True
+
+ if is_datetime64_dtype(ldtype) and is_object_dtype(rdtype):
+ # in particular case where right is an array of DateOffsets
+ return True
+
+ return False
+
+
+def dispatch_to_series(left, right, func, str_rep=None, axis=None):
+ """
+ Evaluate the frame operation func(left, right) by evaluating
+ column-by-column, dispatching to the Series implementation.
+
+ Parameters
+ ----------
+ left : DataFrame
+ right : scalar or DataFrame
+ func : arithmetic or comparison operator
+ str_rep : str or None, default None
+ axis : {None, 0, 1, "index", "columns"}
+
+ Returns
+ -------
+ DataFrame
+ """
+ # Note: we use iloc to access columns for compat with cases
+ # with non-unique columns.
+ import pandas.core.computation.expressions as expressions
+
+ right = lib.item_from_zerodim(right)
+ if lib.is_scalar(right) or np.ndim(right) == 0:
+
+ def column_op(a, b):
+ return {i: func(a.iloc[:, i], b)
+ for i in range(len(a.columns))}
+
+ elif isinstance(right, ABCDataFrame):
+ assert right._indexed_same(left)
+
+ def column_op(a, b):
+ return {i: func(a.iloc[:, i], b.iloc[:, i])
+ for i in range(len(a.columns))}
+
+ elif isinstance(right, ABCSeries) and axis == "columns":
+ # We only get here if called via left._combine_match_columns,
+ # in which case we specifically want to operate row-by-row
+ assert right.index.equals(left.columns)
+
+ def column_op(a, b):
+ return {i: func(a.iloc[:, i], b.iloc[i])
+ for i in range(len(a.columns))}
+
+ elif isinstance(right, ABCSeries):
+ assert right.index.equals(left.index) # Handle other cases later
+
+ def column_op(a, b):
+ return {i: func(a.iloc[:, i], b)
+ for i in range(len(a.columns))}
+
+ else:
+ # Remaining cases have less-obvious dispatch rules
+ raise NotImplementedError(right)
+
+ new_data = expressions.evaluate(column_op, str_rep, left, right)
+
+ result = left._constructor(new_data, index=left.index, copy=False)
+ # Pin columns instead of passing to constructor for compat with
+ # non-unique columns case
+ result.columns = left.columns
+ return result
+
+
+def dispatch_to_index_op(op, left, right, index_class):
+ """
+ Wrap Series left in the given index_class to delegate the operation op
+ to the index implementation. DatetimeIndex and TimedeltaIndex perform
+ type checking, timezone handling, overflow checks, etc.
+
+ Parameters
+ ----------
+ op : binary operator (operator.add, operator.sub, ...)
+ left : Series
+ right : object
+ index_class : DatetimeIndex or TimedeltaIndex
+
+ Returns
+ -------
+ result : object, usually DatetimeIndex, TimedeltaIndex, or Series
+ """
+ left_idx = index_class(left)
+
+ # avoid accidentally allowing integer add/sub. For datetime64[tz] dtypes,
+ # left_idx may inherit a freq from a cached DatetimeIndex.
+ # See discussion in GH#19147.
+ if getattr(left_idx, 'freq', None) is not None:
+ left_idx = left_idx._shallow_copy(freq=None)
+ try:
+ result = op(left_idx, right)
+ except NullFrequencyError:
+ # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError
+ # on add/sub of integers (or int-like). We re-raise as a TypeError.
+ raise TypeError('incompatible type for a datetime/timedelta '
+ 'operation [{name}]'.format(name=op.__name__))
+ return result
+
+
+def dispatch_to_extension_op(op, left, right):
+ """
+ Assume that left or right is a Series backed by an ExtensionArray,
+ apply the operator defined by op.
+ """
+
+ # The op calls will raise TypeError if the op is not defined
+ # on the ExtensionArray
+
+ # unbox Series and Index to arrays
+ if isinstance(left, (ABCSeries, ABCIndexClass)):
+ new_left = left._values
+ else:
+ new_left = left
+
+ if isinstance(right, (ABCSeries, ABCIndexClass)):
+ new_right = right._values
+ else:
+ new_right = right
+
+ res_values = op(new_left, new_right)
+ res_name = get_op_result_name(left, right)
+
+ if op.__name__ in ['divmod', 'rdivmod']:
+ return _construct_divmod_result(
+ left, res_values, left.index, res_name)
+
+ return _construct_result(left, res_values, left.index, res_name)
+
+
+# -----------------------------------------------------------------------------
+# Functions that add arithmetic methods to objects, given arithmetic factory
+# methods
+
+def _get_method_wrappers(cls):
+ """
+ Find the appropriate operation-wrappers to use when defining flex/special
+ arithmetic, boolean, and comparison operations with the given class.
+
+ Parameters
+ ----------
+ cls : class
+
+ Returns
+ -------
+ arith_flex : function or None
+ comp_flex : function or None
+ arith_special : function
+ comp_special : function
+ bool_special : function
+
+ Notes
+ -----
+ None is only returned for SparseArray
+ """
+ if issubclass(cls, ABCSparseSeries):
+ # Be sure to catch this before ABCSeries and ABCSparseArray,
+ # as they will both come see SparseSeries as a subclass
+ arith_flex = _flex_method_SERIES
+ comp_flex = _flex_method_SERIES
+ arith_special = _arith_method_SPARSE_SERIES
+ comp_special = _arith_method_SPARSE_SERIES
+ bool_special = _bool_method_SERIES
+ # TODO: I don't think the functions defined by bool_method are tested
+ elif issubclass(cls, ABCSeries):
+ # Just Series; SparseSeries is caught above
+ arith_flex = _flex_method_SERIES
+ comp_flex = _flex_method_SERIES
+ arith_special = _arith_method_SERIES
+ comp_special = _comp_method_SERIES
+ bool_special = _bool_method_SERIES
+ elif issubclass(cls, ABCSparseArray):
+ arith_flex = None
+ comp_flex = None
+ arith_special = _arith_method_SPARSE_ARRAY
+ comp_special = _arith_method_SPARSE_ARRAY
+ bool_special = _arith_method_SPARSE_ARRAY
+ elif issubclass(cls, ABCPanel):
+ arith_flex = _flex_method_PANEL
+ comp_flex = _comp_method_PANEL
+ arith_special = _arith_method_PANEL
+ comp_special = _comp_method_PANEL
+ bool_special = _arith_method_PANEL
+ elif issubclass(cls, ABCDataFrame):
+ # Same for DataFrame and SparseDataFrame
+ arith_flex = _arith_method_FRAME
+ comp_flex = _flex_comp_method_FRAME
+ arith_special = _arith_method_FRAME
+ comp_special = _comp_method_FRAME
+ bool_special = _arith_method_FRAME
+ return arith_flex, comp_flex, arith_special, comp_special, bool_special
+
+
+def _create_methods(cls, arith_method, comp_method, bool_method, special):
+ # creates actual methods based upon arithmetic, comp and bool method
+ # constructors.
+
+ have_divmod = issubclass(cls, ABCSeries)
+ # divmod is available for Series and SparseSeries
+
+ # yapf: disable
+ new_methods = dict(
+ add=arith_method(cls, operator.add, special),
+ radd=arith_method(cls, radd, special),
+ sub=arith_method(cls, operator.sub, special),
+ mul=arith_method(cls, operator.mul, special),
+ truediv=arith_method(cls, operator.truediv, special),
+ floordiv=arith_method(cls, operator.floordiv, special),
+ # Causes a floating point exception in the tests when numexpr enabled,
+ # so for now no speedup
+ mod=arith_method(cls, operator.mod, special),
+ pow=arith_method(cls, operator.pow, special),
+ # not entirely sure why this is necessary, but previously was included
+ # so it's here to maintain compatibility
+ rmul=arith_method(cls, rmul, special),
+ rsub=arith_method(cls, rsub, special),
+ rtruediv=arith_method(cls, rtruediv, special),
+ rfloordiv=arith_method(cls, rfloordiv, special),
+ rpow=arith_method(cls, rpow, special),
+ rmod=arith_method(cls, rmod, special))
+ # yapf: enable
+ new_methods['div'] = new_methods['truediv']
+ new_methods['rdiv'] = new_methods['rtruediv']
+ if have_divmod:
+ # divmod doesn't have an op that is supported by numexpr
+ new_methods['divmod'] = arith_method(cls, divmod, special)
+ new_methods['rdivmod'] = arith_method(cls, rdivmod, special)
+
+ new_methods.update(dict(
+ eq=comp_method(cls, operator.eq, special),
+ ne=comp_method(cls, operator.ne, special),
+ lt=comp_method(cls, operator.lt, special),
+ gt=comp_method(cls, operator.gt, special),
+ le=comp_method(cls, operator.le, special),
+ ge=comp_method(cls, operator.ge, special)))
+
+ if bool_method:
+ new_methods.update(
+ dict(and_=bool_method(cls, operator.and_, special),
+ or_=bool_method(cls, operator.or_, special),
+ # For some reason ``^`` wasn't used in original.
+ xor=bool_method(cls, operator.xor, special),
+ rand_=bool_method(cls, rand_, special),
+ ror_=bool_method(cls, ror_, special),
+ rxor=bool_method(cls, rxor, special)))
+
+ if special:
+ dunderize = lambda x: '__{name}__'.format(name=x.strip('_'))
+ else:
+ dunderize = lambda x: x
+ new_methods = {dunderize(k): v for k, v in new_methods.items()}
+ return new_methods
+
+
+def add_methods(cls, new_methods):
+ for name, method in new_methods.items():
+ # For most methods, if we find that the class already has a method
+ # of the same name, it is OK to over-write it. The exception is
+ # inplace methods (__iadd__, __isub__, ...) for SparseArray, which
+ # retain the np.ndarray versions.
+ force = not (issubclass(cls, ABCSparseArray) and
+ name.startswith('__i'))
+ if force or name not in cls.__dict__:
+ bind_method(cls, name, method)
+
+
+# ----------------------------------------------------------------------
+# Arithmetic
+def add_special_arithmetic_methods(cls):
+ """
+ Adds the full suite of special arithmetic methods (``__add__``,
+ ``__sub__``, etc.) to the class.
+
+ Parameters
+ ----------
+ cls : class
+ special methods will be defined and pinned to this class
+ """
+ _, _, arith_method, comp_method, bool_method = _get_method_wrappers(cls)
+ new_methods = _create_methods(cls, arith_method, comp_method, bool_method,
+ special=True)
+ # inplace operators (I feel like these should get passed an `inplace=True`
+ # or just be removed
+
+ def _wrap_inplace_method(method):
+ """
+ return an inplace wrapper for this method
+ """
+
+ def f(self, other):
+ result = method(self, other)
+
+ # this makes sure that we are aligned like the input
+ # we are updating inplace so we want to ignore is_copy
+ self._update_inplace(result.reindex_like(self, copy=False)._data,
+ verify_is_copy=False)
+
+ return self
+
+ f.__name__ = "__i{name}__".format(name=method.__name__.strip("__"))
+ return f
+
+ new_methods.update(
+ dict(__iadd__=_wrap_inplace_method(new_methods["__add__"]),
+ __isub__=_wrap_inplace_method(new_methods["__sub__"]),
+ __imul__=_wrap_inplace_method(new_methods["__mul__"]),
+ __itruediv__=_wrap_inplace_method(new_methods["__truediv__"]),
+ __ifloordiv__=_wrap_inplace_method(new_methods["__floordiv__"]),
+ __imod__=_wrap_inplace_method(new_methods["__mod__"]),
+ __ipow__=_wrap_inplace_method(new_methods["__pow__"])))
+ if not compat.PY3:
+ new_methods["__idiv__"] = _wrap_inplace_method(new_methods["__div__"])
+
+ new_methods.update(
+ dict(__iand__=_wrap_inplace_method(new_methods["__and__"]),
+ __ior__=_wrap_inplace_method(new_methods["__or__"]),
+ __ixor__=_wrap_inplace_method(new_methods["__xor__"])))
+
+ add_methods(cls, new_methods=new_methods)
+
+
+def add_flex_arithmetic_methods(cls):
+ """
+ Adds the full suite of flex arithmetic methods (``pow``, ``mul``, ``add``)
+ to the class.
+
+ Parameters
+ ----------
+ cls : class
+ flex methods will be defined and pinned to this class
+ """
+ flex_arith_method, flex_comp_method, _, _, _ = _get_method_wrappers(cls)
+ new_methods = _create_methods(cls, flex_arith_method,
+ flex_comp_method, bool_method=None,
+ special=False)
+ new_methods.update(dict(multiply=new_methods['mul'],
+ subtract=new_methods['sub'],
+ divide=new_methods['div']))
+ # opt out of bool flex methods for now
+ assert not any(kname in new_methods for kname in ('ror_', 'rxor', 'rand_'))
+
+ add_methods(cls, new_methods=new_methods)
+
+
+# -----------------------------------------------------------------------------
+# Series
+
+def _align_method_SERIES(left, right, align_asobject=False):
+ """ align lhs and rhs Series """
+
+ # ToDo: Different from _align_method_FRAME, list, tuple and ndarray
+ # are not coerced here
+ # because Series has inconsistencies described in #13637
+
+ if isinstance(right, ABCSeries):
+ # avoid repeated alignment
+ if not left.index.equals(right.index):
+
+ if align_asobject:
+ # to keep original value's dtype for bool ops
+ left = left.astype(object)
+ right = right.astype(object)
+
+ left, right = left.align(right, copy=False)
+
+ return left, right
+
+
+def _construct_result(left, result, index, name, dtype=None):
+ """
+ If the raw op result has a non-None name (e.g. it is an Index object) and
+ the name argument is None, then passing name to the constructor will
+ not be enough; we still need to override the name attribute.
+ """
+ out = left._constructor(result, index=index, dtype=dtype)
+
+ out.name = name
+ return out
+
+
+def _construct_divmod_result(left, result, index, name, dtype=None):
+ """divmod returns a tuple of like indexed series instead of a single series.
+ """
+ constructor = left._constructor
+ return (
+ constructor(result[0], index=index, name=name, dtype=dtype),
+ constructor(result[1], index=index, name=name, dtype=dtype),
+ )
+
+
+def _arith_method_SERIES(cls, op, special):
+ """
+ Wrapper function for Series arithmetic operations, to avoid
+ code duplication.
+ """
+ str_rep = _get_opstr(op, cls)
+ op_name = _get_op_name(op, special)
+ eval_kwargs = _gen_eval_kwargs(op_name)
+ fill_zeros = _gen_fill_zeros(op_name)
+ construct_result = (_construct_divmod_result
+ if op in [divmod, rdivmod] else _construct_result)
+
+ def na_op(x, y):
+ import pandas.core.computation.expressions as expressions
+ try:
+ result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs)
+ except TypeError:
+ result = masked_arith_op(x, y, op)
+
+ result = missing.fill_zeros(result, x, y, op_name, fill_zeros)
+ return result
+
+ def safe_na_op(lvalues, rvalues):
+ """
+ return the result of evaluating na_op on the passed in values
+
+ try coercion to object type if the native types are not compatible
+
+ Parameters
+ ----------
+ lvalues : array-like
+ rvalues : array-like
+
+ Raises
+ ------
+ TypeError: invalid operation
+ """
+ try:
+ with np.errstate(all='ignore'):
+ return na_op(lvalues, rvalues)
+ except Exception:
+ if is_object_dtype(lvalues):
+ return libalgos.arrmap_object(lvalues,
+ lambda x: op(x, rvalues))
+ raise
+
+ def wrapper(left, right):
+ if isinstance(right, ABCDataFrame):
+ return NotImplemented
+
+ left, right = _align_method_SERIES(left, right)
+ res_name = get_op_result_name(left, right)
+ right = maybe_upcast_for_op(right)
+
+ if is_categorical_dtype(left):
+ raise TypeError("{typ} cannot perform the operation "
+ "{op}".format(typ=type(left).__name__, op=str_rep))
+
+ elif is_datetime64_dtype(left) or is_datetime64tz_dtype(left):
+ # Give dispatch_to_index_op a chance for tests like
+ # test_dt64_series_add_intlike, which the index dispatching handles
+ # specifically.
+ result = dispatch_to_index_op(op, left, right, pd.DatetimeIndex)
+ return construct_result(left, result,
+ index=left.index, name=res_name,
+ dtype=result.dtype)
+
+ elif (is_extension_array_dtype(left) or
+ (is_extension_array_dtype(right) and not is_scalar(right))):
+ # GH#22378 disallow scalar to exclude e.g. "category", "Int64"
+ return dispatch_to_extension_op(op, left, right)
+
+ elif is_timedelta64_dtype(left):
+ result = dispatch_to_index_op(op, left, right, pd.TimedeltaIndex)
+ return construct_result(left, result,
+ index=left.index, name=res_name)
+
+ elif is_timedelta64_dtype(right):
+ # We should only get here with non-scalar or timedelta64('NaT')
+ # values for right
+ # Note: we cannot use dispatch_to_index_op because
+ # that may incorrectly raise TypeError when we
+ # should get NullFrequencyError
+ result = op(pd.Index(left), right)
+ return construct_result(left, result,
+ index=left.index, name=res_name,
+ dtype=result.dtype)
+
+ lvalues = left.values
+ rvalues = right
+ if isinstance(rvalues, ABCSeries):
+ rvalues = rvalues.values
+
+ result = safe_na_op(lvalues, rvalues)
+ return construct_result(left, result,
+ index=left.index, name=res_name, dtype=None)
+
+ wrapper.__name__ = op_name
+ return wrapper
+
+
+def _comp_method_OBJECT_ARRAY(op, x, y):
+ if isinstance(y, list):
+ y = construct_1d_object_array_from_listlike(y)
+ if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)):
+ if not is_object_dtype(y.dtype):
+ y = y.astype(np.object_)
+
+ if isinstance(y, (ABCSeries, ABCIndex)):
+ y = y.values
+
+ result = libops.vec_compare(x, y, op)
+ else:
+ result = libops.scalar_compare(x, y, op)
+ return result
+
+
+def _comp_method_SERIES(cls, op, special):
+ """
+ Wrapper function for Series arithmetic operations, to avoid
+ code duplication.
+ """
+ op_name = _get_op_name(op, special)
+ masker = _gen_eval_kwargs(op_name).get('masker', False)
+
+ def na_op(x, y):
+ # TODO:
+ # should have guarantess on what x, y can be type-wise
+ # Extension Dtypes are not called here
+
+ # Checking that cases that were once handled here are no longer
+ # reachable.
+ assert not (is_categorical_dtype(y) and not is_scalar(y))
+
+ if is_object_dtype(x.dtype):
+ result = _comp_method_OBJECT_ARRAY(op, x, y)
+
+ elif is_datetimelike_v_numeric(x, y):
+ return invalid_comparison(x, y, op)
+
+ else:
+
+ # we want to compare like types
+ # we only want to convert to integer like if
+ # we are not NotImplemented, otherwise
+ # we would allow datetime64 (but viewed as i8) against
+ # integer comparisons
+
+ # we have a datetime/timedelta and may need to convert
+ assert not needs_i8_conversion(x)
+ mask = None
+ if not is_scalar(y) and needs_i8_conversion(y):
+ mask = isna(x) | isna(y)
+ y = y.view('i8')
+ x = x.view('i8')
+
+ method = getattr(x, op_name, None)
+ if method is not None:
+ with np.errstate(all='ignore'):
+ result = method(y)
+ if result is NotImplemented:
+ return invalid_comparison(x, y, op)
+ else:
+ result = op(x, y)
+
+ if mask is not None and mask.any():
+ result[mask] = masker
+
+ return result
+
+ def wrapper(self, other, axis=None):
+ # Validate the axis parameter
+ if axis is not None:
+ self._get_axis_number(axis)
+
+ res_name = get_op_result_name(self, other)
+
+ if isinstance(other, list):
+ # TODO: same for tuples?
+ other = np.asarray(other)
+
+ if isinstance(other, ABCDataFrame): # pragma: no cover
+ # Defer to DataFrame implementation; fail early
+ return NotImplemented
+
+ elif isinstance(other, ABCSeries) and not self._indexed_same(other):
+ raise ValueError("Can only compare identically-labeled "
+ "Series objects")
+
+ elif is_categorical_dtype(self):
+ # Dispatch to Categorical implementation; pd.CategoricalIndex
+ # behavior is non-canonical GH#19513
+ res_values = dispatch_to_index_op(op, self, other, pd.Categorical)
+ return self._constructor(res_values, index=self.index,
+ name=res_name)
+
+ elif is_datetime64_dtype(self) or is_datetime64tz_dtype(self):
+ # Dispatch to DatetimeIndex to ensure identical
+ # Series/Index behavior
+ if (isinstance(other, datetime.date) and
+ not isinstance(other, datetime.datetime)):
+ # https://github.com/pandas-dev/pandas/issues/21152
+ # Compatibility for difference between Series comparison w/
+ # datetime and date
+ msg = (
+ "Comparing Series of datetimes with 'datetime.date'. "
+ "Currently, the 'datetime.date' is coerced to a "
+ "datetime. In the future pandas will not coerce, "
+ "and {future}. "
+ "To retain the current behavior, "
+ "convert the 'datetime.date' to a datetime with "
+ "'pd.Timestamp'."
+ )
+
+ if op in {operator.lt, operator.le, operator.gt, operator.ge}:
+ future = "a TypeError will be raised"
+ else:
+ future = (
+ "'the values will not compare equal to the "
+ "'datetime.date'"
+ )
+ msg = '\n'.join(textwrap.wrap(msg.format(future=future)))
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+ other = pd.Timestamp(other)
+
+ res_values = dispatch_to_index_op(op, self, other,
+ pd.DatetimeIndex)
+
+ return self._constructor(res_values, index=self.index,
+ name=res_name)
+
+ elif is_timedelta64_dtype(self):
+ res_values = dispatch_to_index_op(op, self, other,
+ pd.TimedeltaIndex)
+ return self._constructor(res_values, index=self.index,
+ name=res_name)
+
+ elif (is_extension_array_dtype(self) or
+ (is_extension_array_dtype(other) and not is_scalar(other))):
+ # Note: the `not is_scalar(other)` condition rules out
+ # e.g. other == "category"
+ return dispatch_to_extension_op(op, self, other)
+
+ elif isinstance(other, ABCSeries):
+ # By this point we have checked that self._indexed_same(other)
+ res_values = na_op(self.values, other.values)
+ # rename is needed in case res_name is None and res_values.name
+ # is not.
+ return self._constructor(res_values, index=self.index,
+ name=res_name).rename(res_name)
+
+ elif isinstance(other, (np.ndarray, pd.Index)):
+ # do not check length of zerodim array
+ # as it will broadcast
+ if other.ndim != 0 and len(self) != len(other):
+ raise ValueError('Lengths must match to compare')
+
+ res_values = na_op(self.values, np.asarray(other))
+ result = self._constructor(res_values, index=self.index)
+ # rename is needed in case res_name is None and self.name
+ # is not.
+ return result.__finalize__(self).rename(res_name)
+
+ elif is_scalar(other) and isna(other):
+ # numpy does not like comparisons vs None
+ if op is operator.ne:
+ res_values = np.ones(len(self), dtype=bool)
+ else:
+ res_values = np.zeros(len(self), dtype=bool)
+ return self._constructor(res_values, index=self.index,
+ name=res_name, dtype='bool')
+
+ else:
+ values = self.get_values()
+
+ with np.errstate(all='ignore'):
+ res = na_op(values, other)
+ if is_scalar(res):
+ raise TypeError('Could not compare {typ} type with Series'
+ .format(typ=type(other)))
+
+ # always return a full value series here
+ res_values = com.values_from_object(res)
+ return self._constructor(res_values, index=self.index,
+ name=res_name, dtype='bool')
+
+ wrapper.__name__ = op_name
+ return wrapper
+
+
+def _bool_method_SERIES(cls, op, special):
+ """
+ Wrapper function for Series arithmetic operations, to avoid
+ code duplication.
+ """
+ op_name = _get_op_name(op, special)
+
+ def na_op(x, y):
+ try:
+ result = op(x, y)
+ except TypeError:
+ assert not isinstance(y, (list, ABCSeries, ABCIndexClass))
+ if isinstance(y, np.ndarray):
+ # bool-bool dtype operations should be OK, should not get here
+ assert not (is_bool_dtype(x) and is_bool_dtype(y))
+ x = ensure_object(x)
+ y = ensure_object(y)
+ result = libops.vec_binop(x, y, op)
+ else:
+ # let null fall thru
+ assert lib.is_scalar(y)
+ if not isna(y):
+ y = bool(y)
+ try:
+ result = libops.scalar_binop(x, y, op)
+ except (TypeError, ValueError, AttributeError,
+ OverflowError, NotImplementedError):
+ raise TypeError("cannot compare a dtyped [{dtype}] array "
+ "with a scalar of type [{typ}]"
+ .format(dtype=x.dtype,
+ typ=type(y).__name__))
+
+ return result
+
+ fill_int = lambda x: x.fillna(0)
+ fill_bool = lambda x: x.fillna(False).astype(bool)
+
+ def wrapper(self, other):
+ is_self_int_dtype = is_integer_dtype(self.dtype)
+
+ self, other = _align_method_SERIES(self, other, align_asobject=True)
+ res_name = get_op_result_name(self, other)
+
+ if isinstance(other, ABCDataFrame):
+ # Defer to DataFrame implementation; fail early
+ return NotImplemented
+
+ elif isinstance(other, (ABCSeries, ABCIndexClass)):
+ is_other_int_dtype = is_integer_dtype(other.dtype)
+ other = fill_int(other) if is_other_int_dtype else fill_bool(other)
+
+ ovalues = other.values
+ finalizer = lambda x: x
+
+ else:
+ # scalars, list, tuple, np.array
+ is_other_int_dtype = is_integer_dtype(np.asarray(other))
+ if is_list_like(other) and not isinstance(other, np.ndarray):
+ # TODO: Can we do this before the is_integer_dtype check?
+ # could the is_integer_dtype check be checking the wrong
+ # thing? e.g. other = [[0, 1], [2, 3], [4, 5]]?
+ other = construct_1d_object_array_from_listlike(other)
+
+ ovalues = other
+ finalizer = lambda x: x.__finalize__(self)
+
+ # For int vs int `^`, `|`, `&` are bitwise operators and return
+ # integer dtypes. Otherwise these are boolean ops
+ filler = (fill_int if is_self_int_dtype and is_other_int_dtype
+ else fill_bool)
+ res_values = na_op(self.values, ovalues)
+ unfilled = self._constructor(res_values,
+ index=self.index, name=res_name)
+ filled = filler(unfilled)
+ return finalizer(filled)
+
+ wrapper.__name__ = op_name
+ return wrapper
+
+
+def _flex_method_SERIES(cls, op, special):
+ name = _get_op_name(op, special)
+ doc = _make_flex_doc(name, 'series')
+
+ @Appender(doc)
+ def flex_wrapper(self, other, level=None, fill_value=None, axis=0):
+ # validate axis
+ if axis is not None:
+ self._get_axis_number(axis)
+ if isinstance(other, ABCSeries):
+ return self._binop(other, op, level=level, fill_value=fill_value)
+ elif isinstance(other, (np.ndarray, list, tuple)):
+ if len(other) != len(self):
+ raise ValueError('Lengths must be equal')
+ other = self._constructor(other, self.index)
+ return self._binop(other, op, level=level, fill_value=fill_value)
+ else:
+ if fill_value is not None:
+ self = self.fillna(fill_value)
+
+ return self._constructor(op(self, other),
+ self.index).__finalize__(self)
+
+ flex_wrapper.__name__ = name
+ return flex_wrapper
+
+
+# -----------------------------------------------------------------------------
+# DataFrame
+
+
+def _combine_series_frame(self, other, func, fill_value=None, axis=None,
+ level=None):
+ """
+ Apply binary operator `func` to self, other using alignment and fill
+ conventions determined by the fill_value, axis, and level kwargs.
+
+ Parameters
+ ----------
+ self : DataFrame
+ other : Series
+ func : binary operator
+ fill_value : object, default None
+ axis : {0, 1, 'columns', 'index', None}, default None
+ level : int or None, default None
+
+ Returns
+ -------
+ result : DataFrame
+ """
+ if fill_value is not None:
+ raise NotImplementedError("fill_value {fill} not supported."
+ .format(fill=fill_value))
+
+ if axis is not None:
+ axis = self._get_axis_number(axis)
+ if axis == 0:
+ return self._combine_match_index(other, func, level=level)
+ else:
+ return self._combine_match_columns(other, func, level=level)
+ else:
+ if not len(other):
+ return self * np.nan
+
+ if not len(self):
+ # Ambiguous case, use _series so works with DataFrame
+ return self._constructor(data=self._series, index=self.index,
+ columns=self.columns)
+
+ # default axis is columns
+ return self._combine_match_columns(other, func, level=level)
+
+
+def _align_method_FRAME(left, right, axis):
+ """ convert rhs to meet lhs dims if input is list, tuple or np.ndarray """
+
+ def to_series(right):
+ msg = ('Unable to coerce to Series, length must be {req_len}: '
+ 'given {given_len}')
+ if axis is not None and left._get_axis_name(axis) == 'index':
+ if len(left.index) != len(right):
+ raise ValueError(msg.format(req_len=len(left.index),
+ given_len=len(right)))
+ right = left._constructor_sliced(right, index=left.index)
+ else:
+ if len(left.columns) != len(right):
+ raise ValueError(msg.format(req_len=len(left.columns),
+ given_len=len(right)))
+ right = left._constructor_sliced(right, index=left.columns)
+ return right
+
+ if isinstance(right, np.ndarray):
+
+ if right.ndim == 1:
+ right = to_series(right)
+
+ elif right.ndim == 2:
+ if right.shape == left.shape:
+ right = left._constructor(right, index=left.index,
+ columns=left.columns)
+
+ elif right.shape[0] == left.shape[0] and right.shape[1] == 1:
+ # Broadcast across columns
+ right = np.broadcast_to(right, left.shape)
+ right = left._constructor(right,
+ index=left.index,
+ columns=left.columns)
+
+ elif right.shape[1] == left.shape[1] and right.shape[0] == 1:
+ # Broadcast along rows
+ right = to_series(right[0, :])
+
+ else:
+ raise ValueError("Unable to coerce to DataFrame, shape "
+ "must be {req_shape}: given {given_shape}"
+ .format(req_shape=left.shape,
+ given_shape=right.shape))
+
+ elif right.ndim > 2:
+ raise ValueError('Unable to coerce to Series/DataFrame, dim '
+ 'must be <= 2: {dim}'.format(dim=right.shape))
+
+ elif (is_list_like(right) and
+ not isinstance(right, (ABCSeries, ABCDataFrame))):
+ # GH17901
+ right = to_series(right)
+
+ return right
+
+
+def _arith_method_FRAME(cls, op, special):
+ str_rep = _get_opstr(op, cls)
+ op_name = _get_op_name(op, special)
+ eval_kwargs = _gen_eval_kwargs(op_name)
+ fill_zeros = _gen_fill_zeros(op_name)
+ default_axis = _get_frame_op_default_axis(op_name)
+
+ def na_op(x, y):
+ import pandas.core.computation.expressions as expressions
+
+ try:
+ result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs)
+ except TypeError:
+ result = masked_arith_op(x, y, op)
+
+ result = missing.fill_zeros(result, x, y, op_name, fill_zeros)
+
+ return result
+
+ if op_name in _op_descriptions:
+ # i.e. include "add" but not "__add__"
+ doc = _make_flex_doc(op_name, 'dataframe')
+ else:
+ doc = _arith_doc_FRAME % op_name
+
+ @Appender(doc)
+ def f(self, other, axis=default_axis, level=None, fill_value=None):
+
+ other = _align_method_FRAME(self, other, axis)
+
+ if isinstance(other, ABCDataFrame):
+ # Another DataFrame
+ pass_op = op if should_series_dispatch(self, other, op) else na_op
+ return self._combine_frame(other, pass_op, fill_value, level)
+ elif isinstance(other, ABCSeries):
+ # For these values of `axis`, we end up dispatching to Series op,
+ # so do not want the masked op.
+ pass_op = op if axis in [0, "columns", None] else na_op
+ return _combine_series_frame(self, other, pass_op,
+ fill_value=fill_value, axis=axis,
+ level=level)
+ else:
+ if fill_value is not None:
+ self = self.fillna(fill_value)
+
+ assert np.ndim(other) == 0
+ return self._combine_const(other, op)
+
+ f.__name__ = op_name
+
+ return f
+
+
+def _flex_comp_method_FRAME(cls, op, special):
+ str_rep = _get_opstr(op, cls)
+ op_name = _get_op_name(op, special)
+ default_axis = _get_frame_op_default_axis(op_name)
+
+ def na_op(x, y):
+ try:
+ with np.errstate(invalid='ignore'):
+ result = op(x, y)
+ except TypeError:
+ result = mask_cmp_op(x, y, op, (np.ndarray, ABCSeries))
+ return result
+
+ doc = _flex_comp_doc_FRAME.format(op_name=op_name,
+ desc=_op_descriptions[op_name]['desc'])
+
+ @Appender(doc)
+ def f(self, other, axis=default_axis, level=None):
+
+ other = _align_method_FRAME(self, other, axis)
+
+ if isinstance(other, ABCDataFrame):
+ # Another DataFrame
+ if not self._indexed_same(other):
+ self, other = self.align(other, 'outer',
+ level=level, copy=False)
+ return dispatch_to_series(self, other, na_op, str_rep)
+
+ elif isinstance(other, ABCSeries):
+ return _combine_series_frame(self, other, na_op,
+ fill_value=None, axis=axis,
+ level=level)
+ else:
+ assert np.ndim(other) == 0, other
+ return self._combine_const(other, na_op)
+
+ f.__name__ = op_name
+
+ return f
+
+
+def _comp_method_FRAME(cls, func, special):
+ str_rep = _get_opstr(func, cls)
+ op_name = _get_op_name(func, special)
+
+ @Appender('Wrapper for comparison method {name}'.format(name=op_name))
+ def f(self, other):
+
+ other = _align_method_FRAME(self, other, axis=None)
+
+ if isinstance(other, ABCDataFrame):
+ # Another DataFrame
+ if not self._indexed_same(other):
+ raise ValueError('Can only compare identically-labeled '
+ 'DataFrame objects')
+ return dispatch_to_series(self, other, func, str_rep)
+
+ elif isinstance(other, ABCSeries):
+ return _combine_series_frame(self, other, func,
+ fill_value=None, axis=None,
+ level=None)
+ else:
+
+ # straight boolean comparisons we want to allow all columns
+ # (regardless of dtype to pass thru) See #4537 for discussion.
+ res = self._combine_const(other, func)
+ return res.fillna(True).astype(bool)
+
+ f.__name__ = op_name
+
+ return f
+
+
+# -----------------------------------------------------------------------------
+# Panel
+
+def _arith_method_PANEL(cls, op, special):
+ # work only for scalars
+ op_name = _get_op_name(op, special)
+
+ def f(self, other):
+ if not is_scalar(other):
+ raise ValueError('Simple arithmetic with {name} can only be '
+ 'done with scalar values'
+ .format(name=self._constructor.__name__))
+
+ return self._combine(other, op)
+
+ f.__name__ = op_name
+ return f
+
+
+def _comp_method_PANEL(cls, op, special):
+ str_rep = _get_opstr(op, cls)
+ op_name = _get_op_name(op, special)
+
+ def na_op(x, y):
+ import pandas.core.computation.expressions as expressions
+
+ try:
+ result = expressions.evaluate(op, str_rep, x, y)
+ except TypeError:
+ result = mask_cmp_op(x, y, op, np.ndarray)
+ return result
+
+ @Appender('Wrapper for comparison method {name}'.format(name=op_name))
+ def f(self, other, axis=None):
+ # Validate the axis parameter
+ if axis is not None:
+ self._get_axis_number(axis)
+
+ if isinstance(other, self._constructor):
+ return self._compare_constructor(other, na_op)
+ elif isinstance(other, (self._constructor_sliced, ABCDataFrame,
+ ABCSeries)):
+ raise Exception("input needs alignment for this object [{object}]"
+ .format(object=self._constructor))
+ else:
+ return self._combine_const(other, na_op)
+
+ f.__name__ = op_name
+
+ return f
+
+
+def _flex_method_PANEL(cls, op, special):
+ str_rep = _get_opstr(op, cls)
+ op_name = _get_op_name(op, special)
+ eval_kwargs = _gen_eval_kwargs(op_name)
+ fill_zeros = _gen_fill_zeros(op_name)
+
+ def na_op(x, y):
+ import pandas.core.computation.expressions as expressions
+
+ try:
+ result = expressions.evaluate(op, str_rep, x, y,
+ errors='raise',
+ **eval_kwargs)
+ except TypeError:
+ result = op(x, y)
+
+ # handles discrepancy between numpy and numexpr on division/mod
+ # by 0 though, given that these are generally (always?)
+ # non-scalars, I'm not sure whether it's worth it at the moment
+ result = missing.fill_zeros(result, x, y, op_name, fill_zeros)
+ return result
+
+ if op_name in _op_descriptions:
+ doc = _make_flex_doc(op_name, 'panel')
+ else:
+ # doc strings substitors
+ doc = _agg_doc_PANEL.format(op_name=op_name)
+
+ @Appender(doc)
+ def f(self, other, axis=0):
+ return self._combine(other, na_op, axis=axis)
+
+ f.__name__ = op_name
+ return f
+
+
+# -----------------------------------------------------------------------------
+# Sparse
+
+def _cast_sparse_series_op(left, right, opname):
+ """
+ For SparseSeries operation, coerce to float64 if the result is expected
+ to have NaN or inf values
+
+ Parameters
+ ----------
+ left : SparseArray
+ right : SparseArray
+ opname : str
+
+ Returns
+ -------
+ left : SparseArray
+ right : SparseArray
+ """
+ from pandas.core.sparse.api import SparseDtype
+
+ opname = opname.strip('_')
+
+ # TODO: This should be moved to the array?
+ if is_integer_dtype(left) and is_integer_dtype(right):
+ # series coerces to float64 if result should have NaN/inf
+ if opname in ('floordiv', 'mod') and (right.values == 0).any():
+ left = left.astype(SparseDtype(np.float64, left.fill_value))
+ right = right.astype(SparseDtype(np.float64, right.fill_value))
+ elif opname in ('rfloordiv', 'rmod') and (left.values == 0).any():
+ left = left.astype(SparseDtype(np.float64, left.fill_value))
+ right = right.astype(SparseDtype(np.float64, right.fill_value))
+
+ return left, right
+
+
+def _arith_method_SPARSE_SERIES(cls, op, special):
+ """
+ Wrapper function for Series arithmetic operations, to avoid
+ code duplication.
+ """
+ op_name = _get_op_name(op, special)
+
+ def wrapper(self, other):
+ if isinstance(other, ABCDataFrame):
+ return NotImplemented
+ elif isinstance(other, ABCSeries):
+ if not isinstance(other, ABCSparseSeries):
+ other = other.to_sparse(fill_value=self.fill_value)
+ return _sparse_series_op(self, other, op, op_name)
+ elif is_scalar(other):
+ with np.errstate(all='ignore'):
+ new_values = op(self.values, other)
+ return self._constructor(new_values,
+ index=self.index,
+ name=self.name)
+ else: # pragma: no cover
+ raise TypeError('operation with {other} not supported'
+ .format(other=type(other)))
+
+ wrapper.__name__ = op_name
+ return wrapper
+
+
+def _sparse_series_op(left, right, op, name):
+ left, right = left.align(right, join='outer', copy=False)
+ new_index = left.index
+ new_name = get_op_result_name(left, right)
+
+ from pandas.core.arrays.sparse import _sparse_array_op
+ lvalues, rvalues = _cast_sparse_series_op(left.values, right.values, name)
+ result = _sparse_array_op(lvalues, rvalues, op, name)
+ return left._constructor(result, index=new_index, name=new_name)
+
+
+def _arith_method_SPARSE_ARRAY(cls, op, special):
+ """
+ Wrapper function for Series arithmetic operations, to avoid
+ code duplication.
+ """
+ op_name = _get_op_name(op, special)
+
+ def wrapper(self, other):
+ from pandas.core.arrays.sparse.array import (
+ SparseArray, _sparse_array_op, _wrap_result, _get_fill)
+ if isinstance(other, np.ndarray):
+ if len(self) != len(other):
+ raise AssertionError("length mismatch: {self} vs. {other}"
+ .format(self=len(self), other=len(other)))
+ if not isinstance(other, SparseArray):
+ dtype = getattr(other, 'dtype', None)
+ other = SparseArray(other, fill_value=self.fill_value,
+ dtype=dtype)
+ return _sparse_array_op(self, other, op, op_name)
+ elif is_scalar(other):
+ with np.errstate(all='ignore'):
+ fill = op(_get_fill(self), np.asarray(other))
+ result = op(self.sp_values, other)
+
+ return _wrap_result(op_name, result, self.sp_index, fill)
+ else: # pragma: no cover
+ raise TypeError('operation with {other} not supported'
+ .format(other=type(other)))
+
+ wrapper.__name__ = op_name
+ return wrapper
diff --git a/contrib/python/pandas/py2/pandas/core/panel.py b/contrib/python/pandas/py2/pandas/core/panel.py
new file mode 100644
index 00000000000..540192d1a59
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/panel.py
@@ -0,0 +1,1588 @@
+"""
+Contains data structures designed for manipulating panel (3-dimensional) data
+"""
+# pylint: disable=E1103,W0231,W0212,W0621
+from __future__ import division
+
+import warnings
+
+import numpy as np
+
+import pandas.compat as compat
+from pandas.compat import OrderedDict, map, range, u, zip
+from pandas.compat.numpy import function as nv
+from pandas.util._decorators import Appender, Substitution, deprecate_kwarg
+from pandas.util._validators import validate_axis_style_args
+
+from pandas.core.dtypes.cast import (
+ cast_scalar_to_array, infer_dtype_from_scalar, maybe_cast_item)
+from pandas.core.dtypes.common import (
+ is_integer, is_list_like, is_scalar, is_string_like)
+from pandas.core.dtypes.missing import notna
+
+import pandas.core.common as com
+from pandas.core.frame import DataFrame
+from pandas.core.generic import NDFrame, _shared_docs
+from pandas.core.index import (
+ Index, MultiIndex, _get_objs_combined_axis, ensure_index)
+import pandas.core.indexes.base as ibase
+from pandas.core.indexing import maybe_droplevels
+from pandas.core.internals import (
+ BlockManager, create_block_manager_from_arrays,
+ create_block_manager_from_blocks)
+import pandas.core.ops as ops
+from pandas.core.reshape.util import cartesian_product
+from pandas.core.series import Series
+
+from pandas.io.formats.printing import pprint_thing
+
+_shared_doc_kwargs = dict(
+ axes='items, major_axis, minor_axis',
+ klass="Panel",
+ axes_single_arg="{0, 1, 2, 'items', 'major_axis', 'minor_axis'}",
+ optional_mapper='', optional_axis='', optional_labels='')
+_shared_doc_kwargs['args_transpose'] = (
+ "three positional arguments: each one of\n{ax_single}".format(
+ ax_single=_shared_doc_kwargs['axes_single_arg']))
+
+
+def _ensure_like_indices(time, panels):
+ """
+ Makes sure that time and panels are conformable.
+ """
+ n_time = len(time)
+ n_panel = len(panels)
+ u_panels = np.unique(panels) # this sorts!
+ u_time = np.unique(time)
+ if len(u_time) == n_time:
+ time = np.tile(u_time, len(u_panels))
+ if len(u_panels) == n_panel:
+ panels = np.repeat(u_panels, len(u_time))
+ return time, panels
+
+
+def panel_index(time, panels, names=None):
+ """
+ Returns a multi-index suitable for a panel-like DataFrame.
+
+ Parameters
+ ----------
+ time : array-like
+ Time index, does not have to repeat
+ panels : array-like
+ Panel index, does not have to repeat
+ names : list, optional
+ List containing the names of the indices
+
+ Returns
+ -------
+ multi_index : MultiIndex
+ Time index is the first level, the panels are the second level.
+
+ Examples
+ --------
+ >>> years = range(1960,1963)
+ >>> panels = ['A', 'B', 'C']
+ >>> panel_idx = panel_index(years, panels)
+ >>> panel_idx
+ MultiIndex([(1960, 'A'), (1961, 'A'), (1962, 'A'), (1960, 'B'),
+ (1961, 'B'), (1962, 'B'), (1960, 'C'), (1961, 'C'),
+ (1962, 'C')], dtype=object)
+
+ or
+
+ >>> years = np.repeat(range(1960,1963), 3)
+ >>> panels = np.tile(['A', 'B', 'C'], 3)
+ >>> panel_idx = panel_index(years, panels)
+ >>> panel_idx
+ MultiIndex([(1960, 'A'), (1960, 'B'), (1960, 'C'), (1961, 'A'),
+ (1961, 'B'), (1961, 'C'), (1962, 'A'), (1962, 'B'),
+ (1962, 'C')], dtype=object)
+ """
+ if names is None:
+ names = ['time', 'panel']
+ time, panels = _ensure_like_indices(time, panels)
+ return MultiIndex.from_arrays([time, panels], sortorder=None, names=names)
+
+
+class Panel(NDFrame):
+ """
+ Represents wide format panel data, stored as 3-dimensional array.
+
+ .. deprecated:: 0.20.0
+ The recommended way to represent 3-D data are with a MultiIndex on a
+ DataFrame via the :attr:`~Panel.to_frame()` method or with the
+ `xarray package <http://xarray.pydata.org/en/stable/>`__.
+ Pandas provides a :attr:`~Panel.to_xarray()` method to automate this
+ conversion.
+
+ Parameters
+ ----------
+ data : ndarray (items x major x minor), or dict of DataFrames
+ items : Index or array-like
+ axis=0
+ major_axis : Index or array-like
+ axis=1
+ minor_axis : Index or array-like
+ axis=2
+ copy : boolean, default False
+ Copy data from inputs. Only affects DataFrame / 2d ndarray input
+ dtype : dtype, default None
+ Data type to force, otherwise infer
+ """
+
+ @property
+ def _constructor(self):
+ return type(self)
+
+ _constructor_sliced = DataFrame
+
+ def __init__(self, data=None, items=None, major_axis=None, minor_axis=None,
+ copy=False, dtype=None):
+ # deprecation GH13563
+ warnings.warn("\nPanel is deprecated and will be removed in a "
+ "future version.\nThe recommended way to represent "
+ "these types of 3-dimensional data are with a "
+ "MultiIndex on a DataFrame, via the "
+ "Panel.to_frame() method\n"
+ "Alternatively, you can use the xarray package "
+ "http://xarray.pydata.org/en/stable/.\n"
+ "Pandas provides a `.to_xarray()` method to help "
+ "automate this conversion.\n",
+ FutureWarning, stacklevel=3)
+
+ self._init_data(data=data, items=items, major_axis=major_axis,
+ minor_axis=minor_axis, copy=copy, dtype=dtype)
+
+ def _init_data(self, data, copy, dtype, **kwargs):
+ """
+ Generate ND initialization; axes are passed
+ as required objects to __init__.
+ """
+ if data is None:
+ data = {}
+ if dtype is not None:
+ dtype = self._validate_dtype(dtype)
+
+ passed_axes = [kwargs.pop(a, None) for a in self._AXIS_ORDERS]
+
+ if kwargs:
+ raise TypeError('_init_data() got an unexpected keyword '
+ 'argument "{0}"'.format(list(kwargs.keys())[0]))
+
+ axes = None
+ if isinstance(data, BlockManager):
+ if com._any_not_none(*passed_axes):
+ axes = [x if x is not None else y
+ for x, y in zip(passed_axes, data.axes)]
+ mgr = data
+ elif isinstance(data, dict):
+ mgr = self._init_dict(data, passed_axes, dtype=dtype)
+ copy = False
+ dtype = None
+ elif isinstance(data, (np.ndarray, list)):
+ mgr = self._init_matrix(data, passed_axes, dtype=dtype, copy=copy)
+ copy = False
+ dtype = None
+ elif is_scalar(data) and com._all_not_none(*passed_axes):
+ values = cast_scalar_to_array([len(x) for x in passed_axes],
+ data, dtype=dtype)
+ mgr = self._init_matrix(values, passed_axes, dtype=values.dtype,
+ copy=False)
+ copy = False
+ else: # pragma: no cover
+ raise ValueError('Panel constructor not properly called!')
+
+ NDFrame.__init__(self, mgr, axes=axes, copy=copy, dtype=dtype)
+
+ def _init_dict(self, data, axes, dtype=None):
+ haxis = axes.pop(self._info_axis_number)
+
+ # prefilter if haxis passed
+ if haxis is not None:
+ haxis = ensure_index(haxis)
+ data = OrderedDict((k, v)
+ for k, v in compat.iteritems(data)
+ if k in haxis)
+ else:
+ keys = com.dict_keys_to_ordered_list(data)
+ haxis = Index(keys)
+
+ for k, v in compat.iteritems(data):
+ if isinstance(v, dict):
+ data[k] = self._constructor_sliced(v)
+
+ # extract axis for remaining axes & create the slicemap
+ raxes = [self._extract_axis(self, data, axis=i) if a is None else a
+ for i, a in enumerate(axes)]
+ raxes_sm = self._extract_axes_for_slice(self, raxes)
+
+ # shallow copy
+ arrays = []
+ haxis_shape = [len(a) for a in raxes]
+ for h in haxis:
+ v = values = data.get(h)
+ if v is None:
+ values = np.empty(haxis_shape, dtype=dtype)
+ values.fill(np.nan)
+ elif isinstance(v, self._constructor_sliced):
+ d = raxes_sm.copy()
+ d['copy'] = False
+ v = v.reindex(**d)
+ if dtype is not None:
+ v = v.astype(dtype)
+ values = v.values
+ arrays.append(values)
+
+ return self._init_arrays(arrays, haxis, [haxis] + raxes)
+
+ def _init_arrays(self, arrays, arr_names, axes):
+ return create_block_manager_from_arrays(arrays, arr_names, axes)
+
+ @classmethod
+ def from_dict(cls, data, intersect=False, orient='items', dtype=None):
+ """
+ Construct Panel from dict of DataFrame objects.
+
+ Parameters
+ ----------
+ data : dict
+ {field : DataFrame}
+ intersect : boolean
+ Intersect indexes of input DataFrames
+ orient : {'items', 'minor'}, default 'items'
+ The "orientation" of the data. If the keys of the passed dict
+ should be the items of the result panel, pass 'items'
+ (default). Otherwise if the columns of the values of the passed
+ DataFrame objects should be the items (which in the case of
+ mixed-dtype data you should do), instead pass 'minor'
+ dtype : dtype, default None
+ Data type to force, otherwise infer
+
+ Returns
+ -------
+ Panel
+ """
+ from collections import defaultdict
+
+ orient = orient.lower()
+ if orient == 'minor':
+ new_data = defaultdict(OrderedDict)
+ for col, df in compat.iteritems(data):
+ for item, s in compat.iteritems(df):
+ new_data[item][col] = s
+ data = new_data
+ elif orient != 'items': # pragma: no cover
+ raise ValueError('Orientation must be one of {items, minor}.')
+
+ d = cls._homogenize_dict(cls, data, intersect=intersect, dtype=dtype)
+ ks = list(d['data'].keys())
+ if not isinstance(d['data'], OrderedDict):
+ ks = list(sorted(ks))
+ d[cls._info_axis_name] = Index(ks)
+ return cls(**d)
+
+ def __getitem__(self, key):
+ key = com.apply_if_callable(key, self)
+
+ if isinstance(self._info_axis, MultiIndex):
+ return self._getitem_multilevel(key)
+ if not (is_list_like(key) or isinstance(key, slice)):
+ return super(Panel, self).__getitem__(key)
+ return self.loc[key]
+
+ def _getitem_multilevel(self, key):
+ info = self._info_axis
+ loc = info.get_loc(key)
+ if isinstance(loc, (slice, np.ndarray)):
+ new_index = info[loc]
+ result_index = maybe_droplevels(new_index, key)
+ slices = [loc] + [slice(None)] * (self._AXIS_LEN - 1)
+ new_values = self.values[slices]
+
+ d = self._construct_axes_dict(self._AXIS_ORDERS[1:])
+ d[self._info_axis_name] = result_index
+ result = self._constructor(new_values, **d)
+ return result
+ else:
+ return self._get_item_cache(key)
+
+ def _init_matrix(self, data, axes, dtype=None, copy=False):
+ values = self._prep_ndarray(self, data, copy=copy)
+
+ if dtype is not None:
+ try:
+ values = values.astype(dtype)
+ except Exception:
+ raise ValueError('failed to cast to '
+ '{datatype}'.format(datatype=dtype))
+
+ shape = values.shape
+ fixed_axes = []
+ for i, ax in enumerate(axes):
+ if ax is None:
+ ax = ibase.default_index(shape[i])
+ else:
+ ax = ensure_index(ax)
+ fixed_axes.append(ax)
+
+ return create_block_manager_from_blocks([values], fixed_axes)
+
+ # ----------------------------------------------------------------------
+ # Comparison methods
+
+ def _compare_constructor(self, other, func):
+ if not self._indexed_same(other):
+ raise Exception('Can only compare identically-labeled '
+ 'same type objects')
+
+ new_data = {col: func(self[col], other[col])
+ for col in self._info_axis}
+
+ d = self._construct_axes_dict(copy=False)
+ return self._constructor(data=new_data, **d)
+
+ # ----------------------------------------------------------------------
+ # Magic methods
+
+ def __unicode__(self):
+ """
+ Return a string representation for a particular Panel.
+
+ Invoked by unicode(df) in py2 only.
+ Yields a Unicode String in both py2/py3.
+ """
+
+ class_name = str(self.__class__)
+
+ dims = u('Dimensions: {dimensions}'.format(dimensions=' x '.join(
+ ["{shape} ({axis})".format(shape=shape, axis=axis) for axis, shape
+ in zip(self._AXIS_ORDERS, self.shape)])))
+
+ def axis_pretty(a):
+ v = getattr(self, a)
+ if len(v) > 0:
+ return u('{ax} axis: {x} to {y}'.format(ax=a.capitalize(),
+ x=pprint_thing(v[0]),
+ y=pprint_thing(v[-1])))
+ else:
+ return u('{ax} axis: None'.format(ax=a.capitalize()))
+
+ output = '\n'.join(
+ [class_name, dims] + [axis_pretty(a) for a in self._AXIS_ORDERS])
+ return output
+
+ def _get_plane_axes_index(self, axis):
+ """
+ Get my plane axes indexes: these are already
+ (as compared with higher level planes),
+ as we are returning a DataFrame axes indexes.
+ """
+ axis_name = self._get_axis_name(axis)
+
+ if axis_name == 'major_axis':
+ index = 'minor_axis'
+ columns = 'items'
+ if axis_name == 'minor_axis':
+ index = 'major_axis'
+ columns = 'items'
+ elif axis_name == 'items':
+ index = 'major_axis'
+ columns = 'minor_axis'
+
+ return index, columns
+
+ def _get_plane_axes(self, axis):
+ """
+ Get my plane axes indexes: these are already
+ (as compared with higher level planes),
+ as we are returning a DataFrame axes.
+ """
+ return [self._get_axis(axi)
+ for axi in self._get_plane_axes_index(axis)]
+
+ fromDict = from_dict
+
+ def to_sparse(self, *args, **kwargs):
+ """
+ NOT IMPLEMENTED: do not call this method, as sparsifying is not
+ supported for Panel objects and will raise an error.
+
+ Convert to SparsePanel.
+ """
+ raise NotImplementedError("sparsifying is not supported "
+ "for Panel objects")
+
+ def to_excel(self, path, na_rep='', engine=None, **kwargs):
+ """
+ Write each DataFrame in Panel to a separate excel sheet.
+
+ Parameters
+ ----------
+ path : string or ExcelWriter object
+ File path or existing ExcelWriter
+ na_rep : string, default ''
+ Missing data representation
+ engine : string, default None
+ write engine to use - you can also set this via the options
+ ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and
+ ``io.excel.xlsm.writer``.
+
+ Other Parameters
+ ----------------
+ float_format : string, default None
+ Format string for floating point numbers
+ cols : sequence, optional
+ Columns to write
+ header : boolean or list of string, default True
+ Write out column names. If a list of string is given it is
+ assumed to be aliases for the column names
+ index : boolean, default True
+ Write row names (index)
+ index_label : string or sequence, default None
+ Column label for index column(s) if desired. If None is given, and
+ `header` and `index` are True, then the index names are used. A
+ sequence should be given if the DataFrame uses MultiIndex.
+ startrow : upper left cell row to dump data frame
+ startcol : upper left cell column to dump data frame
+
+ Notes
+ -----
+ Keyword arguments (and na_rep) are passed to the ``to_excel`` method
+ for each DataFrame written.
+ """
+ from pandas.io.excel import ExcelWriter
+
+ if isinstance(path, compat.string_types):
+ writer = ExcelWriter(path, engine=engine)
+ else:
+ writer = path
+ kwargs['na_rep'] = na_rep
+
+ for item, df in self.iteritems():
+ name = str(item)
+ df.to_excel(writer, name, **kwargs)
+ writer.save()
+
+ def as_matrix(self):
+ self._consolidate_inplace()
+ return self._data.as_array()
+
+ # ----------------------------------------------------------------------
+ # Getting and setting elements
+
+ def get_value(self, *args, **kwargs):
+ """
+ Quickly retrieve single value at (item, major, minor) location.
+
+ .. deprecated:: 0.21.0
+
+ Please use .at[] or .iat[] accessors.
+
+ Parameters
+ ----------
+ item : item label (panel item)
+ major : major axis label (panel item row)
+ minor : minor axis label (panel item column)
+ takeable : interpret the passed labels as indexers, default False
+
+ Returns
+ -------
+ value : scalar value
+ """
+ warnings.warn("get_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead", FutureWarning,
+ stacklevel=2)
+ return self._get_value(*args, **kwargs)
+
+ def _get_value(self, *args, **kwargs):
+ nargs = len(args)
+ nreq = self._AXIS_LEN
+
+ # require an arg for each axis
+ if nargs != nreq:
+ raise TypeError('There must be an argument for each axis, you gave'
+ ' {0} args, but {1} are required'.format(nargs,
+ nreq))
+ takeable = kwargs.pop('takeable', None)
+
+ if kwargs:
+ raise TypeError('get_value() got an unexpected keyword '
+ 'argument "{0}"'.format(list(kwargs.keys())[0]))
+
+ if takeable is True:
+ lower = self._iget_item_cache(args[0])
+ else:
+ lower = self._get_item_cache(args[0])
+
+ return lower._get_value(*args[1:], takeable=takeable)
+ _get_value.__doc__ = get_value.__doc__
+
+ def set_value(self, *args, **kwargs):
+ """
+ Quickly set single value at (item, major, minor) location.
+
+ .. deprecated:: 0.21.0
+
+ Please use .at[] or .iat[] accessors.
+
+ Parameters
+ ----------
+ item : item label (panel item)
+ major : major axis label (panel item row)
+ minor : minor axis label (panel item column)
+ value : scalar
+ takeable : interpret the passed labels as indexers, default False
+
+ Returns
+ -------
+ panel : Panel
+ If label combo is contained, will be reference to calling Panel,
+ otherwise a new object
+ """
+ warnings.warn("set_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead", FutureWarning,
+ stacklevel=2)
+ return self._set_value(*args, **kwargs)
+
+ def _set_value(self, *args, **kwargs):
+ # require an arg for each axis and the value
+ nargs = len(args)
+ nreq = self._AXIS_LEN + 1
+
+ if nargs != nreq:
+ raise TypeError('There must be an argument for each axis plus the '
+ 'value provided, you gave {0} args, but {1} are '
+ 'required'.format(nargs, nreq))
+ takeable = kwargs.pop('takeable', None)
+
+ if kwargs:
+ raise TypeError('set_value() got an unexpected keyword '
+ 'argument "{0}"'.format(list(kwargs.keys())[0]))
+
+ try:
+ if takeable is True:
+ lower = self._iget_item_cache(args[0])
+ else:
+ lower = self._get_item_cache(args[0])
+
+ lower._set_value(*args[1:], takeable=takeable)
+ return self
+ except KeyError:
+ axes = self._expand_axes(args)
+ d = self._construct_axes_dict_from(self, axes, copy=False)
+ result = self.reindex(**d)
+ args = list(args)
+ likely_dtype, args[-1] = infer_dtype_from_scalar(args[-1])
+ made_bigger = not np.array_equal(axes[0], self._info_axis)
+ # how to make this logic simpler?
+ if made_bigger:
+ maybe_cast_item(result, args[0], likely_dtype)
+
+ return result._set_value(*args)
+ _set_value.__doc__ = set_value.__doc__
+
+ def _box_item_values(self, key, values):
+ if self.ndim == values.ndim:
+ result = self._constructor(values)
+
+ # a dup selection will yield a full ndim
+ if result._get_axis(0).is_unique:
+ result = result[key]
+
+ return result
+
+ d = self._construct_axes_dict_for_slice(self._AXIS_ORDERS[1:])
+ return self._constructor_sliced(values, **d)
+
+ def __setitem__(self, key, value):
+ key = com.apply_if_callable(key, self)
+ shape = tuple(self.shape)
+ if isinstance(value, self._constructor_sliced):
+ value = value.reindex(
+ **self._construct_axes_dict_for_slice(self._AXIS_ORDERS[1:]))
+ mat = value.values
+ elif isinstance(value, np.ndarray):
+ if value.shape != shape[1:]:
+ raise ValueError('shape of value must be {0}, shape of given '
+ 'object was {1}'.format(
+ shape[1:], tuple(map(int, value.shape))))
+ mat = np.asarray(value)
+ elif is_scalar(value):
+ mat = cast_scalar_to_array(shape[1:], value)
+ else:
+ raise TypeError('Cannot set item of '
+ 'type: {dtype!s}'.format(dtype=type(value)))
+
+ mat = mat.reshape(tuple([1]) + shape[1:])
+ NDFrame._set_item(self, key, mat)
+
+ def _unpickle_panel_compat(self, state): # pragma: no cover
+ """
+ Unpickle the panel.
+ """
+ from pandas.io.pickle import _unpickle_array
+
+ _unpickle = _unpickle_array
+ vals, items, major, minor = state
+
+ items = _unpickle(items)
+ major = _unpickle(major)
+ minor = _unpickle(minor)
+ values = _unpickle(vals)
+ wp = Panel(values, items, major, minor)
+ self._data = wp._data
+
+ def conform(self, frame, axis='items'):
+ """
+ Conform input DataFrame to align with chosen axis pair.
+
+ Parameters
+ ----------
+ frame : DataFrame
+ axis : {'items', 'major', 'minor'}
+
+ Axis the input corresponds to. E.g., if axis='major', then
+ the frame's columns would be items, and the index would be
+ values of the minor axis
+
+ Returns
+ -------
+ DataFrame
+ """
+ axes = self._get_plane_axes(axis)
+ return frame.reindex(**self._extract_axes_for_slice(self, axes))
+
+ def head(self, n=5):
+ raise NotImplementedError
+
+ def tail(self, n=5):
+ raise NotImplementedError
+
+ def round(self, decimals=0, *args, **kwargs):
+ """
+ Round each value in Panel to a specified number of decimal places.
+
+ .. versionadded:: 0.18.0
+
+ Parameters
+ ----------
+ decimals : int
+ Number of decimal places to round to (default: 0).
+ If decimals is negative, it specifies the number of
+ positions to the left of the decimal point.
+
+ Returns
+ -------
+ Panel object
+
+ See Also
+ --------
+ numpy.around
+ """
+ nv.validate_round(args, kwargs)
+
+ if is_integer(decimals):
+ result = np.apply_along_axis(np.round, 0, self.values)
+ return self._wrap_result(result, axis=0)
+ raise TypeError("decimals must be an integer")
+
+ def _needs_reindex_multi(self, axes, method, level):
+ """
+ Don't allow a multi reindex on Panel or above ndim.
+ """
+ return False
+
+ def align(self, other, **kwargs):
+ raise NotImplementedError
+
+ def dropna(self, axis=0, how='any', inplace=False):
+ """
+ Drop 2D from panel, holding passed axis constant.
+
+ Parameters
+ ----------
+ axis : int, default 0
+ Axis to hold constant. E.g. axis=1 will drop major_axis entries
+ having a certain amount of NA data
+ how : {'all', 'any'}, default 'any'
+ 'any': one or more values are NA in the DataFrame along the
+ axis. For 'all' they all must be.
+ inplace : bool, default False
+ If True, do operation inplace and return None.
+
+ Returns
+ -------
+ dropped : Panel
+ """
+ axis = self._get_axis_number(axis)
+
+ values = self.values
+ mask = notna(values)
+
+ for ax in reversed(sorted(set(range(self._AXIS_LEN)) - {axis})):
+ mask = mask.sum(ax)
+
+ per_slice = np.prod(values.shape[:axis] + values.shape[axis + 1:])
+
+ if how == 'all':
+ cond = mask > 0
+ else:
+ cond = mask == per_slice
+
+ new_ax = self._get_axis(axis)[cond]
+ result = self.reindex_axis(new_ax, axis=axis)
+ if inplace:
+ self._update_inplace(result)
+ else:
+ return result
+
+ def _combine(self, other, func, axis=0):
+ if isinstance(other, Panel):
+ return self._combine_panel(other, func)
+ elif isinstance(other, DataFrame):
+ return self._combine_frame(other, func, axis=axis)
+ elif is_scalar(other):
+ return self._combine_const(other, func)
+ else:
+ raise NotImplementedError(
+ "{otype!s} is not supported in combine operation with "
+ "{selftype!s}".format(otype=type(other), selftype=type(self)))
+
+ def _combine_const(self, other, func):
+ with np.errstate(all='ignore'):
+ new_values = func(self.values, other)
+ d = self._construct_axes_dict()
+ return self._constructor(new_values, **d)
+
+ def _combine_frame(self, other, func, axis=0):
+ index, columns = self._get_plane_axes(axis)
+ axis = self._get_axis_number(axis)
+
+ other = other.reindex(index=index, columns=columns)
+
+ with np.errstate(all='ignore'):
+ if axis == 0:
+ new_values = func(self.values, other.values)
+ elif axis == 1:
+ new_values = func(self.values.swapaxes(0, 1), other.values.T)
+ new_values = new_values.swapaxes(0, 1)
+ elif axis == 2:
+ new_values = func(self.values.swapaxes(0, 2), other.values)
+ new_values = new_values.swapaxes(0, 2)
+
+ return self._constructor(new_values, self.items, self.major_axis,
+ self.minor_axis)
+
+ def _combine_panel(self, other, func):
+ items = self.items.union(other.items)
+ major = self.major_axis.union(other.major_axis)
+ minor = self.minor_axis.union(other.minor_axis)
+
+ # could check that everything's the same size, but forget it
+ this = self.reindex(items=items, major=major, minor=minor)
+ other = other.reindex(items=items, major=major, minor=minor)
+
+ with np.errstate(all='ignore'):
+ result_values = func(this.values, other.values)
+
+ return self._constructor(result_values, items, major, minor)
+
+ def major_xs(self, key):
+ """
+ Return slice of panel along major axis.
+
+ Parameters
+ ----------
+ key : object
+ Major axis label
+
+ Returns
+ -------
+ y : DataFrame
+ index -> minor axis, columns -> items
+
+ Notes
+ -----
+ major_xs is only for getting, not setting values.
+
+ MultiIndex Slicers is a generic way to get/set values on any level or
+ levels and is a superset of major_xs functionality, see
+ :ref:`MultiIndex Slicers <advanced.mi_slicers>`
+ """
+ return self.xs(key, axis=self._AXIS_LEN - 2)
+
+ def minor_xs(self, key):
+ """
+ Return slice of panel along minor axis.
+
+ Parameters
+ ----------
+ key : object
+ Minor axis label
+
+ Returns
+ -------
+ y : DataFrame
+ index -> major axis, columns -> items
+
+ Notes
+ -----
+ minor_xs is only for getting, not setting values.
+
+ MultiIndex Slicers is a generic way to get/set values on any level or
+ levels and is a superset of minor_xs functionality, see
+ :ref:`MultiIndex Slicers <advanced.mi_slicers>`
+ """
+ return self.xs(key, axis=self._AXIS_LEN - 1)
+
+ def xs(self, key, axis=1):
+ """
+ Return slice of panel along selected axis.
+
+ Parameters
+ ----------
+ key : object
+ Label
+ axis : {'items', 'major', 'minor}, default 1/'major'
+
+ Returns
+ -------
+ y : ndim(self)-1
+
+ Notes
+ -----
+ xs is only for getting, not setting values.
+
+ MultiIndex Slicers is a generic way to get/set values on any level or
+ levels and is a superset of xs functionality, see
+ :ref:`MultiIndex Slicers <advanced.mi_slicers>`
+ """
+ axis = self._get_axis_number(axis)
+ if axis == 0:
+ return self[key]
+
+ self._consolidate_inplace()
+ axis_number = self._get_axis_number(axis)
+ new_data = self._data.xs(key, axis=axis_number, copy=False)
+ result = self._construct_return_type(new_data)
+ copy = new_data.is_mixed_type
+ result._set_is_copy(self, copy=copy)
+ return result
+
+ _xs = xs
+
+ def _ixs(self, i, axis=0):
+ """
+ Parameters
+ ----------
+ i : int, slice, or sequence of integers
+ axis : int
+ """
+
+ ax = self._get_axis(axis)
+ key = ax[i]
+
+ # xs cannot handle a non-scalar key, so just reindex here
+ # if we have a multi-index and a single tuple, then its a reduction
+ # (GH 7516)
+ if not (isinstance(ax, MultiIndex) and isinstance(key, tuple)):
+ if is_list_like(key):
+ indexer = {self._get_axis_name(axis): key}
+ return self.reindex(**indexer)
+
+ # a reduction
+ if axis == 0:
+ values = self._data.iget(i)
+ return self._box_item_values(key, values)
+
+ # xs by position
+ self._consolidate_inplace()
+ new_data = self._data.xs(i, axis=axis, copy=True, takeable=True)
+ return self._construct_return_type(new_data)
+
+ def groupby(self, function, axis='major'):
+ """
+ Group data on given axis, returning GroupBy object.
+
+ Parameters
+ ----------
+ function : callable
+ Mapping function for chosen access
+ axis : {'major', 'minor', 'items'}, default 'major'
+
+ Returns
+ -------
+ grouped : PanelGroupBy
+ """
+ from pandas.core.groupby import PanelGroupBy
+ axis = self._get_axis_number(axis)
+ return PanelGroupBy(self, function, axis=axis)
+
+ def to_frame(self, filter_observations=True):
+ """
+ Transform wide format into long (stacked) format as DataFrame whose
+ columns are the Panel's items and whose index is a MultiIndex formed
+ of the Panel's major and minor axes.
+
+ Parameters
+ ----------
+ filter_observations : boolean, default True
+ Drop (major, minor) pairs without a complete set of observations
+ across all the items
+
+ Returns
+ -------
+ y : DataFrame
+ """
+ _, N, K = self.shape
+
+ if filter_observations:
+ # shaped like the return DataFrame
+ mask = notna(self.values).all(axis=0)
+ # size = mask.sum()
+ selector = mask.ravel()
+ else:
+ # size = N * K
+ selector = slice(None, None)
+
+ data = {item: self[item].values.ravel()[selector]
+ for item in self.items}
+
+ def construct_multi_parts(idx, n_repeat, n_shuffle=1):
+ # Replicates and shuffles MultiIndex, returns individual attributes
+ codes = [np.repeat(x, n_repeat) for x in idx.codes]
+ # Assumes that each label is divisible by n_shuffle
+ codes = [x.reshape(n_shuffle, -1).ravel(order='F')
+ for x in codes]
+ codes = [x[selector] for x in codes]
+ levels = idx.levels
+ names = idx.names
+ return codes, levels, names
+
+ def construct_index_parts(idx, major=True):
+ levels = [idx]
+ if major:
+ codes = [np.arange(N).repeat(K)[selector]]
+ names = idx.name or 'major'
+ else:
+ codes = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)]
+ codes = [codes.ravel()[selector]]
+ names = idx.name or 'minor'
+ names = [names]
+ return codes, levels, names
+
+ if isinstance(self.major_axis, MultiIndex):
+ major_codes, major_levels, major_names = construct_multi_parts(
+ self.major_axis, n_repeat=K)
+ else:
+ major_codes, major_levels, major_names = construct_index_parts(
+ self.major_axis)
+
+ if isinstance(self.minor_axis, MultiIndex):
+ minor_codes, minor_levels, minor_names = construct_multi_parts(
+ self.minor_axis, n_repeat=N, n_shuffle=K)
+ else:
+ minor_codes, minor_levels, minor_names = construct_index_parts(
+ self.minor_axis, major=False)
+
+ levels = major_levels + minor_levels
+ codes = major_codes + minor_codes
+ names = major_names + minor_names
+
+ index = MultiIndex(levels=levels, codes=codes, names=names,
+ verify_integrity=False)
+
+ return DataFrame(data, index=index, columns=self.items)
+
+ def apply(self, func, axis='major', **kwargs):
+ """
+ Applies function along axis (or axes) of the Panel.
+
+ Parameters
+ ----------
+ func : function
+ Function to apply to each combination of 'other' axes
+ e.g. if axis = 'items', the combination of major_axis/minor_axis
+ will each be passed as a Series; if axis = ('items', 'major'),
+ DataFrames of items & major axis will be passed
+ axis : {'items', 'minor', 'major'}, or {0, 1, 2}, or a tuple with two
+ axes
+ Additional keyword arguments will be passed as keywords to the function
+
+ Returns
+ -------
+ result : Panel, DataFrame, or Series
+
+ Examples
+ --------
+
+ Returns a Panel with the square root of each element
+
+ >>> p = pd.Panel(np.random.rand(4, 3, 2)) # doctest: +SKIP
+ >>> p.apply(np.sqrt)
+
+ Equivalent to p.sum(1), returning a DataFrame
+
+ >>> p.apply(lambda x: x.sum(), axis=1) # doctest: +SKIP
+
+ Equivalent to previous:
+
+ >>> p.apply(lambda x: x.sum(), axis='major') # doctest: +SKIP
+
+ Return the shapes of each DataFrame over axis 2 (i.e the shapes of
+ items x major), as a Series
+
+ >>> p.apply(lambda x: x.shape, axis=(0,1)) # doctest: +SKIP
+ """
+
+ if kwargs and not isinstance(func, np.ufunc):
+ f = lambda x: func(x, **kwargs)
+ else:
+ f = func
+
+ # 2d-slabs
+ if isinstance(axis, (tuple, list)) and len(axis) == 2:
+ return self._apply_2d(f, axis=axis)
+
+ axis = self._get_axis_number(axis)
+
+ # try ufunc like
+ if isinstance(f, np.ufunc):
+ try:
+ with np.errstate(all='ignore'):
+ result = np.apply_along_axis(func, axis, self.values)
+ return self._wrap_result(result, axis=axis)
+ except (AttributeError):
+ pass
+
+ # 1d
+ return self._apply_1d(f, axis=axis)
+
+ def _apply_1d(self, func, axis):
+
+ axis_name = self._get_axis_name(axis)
+ ndim = self.ndim
+ values = self.values
+
+ # iter thru the axes
+ slice_axis = self._get_axis(axis)
+ slice_indexer = [0] * (ndim - 1)
+ indexer = np.zeros(ndim, 'O')
+ indlist = list(range(ndim))
+ indlist.remove(axis)
+ indexer[axis] = slice(None, None)
+ indexer.put(indlist, slice_indexer)
+ planes = [self._get_axis(axi) for axi in indlist]
+ shape = np.array(self.shape).take(indlist)
+
+ # all the iteration points
+ points = cartesian_product(planes)
+
+ results = []
+ for i in range(np.prod(shape)):
+
+ # construct the object
+ pts = tuple(p[i] for p in points)
+ indexer.put(indlist, slice_indexer)
+
+ obj = Series(values[tuple(indexer)], index=slice_axis, name=pts)
+ result = func(obj)
+
+ results.append(result)
+
+ # increment the indexer
+ slice_indexer[-1] += 1
+ n = -1
+ while (slice_indexer[n] >= shape[n]) and (n > (1 - ndim)):
+ slice_indexer[n - 1] += 1
+ slice_indexer[n] = 0
+ n -= 1
+
+ # empty object
+ if not len(results):
+ return self._constructor(**self._construct_axes_dict())
+
+ # same ndim as current
+ if isinstance(results[0], Series):
+ arr = np.vstack([r.values for r in results])
+ arr = arr.T.reshape(tuple([len(slice_axis)] + list(shape)))
+ tranp = np.array([axis] + indlist).argsort()
+ arr = arr.transpose(tuple(list(tranp)))
+ return self._constructor(arr, **self._construct_axes_dict())
+
+ # ndim-1 shape
+ results = np.array(results).reshape(shape)
+ if results.ndim == 2 and axis_name != self._info_axis_name:
+ results = results.T
+ planes = planes[::-1]
+ return self._construct_return_type(results, planes)
+
+ def _apply_2d(self, func, axis):
+ """
+ Handle 2-d slices, equiv to iterating over the other axis.
+ """
+ ndim = self.ndim
+ axis = [self._get_axis_number(a) for a in axis]
+
+ # construct slabs, in 2-d this is a DataFrame result
+ indexer_axis = list(range(ndim))
+ for a in axis:
+ indexer_axis.remove(a)
+ indexer_axis = indexer_axis[0]
+
+ slicer = [slice(None, None)] * ndim
+ ax = self._get_axis(indexer_axis)
+
+ results = []
+ for i, e in enumerate(ax):
+ slicer[indexer_axis] = i
+ sliced = self.iloc[tuple(slicer)]
+
+ obj = func(sliced)
+ results.append((e, obj))
+
+ return self._construct_return_type(dict(results))
+
+ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
+ filter_type=None, **kwds):
+ if numeric_only:
+ raise NotImplementedError('Panel.{0} does not implement '
+ 'numeric_only.'.format(name))
+
+ if axis is None and filter_type == 'bool':
+ # labels = None
+ # constructor = None
+ axis_number = None
+ axis_name = None
+ else:
+ # TODO: Make other agg func handle axis=None properly
+ axis = self._get_axis_number(axis)
+ # labels = self._get_agg_axis(axis)
+ # constructor = self._constructor
+ axis_name = self._get_axis_name(axis)
+ axis_number = self._get_axis_number(axis_name)
+
+ f = lambda x: op(x, axis=axis_number, skipna=skipna, **kwds)
+
+ with np.errstate(all='ignore'):
+ result = f(self.values)
+
+ if axis is None and filter_type == 'bool':
+ return np.bool_(result)
+ axes = self._get_plane_axes(axis_name)
+ if result.ndim == 2 and axis_name != self._info_axis_name:
+ result = result.T
+
+ return self._construct_return_type(result, axes)
+
+ def _construct_return_type(self, result, axes=None):
+ """
+ Return the type for the ndim of the result.
+ """
+ ndim = getattr(result, 'ndim', None)
+
+ # need to assume they are the same
+ if ndim is None:
+ if isinstance(result, dict):
+ ndim = getattr(list(compat.itervalues(result))[0], 'ndim', 0)
+
+ # have a dict, so top-level is +1 dim
+ if ndim != 0:
+ ndim += 1
+
+ # scalar
+ if ndim == 0:
+ return Series(result)
+
+ # same as self
+ elif self.ndim == ndim:
+ # return the construction dictionary for these axes
+ if axes is None:
+ return self._constructor(result)
+ return self._constructor(result, **self._construct_axes_dict())
+
+ # sliced
+ elif self.ndim == ndim + 1:
+ if axes is None:
+ return self._constructor_sliced(result)
+ return self._constructor_sliced(
+ result, **self._extract_axes_for_slice(self, axes))
+
+ raise ValueError('invalid _construct_return_type [self->{self}] '
+ '[result->{result}]'.format(self=self, result=result))
+
+ def _wrap_result(self, result, axis):
+ axis = self._get_axis_name(axis)
+ axes = self._get_plane_axes(axis)
+ if result.ndim == 2 and axis != self._info_axis_name:
+ result = result.T
+
+ return self._construct_return_type(result, axes)
+
+ @Substitution(**_shared_doc_kwargs)
+ @Appender(NDFrame.reindex.__doc__)
+ def reindex(self, *args, **kwargs):
+ major = kwargs.pop("major", None)
+ minor = kwargs.pop('minor', None)
+
+ if major is not None:
+ if kwargs.get("major_axis"):
+ raise TypeError("Cannot specify both 'major' and 'major_axis'")
+ kwargs['major_axis'] = major
+ if minor is not None:
+ if kwargs.get("minor_axis"):
+ raise TypeError("Cannot specify both 'minor' and 'minor_axis'")
+
+ kwargs['minor_axis'] = minor
+ axes = validate_axis_style_args(self, args, kwargs, 'labels',
+ 'reindex')
+ kwargs.update(axes)
+ kwargs.pop('axis', None)
+ kwargs.pop('labels', None)
+
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore", FutureWarning)
+ # do not warn about constructing Panel when reindexing
+ result = super(Panel, self).reindex(**kwargs)
+ return result
+
+ @Substitution(**_shared_doc_kwargs)
+ @Appender(NDFrame.rename.__doc__)
+ def rename(self, items=None, major_axis=None, minor_axis=None, **kwargs):
+ major_axis = (major_axis if major_axis is not None else
+ kwargs.pop('major', None))
+ minor_axis = (minor_axis if minor_axis is not None else
+ kwargs.pop('minor', None))
+ return super(Panel, self).rename(items=items, major_axis=major_axis,
+ minor_axis=minor_axis, **kwargs)
+
+ @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)
+ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
+ limit=None, fill_value=np.nan):
+ return super(Panel, self).reindex_axis(labels=labels, axis=axis,
+ method=method, level=level,
+ copy=copy, limit=limit,
+ fill_value=fill_value)
+
+ @Substitution(**_shared_doc_kwargs)
+ @Appender(NDFrame.transpose.__doc__)
+ def transpose(self, *args, **kwargs):
+ # check if a list of axes was passed in instead as a
+ # single *args element
+ if (len(args) == 1 and hasattr(args[0], '__iter__') and
+ not is_string_like(args[0])):
+ axes = args[0]
+ else:
+ axes = args
+
+ if 'axes' in kwargs and axes:
+ raise TypeError("transpose() got multiple values for "
+ "keyword argument 'axes'")
+ elif not axes:
+ axes = kwargs.pop('axes', ())
+
+ return super(Panel, self).transpose(*axes, **kwargs)
+
+ @Substitution(**_shared_doc_kwargs)
+ @Appender(NDFrame.fillna.__doc__)
+ def fillna(self, value=None, method=None, axis=None, inplace=False,
+ limit=None, downcast=None, **kwargs):
+ return super(Panel, self).fillna(value=value, method=method, axis=axis,
+ inplace=inplace, limit=limit,
+ downcast=downcast, **kwargs)
+
+ def count(self, axis='major'):
+ """
+ Return number of observations over requested axis.
+
+ Parameters
+ ----------
+ axis : {'items', 'major', 'minor'} or {0, 1, 2}
+
+ Returns
+ -------
+ count : DataFrame
+ """
+ i = self._get_axis_number(axis)
+
+ values = self.values
+ mask = np.isfinite(values)
+ result = mask.sum(axis=i, dtype='int64')
+
+ return self._wrap_result(result, axis)
+
+ def shift(self, periods=1, freq=None, axis='major'):
+ """
+ Shift index by desired number of periods with an optional time freq.
+
+ The shifted data will not include the dropped periods and the
+ shifted axis will be smaller than the original. This is different
+ from the behavior of DataFrame.shift()
+
+ Parameters
+ ----------
+ periods : int
+ Number of periods to move, can be positive or negative
+ freq : DateOffset, timedelta, or time rule string, optional
+ axis : {'items', 'major', 'minor'} or {0, 1, 2}
+
+ Returns
+ -------
+ shifted : Panel
+ """
+ if freq:
+ return self.tshift(periods, freq, axis=axis)
+
+ return super(Panel, self).slice_shift(periods, axis=axis)
+
+ def tshift(self, periods=1, freq=None, axis='major'):
+ return super(Panel, self).tshift(periods, freq, axis)
+
+ def join(self, other, how='left', lsuffix='', rsuffix=''):
+ """
+ Join items with other Panel either on major and minor axes column.
+
+ Parameters
+ ----------
+ other : Panel or list of Panels
+ Index should be similar to one of the columns in this one
+ how : {'left', 'right', 'outer', 'inner'}
+ How to handle indexes of the two objects. Default: 'left'
+ for joining on index, None otherwise
+ * left: use calling frame's index
+ * right: use input frame's index
+ * outer: form union of indexes
+ * inner: use intersection of indexes
+ lsuffix : string
+ Suffix to use from left frame's overlapping columns
+ rsuffix : string
+ Suffix to use from right frame's overlapping columns
+
+ Returns
+ -------
+ joined : Panel
+ """
+ from pandas.core.reshape.concat import concat
+
+ if isinstance(other, Panel):
+ join_major, join_minor = self._get_join_index(other, how)
+ this = self.reindex(major=join_major, minor=join_minor)
+ other = other.reindex(major=join_major, minor=join_minor)
+ merged_data = this._data.merge(other._data, lsuffix, rsuffix)
+ return self._constructor(merged_data)
+ else:
+ if lsuffix or rsuffix:
+ raise ValueError('Suffixes not supported when passing '
+ 'multiple panels')
+
+ if how == 'left':
+ how = 'outer'
+ join_axes = [self.major_axis, self.minor_axis]
+ elif how == 'right':
+ raise ValueError('Right join not supported with multiple '
+ 'panels')
+ else:
+ join_axes = None
+
+ return concat([self] + list(other), axis=0, join=how,
+ join_axes=join_axes, verify_integrity=True)
+
+ @deprecate_kwarg(old_arg_name='raise_conflict', new_arg_name='errors',
+ mapping={False: 'ignore', True: 'raise'})
+ def update(self, other, join='left', overwrite=True, filter_func=None,
+ errors='ignore'):
+ """
+ Modify Panel in place using non-NA values from other Panel.
+
+ May also use object coercible to Panel. Will align on items.
+
+ Parameters
+ ----------
+ other : Panel, or object coercible to Panel
+ The object from which the caller will be udpated.
+ join : {'left', 'right', 'outer', 'inner'}, default 'left'
+ How individual DataFrames are joined.
+ overwrite : bool, default True
+ If True then overwrite values for common keys in the calling Panel.
+ filter_func : callable(1d-array) -> 1d-array<bool>, default None
+ Can choose to replace values other than NA. Return True for values
+ that should be updated.
+ errors : {'raise', 'ignore'}, default 'ignore'
+ If 'raise', will raise an error if a DataFrame and other both.
+
+ .. versionchanged :: 0.24.0
+ Changed from `raise_conflict=False|True`
+ to `errors='ignore'|'raise'`.
+
+ See Also
+ --------
+ DataFrame.update : Similar method for DataFrames.
+ dict.update : Similar method for dictionaries.
+ """
+
+ if not isinstance(other, self._constructor):
+ other = self._constructor(other)
+
+ axis_name = self._info_axis_name
+ axis_values = self._info_axis
+ other = other.reindex(**{axis_name: axis_values})
+
+ for frame in axis_values:
+ self[frame].update(other[frame], join=join, overwrite=overwrite,
+ filter_func=filter_func, errors=errors)
+
+ def _get_join_index(self, other, how):
+ if how == 'left':
+ join_major, join_minor = self.major_axis, self.minor_axis
+ elif how == 'right':
+ join_major, join_minor = other.major_axis, other.minor_axis
+ elif how == 'inner':
+ join_major = self.major_axis.intersection(other.major_axis)
+ join_minor = self.minor_axis.intersection(other.minor_axis)
+ elif how == 'outer':
+ join_major = self.major_axis.union(other.major_axis)
+ join_minor = self.minor_axis.union(other.minor_axis)
+ return join_major, join_minor
+
+ # miscellaneous data creation
+ @staticmethod
+ def _extract_axes(self, data, axes, **kwargs):
+ """
+ Return a list of the axis indices.
+ """
+ return [self._extract_axis(self, data, axis=i, **kwargs)
+ for i, a in enumerate(axes)]
+
+ @staticmethod
+ def _extract_axes_for_slice(self, axes):
+ """
+ Return the slice dictionary for these axes.
+ """
+ return {self._AXIS_SLICEMAP[i]: a for i, a in
+ zip(self._AXIS_ORDERS[self._AXIS_LEN - len(axes):], axes)}
+
+ @staticmethod
+ def _prep_ndarray(self, values, copy=True):
+ if not isinstance(values, np.ndarray):
+ values = np.asarray(values)
+ # NumPy strings are a pain, convert to object
+ if issubclass(values.dtype.type, compat.string_types):
+ values = np.array(values, dtype=object, copy=True)
+ else:
+ if copy:
+ values = values.copy()
+ if values.ndim != self._AXIS_LEN:
+ raise ValueError("The number of dimensions required is {0}, "
+ "but the number of dimensions of the "
+ "ndarray given was {1}".format(self._AXIS_LEN,
+ values.ndim))
+ return values
+
+ @staticmethod
+ def _homogenize_dict(self, frames, intersect=True, dtype=None):
+ """
+ Conform set of _constructor_sliced-like objects to either
+ an intersection of indices / columns or a union.
+
+ Parameters
+ ----------
+ frames : dict
+ intersect : boolean, default True
+
+ Returns
+ -------
+ dict of aligned results & indices
+ """
+
+ result = dict()
+ # caller differs dict/ODict, preserved type
+ if isinstance(frames, OrderedDict):
+ result = OrderedDict()
+
+ adj_frames = OrderedDict()
+ for k, v in compat.iteritems(frames):
+ if isinstance(v, dict):
+ adj_frames[k] = self._constructor_sliced(v)
+ else:
+ adj_frames[k] = v
+
+ axes = self._AXIS_ORDERS[1:]
+ axes_dict = {a: ax for a, ax in zip(axes, self._extract_axes(
+ self, adj_frames, axes, intersect=intersect))}
+
+ reindex_dict = {self._AXIS_SLICEMAP[a]: axes_dict[a] for a in axes}
+ reindex_dict['copy'] = False
+ for key, frame in compat.iteritems(adj_frames):
+ if frame is not None:
+ result[key] = frame.reindex(**reindex_dict)
+ else:
+ result[key] = None
+
+ axes_dict['data'] = result
+ axes_dict['dtype'] = dtype
+ return axes_dict
+
+ @staticmethod
+ def _extract_axis(self, data, axis=0, intersect=False):
+
+ index = None
+ if len(data) == 0:
+ index = Index([])
+ elif len(data) > 0:
+ raw_lengths = []
+
+ have_raw_arrays = False
+ have_frames = False
+
+ for v in data.values():
+ if isinstance(v, self._constructor_sliced):
+ have_frames = True
+ elif v is not None:
+ have_raw_arrays = True
+ raw_lengths.append(v.shape[axis])
+
+ if have_frames:
+ # we want the "old" behavior here, of sorting only
+ # 1. we're doing a union (intersect=False)
+ # 2. the indices are not aligned.
+ index = _get_objs_combined_axis(data.values(), axis=axis,
+ intersect=intersect, sort=None)
+
+ if have_raw_arrays:
+ lengths = list(set(raw_lengths))
+ if len(lengths) > 1:
+ raise ValueError('ndarrays must match shape on '
+ 'axis {ax}'.format(ax=axis))
+
+ if have_frames:
+ if lengths[0] != len(index):
+ raise AssertionError('Length of data and index must match')
+ else:
+ index = Index(np.arange(lengths[0]))
+
+ if index is None:
+ index = Index([])
+
+ return ensure_index(index)
+
+ def sort_values(self, *args, **kwargs):
+ """
+ NOT IMPLEMENTED: do not call this method, as sorting values is not
+ supported for Panel objects and will raise an error.
+ """
+ super(Panel, self).sort_values(*args, **kwargs)
+
+
+Panel._setup_axes(axes=['items', 'major_axis', 'minor_axis'], info_axis=0,
+ stat_axis=1, aliases={'major': 'major_axis',
+ 'minor': 'minor_axis'},
+ slicers={'major_axis': 'index',
+ 'minor_axis': 'columns'},
+ docs={})
+
+ops.add_special_arithmetic_methods(Panel)
+ops.add_flex_arithmetic_methods(Panel)
+Panel._add_numeric_operations()
diff --git a/contrib/python/pandas/py2/pandas/core/resample.py b/contrib/python/pandas/py2/pandas/core/resample.py
new file mode 100644
index 00000000000..7723827ff47
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/resample.py
@@ -0,0 +1,1766 @@
+import copy
+from datetime import timedelta
+from textwrap import dedent
+import warnings
+
+import numpy as np
+
+from pandas._libs import lib
+from pandas._libs.tslibs import NaT, Timestamp
+from pandas._libs.tslibs.frequencies import is_subperiod, is_superperiod
+from pandas._libs.tslibs.period import IncompatibleFrequency
+import pandas.compat as compat
+from pandas.compat.numpy import function as nv
+from pandas.errors import AbstractMethodError
+from pandas.util._decorators import Appender, Substitution
+
+from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
+
+import pandas as pd
+import pandas.core.algorithms as algos
+from pandas.core.generic import _shared_docs
+from pandas.core.groupby.base import GroupByMixin
+from pandas.core.groupby.generic import PanelGroupBy, SeriesGroupBy
+from pandas.core.groupby.groupby import (
+ GroupBy, _GroupBy, _pipe_template, groupby)
+from pandas.core.groupby.grouper import Grouper
+from pandas.core.groupby.ops import BinGrouper
+from pandas.core.indexes.datetimes import DatetimeIndex, date_range
+from pandas.core.indexes.period import PeriodIndex
+from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range
+
+from pandas.tseries.frequencies import to_offset
+from pandas.tseries.offsets import DateOffset, Day, Nano, Tick
+
+_shared_docs_kwargs = dict()
+
+
+class Resampler(_GroupBy):
+
+ """
+ Class for resampling datetimelike data, a groupby-like operation.
+ See aggregate, transform, and apply functions on this object.
+
+ It's easiest to use obj.resample(...) to use Resampler.
+
+ Parameters
+ ----------
+ obj : pandas object
+ groupby : a TimeGrouper object
+ axis : int, default 0
+ kind : str or None
+ 'period', 'timestamp' to override default index treatement
+
+ Returns
+ -------
+ a Resampler of the appropriate type
+
+ Notes
+ -----
+ After resampling, see aggregate, apply, and transform functions.
+ """
+
+ # to the groupby descriptor
+ _attributes = ['freq', 'axis', 'closed', 'label', 'convention',
+ 'loffset', 'base', 'kind']
+
+ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs):
+ self.groupby = groupby
+ self.keys = None
+ self.sort = True
+ self.axis = axis
+ self.kind = kind
+ self.squeeze = False
+ self.group_keys = True
+ self.as_index = True
+ self.exclusions = set()
+ self.binner = None
+ self.grouper = None
+
+ if self.groupby is not None:
+ self.groupby._set_grouper(self._convert_obj(obj), sort=True)
+
+ def __unicode__(self):
+ """
+ Provide a nice str repr of our rolling object.
+ """
+ attrs = ["{k}={v}".format(k=k, v=getattr(self.groupby, k))
+ for k in self._attributes if
+ getattr(self.groupby, k, None) is not None]
+ return "{klass} [{attrs}]".format(klass=self.__class__.__name__,
+ attrs=', '.join(attrs))
+
+ def __getattr__(self, attr):
+ if attr in self._internal_names_set:
+ return object.__getattribute__(self, attr)
+ if attr in self._attributes:
+ return getattr(self.groupby, attr)
+ if attr in self.obj:
+ return self[attr]
+
+ return object.__getattribute__(self, attr)
+
+ def __iter__(self):
+ """
+ Resampler iterator.
+
+ Returns
+ -------
+ Generator yielding sequence of (name, subsetted object)
+ for each group
+
+ See Also
+ --------
+ GroupBy.__iter__
+ """
+ self._set_binner()
+ return super(Resampler, self).__iter__()
+
+ @property
+ def obj(self):
+ return self.groupby.obj
+
+ @property
+ def ax(self):
+ return self.groupby.ax
+
+ @property
+ def _typ(self):
+ """
+ Masquerade for compat as a Series or a DataFrame.
+ """
+ if isinstance(self._selected_obj, pd.Series):
+ return 'series'
+ return 'dataframe'
+
+ @property
+ def _from_selection(self):
+ """
+ Is the resampling from a DataFrame column or MultiIndex level.
+ """
+ # upsampling and PeriodIndex resampling do not work
+ # with selection, this state used to catch and raise an error
+ return (self.groupby is not None and
+ (self.groupby.key is not None or
+ self.groupby.level is not None))
+
+ def _convert_obj(self, obj):
+ """
+ Provide any conversions for the object in order to correctly handle.
+
+ Parameters
+ ----------
+ obj : the object to be resampled
+
+ Returns
+ -------
+ obj : converted object
+ """
+ obj = obj._consolidate()
+ return obj
+
+ def _get_binner_for_time(self):
+ raise AbstractMethodError(self)
+
+ def _set_binner(self):
+ """
+ Setup our binners.
+
+ Cache these as we are an immutable object
+ """
+ if self.binner is None:
+ self.binner, self.grouper = self._get_binner()
+
+ def _get_binner(self):
+ """
+ Create the BinGrouper, assume that self.set_grouper(obj)
+ has already been called.
+ """
+
+ binner, bins, binlabels = self._get_binner_for_time()
+ bin_grouper = BinGrouper(bins, binlabels, indexer=self.groupby.indexer)
+ return binner, bin_grouper
+
+ def _assure_grouper(self):
+ """
+ Make sure that we are creating our binner & grouper.
+ """
+ self._set_binner()
+
+ @Substitution(klass='Resampler',
+ versionadded='.. versionadded:: 0.23.0',
+ examples="""
+ >>> df = pd.DataFrame({'A': [1, 2, 3, 4]},
+ ... index=pd.date_range('2012-08-02', periods=4))
+ >>> df
+ A
+ 2012-08-02 1
+ 2012-08-03 2
+ 2012-08-04 3
+ 2012-08-05 4
+
+ To get the difference between each 2-day period's maximum and minimum
+ value in one pass, you can do
+
+ >>> df.resample('2D').pipe(lambda x: x.max() - x.min())
+ A
+ 2012-08-02 1
+ 2012-08-04 1
+ """)
+ @Appender(_pipe_template)
+ def pipe(self, func, *args, **kwargs):
+ return super(Resampler, self).pipe(func, *args, **kwargs)
+
+ _agg_see_also_doc = dedent("""
+ See Also
+ --------
+ pandas.DataFrame.groupby.aggregate
+ pandas.DataFrame.resample.transform
+ pandas.DataFrame.aggregate
+ """)
+
+ _agg_examples_doc = dedent("""
+ Examples
+ --------
+ >>> s = pd.Series([1,2,3,4,5],
+ index=pd.date_range('20130101', periods=5,freq='s'))
+ 2013-01-01 00:00:00 1
+ 2013-01-01 00:00:01 2
+ 2013-01-01 00:00:02 3
+ 2013-01-01 00:00:03 4
+ 2013-01-01 00:00:04 5
+ Freq: S, dtype: int64
+
+ >>> r = s.resample('2s')
+ DatetimeIndexResampler [freq=<2 * Seconds>, axis=0, closed=left,
+ label=left, convention=start, base=0]
+
+ >>> r.agg(np.sum)
+ 2013-01-01 00:00:00 3
+ 2013-01-01 00:00:02 7
+ 2013-01-01 00:00:04 5
+ Freq: 2S, dtype: int64
+
+ >>> r.agg(['sum','mean','max'])
+ sum mean max
+ 2013-01-01 00:00:00 3 1.5 2
+ 2013-01-01 00:00:02 7 3.5 4
+ 2013-01-01 00:00:04 5 5.0 5
+
+ >>> r.agg({'result' : lambda x: x.mean() / x.std(),
+ 'total' : np.sum})
+ total result
+ 2013-01-01 00:00:00 3 2.121320
+ 2013-01-01 00:00:02 7 4.949747
+ 2013-01-01 00:00:04 5 NaN
+ """)
+
+ @Substitution(see_also=_agg_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded='',
+ klass='DataFrame',
+ axis='')
+ @Appender(_shared_docs['aggregate'])
+ def aggregate(self, func, *args, **kwargs):
+
+ self._set_binner()
+ result, how = self._aggregate(func, *args, **kwargs)
+ if result is None:
+ how = func
+ grouper = None
+ result = self._groupby_and_aggregate(how,
+ grouper,
+ *args,
+ **kwargs)
+
+ result = self._apply_loffset(result)
+ return result
+
+ agg = aggregate
+ apply = aggregate
+
+ def transform(self, arg, *args, **kwargs):
+ """
+ Call function producing a like-indexed Series on each group and return
+ a Series with the transformed values.
+
+ Parameters
+ ----------
+ func : function
+ To apply to each group. Should return a Series with the same index
+
+ Returns
+ -------
+ transformed : Series
+
+ Examples
+ --------
+ >>> resampled.transform(lambda x: (x - x.mean()) / x.std())
+ """
+ return self._selected_obj.groupby(self.groupby).transform(
+ arg, *args, **kwargs)
+
+ def _downsample(self, f):
+ raise AbstractMethodError(self)
+
+ def _upsample(self, f, limit=None, fill_value=None):
+ raise AbstractMethodError(self)
+
+ def _gotitem(self, key, ndim, subset=None):
+ """
+ Sub-classes to define. Return a sliced object.
+
+ Parameters
+ ----------
+ key : string / list of selections
+ ndim : 1,2
+ requested ndim of result
+ subset : object, default None
+ subset to act on
+ """
+ self._set_binner()
+ grouper = self.grouper
+ if subset is None:
+ subset = self.obj
+ grouped = groupby(subset, by=None, grouper=grouper, axis=self.axis)
+
+ # try the key selection
+ try:
+ return grouped[key]
+ except KeyError:
+ return grouped
+
+ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs):
+ """
+ Re-evaluate the obj with a groupby aggregation.
+ """
+
+ if grouper is None:
+ self._set_binner()
+ grouper = self.grouper
+
+ obj = self._selected_obj
+
+ try:
+ grouped = groupby(obj, by=None, grouper=grouper, axis=self.axis)
+ except TypeError:
+
+ # panel grouper
+ grouped = PanelGroupBy(obj, grouper=grouper, axis=self.axis)
+
+ try:
+ if isinstance(obj, ABCDataFrame) and compat.callable(how):
+ # Check if the function is reducing or not.
+ result = grouped._aggregate_item_by_item(how, *args, **kwargs)
+ else:
+ result = grouped.aggregate(how, *args, **kwargs)
+ except Exception:
+
+ # we have a non-reducing function
+ # try to evaluate
+ result = grouped.apply(how, *args, **kwargs)
+
+ result = self._apply_loffset(result)
+ return self._wrap_result(result)
+
+ def _apply_loffset(self, result):
+ """
+ If loffset is set, offset the result index.
+
+ This is NOT an idempotent routine, it will be applied
+ exactly once to the result.
+
+ Parameters
+ ----------
+ result : Series or DataFrame
+ the result of resample
+ """
+
+ needs_offset = (
+ isinstance(self.loffset, (DateOffset, timedelta,
+ np.timedelta64)) and
+ isinstance(result.index, DatetimeIndex) and
+ len(result.index) > 0
+ )
+
+ if needs_offset:
+ result.index = result.index + self.loffset
+
+ self.loffset = None
+ return result
+
+ def _get_resampler_for_grouping(self, groupby, **kwargs):
+ """
+ Return the correct class for resampling with groupby.
+ """
+ return self._resampler_for_grouping(self, groupby=groupby, **kwargs)
+
+ def _wrap_result(self, result):
+ """
+ Potentially wrap any results.
+ """
+ if isinstance(result, ABCSeries) and self._selection is not None:
+ result.name = self._selection
+
+ if isinstance(result, ABCSeries) and result.empty:
+ obj = self.obj
+ if isinstance(obj.index, PeriodIndex):
+ result.index = obj.index.asfreq(self.freq)
+ else:
+ result.index = obj.index._shallow_copy(freq=self.freq)
+ result.name = getattr(obj, 'name', None)
+
+ return result
+
+ def pad(self, limit=None):
+ """
+ Forward fill the values.
+
+ Parameters
+ ----------
+ limit : integer, optional
+ limit of how many values to fill
+
+ Returns
+ -------
+ an upsampled Series
+
+ See Also
+ --------
+ Series.fillna
+ DataFrame.fillna
+ """
+ return self._upsample('pad', limit=limit)
+ ffill = pad
+
+ def nearest(self, limit=None):
+ """
+ Resample by using the nearest value.
+
+ When resampling data, missing values may appear (e.g., when the
+ resampling frequency is higher than the original frequency).
+ The `nearest` method will replace ``NaN`` values that appeared in
+ the resampled data with the value from the nearest member of the
+ sequence, based on the index value.
+ Missing values that existed in the original data will not be modified.
+ If `limit` is given, fill only this many values in each direction for
+ each of the original values.
+
+ Parameters
+ ----------
+ limit : int, optional
+ Limit of how many values to fill.
+
+ .. versionadded:: 0.21.0
+
+ Returns
+ -------
+ Series or DataFrame
+ An upsampled Series or DataFrame with ``NaN`` values filled with
+ their nearest value.
+
+ See Also
+ --------
+ backfill : Backward fill the new missing values in the resampled data.
+ pad : Forward fill ``NaN`` values.
+
+ Examples
+ --------
+ >>> s = pd.Series([1, 2],
+ ... index=pd.date_range('20180101',
+ ... periods=2,
+ ... freq='1h'))
+ >>> s
+ 2018-01-01 00:00:00 1
+ 2018-01-01 01:00:00 2
+ Freq: H, dtype: int64
+
+ >>> s.resample('15min').nearest()
+ 2018-01-01 00:00:00 1
+ 2018-01-01 00:15:00 1
+ 2018-01-01 00:30:00 2
+ 2018-01-01 00:45:00 2
+ 2018-01-01 01:00:00 2
+ Freq: 15T, dtype: int64
+
+ Limit the number of upsampled values imputed by the nearest:
+
+ >>> s.resample('15min').nearest(limit=1)
+ 2018-01-01 00:00:00 1.0
+ 2018-01-01 00:15:00 1.0
+ 2018-01-01 00:30:00 NaN
+ 2018-01-01 00:45:00 2.0
+ 2018-01-01 01:00:00 2.0
+ Freq: 15T, dtype: float64
+ """
+ return self._upsample('nearest', limit=limit)
+
+ def backfill(self, limit=None):
+ """
+ Backward fill the new missing values in the resampled data.
+
+ In statistics, imputation is the process of replacing missing data with
+ substituted values [1]_. When resampling data, missing values may
+ appear (e.g., when the resampling frequency is higher than the original
+ frequency). The backward fill will replace NaN values that appeared in
+ the resampled data with the next value in the original sequence.
+ Missing values that existed in the original data will not be modified.
+
+ Parameters
+ ----------
+ limit : integer, optional
+ Limit of how many values to fill.
+
+ Returns
+ -------
+ Series, DataFrame
+ An upsampled Series or DataFrame with backward filled NaN values.
+
+ See Also
+ --------
+ bfill : Alias of backfill.
+ fillna : Fill NaN values using the specified method, which can be
+ 'backfill'.
+ nearest : Fill NaN values with nearest neighbor starting from center.
+ pad : Forward fill NaN values.
+ pandas.Series.fillna : Fill NaN values in the Series using the
+ specified method, which can be 'backfill'.
+ pandas.DataFrame.fillna : Fill NaN values in the DataFrame using the
+ specified method, which can be 'backfill'.
+
+ References
+ ----------
+ .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics)
+
+ Examples
+ --------
+
+ Resampling a Series:
+
+ >>> s = pd.Series([1, 2, 3],
+ ... index=pd.date_range('20180101', periods=3, freq='h'))
+ >>> s
+ 2018-01-01 00:00:00 1
+ 2018-01-01 01:00:00 2
+ 2018-01-01 02:00:00 3
+ Freq: H, dtype: int64
+
+ >>> s.resample('30min').backfill()
+ 2018-01-01 00:00:00 1
+ 2018-01-01 00:30:00 2
+ 2018-01-01 01:00:00 2
+ 2018-01-01 01:30:00 3
+ 2018-01-01 02:00:00 3
+ Freq: 30T, dtype: int64
+
+ >>> s.resample('15min').backfill(limit=2)
+ 2018-01-01 00:00:00 1.0
+ 2018-01-01 00:15:00 NaN
+ 2018-01-01 00:30:00 2.0
+ 2018-01-01 00:45:00 2.0
+ 2018-01-01 01:00:00 2.0
+ 2018-01-01 01:15:00 NaN
+ 2018-01-01 01:30:00 3.0
+ 2018-01-01 01:45:00 3.0
+ 2018-01-01 02:00:00 3.0
+ Freq: 15T, dtype: float64
+
+ Resampling a DataFrame that has missing values:
+
+ >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]},
+ ... index=pd.date_range('20180101', periods=3,
+ ... freq='h'))
+ >>> df
+ a b
+ 2018-01-01 00:00:00 2.0 1
+ 2018-01-01 01:00:00 NaN 3
+ 2018-01-01 02:00:00 6.0 5
+
+ >>> df.resample('30min').backfill()
+ a b
+ 2018-01-01 00:00:00 2.0 1
+ 2018-01-01 00:30:00 NaN 3
+ 2018-01-01 01:00:00 NaN 3
+ 2018-01-01 01:30:00 6.0 5
+ 2018-01-01 02:00:00 6.0 5
+
+ >>> df.resample('15min').backfill(limit=2)
+ a b
+ 2018-01-01 00:00:00 2.0 1.0
+ 2018-01-01 00:15:00 NaN NaN
+ 2018-01-01 00:30:00 NaN 3.0
+ 2018-01-01 00:45:00 NaN 3.0
+ 2018-01-01 01:00:00 NaN 3.0
+ 2018-01-01 01:15:00 NaN NaN
+ 2018-01-01 01:30:00 6.0 5.0
+ 2018-01-01 01:45:00 6.0 5.0
+ 2018-01-01 02:00:00 6.0 5.0
+ """
+ return self._upsample('backfill', limit=limit)
+ bfill = backfill
+
+ def fillna(self, method, limit=None):
+ """
+ Fill missing values introduced by upsampling.
+
+ In statistics, imputation is the process of replacing missing data with
+ substituted values [1]_. When resampling data, missing values may
+ appear (e.g., when the resampling frequency is higher than the original
+ frequency).
+
+ Missing values that existed in the original data will
+ not be modified.
+
+ Parameters
+ ----------
+ method : {'pad', 'backfill', 'ffill', 'bfill', 'nearest'}
+ Method to use for filling holes in resampled data
+
+ * 'pad' or 'ffill': use previous valid observation to fill gap
+ (forward fill).
+ * 'backfill' or 'bfill': use next valid observation to fill gap.
+ * 'nearest': use nearest valid observation to fill gap.
+
+ limit : integer, optional
+ Limit of how many consecutive missing values to fill.
+
+ Returns
+ -------
+ Series or DataFrame
+ An upsampled Series or DataFrame with missing values filled.
+
+ See Also
+ --------
+ backfill : Backward fill NaN values in the resampled data.
+ pad : Forward fill NaN values in the resampled data.
+ nearest : Fill NaN values in the resampled data
+ with nearest neighbor starting from center.
+ interpolate : Fill NaN values using interpolation.
+ pandas.Series.fillna : Fill NaN values in the Series using the
+ specified method, which can be 'bfill' and 'ffill'.
+ pandas.DataFrame.fillna : Fill NaN values in the DataFrame using the
+ specified method, which can be 'bfill' and 'ffill'.
+
+ References
+ ----------
+ .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics)
+
+ Examples
+ --------
+ Resampling a Series:
+
+ >>> s = pd.Series([1, 2, 3],
+ ... index=pd.date_range('20180101', periods=3, freq='h'))
+ >>> s
+ 2018-01-01 00:00:00 1
+ 2018-01-01 01:00:00 2
+ 2018-01-01 02:00:00 3
+ Freq: H, dtype: int64
+
+ Without filling the missing values you get:
+
+ >>> s.resample("30min").asfreq()
+ 2018-01-01 00:00:00 1.0
+ 2018-01-01 00:30:00 NaN
+ 2018-01-01 01:00:00 2.0
+ 2018-01-01 01:30:00 NaN
+ 2018-01-01 02:00:00 3.0
+ Freq: 30T, dtype: float64
+
+ >>> s.resample('30min').fillna("backfill")
+ 2018-01-01 00:00:00 1
+ 2018-01-01 00:30:00 2
+ 2018-01-01 01:00:00 2
+ 2018-01-01 01:30:00 3
+ 2018-01-01 02:00:00 3
+ Freq: 30T, dtype: int64
+
+ >>> s.resample('15min').fillna("backfill", limit=2)
+ 2018-01-01 00:00:00 1.0
+ 2018-01-01 00:15:00 NaN
+ 2018-01-01 00:30:00 2.0
+ 2018-01-01 00:45:00 2.0
+ 2018-01-01 01:00:00 2.0
+ 2018-01-01 01:15:00 NaN
+ 2018-01-01 01:30:00 3.0
+ 2018-01-01 01:45:00 3.0
+ 2018-01-01 02:00:00 3.0
+ Freq: 15T, dtype: float64
+
+ >>> s.resample('30min').fillna("pad")
+ 2018-01-01 00:00:00 1
+ 2018-01-01 00:30:00 1
+ 2018-01-01 01:00:00 2
+ 2018-01-01 01:30:00 2
+ 2018-01-01 02:00:00 3
+ Freq: 30T, dtype: int64
+
+ >>> s.resample('30min').fillna("nearest")
+ 2018-01-01 00:00:00 1
+ 2018-01-01 00:30:00 2
+ 2018-01-01 01:00:00 2
+ 2018-01-01 01:30:00 3
+ 2018-01-01 02:00:00 3
+ Freq: 30T, dtype: int64
+
+ Missing values present before the upsampling are not affected.
+
+ >>> sm = pd.Series([1, None, 3],
+ ... index=pd.date_range('20180101', periods=3, freq='h'))
+ >>> sm
+ 2018-01-01 00:00:00 1.0
+ 2018-01-01 01:00:00 NaN
+ 2018-01-01 02:00:00 3.0
+ Freq: H, dtype: float64
+
+ >>> sm.resample('30min').fillna('backfill')
+ 2018-01-01 00:00:00 1.0
+ 2018-01-01 00:30:00 NaN
+ 2018-01-01 01:00:00 NaN
+ 2018-01-01 01:30:00 3.0
+ 2018-01-01 02:00:00 3.0
+ Freq: 30T, dtype: float64
+
+ >>> sm.resample('30min').fillna('pad')
+ 2018-01-01 00:00:00 1.0
+ 2018-01-01 00:30:00 1.0
+ 2018-01-01 01:00:00 NaN
+ 2018-01-01 01:30:00 NaN
+ 2018-01-01 02:00:00 3.0
+ Freq: 30T, dtype: float64
+
+ >>> sm.resample('30min').fillna('nearest')
+ 2018-01-01 00:00:00 1.0
+ 2018-01-01 00:30:00 NaN
+ 2018-01-01 01:00:00 NaN
+ 2018-01-01 01:30:00 3.0
+ 2018-01-01 02:00:00 3.0
+ Freq: 30T, dtype: float64
+
+ DataFrame resampling is done column-wise. All the same options are
+ available.
+
+ >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]},
+ ... index=pd.date_range('20180101', periods=3,
+ ... freq='h'))
+ >>> df
+ a b
+ 2018-01-01 00:00:00 2.0 1
+ 2018-01-01 01:00:00 NaN 3
+ 2018-01-01 02:00:00 6.0 5
+
+ >>> df.resample('30min').fillna("bfill")
+ a b
+ 2018-01-01 00:00:00 2.0 1
+ 2018-01-01 00:30:00 NaN 3
+ 2018-01-01 01:00:00 NaN 3
+ 2018-01-01 01:30:00 6.0 5
+ 2018-01-01 02:00:00 6.0 5
+ """
+ return self._upsample(method, limit=limit)
+
+ @Appender(_shared_docs['interpolate'] % _shared_docs_kwargs)
+ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
+ limit_direction='forward', limit_area=None,
+ downcast=None, **kwargs):
+ """
+ Interpolate values according to different methods.
+
+ .. versionadded:: 0.18.1
+ """
+ result = self._upsample(None)
+ return result.interpolate(method=method, axis=axis, limit=limit,
+ inplace=inplace,
+ limit_direction=limit_direction,
+ limit_area=limit_area,
+ downcast=downcast, **kwargs)
+
+ def asfreq(self, fill_value=None):
+ """
+ Return the values at the new freq, essentially a reindex.
+
+ Parameters
+ ----------
+ fill_value : scalar, optional
+ Value to use for missing values, applied during upsampling (note
+ this does not fill NaNs that already were present).
+
+ .. versionadded:: 0.20.0
+
+ See Also
+ --------
+ Series.asfreq
+ DataFrame.asfreq
+ """
+ return self._upsample('asfreq', fill_value=fill_value)
+
+ def std(self, ddof=1, *args, **kwargs):
+ """
+ Compute standard deviation of groups, excluding missing values.
+
+ Parameters
+ ----------
+ ddof : integer, default 1
+ degrees of freedom
+ """
+ nv.validate_resampler_func('std', args, kwargs)
+ return self._downsample('std', ddof=ddof)
+
+ def var(self, ddof=1, *args, **kwargs):
+ """
+ Compute variance of groups, excluding missing values.
+
+ Parameters
+ ----------
+ ddof : integer, default 1
+ degrees of freedom
+ """
+ nv.validate_resampler_func('var', args, kwargs)
+ return self._downsample('var', ddof=ddof)
+
+ @Appender(GroupBy.size.__doc__)
+ def size(self):
+ # It's a special case as higher level does return
+ # a copy of 0-len objects. GH14962
+ result = self._downsample('size')
+ if not len(self.ax) and isinstance(self._selected_obj, ABCDataFrame):
+ result = pd.Series([], index=result.index, dtype='int64')
+ return result
+
+ def quantile(self, q=0.5, **kwargs):
+ """
+ Return value at the given quantile.
+
+ .. versionadded:: 0.24.0
+
+ Parameters
+ ----------
+ q : float or array-like, default 0.5 (50% quantile)
+
+ See Also
+ --------
+ Series.quantile
+ DataFrame.quantile
+ DataFrameGroupBy.quantile
+ """
+ return self._downsample('quantile', q=q, **kwargs)
+
+
+# downsample methods
+for method in ['sum', 'prod']:
+
+ def f(self, _method=method, min_count=0, *args, **kwargs):
+ nv.validate_resampler_func(_method, args, kwargs)
+ return self._downsample(_method, min_count=min_count)
+ f.__doc__ = getattr(GroupBy, method).__doc__
+ setattr(Resampler, method, f)
+
+
+# downsample methods
+for method in ['min', 'max', 'first', 'last', 'mean', 'sem',
+ 'median', 'ohlc']:
+
+ def f(self, _method=method, *args, **kwargs):
+ nv.validate_resampler_func(_method, args, kwargs)
+ return self._downsample(_method)
+ f.__doc__ = getattr(GroupBy, method).__doc__
+ setattr(Resampler, method, f)
+
+# groupby & aggregate methods
+for method in ['count']:
+ def f(self, _method=method):
+ return self._downsample(_method)
+ f.__doc__ = getattr(GroupBy, method).__doc__
+ setattr(Resampler, method, f)
+
+# series only methods
+for method in ['nunique']:
+ def f(self, _method=method):
+ return self._downsample(_method)
+ f.__doc__ = getattr(SeriesGroupBy, method).__doc__
+ setattr(Resampler, method, f)
+
+
+def _maybe_process_deprecations(r, how=None, fill_method=None, limit=None):
+ """
+ Potentially we might have a deprecation warning, show it
+ but call the appropriate methods anyhow.
+ """
+
+ if how is not None:
+
+ # .resample(..., how='sum')
+ if isinstance(how, compat.string_types):
+ method = "{0}()".format(how)
+
+ # .resample(..., how=lambda x: ....)
+ else:
+ method = ".apply(<func>)"
+
+ # if we have both a how and fill_method, then show
+ # the following warning
+ if fill_method is None:
+ warnings.warn("how in .resample() is deprecated\n"
+ "the new syntax is "
+ ".resample(...).{method}".format(
+ method=method),
+ FutureWarning, stacklevel=3)
+ r = r.aggregate(how)
+
+ if fill_method is not None:
+
+ # show the prior function call
+ method = '.' + method if how is not None else ''
+
+ args = "limit={0}".format(limit) if limit is not None else ""
+ warnings.warn("fill_method is deprecated to .resample()\n"
+ "the new syntax is .resample(...){method}"
+ ".{fill_method}({args})".format(
+ method=method,
+ fill_method=fill_method,
+ args=args),
+ FutureWarning, stacklevel=3)
+
+ if how is not None:
+ r = getattr(r, fill_method)(limit=limit)
+ else:
+ r = r.aggregate(fill_method, limit=limit)
+
+ return r
+
+
+class _GroupByMixin(GroupByMixin):
+ """
+ Provide the groupby facilities.
+ """
+ def __init__(self, obj, *args, **kwargs):
+
+ parent = kwargs.pop('parent', None)
+ groupby = kwargs.pop('groupby', None)
+ if parent is None:
+ parent = obj
+
+ # initialize our GroupByMixin object with
+ # the resampler attributes
+ for attr in self._attributes:
+ setattr(self, attr, kwargs.get(attr, getattr(parent, attr)))
+
+ super(_GroupByMixin, self).__init__(None)
+ self._groupby = groupby
+ self._groupby.mutated = True
+ self._groupby.grouper.mutated = True
+ self.groupby = copy.copy(parent.groupby)
+
+ def _apply(self, f, grouper=None, *args, **kwargs):
+ """
+ Dispatch to _upsample; we are stripping all of the _upsample kwargs and
+ performing the original function call on the grouped object.
+ """
+
+ def func(x):
+ x = self._shallow_copy(x, groupby=self.groupby)
+
+ if isinstance(f, compat.string_types):
+ return getattr(x, f)(**kwargs)
+
+ return x.apply(f, *args, **kwargs)
+
+ result = self._groupby.apply(func)
+ return self._wrap_result(result)
+
+ _upsample = _apply
+ _downsample = _apply
+ _groupby_and_aggregate = _apply
+
+
+class DatetimeIndexResampler(Resampler):
+
+ @property
+ def _resampler_for_grouping(self):
+ return DatetimeIndexResamplerGroupby
+
+ def _get_binner_for_time(self):
+
+ # this is how we are actually creating the bins
+ if self.kind == 'period':
+ return self.groupby._get_time_period_bins(self.ax)
+ return self.groupby._get_time_bins(self.ax)
+
+ def _downsample(self, how, **kwargs):
+ """
+ Downsample the cython defined function.
+
+ Parameters
+ ----------
+ how : string / cython mapped function
+ **kwargs : kw args passed to how function
+ """
+ self._set_binner()
+ how = self._is_cython_func(how) or how
+ ax = self.ax
+ obj = self._selected_obj
+
+ if not len(ax):
+ # reset to the new freq
+ obj = obj.copy()
+ obj.index.freq = self.freq
+ return obj
+
+ # do we have a regular frequency
+ if ax.freq is not None or ax.inferred_freq is not None:
+
+ if len(self.grouper.binlabels) > len(ax) and how is None:
+
+ # let's do an asfreq
+ return self.asfreq()
+
+ # we are downsampling
+ # we want to call the actual grouper method here
+ result = obj.groupby(
+ self.grouper, axis=self.axis).aggregate(how, **kwargs)
+
+ result = self._apply_loffset(result)
+ return self._wrap_result(result)
+
+ def _adjust_binner_for_upsample(self, binner):
+ """
+ Adjust our binner when upsampling.
+
+ The range of a new index should not be outside specified range
+ """
+ if self.closed == 'right':
+ binner = binner[1:]
+ else:
+ binner = binner[:-1]
+ return binner
+
+ def _upsample(self, method, limit=None, fill_value=None):
+ """
+ Parameters
+ ----------
+ method : string {'backfill', 'bfill', 'pad',
+ 'ffill', 'asfreq'} method for upsampling
+ limit : int, default None
+ Maximum size gap to fill when reindexing
+ fill_value : scalar, default None
+ Value to use for missing values
+
+ See Also
+ --------
+ .fillna
+
+ """
+ self._set_binner()
+ if self.axis:
+ raise AssertionError('axis must be 0')
+ if self._from_selection:
+ raise ValueError("Upsampling from level= or on= selection"
+ " is not supported, use .set_index(...)"
+ " to explicitly set index to"
+ " datetime-like")
+
+ ax = self.ax
+ obj = self._selected_obj
+ binner = self.binner
+ res_index = self._adjust_binner_for_upsample(binner)
+
+ # if we have the same frequency as our axis, then we are equal sampling
+ if limit is None and to_offset(ax.inferred_freq) == self.freq:
+ result = obj.copy()
+ result.index = res_index
+ else:
+ result = obj.reindex(res_index, method=method,
+ limit=limit, fill_value=fill_value)
+
+ result = self._apply_loffset(result)
+ return self._wrap_result(result)
+
+ def _wrap_result(self, result):
+ result = super(DatetimeIndexResampler, self)._wrap_result(result)
+
+ # we may have a different kind that we were asked originally
+ # convert if needed
+ if self.kind == 'period' and not isinstance(result.index, PeriodIndex):
+ result.index = result.index.to_period(self.freq)
+ return result
+
+
+class DatetimeIndexResamplerGroupby(_GroupByMixin, DatetimeIndexResampler):
+ """
+ Provides a resample of a groupby implementation
+
+ .. versionadded:: 0.18.1
+ """
+ @property
+ def _constructor(self):
+ return DatetimeIndexResampler
+
+
+class PeriodIndexResampler(DatetimeIndexResampler):
+
+ @property
+ def _resampler_for_grouping(self):
+ return PeriodIndexResamplerGroupby
+
+ def _get_binner_for_time(self):
+ if self.kind == 'timestamp':
+ return super(PeriodIndexResampler, self)._get_binner_for_time()
+ return self.groupby._get_period_bins(self.ax)
+
+ def _convert_obj(self, obj):
+ obj = super(PeriodIndexResampler, self)._convert_obj(obj)
+
+ if self._from_selection:
+ # see GH 14008, GH 12871
+ msg = ("Resampling from level= or on= selection"
+ " with a PeriodIndex is not currently supported,"
+ " use .set_index(...) to explicitly set index")
+ raise NotImplementedError(msg)
+
+ if self.loffset is not None:
+ # Cannot apply loffset/timedelta to PeriodIndex -> convert to
+ # timestamps
+ self.kind = 'timestamp'
+
+ # convert to timestamp
+ if self.kind == 'timestamp':
+ obj = obj.to_timestamp(how=self.convention)
+
+ return obj
+
+ def _downsample(self, how, **kwargs):
+ """
+ Downsample the cython defined function.
+
+ Parameters
+ ----------
+ how : string / cython mapped function
+ **kwargs : kw args passed to how function
+ """
+
+ # we may need to actually resample as if we are timestamps
+ if self.kind == 'timestamp':
+ return super(PeriodIndexResampler, self)._downsample(how, **kwargs)
+
+ how = self._is_cython_func(how) or how
+ ax = self.ax
+
+ if is_subperiod(ax.freq, self.freq):
+ # Downsampling
+ return self._groupby_and_aggregate(how, grouper=self.grouper,
+ **kwargs)
+ elif is_superperiod(ax.freq, self.freq):
+ if how == 'ohlc':
+ # GH #13083
+ # upsampling to subperiods is handled as an asfreq, which works
+ # for pure aggregating/reducing methods
+ # OHLC reduces along the time dimension, but creates multiple
+ # values for each period -> handle by _groupby_and_aggregate()
+ return self._groupby_and_aggregate(how, grouper=self.grouper)
+ return self.asfreq()
+ elif ax.freq == self.freq:
+ return self.asfreq()
+
+ raise IncompatibleFrequency(
+ 'Frequency {} cannot be resampled to {}, as they are not '
+ 'sub or super periods'.format(ax.freq, self.freq))
+
+ def _upsample(self, method, limit=None, fill_value=None):
+ """
+ Parameters
+ ----------
+ method : string {'backfill', 'bfill', 'pad', 'ffill'}
+ method for upsampling
+ limit : int, default None
+ Maximum size gap to fill when reindexing
+ fill_value : scalar, default None
+ Value to use for missing values
+
+ See Also
+ --------
+ .fillna
+
+ """
+
+ # we may need to actually resample as if we are timestamps
+ if self.kind == 'timestamp':
+ return super(PeriodIndexResampler, self)._upsample(
+ method, limit=limit, fill_value=fill_value)
+
+ self._set_binner()
+ ax = self.ax
+ obj = self.obj
+ new_index = self.binner
+
+ # Start vs. end of period
+ memb = ax.asfreq(self.freq, how=self.convention)
+
+ # Get the fill indexer
+ indexer = memb.get_indexer(new_index, method=method, limit=limit)
+ return self._wrap_result(_take_new_index(
+ obj, indexer, new_index, axis=self.axis))
+
+
+class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler):
+ """
+ Provides a resample of a groupby implementation.
+
+ .. versionadded:: 0.18.1
+ """
+ @property
+ def _constructor(self):
+ return PeriodIndexResampler
+
+
+class TimedeltaIndexResampler(DatetimeIndexResampler):
+
+ @property
+ def _resampler_for_grouping(self):
+ return TimedeltaIndexResamplerGroupby
+
+ def _get_binner_for_time(self):
+ return self.groupby._get_time_delta_bins(self.ax)
+
+ def _adjust_binner_for_upsample(self, binner):
+ """
+ Adjust our binner when upsampling.
+
+ The range of a new index is allowed to be greater than original range
+ so we don't need to change the length of a binner, GH 13022
+ """
+ return binner
+
+
+class TimedeltaIndexResamplerGroupby(_GroupByMixin, TimedeltaIndexResampler):
+ """
+ Provides a resample of a groupby implementation.
+
+ .. versionadded:: 0.18.1
+ """
+ @property
+ def _constructor(self):
+ return TimedeltaIndexResampler
+
+
+def resample(obj, kind=None, **kwds):
+ """
+ Create a TimeGrouper and return our resampler.
+ """
+ tg = TimeGrouper(**kwds)
+ return tg._get_resampler(obj, kind=kind)
+
+
+resample.__doc__ = Resampler.__doc__
+
+
+def get_resampler_for_grouping(groupby, rule, how=None, fill_method=None,
+ limit=None, kind=None, **kwargs):
+ """
+ Return our appropriate resampler when grouping as well.
+ """
+
+ # .resample uses 'on' similar to how .groupby uses 'key'
+ kwargs['key'] = kwargs.pop('on', None)
+
+ tg = TimeGrouper(freq=rule, **kwargs)
+ resampler = tg._get_resampler(groupby.obj, kind=kind)
+ r = resampler._get_resampler_for_grouping(groupby=groupby)
+ return _maybe_process_deprecations(r,
+ how=how,
+ fill_method=fill_method,
+ limit=limit)
+
+
+class TimeGrouper(Grouper):
+ """
+ Custom groupby class for time-interval grouping.
+
+ Parameters
+ ----------
+ freq : pandas date offset or offset alias for identifying bin edges
+ closed : closed end of interval; 'left' or 'right'
+ label : interval boundary to use for labeling; 'left' or 'right'
+ convention : {'start', 'end', 'e', 's'}
+ If axis is PeriodIndex
+ """
+ _attributes = Grouper._attributes + ('closed', 'label', 'how',
+ 'loffset', 'kind', 'convention',
+ 'base')
+
+ def __init__(self, freq='Min', closed=None, label=None, how='mean',
+ axis=0, fill_method=None, limit=None, loffset=None,
+ kind=None, convention=None, base=0, **kwargs):
+ # Check for correctness of the keyword arguments which would
+ # otherwise silently use the default if misspelled
+ if label not in {None, 'left', 'right'}:
+ raise ValueError('Unsupported value {} for `label`'.format(label))
+ if closed not in {None, 'left', 'right'}:
+ raise ValueError('Unsupported value {} for `closed`'.format(
+ closed))
+ if convention not in {None, 'start', 'end', 'e', 's'}:
+ raise ValueError('Unsupported value {} for `convention`'
+ .format(convention))
+
+ freq = to_offset(freq)
+
+ end_types = {'M', 'A', 'Q', 'BM', 'BA', 'BQ', 'W'}
+ rule = freq.rule_code
+ if (rule in end_types or
+ ('-' in rule and rule[:rule.find('-')] in end_types)):
+ if closed is None:
+ closed = 'right'
+ if label is None:
+ label = 'right'
+ else:
+ if closed is None:
+ closed = 'left'
+ if label is None:
+ label = 'left'
+
+ self.closed = closed
+ self.label = label
+ self.kind = kind
+
+ self.convention = convention or 'E'
+ self.convention = self.convention.lower()
+
+ if isinstance(loffset, compat.string_types):
+ loffset = to_offset(loffset)
+ self.loffset = loffset
+
+ self.how = how
+ self.fill_method = fill_method
+ self.limit = limit
+ self.base = base
+
+ # always sort time groupers
+ kwargs['sort'] = True
+
+ super(TimeGrouper, self).__init__(freq=freq, axis=axis, **kwargs)
+
+ def _get_resampler(self, obj, kind=None):
+ """
+ Return my resampler or raise if we have an invalid axis.
+
+ Parameters
+ ----------
+ obj : input object
+ kind : string, optional
+ 'period','timestamp','timedelta' are valid
+
+ Returns
+ -------
+ a Resampler
+
+ Raises
+ ------
+ TypeError if incompatible axis
+
+ """
+ self._set_grouper(obj)
+
+ ax = self.ax
+ if isinstance(ax, DatetimeIndex):
+ return DatetimeIndexResampler(obj,
+ groupby=self,
+ kind=kind,
+ axis=self.axis)
+ elif isinstance(ax, PeriodIndex) or kind == 'period':
+ return PeriodIndexResampler(obj,
+ groupby=self,
+ kind=kind,
+ axis=self.axis)
+ elif isinstance(ax, TimedeltaIndex):
+ return TimedeltaIndexResampler(obj,
+ groupby=self,
+ axis=self.axis)
+
+ raise TypeError("Only valid with DatetimeIndex, "
+ "TimedeltaIndex or PeriodIndex, "
+ "but got an instance of %r" % type(ax).__name__)
+
+ def _get_grouper(self, obj, validate=True):
+ # create the resampler and return our binner
+ r = self._get_resampler(obj)
+ r._set_binner()
+ return r.binner, r.grouper, r.obj
+
+ def _get_time_bins(self, ax):
+ if not isinstance(ax, DatetimeIndex):
+ raise TypeError('axis must be a DatetimeIndex, but got '
+ 'an instance of %r' % type(ax).__name__)
+
+ if len(ax) == 0:
+ binner = labels = DatetimeIndex(
+ data=[], freq=self.freq, name=ax.name)
+ return binner, [], labels
+
+ first, last = _get_timestamp_range_edges(ax.min(), ax.max(),
+ self.freq,
+ closed=self.closed,
+ base=self.base)
+ # GH #12037
+ # use first/last directly instead of call replace() on them
+ # because replace() will swallow the nanosecond part
+ # thus last bin maybe slightly before the end if the end contains
+ # nanosecond part and lead to `Values falls after last bin` error
+ binner = labels = date_range(freq=self.freq,
+ start=first,
+ end=last,
+ tz=ax.tz,
+ name=ax.name,
+ ambiguous='infer',
+ nonexistent='shift_forward')
+
+ ax_values = ax.asi8
+ binner, bin_edges = self._adjust_bin_edges(binner, ax_values)
+
+ # general version, knowing nothing about relative frequencies
+ bins = lib.generate_bins_dt64(
+ ax_values, bin_edges, self.closed, hasnans=ax.hasnans)
+
+ if self.closed == 'right':
+ labels = binner
+ if self.label == 'right':
+ labels = labels[1:]
+ elif self.label == 'right':
+ labels = labels[1:]
+
+ if ax.hasnans:
+ binner = binner.insert(0, NaT)
+ labels = labels.insert(0, NaT)
+
+ # if we end up with more labels than bins
+ # adjust the labels
+ # GH4076
+ if len(bins) < len(labels):
+ labels = labels[:len(bins)]
+
+ return binner, bins, labels
+
+ def _adjust_bin_edges(self, binner, ax_values):
+ # Some hacks for > daily data, see #1471, #1458, #1483
+
+ if self.freq != 'D' and is_superperiod(self.freq, 'D'):
+ if self.closed == 'right':
+ # GH 21459, GH 9119: Adjust the bins relative to the wall time
+ bin_edges = binner.tz_localize(None)
+ bin_edges = bin_edges + timedelta(1) - Nano(1)
+ bin_edges = bin_edges.tz_localize(binner.tz).asi8
+ else:
+ bin_edges = binner.asi8
+
+ # intraday values on last day
+ if bin_edges[-2] > ax_values.max():
+ bin_edges = bin_edges[:-1]
+ binner = binner[:-1]
+ else:
+ bin_edges = binner.asi8
+ return binner, bin_edges
+
+ def _get_time_delta_bins(self, ax):
+ if not isinstance(ax, TimedeltaIndex):
+ raise TypeError('axis must be a TimedeltaIndex, but got '
+ 'an instance of %r' % type(ax).__name__)
+
+ if not len(ax):
+ binner = labels = TimedeltaIndex(
+ data=[], freq=self.freq, name=ax.name)
+ return binner, [], labels
+
+ start, end = ax.min(), ax.max()
+ labels = binner = timedelta_range(start=start,
+ end=end,
+ freq=self.freq,
+ name=ax.name)
+
+ end_stamps = labels + self.freq
+ bins = ax.searchsorted(end_stamps, side='left')
+
+ # Addresses GH #10530
+ if self.base > 0:
+ labels += type(self.freq)(self.base)
+
+ return binner, bins, labels
+
+ def _get_time_period_bins(self, ax):
+ if not isinstance(ax, DatetimeIndex):
+ raise TypeError('axis must be a DatetimeIndex, but got '
+ 'an instance of %r' % type(ax).__name__)
+
+ freq = self.freq
+
+ if not len(ax):
+ binner = labels = PeriodIndex(data=[], freq=freq, name=ax.name)
+ return binner, [], labels
+
+ labels = binner = pd.period_range(start=ax[0],
+ end=ax[-1],
+ freq=freq,
+ name=ax.name)
+
+ end_stamps = (labels + freq).asfreq(freq, 's').to_timestamp()
+ if ax.tzinfo:
+ end_stamps = end_stamps.tz_localize(ax.tzinfo)
+ bins = ax.searchsorted(end_stamps, side='left')
+
+ return binner, bins, labels
+
+ def _get_period_bins(self, ax):
+ if not isinstance(ax, PeriodIndex):
+ raise TypeError('axis must be a PeriodIndex, but got '
+ 'an instance of %r' % type(ax).__name__)
+
+ memb = ax.asfreq(self.freq, how=self.convention)
+
+ # NaT handling as in pandas._lib.lib.generate_bins_dt64()
+ nat_count = 0
+ if memb.hasnans:
+ nat_count = np.sum(memb._isnan)
+ memb = memb[~memb._isnan]
+
+ # if index contains no valid (non-NaT) values, return empty index
+ if not len(memb):
+ binner = labels = PeriodIndex(
+ data=[], freq=self.freq, name=ax.name)
+ return binner, [], labels
+
+ freq_mult = self.freq.n
+
+ start = ax.min().asfreq(self.freq, how=self.convention)
+ end = ax.max().asfreq(self.freq, how='end')
+ bin_shift = 0
+
+ # GH 23882
+ if self.base:
+ # get base adjusted bin edge labels
+ p_start, end = _get_period_range_edges(start,
+ end,
+ self.freq,
+ closed=self.closed,
+ base=self.base)
+
+ # Get offset for bin edge (not label edge) adjustment
+ start_offset = (pd.Period(start, self.freq)
+ - pd.Period(p_start, self.freq))
+ bin_shift = start_offset.n % freq_mult
+ start = p_start
+
+ labels = binner = pd.period_range(start=start, end=end,
+ freq=self.freq, name=ax.name)
+
+ i8 = memb.asi8
+
+ # when upsampling to subperiods, we need to generate enough bins
+ expected_bins_count = len(binner) * freq_mult
+ i8_extend = expected_bins_count - (i8[-1] - i8[0])
+ rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult)
+ rng += freq_mult
+ # adjust bin edge indexes to account for base
+ rng -= bin_shift
+ bins = memb.searchsorted(rng, side='left')
+
+ if nat_count > 0:
+ # NaT handling as in pandas._lib.lib.generate_bins_dt64()
+ # shift bins by the number of NaT
+ bins += nat_count
+ bins = np.insert(bins, 0, nat_count)
+ binner = binner.insert(0, NaT)
+ labels = labels.insert(0, NaT)
+
+ return binner, bins, labels
+
+
+def _take_new_index(obj, indexer, new_index, axis=0):
+ from pandas.core.api import Series, DataFrame
+
+ if isinstance(obj, Series):
+ new_values = algos.take_1d(obj.values, indexer)
+ return Series(new_values, index=new_index, name=obj.name)
+ elif isinstance(obj, DataFrame):
+ if axis == 1:
+ raise NotImplementedError("axis 1 is not supported")
+ return DataFrame(obj._data.reindex_indexer(
+ new_axis=new_index, indexer=indexer, axis=1))
+ else:
+ raise ValueError("'obj' should be either a Series or a DataFrame")
+
+
+def _get_timestamp_range_edges(first, last, offset, closed='left', base=0):
+ """
+ Adjust the `first` Timestamp to the preceeding Timestamp that resides on
+ the provided offset. Adjust the `last` Timestamp to the following
+ Timestamp that resides on the provided offset. Input Timestamps that
+ already reside on the offset will be adjusted depending on the type of
+ offset and the `closed` parameter.
+
+ Parameters
+ ----------
+ first : pd.Timestamp
+ The beginning Timestamp of the range to be adjusted.
+ last : pd.Timestamp
+ The ending Timestamp of the range to be adjusted.
+ offset : pd.DateOffset
+ The dateoffset to which the Timestamps will be adjusted.
+ closed : {'right', 'left'}, default None
+ Which side of bin interval is closed.
+ base : int, default 0
+ The "origin" of the adjusted Timestamps.
+
+ Returns
+ -------
+ A tuple of length 2, containing the adjusted pd.Timestamp objects.
+ """
+ if isinstance(offset, Tick):
+ if isinstance(offset, Day):
+ # _adjust_dates_anchored assumes 'D' means 24H, but first/last
+ # might contain a DST transition (23H, 24H, or 25H).
+ # So "pretend" the dates are naive when adjusting the endpoints
+ tz = first.tz
+ first = first.tz_localize(None)
+ last = last.tz_localize(None)
+
+ first, last = _adjust_dates_anchored(first, last, offset,
+ closed=closed, base=base)
+ if isinstance(offset, Day):
+ first = first.tz_localize(tz)
+ last = last.tz_localize(tz)
+ return first, last
+
+ else:
+ first = first.normalize()
+ last = last.normalize()
+
+ if closed == 'left':
+ first = Timestamp(offset.rollback(first))
+ else:
+ first = Timestamp(first - offset)
+
+ last = Timestamp(last + offset)
+
+ return first, last
+
+
+def _get_period_range_edges(first, last, offset, closed='left', base=0):
+ """
+ Adjust the provided `first` and `last` Periods to the respective Period of
+ the given offset that encompasses them.
+
+ Parameters
+ ----------
+ first : pd.Period
+ The beginning Period of the range to be adjusted.
+ last : pd.Period
+ The ending Period of the range to be adjusted.
+ offset : pd.DateOffset
+ The dateoffset to which the Periods will be adjusted.
+ closed : {'right', 'left'}, default None
+ Which side of bin interval is closed.
+ base : int, default 0
+ The "origin" of the adjusted Periods.
+
+ Returns
+ -------
+ A tuple of length 2, containing the adjusted pd.Period objects.
+ """
+ if not all(isinstance(obj, pd.Period) for obj in [first, last]):
+ raise TypeError("'first' and 'last' must be instances of type Period")
+
+ # GH 23882
+ first = first.to_timestamp()
+ last = last.to_timestamp()
+ adjust_first = not offset.onOffset(first)
+ adjust_last = offset.onOffset(last)
+
+ first, last = _get_timestamp_range_edges(first, last, offset,
+ closed=closed, base=base)
+
+ first = (first + adjust_first * offset).to_period(offset)
+ last = (last - adjust_last * offset).to_period(offset)
+ return first, last
+
+
+def _adjust_dates_anchored(first, last, offset, closed='right', base=0):
+ # First and last offsets should be calculated from the start day to fix an
+ # error cause by resampling across multiple days when a one day period is
+ # not a multiple of the frequency.
+ #
+ # See https://github.com/pandas-dev/pandas/issues/8683
+
+ # GH 10117 & GH 19375. If first and last contain timezone information,
+ # Perform the calculation in UTC in order to avoid localizing on an
+ # Ambiguous or Nonexistent time.
+ first_tzinfo = first.tzinfo
+ last_tzinfo = last.tzinfo
+ start_day_nanos = first.normalize().value
+ if first_tzinfo is not None:
+ first = first.tz_convert('UTC')
+ if last_tzinfo is not None:
+ last = last.tz_convert('UTC')
+
+ base_nanos = (base % offset.n) * offset.nanos // offset.n
+ start_day_nanos += base_nanos
+
+ foffset = (first.value - start_day_nanos) % offset.nanos
+ loffset = (last.value - start_day_nanos) % offset.nanos
+
+ if closed == 'right':
+ if foffset > 0:
+ # roll back
+ fresult = first.value - foffset
+ else:
+ fresult = first.value - offset.nanos
+
+ if loffset > 0:
+ # roll forward
+ lresult = last.value + (offset.nanos - loffset)
+ else:
+ # already the end of the road
+ lresult = last.value
+ else: # closed == 'left'
+ if foffset > 0:
+ fresult = first.value - foffset
+ else:
+ # start of the road
+ fresult = first.value
+
+ if loffset > 0:
+ # roll forward
+ lresult = last.value + (offset.nanos - loffset)
+ else:
+ lresult = last.value + offset.nanos
+ fresult = Timestamp(fresult)
+ lresult = Timestamp(lresult)
+ if first_tzinfo is not None:
+ fresult = fresult.tz_localize('UTC').tz_convert(first_tzinfo)
+ if last_tzinfo is not None:
+ lresult = lresult.tz_localize('UTC').tz_convert(last_tzinfo)
+ return fresult, lresult
+
+
+def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None):
+ """
+ Utility frequency conversion method for Series/DataFrame.
+ """
+ if isinstance(obj.index, PeriodIndex):
+ if method is not None:
+ raise NotImplementedError("'method' argument is not supported")
+
+ if how is None:
+ how = 'E'
+
+ new_obj = obj.copy()
+ new_obj.index = obj.index.asfreq(freq, how=how)
+
+ elif len(obj.index) == 0:
+ new_obj = obj.copy()
+ new_obj.index = obj.index._shallow_copy(freq=to_offset(freq))
+
+ else:
+ dti = date_range(obj.index[0], obj.index[-1], freq=freq)
+ dti.name = obj.index.name
+ new_obj = obj.reindex(dti, method=method, fill_value=fill_value)
+ if normalize:
+ new_obj.index = new_obj.index.normalize()
+
+ return new_obj
diff --git a/contrib/python/pandas/py2/pandas/core/reshape/__init__.py b/contrib/python/pandas/py2/pandas/core/reshape/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/reshape/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/core/reshape/api.py b/contrib/python/pandas/py2/pandas/core/reshape/api.py
new file mode 100644
index 00000000000..3c76eef809c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/reshape/api.py
@@ -0,0 +1,8 @@
+# flake8: noqa
+
+from pandas.core.reshape.concat import concat
+from pandas.core.reshape.melt import lreshape, melt, wide_to_long
+from pandas.core.reshape.merge import merge, merge_asof, merge_ordered
+from pandas.core.reshape.pivot import crosstab, pivot, pivot_table
+from pandas.core.reshape.reshape import get_dummies
+from pandas.core.reshape.tile import cut, qcut
diff --git a/contrib/python/pandas/py2/pandas/core/reshape/concat.py b/contrib/python/pandas/py2/pandas/core/reshape/concat.py
new file mode 100644
index 00000000000..53671e00e88
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/reshape/concat.py
@@ -0,0 +1,635 @@
+"""
+concat routines
+"""
+
+import numpy as np
+
+import pandas.core.dtypes.concat as _concat
+
+from pandas import DataFrame, Index, MultiIndex, Series, compat
+from pandas.core import common as com
+from pandas.core.arrays.categorical import (
+ _factorize_from_iterable, _factorize_from_iterables)
+from pandas.core.generic import NDFrame
+from pandas.core.index import (
+ _all_indexes_same, _get_consensus_names, _get_objs_combined_axis,
+ ensure_index)
+import pandas.core.indexes.base as ibase
+from pandas.core.internals import concatenate_block_managers
+
+# ---------------------------------------------------------------------
+# Concatenate DataFrame objects
+
+
+def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
+ keys=None, levels=None, names=None, verify_integrity=False,
+ sort=None, copy=True):
+ """
+ Concatenate pandas objects along a particular axis with optional set logic
+ along the other axes.
+
+ Can also add a layer of hierarchical indexing on the concatenation axis,
+ which may be useful if the labels are the same (or overlapping) on
+ the passed axis number.
+
+ Parameters
+ ----------
+ objs : a sequence or mapping of Series, DataFrame, or Panel objects
+ If a dict is passed, the sorted keys will be used as the `keys`
+ argument, unless it is passed, in which case the values will be
+ selected (see below). Any None objects will be dropped silently unless
+ they are all None in which case a ValueError will be raised
+ axis : {0/'index', 1/'columns'}, default 0
+ The axis to concatenate along
+ join : {'inner', 'outer'}, default 'outer'
+ How to handle indexes on other axis(es)
+ join_axes : list of Index objects
+ Specific indexes to use for the other n - 1 axes instead of performing
+ inner/outer set logic
+ ignore_index : boolean, default False
+ If True, do not use the index values along the concatenation axis. The
+ resulting axis will be labeled 0, ..., n - 1. This is useful if you are
+ concatenating objects where the concatenation axis does not have
+ meaningful indexing information. Note the index values on the other
+ axes are still respected in the join.
+ keys : sequence, default None
+ If multiple levels passed, should contain tuples. Construct
+ hierarchical index using the passed keys as the outermost level
+ levels : list of sequences, default None
+ Specific levels (unique values) to use for constructing a
+ MultiIndex. Otherwise they will be inferred from the keys
+ names : list, default None
+ Names for the levels in the resulting hierarchical index
+ verify_integrity : boolean, default False
+ Check whether the new concatenated axis contains duplicates. This can
+ be very expensive relative to the actual data concatenation
+ sort : boolean, default None
+ Sort non-concatenation axis if it is not already aligned when `join`
+ is 'outer'. The current default of sorting is deprecated and will
+ change to not-sorting in a future version of pandas.
+
+ Explicitly pass ``sort=True`` to silence the warning and sort.
+ Explicitly pass ``sort=False`` to silence the warning and not sort.
+
+ This has no effect when ``join='inner'``, which already preserves
+ the order of the non-concatenation axis.
+
+ .. versionadded:: 0.23.0
+
+ copy : boolean, default True
+ If False, do not copy data unnecessarily
+
+ Returns
+ -------
+ concatenated : object, type of objs
+ When concatenating all ``Series`` along the index (axis=0), a
+ ``Series`` is returned. When ``objs`` contains at least one
+ ``DataFrame``, a ``DataFrame`` is returned. When concatenating along
+ the columns (axis=1), a ``DataFrame`` is returned.
+
+ See Also
+ --------
+ Series.append
+ DataFrame.append
+ DataFrame.join
+ DataFrame.merge
+
+ Notes
+ -----
+ The keys, levels, and names arguments are all optional.
+
+ A walkthrough of how this method fits in with other tools for combining
+ pandas objects can be found `here
+ <http://pandas.pydata.org/pandas-docs/stable/merging.html>`__.
+
+ Examples
+ --------
+ Combine two ``Series``.
+
+ >>> s1 = pd.Series(['a', 'b'])
+ >>> s2 = pd.Series(['c', 'd'])
+ >>> pd.concat([s1, s2])
+ 0 a
+ 1 b
+ 0 c
+ 1 d
+ dtype: object
+
+ Clear the existing index and reset it in the result
+ by setting the ``ignore_index`` option to ``True``.
+
+ >>> pd.concat([s1, s2], ignore_index=True)
+ 0 a
+ 1 b
+ 2 c
+ 3 d
+ dtype: object
+
+ Add a hierarchical index at the outermost level of
+ the data with the ``keys`` option.
+
+ >>> pd.concat([s1, s2], keys=['s1', 's2',])
+ s1 0 a
+ 1 b
+ s2 0 c
+ 1 d
+ dtype: object
+
+ Label the index keys you create with the ``names`` option.
+
+ >>> pd.concat([s1, s2], keys=['s1', 's2'],
+ ... names=['Series name', 'Row ID'])
+ Series name Row ID
+ s1 0 a
+ 1 b
+ s2 0 c
+ 1 d
+ dtype: object
+
+ Combine two ``DataFrame`` objects with identical columns.
+
+ >>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
+ ... columns=['letter', 'number'])
+ >>> df1
+ letter number
+ 0 a 1
+ 1 b 2
+ >>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
+ ... columns=['letter', 'number'])
+ >>> df2
+ letter number
+ 0 c 3
+ 1 d 4
+ >>> pd.concat([df1, df2])
+ letter number
+ 0 a 1
+ 1 b 2
+ 0 c 3
+ 1 d 4
+
+ Combine ``DataFrame`` objects with overlapping columns
+ and return everything. Columns outside the intersection will
+ be filled with ``NaN`` values.
+
+ >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
+ ... columns=['letter', 'number', 'animal'])
+ >>> df3
+ letter number animal
+ 0 c 3 cat
+ 1 d 4 dog
+ >>> pd.concat([df1, df3], sort=False)
+ letter number animal
+ 0 a 1 NaN
+ 1 b 2 NaN
+ 0 c 3 cat
+ 1 d 4 dog
+
+ Combine ``DataFrame`` objects with overlapping columns
+ and return only those that are shared by passing ``inner`` to
+ the ``join`` keyword argument.
+
+ >>> pd.concat([df1, df3], join="inner")
+ letter number
+ 0 a 1
+ 1 b 2
+ 0 c 3
+ 1 d 4
+
+ Combine ``DataFrame`` objects horizontally along the x axis by
+ passing in ``axis=1``.
+
+ >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
+ ... columns=['animal', 'name'])
+ >>> pd.concat([df1, df4], axis=1)
+ letter number animal name
+ 0 a 1 bird polly
+ 1 b 2 monkey george
+
+ Prevent the result from including duplicate index values with the
+ ``verify_integrity`` option.
+
+ >>> df5 = pd.DataFrame([1], index=['a'])
+ >>> df5
+ 0
+ a 1
+ >>> df6 = pd.DataFrame([2], index=['a'])
+ >>> df6
+ 0
+ a 2
+ >>> pd.concat([df5, df6], verify_integrity=True)
+ Traceback (most recent call last):
+ ...
+ ValueError: Indexes have overlapping values: ['a']
+ """
+ op = _Concatenator(objs, axis=axis, join_axes=join_axes,
+ ignore_index=ignore_index, join=join,
+ keys=keys, levels=levels, names=names,
+ verify_integrity=verify_integrity,
+ copy=copy, sort=sort)
+ return op.get_result()
+
+
+class _Concatenator(object):
+ """
+ Orchestrates a concatenation operation for BlockManagers
+ """
+
+ def __init__(self, objs, axis=0, join='outer', join_axes=None,
+ keys=None, levels=None, names=None,
+ ignore_index=False, verify_integrity=False, copy=True,
+ sort=False):
+ if isinstance(objs, (NDFrame, compat.string_types)):
+ raise TypeError('first argument must be an iterable of pandas '
+ 'objects, you passed an object of type '
+ '"{name}"'.format(name=type(objs).__name__))
+
+ if join == 'outer':
+ self.intersect = False
+ elif join == 'inner':
+ self.intersect = True
+ else: # pragma: no cover
+ raise ValueError('Only can inner (intersect) or outer (union) '
+ 'join the other axis')
+
+ if isinstance(objs, dict):
+ if keys is None:
+ keys = sorted(objs)
+ objs = [objs[k] for k in keys]
+ else:
+ objs = list(objs)
+
+ if len(objs) == 0:
+ raise ValueError('No objects to concatenate')
+
+ if keys is None:
+ objs = list(com._not_none(*objs))
+ else:
+ # #1649
+ clean_keys = []
+ clean_objs = []
+ for k, v in zip(keys, objs):
+ if v is None:
+ continue
+ clean_keys.append(k)
+ clean_objs.append(v)
+ objs = clean_objs
+ name = getattr(keys, 'name', None)
+ keys = Index(clean_keys, name=name)
+
+ if len(objs) == 0:
+ raise ValueError('All objects passed were None')
+
+ # consolidate data & figure out what our result ndim is going to be
+ ndims = set()
+ for obj in objs:
+ if not isinstance(obj, NDFrame):
+ msg = ('cannot concatenate object of type "{0}";'
+ ' only pd.Series, pd.DataFrame, and pd.Panel'
+ ' (deprecated) objs are valid'.format(type(obj)))
+ raise TypeError(msg)
+
+ # consolidate
+ obj._consolidate(inplace=True)
+ ndims.add(obj.ndim)
+
+ # get the sample
+ # want the highest ndim that we have, and must be non-empty
+ # unless all objs are empty
+ sample = None
+ if len(ndims) > 1:
+ max_ndim = max(ndims)
+ for obj in objs:
+ if obj.ndim == max_ndim and np.sum(obj.shape):
+ sample = obj
+ break
+
+ else:
+ # filter out the empties if we have not multi-index possibilities
+ # note to keep empty Series as it affect to result columns / name
+ non_empties = [obj for obj in objs
+ if sum(obj.shape) > 0 or isinstance(obj, Series)]
+
+ if (len(non_empties) and (keys is None and names is None and
+ levels is None and
+ join_axes is None and
+ not self.intersect)):
+ objs = non_empties
+ sample = objs[0]
+
+ if sample is None:
+ sample = objs[0]
+ self.objs = objs
+
+ # Standardize axis parameter to int
+ if isinstance(sample, Series):
+ axis = DataFrame._get_axis_number(axis)
+ else:
+ axis = sample._get_axis_number(axis)
+
+ # Need to flip BlockManager axis in the DataFrame special case
+ self._is_frame = isinstance(sample, DataFrame)
+ if self._is_frame:
+ axis = 1 if axis == 0 else 0
+
+ self._is_series = isinstance(sample, Series)
+ if not 0 <= axis <= sample.ndim:
+ raise AssertionError("axis must be between 0 and {ndim}, input was"
+ " {axis}".format(ndim=sample.ndim, axis=axis))
+
+ # if we have mixed ndims, then convert to highest ndim
+ # creating column numbers as needed
+ if len(ndims) > 1:
+ current_column = 0
+ max_ndim = sample.ndim
+ self.objs, objs = [], self.objs
+ for obj in objs:
+
+ ndim = obj.ndim
+ if ndim == max_ndim:
+ pass
+
+ elif ndim != max_ndim - 1:
+ raise ValueError("cannot concatenate unaligned mixed "
+ "dimensional NDFrame objects")
+
+ else:
+ name = getattr(obj, 'name', None)
+ if ignore_index or name is None:
+ name = current_column
+ current_column += 1
+
+ # doing a row-wise concatenation so need everything
+ # to line up
+ if self._is_frame and axis == 1:
+ name = 0
+ obj = sample._constructor({name: obj})
+
+ self.objs.append(obj)
+
+ # note: this is the BlockManager axis (since DataFrame is transposed)
+ self.axis = axis
+ self.join_axes = join_axes
+ self.keys = keys
+ self.names = names or getattr(keys, 'names', None)
+ self.levels = levels
+ self.sort = sort
+
+ self.ignore_index = ignore_index
+ self.verify_integrity = verify_integrity
+ self.copy = copy
+
+ self.new_axes = self._get_new_axes()
+
+ def get_result(self):
+
+ # series only
+ if self._is_series:
+
+ # stack blocks
+ if self.axis == 0:
+ name = com.consensus_name_attr(self.objs)
+
+ mgr = self.objs[0]._data.concat([x._data for x in self.objs],
+ self.new_axes)
+ cons = _concat._get_series_result_type(mgr, self.objs)
+ return cons(mgr, name=name).__finalize__(self, method='concat')
+
+ # combine as columns in a frame
+ else:
+ data = dict(zip(range(len(self.objs)), self.objs))
+ cons = _concat._get_series_result_type(data)
+
+ index, columns = self.new_axes
+ df = cons(data, index=index)
+ df.columns = columns
+ return df.__finalize__(self, method='concat')
+
+ # combine block managers
+ else:
+ mgrs_indexers = []
+ for obj in self.objs:
+ mgr = obj._data
+ indexers = {}
+ for ax, new_labels in enumerate(self.new_axes):
+ if ax == self.axis:
+ # Suppress reindexing on concat axis
+ continue
+
+ obj_labels = mgr.axes[ax]
+ if not new_labels.equals(obj_labels):
+ indexers[ax] = obj_labels.reindex(new_labels)[1]
+
+ mgrs_indexers.append((obj._data, indexers))
+
+ new_data = concatenate_block_managers(
+ mgrs_indexers, self.new_axes, concat_axis=self.axis,
+ copy=self.copy)
+ if not self.copy:
+ new_data._consolidate_inplace()
+
+ cons = _concat._get_frame_result_type(new_data, self.objs)
+ return (cons._from_axes(new_data, self.new_axes)
+ .__finalize__(self, method='concat'))
+
+ def _get_result_dim(self):
+ if self._is_series and self.axis == 1:
+ return 2
+ else:
+ return self.objs[0].ndim
+
+ def _get_new_axes(self):
+ ndim = self._get_result_dim()
+ new_axes = [None] * ndim
+
+ if self.join_axes is None:
+ for i in range(ndim):
+ if i == self.axis:
+ continue
+ new_axes[i] = self._get_comb_axis(i)
+ else:
+ if len(self.join_axes) != ndim - 1:
+ raise AssertionError("length of join_axes must be equal "
+ "to {length}".format(length=ndim - 1))
+
+ # ufff...
+ indices = compat.lrange(ndim)
+ indices.remove(self.axis)
+
+ for i, ax in zip(indices, self.join_axes):
+ new_axes[i] = ax
+
+ new_axes[self.axis] = self._get_concat_axis()
+ return new_axes
+
+ def _get_comb_axis(self, i):
+ data_axis = self.objs[0]._get_block_manager_axis(i)
+ try:
+ return _get_objs_combined_axis(self.objs, axis=data_axis,
+ intersect=self.intersect,
+ sort=self.sort)
+ except IndexError:
+ types = [type(x).__name__ for x in self.objs]
+ raise TypeError("Cannot concatenate list of {types}"
+ .format(types=types))
+
+ def _get_concat_axis(self):
+ """
+ Return index to be used along concatenation axis.
+ """
+ if self._is_series:
+ if self.axis == 0:
+ indexes = [x.index for x in self.objs]
+ elif self.ignore_index:
+ idx = ibase.default_index(len(self.objs))
+ return idx
+ elif self.keys is None:
+ names = [None] * len(self.objs)
+ num = 0
+ has_names = False
+ for i, x in enumerate(self.objs):
+ if not isinstance(x, Series):
+ raise TypeError("Cannot concatenate type 'Series' "
+ "with object of type {type!r}"
+ .format(type=type(x).__name__))
+ if x.name is not None:
+ names[i] = x.name
+ has_names = True
+ else:
+ names[i] = num
+ num += 1
+ if has_names:
+ return Index(names)
+ else:
+ return ibase.default_index(len(self.objs))
+ else:
+ return ensure_index(self.keys).set_names(self.names)
+ else:
+ indexes = [x._data.axes[self.axis] for x in self.objs]
+
+ if self.ignore_index:
+ idx = ibase.default_index(sum(len(i) for i in indexes))
+ return idx
+
+ if self.keys is None:
+ concat_axis = _concat_indexes(indexes)
+ else:
+ concat_axis = _make_concat_multiindex(indexes, self.keys,
+ self.levels, self.names)
+
+ self._maybe_check_integrity(concat_axis)
+
+ return concat_axis
+
+ def _maybe_check_integrity(self, concat_index):
+ if self.verify_integrity:
+ if not concat_index.is_unique:
+ overlap = concat_index[concat_index.duplicated()].unique()
+ raise ValueError('Indexes have overlapping values: '
+ '{overlap!s}'.format(overlap=overlap))
+
+
+def _concat_indexes(indexes):
+ return indexes[0].append(indexes[1:])
+
+
+def _make_concat_multiindex(indexes, keys, levels=None, names=None):
+
+ if ((levels is None and isinstance(keys[0], tuple)) or
+ (levels is not None and len(levels) > 1)):
+ zipped = compat.lzip(*keys)
+ if names is None:
+ names = [None] * len(zipped)
+
+ if levels is None:
+ _, levels = _factorize_from_iterables(zipped)
+ else:
+ levels = [ensure_index(x) for x in levels]
+ else:
+ zipped = [keys]
+ if names is None:
+ names = [None]
+
+ if levels is None:
+ levels = [ensure_index(keys)]
+ else:
+ levels = [ensure_index(x) for x in levels]
+
+ if not _all_indexes_same(indexes):
+ codes_list = []
+
+ # things are potentially different sizes, so compute the exact codes
+ # for each level and pass those to MultiIndex.from_arrays
+
+ for hlevel, level in zip(zipped, levels):
+ to_concat = []
+ for key, index in zip(hlevel, indexes):
+ try:
+ i = level.get_loc(key)
+ except KeyError:
+ raise ValueError('Key {key!s} not in level {level!s}'
+ .format(key=key, level=level))
+
+ to_concat.append(np.repeat(i, len(index)))
+ codes_list.append(np.concatenate(to_concat))
+
+ concat_index = _concat_indexes(indexes)
+
+ # these go at the end
+ if isinstance(concat_index, MultiIndex):
+ levels.extend(concat_index.levels)
+ codes_list.extend(concat_index.codes)
+ else:
+ codes, categories = _factorize_from_iterable(concat_index)
+ levels.append(categories)
+ codes_list.append(codes)
+
+ if len(names) == len(levels):
+ names = list(names)
+ else:
+ # make sure that all of the passed indices have the same nlevels
+ if not len({idx.nlevels for idx in indexes}) == 1:
+ raise AssertionError("Cannot concat indices that do"
+ " not have the same number of levels")
+
+ # also copies
+ names = names + _get_consensus_names(indexes)
+
+ return MultiIndex(levels=levels, codes=codes_list, names=names,
+ verify_integrity=False)
+
+ new_index = indexes[0]
+ n = len(new_index)
+ kpieces = len(indexes)
+
+ # also copies
+ new_names = list(names)
+ new_levels = list(levels)
+
+ # construct codes
+ new_codes = []
+
+ # do something a bit more speedy
+
+ for hlevel, level in zip(zipped, levels):
+ hlevel = ensure_index(hlevel)
+ mapped = level.get_indexer(hlevel)
+
+ mask = mapped == -1
+ if mask.any():
+ raise ValueError('Values not found in passed level: {hlevel!s}'
+ .format(hlevel=hlevel[mask]))
+
+ new_codes.append(np.repeat(mapped, n))
+
+ if isinstance(new_index, MultiIndex):
+ new_levels.extend(new_index.levels)
+ new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes])
+ else:
+ new_levels.append(new_index)
+ new_codes.append(np.tile(np.arange(n), kpieces))
+
+ if len(new_names) < len(new_levels):
+ new_names.extend(new_index.names)
+
+ return MultiIndex(levels=new_levels, codes=new_codes, names=new_names,
+ verify_integrity=False)
diff --git a/contrib/python/pandas/py2/pandas/core/reshape/melt.py b/contrib/python/pandas/py2/pandas/core/reshape/melt.py
new file mode 100644
index 00000000000..312a108ad33
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/reshape/melt.py
@@ -0,0 +1,461 @@
+# pylint: disable=E1101,E1103
+# pylint: disable=W0703,W0622,W0613,W0201
+import re
+
+import numpy as np
+
+from pandas.util._decorators import Appender
+
+from pandas.core.dtypes.common import is_extension_type, is_list_like
+from pandas.core.dtypes.generic import ABCMultiIndex
+from pandas.core.dtypes.missing import notna
+
+from pandas import compat
+from pandas.core.arrays import Categorical
+from pandas.core.frame import _shared_docs
+from pandas.core.indexes.base import Index
+from pandas.core.reshape.concat import concat
+from pandas.core.tools.numeric import to_numeric
+
+
+@Appender(_shared_docs['melt'] %
+ dict(caller='pd.melt(df, ',
+ versionadded="",
+ other='DataFrame.melt'))
+def melt(frame, id_vars=None, value_vars=None, var_name=None,
+ value_name='value', col_level=None):
+ # TODO: what about the existing index?
+ # If multiindex, gather names of columns on all level for checking presence
+ # of `id_vars` and `value_vars`
+ if isinstance(frame.columns, ABCMultiIndex):
+ cols = [x for c in frame.columns for x in c]
+ else:
+ cols = list(frame.columns)
+ if id_vars is not None:
+ if not is_list_like(id_vars):
+ id_vars = [id_vars]
+ elif (isinstance(frame.columns, ABCMultiIndex) and
+ not isinstance(id_vars, list)):
+ raise ValueError('id_vars must be a list of tuples when columns'
+ ' are a MultiIndex')
+ else:
+ # Check that `id_vars` are in frame
+ id_vars = list(id_vars)
+ missing = Index(np.ravel(id_vars)).difference(cols)
+ if not missing.empty:
+ raise KeyError("The following 'id_vars' are not present"
+ " in the DataFrame: {missing}"
+ "".format(missing=list(missing)))
+ else:
+ id_vars = []
+
+ if value_vars is not None:
+ if not is_list_like(value_vars):
+ value_vars = [value_vars]
+ elif (isinstance(frame.columns, ABCMultiIndex) and
+ not isinstance(value_vars, list)):
+ raise ValueError('value_vars must be a list of tuples when'
+ ' columns are a MultiIndex')
+ else:
+ value_vars = list(value_vars)
+ # Check that `value_vars` are in frame
+ missing = Index(np.ravel(value_vars)).difference(cols)
+ if not missing.empty:
+ raise KeyError("The following 'value_vars' are not present in"
+ " the DataFrame: {missing}"
+ "".format(missing=list(missing)))
+ frame = frame.loc[:, id_vars + value_vars]
+ else:
+ frame = frame.copy()
+
+ if col_level is not None: # allow list or other?
+ # frame is a copy
+ frame.columns = frame.columns.get_level_values(col_level)
+
+ if var_name is None:
+ if isinstance(frame.columns, ABCMultiIndex):
+ if len(frame.columns.names) == len(set(frame.columns.names)):
+ var_name = frame.columns.names
+ else:
+ var_name = ['variable_{i}'.format(i=i)
+ for i in range(len(frame.columns.names))]
+ else:
+ var_name = [frame.columns.name if frame.columns.name is not None
+ else 'variable']
+ if isinstance(var_name, compat.string_types):
+ var_name = [var_name]
+
+ N, K = frame.shape
+ K -= len(id_vars)
+
+ mdata = {}
+ for col in id_vars:
+ id_data = frame.pop(col)
+ if is_extension_type(id_data):
+ id_data = concat([id_data] * K, ignore_index=True)
+ else:
+ id_data = np.tile(id_data.values, K)
+ mdata[col] = id_data
+
+ mcolumns = id_vars + var_name + [value_name]
+
+ mdata[value_name] = frame.values.ravel('F')
+ for i, col in enumerate(var_name):
+ # asanyarray will keep the columns as an Index
+ mdata[col] = np.asanyarray(frame.columns
+ ._get_level_values(i)).repeat(N)
+
+ return frame._constructor(mdata, columns=mcolumns)
+
+
+def lreshape(data, groups, dropna=True, label=None):
+ """
+ Reshape long-format data to wide. Generalized inverse of DataFrame.pivot
+
+ Parameters
+ ----------
+ data : DataFrame
+ groups : dict
+ {new_name : list_of_columns}
+ dropna : boolean, default True
+
+ Examples
+ --------
+ >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
+ ... 'team': ['Red Sox', 'Yankees'],
+ ... 'year1': [2007, 2007], 'year2': [2008, 2008]})
+ >>> data
+ hr1 hr2 team year1 year2
+ 0 514 545 Red Sox 2007 2008
+ 1 573 526 Yankees 2007 2008
+
+ >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
+ team year hr
+ 0 Red Sox 2007 514
+ 1 Yankees 2007 573
+ 2 Red Sox 2008 545
+ 3 Yankees 2008 526
+
+ Returns
+ -------
+ reshaped : DataFrame
+ """
+ if isinstance(groups, dict):
+ keys = list(groups.keys())
+ values = list(groups.values())
+ else:
+ keys, values = zip(*groups)
+
+ all_cols = list(set.union(*[set(x) for x in values]))
+ id_cols = list(data.columns.difference(all_cols))
+
+ K = len(values[0])
+
+ for seq in values:
+ if len(seq) != K:
+ raise ValueError('All column lists must be same length')
+
+ mdata = {}
+ pivot_cols = []
+
+ for target, names in zip(keys, values):
+ to_concat = [data[col].values for col in names]
+
+ import pandas.core.dtypes.concat as _concat
+ mdata[target] = _concat._concat_compat(to_concat)
+ pivot_cols.append(target)
+
+ for col in id_cols:
+ mdata[col] = np.tile(data[col].values, K)
+
+ if dropna:
+ mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)
+ for c in pivot_cols:
+ mask &= notna(mdata[c])
+ if not mask.all():
+ mdata = {k: v[mask] for k, v in compat.iteritems(mdata)}
+
+ return data._constructor(mdata, columns=id_cols + pivot_cols)
+
+
+def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
+ r"""
+ Wide panel to long format. Less flexible but more user-friendly than melt.
+
+ With stubnames ['A', 'B'], this function expects to find one or more
+ group of columns with format
+ A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,...
+ You specify what you want to call this suffix in the resulting long format
+ with `j` (for example `j='year'`)
+
+ Each row of these wide variables are assumed to be uniquely identified by
+ `i` (can be a single column name or a list of column names)
+
+ All remaining variables in the data frame are left intact.
+
+ Parameters
+ ----------
+ df : DataFrame
+ The wide-format DataFrame
+ stubnames : str or list-like
+ The stub name(s). The wide format variables are assumed to
+ start with the stub names.
+ i : str or list-like
+ Column(s) to use as id variable(s)
+ j : str
+ The name of the sub-observation variable. What you wish to name your
+ suffix in the long format.
+ sep : str, default ""
+ A character indicating the separation of the variable names
+ in the wide format, to be stripped from the names in the long format.
+ For example, if your column names are A-suffix1, A-suffix2, you
+ can strip the hyphen by specifying `sep='-'`
+
+ .. versionadded:: 0.20.0
+
+ suffix : str, default '\\d+'
+ A regular expression capturing the wanted suffixes. '\\d+' captures
+ numeric suffixes. Suffixes with no numbers could be specified with the
+ negated character class '\\D+'. You can also further disambiguate
+ suffixes, for example, if your wide variables are of the form
+ A-one, B-two,.., and you have an unrelated column A-rating, you can
+ ignore the last one by specifying `suffix='(!?one|two)'`
+
+ .. versionadded:: 0.20.0
+
+ .. versionchanged:: 0.23.0
+ When all suffixes are numeric, they are cast to int64/float64.
+
+ Returns
+ -------
+ DataFrame
+ A DataFrame that contains each stub name as a variable, with new index
+ (i, j)
+
+ Notes
+ -----
+ All extra variables are left untouched. This simply uses
+ `pandas.melt` under the hood, but is hard-coded to "do the right thing"
+ in a typical case.
+
+ Examples
+ --------
+ >>> np.random.seed(123)
+ >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
+ ... "A1980" : {0 : "d", 1 : "e", 2 : "f"},
+ ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
+ ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
+ ... "X" : dict(zip(range(3), np.random.randn(3)))
+ ... })
+ >>> df["id"] = df.index
+ >>> df
+ A1970 A1980 B1970 B1980 X id
+ 0 a d 2.5 3.2 -1.085631 0
+ 1 b e 1.2 1.3 0.997345 1
+ 2 c f 0.7 0.1 0.282978 2
+ >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
+ ... # doctest: +NORMALIZE_WHITESPACE
+ X A B
+ id year
+ 0 1970 -1.085631 a 2.5
+ 1 1970 0.997345 b 1.2
+ 2 1970 0.282978 c 0.7
+ 0 1980 -1.085631 d 3.2
+ 1 1980 0.997345 e 1.3
+ 2 1980 0.282978 f 0.1
+
+ With multiple id columns
+
+ >>> df = pd.DataFrame({
+ ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
+ ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
+ ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
+ ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
+ ... })
+ >>> df
+ birth famid ht1 ht2
+ 0 1 1 2.8 3.4
+ 1 2 1 2.9 3.8
+ 2 3 1 2.2 2.9
+ 3 1 2 2.0 3.2
+ 4 2 2 1.8 2.8
+ 5 3 2 1.9 2.4
+ 6 1 3 2.2 3.3
+ 7 2 3 2.3 3.4
+ 8 3 3 2.1 2.9
+ >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
+ >>> l
+ ... # doctest: +NORMALIZE_WHITESPACE
+ ht
+ famid birth age
+ 1 1 1 2.8
+ 2 3.4
+ 2 1 2.9
+ 2 3.8
+ 3 1 2.2
+ 2 2.9
+ 2 1 1 2.0
+ 2 3.2
+ 2 1 1.8
+ 2 2.8
+ 3 1 1.9
+ 2 2.4
+ 3 1 1 2.2
+ 2 3.3
+ 2 1 2.3
+ 2 3.4
+ 3 1 2.1
+ 2 2.9
+
+ Going from long back to wide just takes some creative use of `unstack`
+
+ >>> w = l.unstack()
+ >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format)
+ >>> w.reset_index()
+ famid birth ht1 ht2
+ 0 1 1 2.8 3.4
+ 1 1 2 2.9 3.8
+ 2 1 3 2.2 2.9
+ 3 2 1 2.0 3.2
+ 4 2 2 1.8 2.8
+ 5 2 3 1.9 2.4
+ 6 3 1 2.2 3.3
+ 7 3 2 2.3 3.4
+ 8 3 3 2.1 2.9
+
+ Less wieldy column names are also handled
+
+ >>> np.random.seed(0)
+ >>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3),
+ ... 'A(quarterly)-2011': np.random.rand(3),
+ ... 'B(quarterly)-2010': np.random.rand(3),
+ ... 'B(quarterly)-2011': np.random.rand(3),
+ ... 'X' : np.random.randint(3, size=3)})
+ >>> df['id'] = df.index
+ >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
+ A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 ...
+ 0 0.548814 0.544883 0.437587 ...
+ 1 0.715189 0.423655 0.891773 ...
+ 2 0.602763 0.645894 0.963663 ...
+ X id
+ 0 0 0
+ 1 1 1
+ 2 1 2
+
+ >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], i='id',
+ ... j='year', sep='-')
+ ... # doctest: +NORMALIZE_WHITESPACE
+ X A(quarterly) B(quarterly)
+ id year
+ 0 2010 0 0.548814 0.437587
+ 1 2010 1 0.715189 0.891773
+ 2 2010 1 0.602763 0.963663
+ 0 2011 0 0.544883 0.383442
+ 1 2011 1 0.423655 0.791725
+ 2 2011 1 0.645894 0.528895
+
+ If we have many columns, we could also use a regex to find our
+ stubnames and pass that list on to wide_to_long
+
+ >>> stubnames = sorted(
+ ... set([match[0] for match in df.columns.str.findall(
+ ... r'[A-B]\(.*\)').values if match != [] ])
+ ... )
+ >>> list(stubnames)
+ ['A(quarterly)', 'B(quarterly)']
+
+ All of the above examples have integers as suffixes. It is possible to
+ have non-integers as suffixes.
+
+ >>> df = pd.DataFrame({
+ ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
+ ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
+ ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
+ ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
+ ... })
+ >>> df
+ birth famid ht_one ht_two
+ 0 1 1 2.8 3.4
+ 1 2 1 2.9 3.8
+ 2 3 1 2.2 2.9
+ 3 1 2 2.0 3.2
+ 4 2 2 1.8 2.8
+ 5 3 2 1.9 2.4
+ 6 1 3 2.2 3.3
+ 7 2 3 2.3 3.4
+ 8 3 3 2.1 2.9
+
+ >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',
+ sep='_', suffix='\w')
+ >>> l
+ ... # doctest: +NORMALIZE_WHITESPACE
+ ht
+ famid birth age
+ 1 1 one 2.8
+ two 3.4
+ 2 one 2.9
+ two 3.8
+ 3 one 2.2
+ two 2.9
+ 2 1 one 2.0
+ two 3.2
+ 2 one 1.8
+ two 2.8
+ 3 one 1.9
+ two 2.4
+ 3 1 one 2.2
+ two 3.3
+ 2 one 2.3
+ two 3.4
+ 3 one 2.1
+ two 2.9
+ """
+ def get_var_names(df, stub, sep, suffix):
+ regex = r'^{stub}{sep}{suffix}$'.format(
+ stub=re.escape(stub), sep=re.escape(sep), suffix=suffix)
+ pattern = re.compile(regex)
+ return [col for col in df.columns if pattern.match(col)]
+
+ def melt_stub(df, stub, i, j, value_vars, sep):
+ newdf = melt(df, id_vars=i, value_vars=value_vars,
+ value_name=stub.rstrip(sep), var_name=j)
+ newdf[j] = Categorical(newdf[j])
+ newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")
+
+ # GH17627 Cast numerics suffixes to int/float
+ newdf[j] = to_numeric(newdf[j], errors='ignore')
+
+ return newdf.set_index(i + [j])
+
+ if not is_list_like(stubnames):
+ stubnames = [stubnames]
+ else:
+ stubnames = list(stubnames)
+
+ if any(col in stubnames for col in df.columns):
+ raise ValueError("stubname can't be identical to a column name")
+
+ if not is_list_like(i):
+ i = [i]
+ else:
+ i = list(i)
+
+ if df[i].duplicated().any():
+ raise ValueError("the id variables need to uniquely identify each row")
+
+ value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames]
+
+ value_vars_flattened = [e for sublist in value_vars for e in sublist]
+ id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))
+
+ melted = [melt_stub(df, s, i, j, v, sep)
+ for s, v in zip(stubnames, value_vars)]
+ melted = melted[0].join(melted[1:], how='outer')
+
+ if len(i) == 1:
+ new = df[id_vars].set_index(i).join(melted)
+ return new
+
+ new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j])
+
+ return new
diff --git a/contrib/python/pandas/py2/pandas/core/reshape/merge.py b/contrib/python/pandas/py2/pandas/core/reshape/merge.py
new file mode 100644
index 00000000000..adfd69c21d7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/reshape/merge.py
@@ -0,0 +1,1752 @@
+"""
+SQL-style merge routines
+"""
+
+import copy
+import string
+import warnings
+
+import numpy as np
+
+from pandas._libs import hashtable as libhashtable, join as libjoin, lib
+import pandas.compat as compat
+from pandas.compat import filter, lzip, map, range, zip
+from pandas.errors import MergeError
+from pandas.util._decorators import Appender, Substitution
+
+from pandas.core.dtypes.common import (
+ ensure_float64, ensure_int64, ensure_object, is_array_like, is_bool,
+ is_bool_dtype, is_categorical_dtype, is_datetime64_dtype,
+ is_datetime64tz_dtype, is_datetimelike, is_dtype_equal,
+ is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer,
+ is_integer_dtype, is_list_like, is_number, is_numeric_dtype,
+ is_object_dtype, needs_i8_conversion)
+from pandas.core.dtypes.missing import isnull, na_value_for_dtype
+
+from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta
+import pandas.core.algorithms as algos
+from pandas.core.arrays.categorical import _recode_for_categories
+import pandas.core.common as com
+from pandas.core.frame import _merge_doc
+from pandas.core.internals import (
+ concatenate_block_managers, items_overlap_with_suffix)
+import pandas.core.sorting as sorting
+from pandas.core.sorting import is_int64_overflow_possible
+
+
+@Substitution('\nleft : DataFrame')
+@Appender(_merge_doc, indents=0)
+def merge(left, right, how='inner', on=None, left_on=None, right_on=None,
+ left_index=False, right_index=False, sort=False,
+ suffixes=('_x', '_y'), copy=True, indicator=False,
+ validate=None):
+ op = _MergeOperation(left, right, how=how, on=on, left_on=left_on,
+ right_on=right_on, left_index=left_index,
+ right_index=right_index, sort=sort, suffixes=suffixes,
+ copy=copy, indicator=indicator,
+ validate=validate)
+ return op.get_result()
+
+
+if __debug__:
+ merge.__doc__ = _merge_doc % '\nleft : DataFrame'
+
+
+def _groupby_and_merge(by, on, left, right, _merge_pieces,
+ check_duplicates=True):
+ """
+ groupby & merge; we are always performing a left-by type operation
+
+ Parameters
+ ----------
+ by: field to group
+ on: duplicates field
+ left: left frame
+ right: right frame
+ _merge_pieces: function for merging
+ check_duplicates: boolean, default True
+ should we check & clean duplicates
+ """
+
+ pieces = []
+ if not isinstance(by, (list, tuple)):
+ by = [by]
+
+ lby = left.groupby(by, sort=False)
+
+ # if we can groupby the rhs
+ # then we can get vastly better perf
+ try:
+
+ # we will check & remove duplicates if indicated
+ if check_duplicates:
+ if on is None:
+ on = []
+ elif not isinstance(on, (list, tuple)):
+ on = [on]
+
+ if right.duplicated(by + on).any():
+ right = right.drop_duplicates(by + on, keep='last')
+ rby = right.groupby(by, sort=False)
+ except KeyError:
+ rby = None
+
+ for key, lhs in lby:
+
+ if rby is None:
+ rhs = right
+ else:
+ try:
+ rhs = right.take(rby.indices[key])
+ except KeyError:
+ # key doesn't exist in left
+ lcols = lhs.columns.tolist()
+ cols = lcols + [r for r in right.columns
+ if r not in set(lcols)]
+ merged = lhs.reindex(columns=cols)
+ merged.index = range(len(merged))
+ pieces.append(merged)
+ continue
+
+ merged = _merge_pieces(lhs, rhs)
+
+ # make sure join keys are in the merged
+ # TODO, should _merge_pieces do this?
+ for k in by:
+ try:
+ if k in merged:
+ merged[k] = key
+ except KeyError:
+ pass
+
+ pieces.append(merged)
+
+ # preserve the original order
+ # if we have a missing piece this can be reset
+ from pandas.core.reshape.concat import concat
+ result = concat(pieces, ignore_index=True)
+ result = result.reindex(columns=pieces[0].columns, copy=False)
+ return result, lby
+
+
+def merge_ordered(left, right, on=None,
+ left_on=None, right_on=None,
+ left_by=None, right_by=None,
+ fill_method=None, suffixes=('_x', '_y'),
+ how='outer'):
+ """Perform merge with optional filling/interpolation designed for ordered
+ data like time series data. Optionally perform group-wise merge (see
+ examples)
+
+ Parameters
+ ----------
+ left : DataFrame
+ right : DataFrame
+ on : label or list
+ Field names to join on. Must be found in both DataFrames.
+ left_on : label or list, or array-like
+ Field names to join on in left DataFrame. Can be a vector or list of
+ vectors of the length of the DataFrame to use a particular vector as
+ the join key instead of columns
+ right_on : label or list, or array-like
+ Field names to join on in right DataFrame or vector/list of vectors per
+ left_on docs
+ left_by : column name or list of column names
+ Group left DataFrame by group columns and merge piece by piece with
+ right DataFrame
+ right_by : column name or list of column names
+ Group right DataFrame by group columns and merge piece by piece with
+ left DataFrame
+ fill_method : {'ffill', None}, default None
+ Interpolation method for data
+ suffixes : 2-length sequence (tuple, list, ...)
+ Suffix to apply to overlapping column names in the left and right
+ side, respectively
+ how : {'left', 'right', 'outer', 'inner'}, default 'outer'
+ * left: use only keys from left frame (SQL: left outer join)
+ * right: use only keys from right frame (SQL: right outer join)
+ * outer: use union of keys from both frames (SQL: full outer join)
+ * inner: use intersection of keys from both frames (SQL: inner join)
+
+ .. versionadded:: 0.19.0
+
+ Returns
+ -------
+ merged : DataFrame
+ The output type will the be same as 'left', if it is a subclass
+ of DataFrame.
+
+ See Also
+ --------
+ merge
+ merge_asof
+
+ Examples
+ --------
+ >>> A >>> B
+ key lvalue group key rvalue
+ 0 a 1 a 0 b 1
+ 1 c 2 a 1 c 2
+ 2 e 3 a 2 d 3
+ 3 a 1 b
+ 4 c 2 b
+ 5 e 3 b
+
+ >>> merge_ordered(A, B, fill_method='ffill', left_by='group')
+ group key lvalue rvalue
+ 0 a a 1 NaN
+ 1 a b 1 1.0
+ 2 a c 2 2.0
+ 3 a d 2 3.0
+ 4 a e 3 3.0
+ 5 b a 1 NaN
+ 6 b b 1 1.0
+ 7 b c 2 2.0
+ 8 b d 2 3.0
+ 9 b e 3 3.0
+ """
+ def _merger(x, y):
+ # perform the ordered merge operation
+ op = _OrderedMerge(x, y, on=on, left_on=left_on, right_on=right_on,
+ suffixes=suffixes, fill_method=fill_method,
+ how=how)
+ return op.get_result()
+
+ if left_by is not None and right_by is not None:
+ raise ValueError('Can only group either left or right frames')
+ elif left_by is not None:
+ result, _ = _groupby_and_merge(left_by, on, left, right,
+ lambda x, y: _merger(x, y),
+ check_duplicates=False)
+ elif right_by is not None:
+ result, _ = _groupby_and_merge(right_by, on, right, left,
+ lambda x, y: _merger(y, x),
+ check_duplicates=False)
+ else:
+ result = _merger(left, right)
+ return result
+
+
+def merge_asof(left, right, on=None,
+ left_on=None, right_on=None,
+ left_index=False, right_index=False,
+ by=None, left_by=None, right_by=None,
+ suffixes=('_x', '_y'),
+ tolerance=None,
+ allow_exact_matches=True,
+ direction='backward'):
+ """Perform an asof merge. This is similar to a left-join except that we
+ match on nearest key rather than equal keys.
+
+ Both DataFrames must be sorted by the key.
+
+ For each row in the left DataFrame:
+
+ - A "backward" search selects the last row in the right DataFrame whose
+ 'on' key is less than or equal to the left's key.
+
+ - A "forward" search selects the first row in the right DataFrame whose
+ 'on' key is greater than or equal to the left's key.
+
+ - A "nearest" search selects the row in the right DataFrame whose 'on'
+ key is closest in absolute distance to the left's key.
+
+ The default is "backward" and is compatible in versions below 0.20.0.
+ The direction parameter was added in version 0.20.0 and introduces
+ "forward" and "nearest".
+
+ Optionally match on equivalent keys with 'by' before searching with 'on'.
+
+ .. versionadded:: 0.19.0
+
+ Parameters
+ ----------
+ left : DataFrame
+ right : DataFrame
+ on : label
+ Field name to join on. Must be found in both DataFrames.
+ The data MUST be ordered. Furthermore this must be a numeric column,
+ such as datetimelike, integer, or float. On or left_on/right_on
+ must be given.
+ left_on : label
+ Field name to join on in left DataFrame.
+ right_on : label
+ Field name to join on in right DataFrame.
+ left_index : boolean
+ Use the index of the left DataFrame as the join key.
+
+ .. versionadded:: 0.19.2
+
+ right_index : boolean
+ Use the index of the right DataFrame as the join key.
+
+ .. versionadded:: 0.19.2
+
+ by : column name or list of column names
+ Match on these columns before performing merge operation.
+ left_by : column name
+ Field names to match on in the left DataFrame.
+
+ .. versionadded:: 0.19.2
+
+ right_by : column name
+ Field names to match on in the right DataFrame.
+
+ .. versionadded:: 0.19.2
+
+ suffixes : 2-length sequence (tuple, list, ...)
+ Suffix to apply to overlapping column names in the left and right
+ side, respectively.
+ tolerance : integer or Timedelta, optional, default None
+ Select asof tolerance within this range; must be compatible
+ with the merge index.
+ allow_exact_matches : boolean, default True
+
+ - If True, allow matching with the same 'on' value
+ (i.e. less-than-or-equal-to / greater-than-or-equal-to)
+ - If False, don't match the same 'on' value
+ (i.e., strictly less-than / strictly greater-than)
+
+ direction : 'backward' (default), 'forward', or 'nearest'
+ Whether to search for prior, subsequent, or closest matches.
+
+ .. versionadded:: 0.20.0
+
+ Returns
+ -------
+ merged : DataFrame
+
+ See Also
+ --------
+ merge
+ merge_ordered
+
+ Examples
+ --------
+ >>> left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']})
+ >>> left
+ a left_val
+ 0 1 a
+ 1 5 b
+ 2 10 c
+
+ >>> right = pd.DataFrame({'a': [1, 2, 3, 6, 7],
+ ... 'right_val': [1, 2, 3, 6, 7]})
+ >>> right
+ a right_val
+ 0 1 1
+ 1 2 2
+ 2 3 3
+ 3 6 6
+ 4 7 7
+
+ >>> pd.merge_asof(left, right, on='a')
+ a left_val right_val
+ 0 1 a 1
+ 1 5 b 3
+ 2 10 c 7
+
+ >>> pd.merge_asof(left, right, on='a', allow_exact_matches=False)
+ a left_val right_val
+ 0 1 a NaN
+ 1 5 b 3.0
+ 2 10 c 7.0
+
+ >>> pd.merge_asof(left, right, on='a', direction='forward')
+ a left_val right_val
+ 0 1 a 1.0
+ 1 5 b 6.0
+ 2 10 c NaN
+
+ >>> pd.merge_asof(left, right, on='a', direction='nearest')
+ a left_val right_val
+ 0 1 a 1
+ 1 5 b 6
+ 2 10 c 7
+
+ We can use indexed DataFrames as well.
+
+ >>> left = pd.DataFrame({'left_val': ['a', 'b', 'c']}, index=[1, 5, 10])
+ >>> left
+ left_val
+ 1 a
+ 5 b
+ 10 c
+
+ >>> right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7]},
+ ... index=[1, 2, 3, 6, 7])
+ >>> right
+ right_val
+ 1 1
+ 2 2
+ 3 3
+ 6 6
+ 7 7
+
+ >>> pd.merge_asof(left, right, left_index=True, right_index=True)
+ left_val right_val
+ 1 a 1
+ 5 b 3
+ 10 c 7
+
+ Here is a real-world times-series example
+
+ >>> quotes
+ time ticker bid ask
+ 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93
+ 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96
+ 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98
+ 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00
+ 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93
+ 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01
+ 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88
+ 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03
+
+ >>> trades
+ time ticker price quantity
+ 0 2016-05-25 13:30:00.023 MSFT 51.95 75
+ 1 2016-05-25 13:30:00.038 MSFT 51.95 155
+ 2 2016-05-25 13:30:00.048 GOOG 720.77 100
+ 3 2016-05-25 13:30:00.048 GOOG 720.92 100
+ 4 2016-05-25 13:30:00.048 AAPL 98.00 100
+
+ By default we are taking the asof of the quotes
+
+ >>> pd.merge_asof(trades, quotes,
+ ... on='time',
+ ... by='ticker')
+ time ticker price quantity bid ask
+ 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96
+ 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98
+ 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93
+ 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93
+ 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
+
+ We only asof within 2ms between the quote time and the trade time
+
+ >>> pd.merge_asof(trades, quotes,
+ ... on='time',
+ ... by='ticker',
+ ... tolerance=pd.Timedelta('2ms'))
+ time ticker price quantity bid ask
+ 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96
+ 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN
+ 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93
+ 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93
+ 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
+
+ We only asof within 10ms between the quote time and the trade time
+ and we exclude exact matches on time. However *prior* data will
+ propagate forward
+
+ >>> pd.merge_asof(trades, quotes,
+ ... on='time',
+ ... by='ticker',
+ ... tolerance=pd.Timedelta('10ms'),
+ ... allow_exact_matches=False)
+ time ticker price quantity bid ask
+ 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN
+ 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98
+ 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN
+ 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN
+ 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
+ """
+ op = _AsOfMerge(left, right,
+ on=on, left_on=left_on, right_on=right_on,
+ left_index=left_index, right_index=right_index,
+ by=by, left_by=left_by, right_by=right_by,
+ suffixes=suffixes,
+ how='asof', tolerance=tolerance,
+ allow_exact_matches=allow_exact_matches,
+ direction=direction)
+ return op.get_result()
+
+
+# TODO: transformations??
+# TODO: only copy DataFrames when modification necessary
+class _MergeOperation(object):
+ """
+ Perform a database (SQL) merge operation between two DataFrame objects
+ using either columns as keys or their row indexes
+ """
+ _merge_type = 'merge'
+
+ def __init__(self, left, right, how='inner', on=None,
+ left_on=None, right_on=None, axis=1,
+ left_index=False, right_index=False, sort=True,
+ suffixes=('_x', '_y'), copy=True, indicator=False,
+ validate=None):
+ left = validate_operand(left)
+ right = validate_operand(right)
+ self.left = self.orig_left = left
+ self.right = self.orig_right = right
+ self.how = how
+ self.axis = axis
+
+ self.on = com.maybe_make_list(on)
+ self.left_on = com.maybe_make_list(left_on)
+ self.right_on = com.maybe_make_list(right_on)
+
+ self.copy = copy
+ self.suffixes = suffixes
+ self.sort = sort
+
+ self.left_index = left_index
+ self.right_index = right_index
+
+ self.indicator = indicator
+
+ if isinstance(self.indicator, compat.string_types):
+ self.indicator_name = self.indicator
+ elif isinstance(self.indicator, bool):
+ self.indicator_name = '_merge' if self.indicator else None
+ else:
+ raise ValueError(
+ 'indicator option can only accept boolean or string arguments')
+
+ if not is_bool(left_index):
+ raise ValueError(
+ 'left_index parameter must be of type bool, not '
+ '{left_index}'.format(left_index=type(left_index)))
+ if not is_bool(right_index):
+ raise ValueError(
+ 'right_index parameter must be of type bool, not '
+ '{right_index}'.format(right_index=type(right_index)))
+
+ # warn user when merging between different levels
+ if left.columns.nlevels != right.columns.nlevels:
+ msg = ('merging between different levels can give an unintended '
+ 'result ({left} levels on the left, {right} on the right)'
+ ).format(left=left.columns.nlevels,
+ right=right.columns.nlevels)
+ warnings.warn(msg, UserWarning)
+
+ self._validate_specification()
+
+ # note this function has side effects
+ (self.left_join_keys,
+ self.right_join_keys,
+ self.join_names) = self._get_merge_keys()
+
+ # validate the merge keys dtypes. We may need to coerce
+ # to avoid incompat dtypes
+ self._maybe_coerce_merge_keys()
+
+ # If argument passed to validate,
+ # check if columns specified as unique
+ # are in fact unique.
+ if validate is not None:
+ self._validate(validate)
+
+ def get_result(self):
+ if self.indicator:
+ self.left, self.right = self._indicator_pre_merge(
+ self.left, self.right)
+
+ join_index, left_indexer, right_indexer = self._get_join_info()
+
+ ldata, rdata = self.left._data, self.right._data
+ lsuf, rsuf = self.suffixes
+
+ llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf,
+ rdata.items, rsuf)
+
+ lindexers = {1: left_indexer} if left_indexer is not None else {}
+ rindexers = {1: right_indexer} if right_indexer is not None else {}
+
+ result_data = concatenate_block_managers(
+ [(ldata, lindexers), (rdata, rindexers)],
+ axes=[llabels.append(rlabels), join_index],
+ concat_axis=0, copy=self.copy)
+
+ typ = self.left._constructor
+ result = typ(result_data).__finalize__(self, method=self._merge_type)
+
+ if self.indicator:
+ result = self._indicator_post_merge(result)
+
+ self._maybe_add_join_keys(result, left_indexer, right_indexer)
+
+ self._maybe_restore_index_levels(result)
+
+ return result
+
+ def _indicator_pre_merge(self, left, right):
+
+ columns = left.columns.union(right.columns)
+
+ for i in ['_left_indicator', '_right_indicator']:
+ if i in columns:
+ raise ValueError("Cannot use `indicator=True` option when "
+ "data contains a column named {name}"
+ .format(name=i))
+ if self.indicator_name in columns:
+ raise ValueError(
+ "Cannot use name of an existing column for indicator column")
+
+ left = left.copy()
+ right = right.copy()
+
+ left['_left_indicator'] = 1
+ left['_left_indicator'] = left['_left_indicator'].astype('int8')
+
+ right['_right_indicator'] = 2
+ right['_right_indicator'] = right['_right_indicator'].astype('int8')
+
+ return left, right
+
+ def _indicator_post_merge(self, result):
+
+ result['_left_indicator'] = result['_left_indicator'].fillna(0)
+ result['_right_indicator'] = result['_right_indicator'].fillna(0)
+
+ result[self.indicator_name] = Categorical((result['_left_indicator'] +
+ result['_right_indicator']),
+ categories=[1, 2, 3])
+ result[self.indicator_name] = (
+ result[self.indicator_name]
+ .cat.rename_categories(['left_only', 'right_only', 'both']))
+
+ result = result.drop(labels=['_left_indicator', '_right_indicator'],
+ axis=1)
+ return result
+
+ def _maybe_restore_index_levels(self, result):
+ """
+ Restore index levels specified as `on` parameters
+
+ Here we check for cases where `self.left_on` and `self.right_on` pairs
+ each reference an index level in their respective DataFrames. The
+ joined columns corresponding to these pairs are then restored to the
+ index of `result`.
+
+ **Note:** This method has side effects. It modifies `result` in-place
+
+ Parameters
+ ----------
+ result: DataFrame
+ merge result
+
+ Returns
+ -------
+ None
+ """
+ names_to_restore = []
+ for name, left_key, right_key in zip(self.join_names,
+ self.left_on,
+ self.right_on):
+ if (self.orig_left._is_level_reference(left_key) and
+ self.orig_right._is_level_reference(right_key) and
+ name not in result.index.names):
+
+ names_to_restore.append(name)
+
+ if names_to_restore:
+ result.set_index(names_to_restore, inplace=True)
+
+ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
+
+ left_has_missing = None
+ right_has_missing = None
+
+ keys = zip(self.join_names, self.left_on, self.right_on)
+ for i, (name, lname, rname) in enumerate(keys):
+ if not _should_fill(lname, rname):
+ continue
+
+ take_left, take_right = None, None
+
+ if name in result:
+
+ if left_indexer is not None and right_indexer is not None:
+ if name in self.left:
+
+ if left_has_missing is None:
+ left_has_missing = (left_indexer == -1).any()
+
+ if left_has_missing:
+ take_right = self.right_join_keys[i]
+
+ if not is_dtype_equal(result[name].dtype,
+ self.left[name].dtype):
+ take_left = self.left[name]._values
+
+ elif name in self.right:
+
+ if right_has_missing is None:
+ right_has_missing = (right_indexer == -1).any()
+
+ if right_has_missing:
+ take_left = self.left_join_keys[i]
+
+ if not is_dtype_equal(result[name].dtype,
+ self.right[name].dtype):
+ take_right = self.right[name]._values
+
+ elif left_indexer is not None \
+ and is_array_like(self.left_join_keys[i]):
+ take_left = self.left_join_keys[i]
+ take_right = self.right_join_keys[i]
+
+ if take_left is not None or take_right is not None:
+
+ if take_left is None:
+ lvals = result[name]._values
+ else:
+ lfill = na_value_for_dtype(take_left.dtype)
+ lvals = algos.take_1d(take_left, left_indexer,
+ fill_value=lfill)
+
+ if take_right is None:
+ rvals = result[name]._values
+ else:
+ rfill = na_value_for_dtype(take_right.dtype)
+ rvals = algos.take_1d(take_right, right_indexer,
+ fill_value=rfill)
+
+ # if we have an all missing left_indexer
+ # make sure to just use the right values
+ mask = left_indexer == -1
+ if mask.all():
+ key_col = rvals
+ else:
+ key_col = Index(lvals).where(~mask, rvals)
+
+ if result._is_label_reference(name):
+ result[name] = key_col
+ elif result._is_level_reference(name):
+ if isinstance(result.index, MultiIndex):
+ key_col.name = name
+ idx_list = [result.index.get_level_values(level_name)
+ if level_name != name else key_col
+ for level_name in result.index.names]
+
+ result.set_index(idx_list, inplace=True)
+ else:
+ result.index = Index(key_col, name=name)
+ else:
+ result.insert(i, name or 'key_{i}'.format(i=i), key_col)
+
+ def _get_join_indexers(self):
+ """ return the join indexers """
+ return _get_join_indexers(self.left_join_keys,
+ self.right_join_keys,
+ sort=self.sort,
+ how=self.how)
+
+ def _get_join_info(self):
+ left_ax = self.left._data.axes[self.axis]
+ right_ax = self.right._data.axes[self.axis]
+
+ if self.left_index and self.right_index and self.how != 'asof':
+ join_index, left_indexer, right_indexer = \
+ left_ax.join(right_ax, how=self.how, return_indexers=True,
+ sort=self.sort)
+ elif self.right_index and self.how == 'left':
+ join_index, left_indexer, right_indexer = \
+ _left_join_on_index(left_ax, right_ax, self.left_join_keys,
+ sort=self.sort)
+
+ elif self.left_index and self.how == 'right':
+ join_index, right_indexer, left_indexer = \
+ _left_join_on_index(right_ax, left_ax, self.right_join_keys,
+ sort=self.sort)
+ else:
+ (left_indexer,
+ right_indexer) = self._get_join_indexers()
+
+ if self.right_index:
+ if len(self.left) > 0:
+ join_index = self.left.index.take(left_indexer)
+ else:
+ join_index = self.right.index.take(right_indexer)
+ left_indexer = np.array([-1] * len(join_index))
+ elif self.left_index:
+ if len(self.right) > 0:
+ join_index = self.right.index.take(right_indexer)
+ else:
+ join_index = self.left.index.take(left_indexer)
+ right_indexer = np.array([-1] * len(join_index))
+ else:
+ join_index = Index(np.arange(len(left_indexer)))
+
+ if len(join_index) == 0:
+ join_index = join_index.astype(object)
+ return join_index, left_indexer, right_indexer
+
+ def _get_merge_keys(self):
+ """
+ Note: has side effects (copy/delete key columns)
+
+ Parameters
+ ----------
+ left
+ right
+ on
+
+ Returns
+ -------
+ left_keys, right_keys
+ """
+ left_keys = []
+ right_keys = []
+ join_names = []
+ right_drop = []
+ left_drop = []
+
+ left, right = self.left, self.right
+
+ is_lkey = lambda x: is_array_like(x) and len(x) == len(left)
+ is_rkey = lambda x: is_array_like(x) and len(x) == len(right)
+
+ # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A
+ # user could, for example, request 'left_index' and 'left_by'. In a
+ # regular pd.merge(), users cannot specify both 'left_index' and
+ # 'left_on'. (Instead, users have a MultiIndex). That means the
+ # self.left_on in this function is always empty in a pd.merge(), but
+ # a pd.merge_asof(left_index=True, left_by=...) will result in a
+ # self.left_on array with a None in the middle of it. This requires
+ # a work-around as designated in the code below.
+ # See _validate_specification() for where this happens.
+
+ # ugh, spaghetti re #733
+ if _any(self.left_on) and _any(self.right_on):
+ for lk, rk in zip(self.left_on, self.right_on):
+ if is_lkey(lk):
+ left_keys.append(lk)
+ if is_rkey(rk):
+ right_keys.append(rk)
+ join_names.append(None) # what to do?
+ else:
+ if rk is not None:
+ right_keys.append(
+ right._get_label_or_level_values(rk))
+ join_names.append(rk)
+ else:
+ # work-around for merge_asof(right_index=True)
+ right_keys.append(right.index)
+ join_names.append(right.index.name)
+ else:
+ if not is_rkey(rk):
+ if rk is not None:
+ right_keys.append(
+ right._get_label_or_level_values(rk))
+ else:
+ # work-around for merge_asof(right_index=True)
+ right_keys.append(right.index)
+ if lk is not None and lk == rk:
+ # avoid key upcast in corner case (length-0)
+ if len(left) > 0:
+ right_drop.append(rk)
+ else:
+ left_drop.append(lk)
+ else:
+ right_keys.append(rk)
+ if lk is not None:
+ left_keys.append(left._get_label_or_level_values(lk))
+ join_names.append(lk)
+ else:
+ # work-around for merge_asof(left_index=True)
+ left_keys.append(left.index)
+ join_names.append(left.index.name)
+ elif _any(self.left_on):
+ for k in self.left_on:
+ if is_lkey(k):
+ left_keys.append(k)
+ join_names.append(None)
+ else:
+ left_keys.append(left._get_label_or_level_values(k))
+ join_names.append(k)
+ if isinstance(self.right.index, MultiIndex):
+ right_keys = [lev._values.take(lev_codes) for lev, lev_codes
+ in zip(self.right.index.levels,
+ self.right.index.codes)]
+ else:
+ right_keys = [self.right.index._values]
+ elif _any(self.right_on):
+ for k in self.right_on:
+ if is_rkey(k):
+ right_keys.append(k)
+ join_names.append(None)
+ else:
+ right_keys.append(right._get_label_or_level_values(k))
+ join_names.append(k)
+ if isinstance(self.left.index, MultiIndex):
+ left_keys = [lev._values.take(lev_codes) for lev, lev_codes
+ in zip(self.left.index.levels,
+ self.left.index.codes)]
+ else:
+ left_keys = [self.left.index.values]
+
+ if left_drop:
+ self.left = self.left._drop_labels_or_levels(left_drop)
+
+ if right_drop:
+ self.right = self.right._drop_labels_or_levels(right_drop)
+
+ return left_keys, right_keys, join_names
+
+ def _maybe_coerce_merge_keys(self):
+ # we have valid mergees but we may have to further
+ # coerce these if they are originally incompatible types
+ #
+ # for example if these are categorical, but are not dtype_equal
+ # or if we have object and integer dtypes
+
+ for lk, rk, name in zip(self.left_join_keys,
+ self.right_join_keys,
+ self.join_names):
+ if (len(lk) and not len(rk)) or (not len(lk) and len(rk)):
+ continue
+
+ lk_is_cat = is_categorical_dtype(lk)
+ rk_is_cat = is_categorical_dtype(rk)
+ lk_is_object = is_object_dtype(lk)
+ rk_is_object = is_object_dtype(rk)
+
+ # if either left or right is a categorical
+ # then the must match exactly in categories & ordered
+ if lk_is_cat and rk_is_cat:
+ if lk.is_dtype_equal(rk):
+ continue
+
+ elif lk_is_cat or rk_is_cat:
+ pass
+
+ elif is_dtype_equal(lk.dtype, rk.dtype):
+ continue
+
+ msg = ("You are trying to merge on {lk_dtype} and "
+ "{rk_dtype} columns. If you wish to proceed "
+ "you should use pd.concat".format(lk_dtype=lk.dtype,
+ rk_dtype=rk.dtype))
+
+ # if we are numeric, then allow differing
+ # kinds to proceed, eg. int64 and int8, int and float
+ # further if we are object, but we infer to
+ # the same, then proceed
+ if is_numeric_dtype(lk) and is_numeric_dtype(rk):
+ if lk.dtype.kind == rk.dtype.kind:
+ continue
+
+ # check whether ints and floats
+ elif is_integer_dtype(rk) and is_float_dtype(lk):
+ if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all():
+ warnings.warn('You are merging on int and float '
+ 'columns where the float values '
+ 'are not equal to their int '
+ 'representation', UserWarning)
+ continue
+
+ elif is_float_dtype(rk) and is_integer_dtype(lk):
+ if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all():
+ warnings.warn('You are merging on int and float '
+ 'columns where the float values '
+ 'are not equal to their int '
+ 'representation', UserWarning)
+ continue
+
+ # let's infer and see if we are ok
+ elif (lib.infer_dtype(lk, skipna=False)
+ == lib.infer_dtype(rk, skipna=False)):
+ continue
+
+ # Check if we are trying to merge on obviously
+ # incompatible dtypes GH 9780, GH 15800
+
+ # bool values are coerced to object
+ elif ((lk_is_object and is_bool_dtype(rk)) or
+ (is_bool_dtype(lk) and rk_is_object)):
+ pass
+
+ # object values are allowed to be merged
+ elif ((lk_is_object and is_numeric_dtype(rk)) or
+ (is_numeric_dtype(lk) and rk_is_object)):
+ inferred_left = lib.infer_dtype(lk, skipna=False)
+ inferred_right = lib.infer_dtype(rk, skipna=False)
+ bool_types = ['integer', 'mixed-integer', 'boolean', 'empty']
+ string_types = ['string', 'unicode', 'mixed', 'bytes', 'empty']
+
+ # inferred bool
+ if (inferred_left in bool_types and
+ inferred_right in bool_types):
+ pass
+
+ # unless we are merging non-string-like with string-like
+ elif ((inferred_left in string_types and
+ inferred_right not in string_types) or
+ (inferred_right in string_types and
+ inferred_left not in string_types)):
+ raise ValueError(msg)
+
+ # datetimelikes must match exactly
+ elif is_datetimelike(lk) and not is_datetimelike(rk):
+ raise ValueError(msg)
+ elif not is_datetimelike(lk) and is_datetimelike(rk):
+ raise ValueError(msg)
+ elif is_datetime64tz_dtype(lk) and not is_datetime64tz_dtype(rk):
+ raise ValueError(msg)
+ elif not is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
+ raise ValueError(msg)
+
+ elif lk_is_object and rk_is_object:
+ continue
+
+ # Houston, we have a problem!
+ # let's coerce to object if the dtypes aren't
+ # categorical, otherwise coerce to the category
+ # dtype. If we coerced categories to object,
+ # then we would lose type information on some
+ # columns, and end up trying to merge
+ # incompatible dtypes. See GH 16900.
+ if name in self.left.columns:
+ typ = lk.categories.dtype if lk_is_cat else object
+ self.left = self.left.assign(
+ **{name: self.left[name].astype(typ)})
+ if name in self.right.columns:
+ typ = rk.categories.dtype if rk_is_cat else object
+ self.right = self.right.assign(
+ **{name: self.right[name].astype(typ)})
+
+ def _validate_specification(self):
+ # Hm, any way to make this logic less complicated??
+ if self.on is None and self.left_on is None and self.right_on is None:
+
+ if self.left_index and self.right_index:
+ self.left_on, self.right_on = (), ()
+ elif self.left_index:
+ if self.right_on is None:
+ raise MergeError('Must pass right_on or right_index=True')
+ elif self.right_index:
+ if self.left_on is None:
+ raise MergeError('Must pass left_on or left_index=True')
+ else:
+ # use the common columns
+ common_cols = self.left.columns.intersection(
+ self.right.columns)
+ if len(common_cols) == 0:
+ raise MergeError(
+ 'No common columns to perform merge on. '
+ 'Merge options: left_on={lon}, right_on={ron}, '
+ 'left_index={lidx}, right_index={ridx}'
+ .format(lon=self.left_on, ron=self.right_on,
+ lidx=self.left_index, ridx=self.right_index))
+ if not common_cols.is_unique:
+ raise MergeError("Data columns not unique: {common!r}"
+ .format(common=common_cols))
+ self.left_on = self.right_on = common_cols
+ elif self.on is not None:
+ if self.left_on is not None or self.right_on is not None:
+ raise MergeError('Can only pass argument "on" OR "left_on" '
+ 'and "right_on", not a combination of both.')
+ self.left_on = self.right_on = self.on
+ elif self.left_on is not None:
+ n = len(self.left_on)
+ if self.right_index:
+ if len(self.left_on) != self.right.index.nlevels:
+ raise ValueError('len(left_on) must equal the number '
+ 'of levels in the index of "right"')
+ self.right_on = [None] * n
+ elif self.right_on is not None:
+ n = len(self.right_on)
+ if self.left_index:
+ if len(self.right_on) != self.left.index.nlevels:
+ raise ValueError('len(right_on) must equal the number '
+ 'of levels in the index of "left"')
+ self.left_on = [None] * n
+ if len(self.right_on) != len(self.left_on):
+ raise ValueError("len(right_on) must equal len(left_on)")
+
+ def _validate(self, validate):
+
+ # Check uniqueness of each
+ if self.left_index:
+ left_unique = self.orig_left.index.is_unique
+ else:
+ left_unique = MultiIndex.from_arrays(self.left_join_keys
+ ).is_unique
+
+ if self.right_index:
+ right_unique = self.orig_right.index.is_unique
+ else:
+ right_unique = MultiIndex.from_arrays(self.right_join_keys
+ ).is_unique
+
+ # Check data integrity
+ if validate in ["one_to_one", "1:1"]:
+ if not left_unique and not right_unique:
+ raise MergeError("Merge keys are not unique in either left"
+ " or right dataset; not a one-to-one merge")
+ elif not left_unique:
+ raise MergeError("Merge keys are not unique in left dataset;"
+ " not a one-to-one merge")
+ elif not right_unique:
+ raise MergeError("Merge keys are not unique in right dataset;"
+ " not a one-to-one merge")
+
+ elif validate in ["one_to_many", "1:m"]:
+ if not left_unique:
+ raise MergeError("Merge keys are not unique in left dataset;"
+ " not a one-to-many merge")
+
+ elif validate in ["many_to_one", "m:1"]:
+ if not right_unique:
+ raise MergeError("Merge keys are not unique in right dataset;"
+ " not a many-to-one merge")
+
+ elif validate in ['many_to_many', 'm:m']:
+ pass
+
+ else:
+ raise ValueError("Not a valid argument for validate")
+
+
+def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
+ **kwargs):
+ """
+
+ Parameters
+ ----------
+ left_keys: ndarray, Index, Series
+ right_keys: ndarray, Index, Series
+ sort: boolean, default False
+ how: string {'inner', 'outer', 'left', 'right'}, default 'inner'
+
+ Returns
+ -------
+ tuple of (left_indexer, right_indexer)
+ indexers into the left_keys, right_keys
+
+ """
+ from functools import partial
+
+ assert len(left_keys) == len(right_keys), \
+ 'left_key and right_keys must be the same length'
+
+ # bind `sort` arg. of _factorize_keys
+ fkeys = partial(_factorize_keys, sort=sort)
+
+ # get left & right join labels and num. of levels at each location
+ llab, rlab, shape = map(list, zip(* map(fkeys, left_keys, right_keys)))
+
+ # get flat i8 keys from label lists
+ lkey, rkey = _get_join_keys(llab, rlab, shape, sort)
+
+ # factorize keys to a dense i8 space
+ # `count` is the num. of unique keys
+ # set(lkey) | set(rkey) == range(count)
+ lkey, rkey, count = fkeys(lkey, rkey)
+
+ # preserve left frame order if how == 'left' and sort == False
+ kwargs = copy.copy(kwargs)
+ if how == 'left':
+ kwargs['sort'] = sort
+ join_func = _join_functions[how]
+
+ return join_func(lkey, rkey, count, **kwargs)
+
+
+def _restore_dropped_levels_multijoin(left, right, dropped_level_names,
+ join_index, lindexer, rindexer):
+ """
+ *this is an internal non-public method*
+
+ Returns the levels, labels and names of a multi-index to multi-index join.
+ Depending on the type of join, this method restores the appropriate
+ dropped levels of the joined multi-index.
+ The method relies on lidx, rindexer which hold the index positions of
+ left and right, where a join was feasible
+
+ Parameters
+ ----------
+ left : MultiIndex
+ left index
+ right : MultiIndex
+ right index
+ dropped_level_names : str array
+ list of non-common level names
+ join_index : MultiIndex
+ the index of the join between the
+ common levels of left and right
+ lindexer : intp array
+ left indexer
+ rindexer : intp array
+ right indexer
+
+ Returns
+ -------
+ levels : list of Index
+ levels of combined multiindexes
+ labels : intp array
+ labels of combined multiindexes
+ names : str array
+ names of combined multiindexes
+
+ """
+
+ def _convert_to_mulitindex(index):
+ if isinstance(index, MultiIndex):
+ return index
+ else:
+ return MultiIndex.from_arrays([index.values],
+ names=[index.name])
+
+ # For multi-multi joins with one overlapping level,
+ # the returned index if of type Index
+ # Assure that join_index is of type MultiIndex
+ # so that dropped levels can be appended
+ join_index = _convert_to_mulitindex(join_index)
+
+ join_levels = join_index.levels
+ join_codes = join_index.codes
+ join_names = join_index.names
+
+ # lindexer and rindexer hold the indexes where the join occurred
+ # for left and right respectively. If left/right is None then
+ # the join occurred on all indices of left/right
+ if lindexer is None:
+ lindexer = range(left.size)
+
+ if rindexer is None:
+ rindexer = range(right.size)
+
+ # Iterate through the levels that must be restored
+ for dropped_level_name in dropped_level_names:
+ if dropped_level_name in left.names:
+ idx = left
+ indexer = lindexer
+ else:
+ idx = right
+ indexer = rindexer
+
+ # The index of the level name to be restored
+ name_idx = idx.names.index(dropped_level_name)
+
+ restore_levels = idx.levels[name_idx]
+ # Inject -1 in the codes list where a join was not possible
+ # IOW indexer[i]=-1
+ codes = idx.codes[name_idx]
+ restore_codes = algos.take_nd(codes, indexer, fill_value=-1)
+
+ join_levels = join_levels + [restore_levels]
+ join_codes = join_codes + [restore_codes]
+ join_names = join_names + [dropped_level_name]
+
+ return join_levels, join_codes, join_names
+
+
+class _OrderedMerge(_MergeOperation):
+ _merge_type = 'ordered_merge'
+
+ def __init__(self, left, right, on=None, left_on=None, right_on=None,
+ left_index=False, right_index=False, axis=1,
+ suffixes=('_x', '_y'), copy=True,
+ fill_method=None, how='outer'):
+
+ self.fill_method = fill_method
+ _MergeOperation.__init__(self, left, right, on=on, left_on=left_on,
+ left_index=left_index,
+ right_index=right_index,
+ right_on=right_on, axis=axis,
+ how=how, suffixes=suffixes,
+ sort=True # factorize sorts
+ )
+
+ def get_result(self):
+ join_index, left_indexer, right_indexer = self._get_join_info()
+
+ # this is a bit kludgy
+ ldata, rdata = self.left._data, self.right._data
+ lsuf, rsuf = self.suffixes
+
+ llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf,
+ rdata.items, rsuf)
+
+ if self.fill_method == 'ffill':
+ left_join_indexer = libjoin.ffill_indexer(left_indexer)
+ right_join_indexer = libjoin.ffill_indexer(right_indexer)
+ else:
+ left_join_indexer = left_indexer
+ right_join_indexer = right_indexer
+
+ lindexers = {
+ 1: left_join_indexer} if left_join_indexer is not None else {}
+ rindexers = {
+ 1: right_join_indexer} if right_join_indexer is not None else {}
+
+ result_data = concatenate_block_managers(
+ [(ldata, lindexers), (rdata, rindexers)],
+ axes=[llabels.append(rlabels), join_index],
+ concat_axis=0, copy=self.copy)
+
+ typ = self.left._constructor
+ result = typ(result_data).__finalize__(self, method=self._merge_type)
+
+ self._maybe_add_join_keys(result, left_indexer, right_indexer)
+
+ return result
+
+
+def _asof_function(direction):
+ name = 'asof_join_{dir}'.format(dir=direction)
+ return getattr(libjoin, name, None)
+
+
+def _asof_by_function(direction):
+ name = 'asof_join_{dir}_on_X_by_Y'.format(dir=direction)
+ return getattr(libjoin, name, None)
+
+
+_type_casters = {
+ 'int64_t': ensure_int64,
+ 'double': ensure_float64,
+ 'object': ensure_object,
+}
+
+
+def _get_cython_type_upcast(dtype):
+ """ Upcast a dtype to 'int64_t', 'double', or 'object' """
+ if is_integer_dtype(dtype):
+ return 'int64_t'
+ elif is_float_dtype(dtype):
+ return 'double'
+ else:
+ return 'object'
+
+
+class _AsOfMerge(_OrderedMerge):
+ _merge_type = 'asof_merge'
+
+ def __init__(self, left, right, on=None, left_on=None, right_on=None,
+ left_index=False, right_index=False,
+ by=None, left_by=None, right_by=None,
+ axis=1, suffixes=('_x', '_y'), copy=True,
+ fill_method=None,
+ how='asof', tolerance=None,
+ allow_exact_matches=True,
+ direction='backward'):
+
+ self.by = by
+ self.left_by = left_by
+ self.right_by = right_by
+ self.tolerance = tolerance
+ self.allow_exact_matches = allow_exact_matches
+ self.direction = direction
+
+ _OrderedMerge.__init__(self, left, right, on=on, left_on=left_on,
+ right_on=right_on, left_index=left_index,
+ right_index=right_index, axis=axis,
+ how=how, suffixes=suffixes,
+ fill_method=fill_method)
+
+ def _validate_specification(self):
+ super(_AsOfMerge, self)._validate_specification()
+
+ # we only allow on to be a single item for on
+ if len(self.left_on) != 1 and not self.left_index:
+ raise MergeError("can only asof on a key for left")
+
+ if len(self.right_on) != 1 and not self.right_index:
+ raise MergeError("can only asof on a key for right")
+
+ if self.left_index and isinstance(self.left.index, MultiIndex):
+ raise MergeError("left can only have one index")
+
+ if self.right_index and isinstance(self.right.index, MultiIndex):
+ raise MergeError("right can only have one index")
+
+ # set 'by' columns
+ if self.by is not None:
+ if self.left_by is not None or self.right_by is not None:
+ raise MergeError('Can only pass by OR left_by '
+ 'and right_by')
+ self.left_by = self.right_by = self.by
+ if self.left_by is None and self.right_by is not None:
+ raise MergeError('missing left_by')
+ if self.left_by is not None and self.right_by is None:
+ raise MergeError('missing right_by')
+
+ # add 'by' to our key-list so we can have it in the
+ # output as a key
+ if self.left_by is not None:
+ if not is_list_like(self.left_by):
+ self.left_by = [self.left_by]
+ if not is_list_like(self.right_by):
+ self.right_by = [self.right_by]
+
+ if len(self.left_by) != len(self.right_by):
+ raise MergeError('left_by and right_by must be same length')
+
+ self.left_on = self.left_by + list(self.left_on)
+ self.right_on = self.right_by + list(self.right_on)
+
+ # check 'direction' is valid
+ if self.direction not in ['backward', 'forward', 'nearest']:
+ raise MergeError('direction invalid: {direction}'
+ .format(direction=self.direction))
+
+ @property
+ def _asof_key(self):
+ """ This is our asof key, the 'on' """
+ return self.left_on[-1]
+
+ def _get_merge_keys(self):
+
+ # note this function has side effects
+ (left_join_keys,
+ right_join_keys,
+ join_names) = super(_AsOfMerge, self)._get_merge_keys()
+
+ # validate index types are the same
+ for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
+ if not is_dtype_equal(lk.dtype, rk.dtype):
+ raise MergeError("incompatible merge keys [{i}] {lkdtype} and "
+ "{rkdtype}, must be the same type"
+ .format(i=i, lkdtype=lk.dtype,
+ rkdtype=rk.dtype))
+
+ # validate tolerance; must be a Timedelta if we have a DTI
+ if self.tolerance is not None:
+
+ if self.left_index:
+ lt = self.left.index
+ else:
+ lt = left_join_keys[-1]
+
+ msg = ("incompatible tolerance {tolerance}, must be compat "
+ "with type {lkdtype}".format(
+ tolerance=type(self.tolerance),
+ lkdtype=lt.dtype))
+
+ if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt):
+ if not isinstance(self.tolerance, Timedelta):
+ raise MergeError(msg)
+ if self.tolerance < Timedelta(0):
+ raise MergeError("tolerance must be positive")
+
+ elif is_int64_dtype(lt):
+ if not is_integer(self.tolerance):
+ raise MergeError(msg)
+ if self.tolerance < 0:
+ raise MergeError("tolerance must be positive")
+
+ elif is_float_dtype(lt):
+ if not is_number(self.tolerance):
+ raise MergeError(msg)
+ if self.tolerance < 0:
+ raise MergeError("tolerance must be positive")
+
+ else:
+ raise MergeError("key must be integer, timestamp or float")
+
+ # validate allow_exact_matches
+ if not is_bool(self.allow_exact_matches):
+ msg = "allow_exact_matches must be boolean, passed {passed}"
+ raise MergeError(msg.format(passed=self.allow_exact_matches))
+
+ return left_join_keys, right_join_keys, join_names
+
+ def _get_join_indexers(self):
+ """ return the join indexers """
+
+ def flip(xs):
+ """ unlike np.transpose, this returns an array of tuples """
+ labels = list(string.ascii_lowercase[:len(xs)])
+ dtypes = [x.dtype for x in xs]
+ labeled_dtypes = list(zip(labels, dtypes))
+ return np.array(lzip(*xs), labeled_dtypes)
+
+ # values to compare
+ left_values = (self.left.index.values if self.left_index else
+ self.left_join_keys[-1])
+ right_values = (self.right.index.values if self.right_index else
+ self.right_join_keys[-1])
+ tolerance = self.tolerance
+
+ # we require sortedness and non-null values in the join keys
+ msg_sorted = "{side} keys must be sorted"
+ msg_missings = "Merge keys contain null values on {side} side"
+
+ if not Index(left_values).is_monotonic:
+ if isnull(left_values).any():
+ raise ValueError(msg_missings.format(side='left'))
+ else:
+ raise ValueError(msg_sorted.format(side='left'))
+
+ if not Index(right_values).is_monotonic:
+ if isnull(right_values).any():
+ raise ValueError(msg_missings.format(side='right'))
+ else:
+ raise ValueError(msg_sorted.format(side='right'))
+
+ # initial type conversion as needed
+ if needs_i8_conversion(left_values):
+ left_values = left_values.view('i8')
+ right_values = right_values.view('i8')
+ if tolerance is not None:
+ tolerance = tolerance.value
+
+ # a "by" parameter requires special handling
+ if self.left_by is not None:
+ # remove 'on' parameter from values if one existed
+ if self.left_index and self.right_index:
+ left_by_values = self.left_join_keys
+ right_by_values = self.right_join_keys
+ else:
+ left_by_values = self.left_join_keys[0:-1]
+ right_by_values = self.right_join_keys[0:-1]
+
+ # get tuple representation of values if more than one
+ if len(left_by_values) == 1:
+ left_by_values = left_by_values[0]
+ right_by_values = right_by_values[0]
+ else:
+ left_by_values = flip(left_by_values)
+ right_by_values = flip(right_by_values)
+
+ # upcast 'by' parameter because HashTable is limited
+ by_type = _get_cython_type_upcast(left_by_values.dtype)
+ by_type_caster = _type_casters[by_type]
+ left_by_values = by_type_caster(left_by_values)
+ right_by_values = by_type_caster(right_by_values)
+
+ # choose appropriate function by type
+ func = _asof_by_function(self.direction)
+ return func(left_values,
+ right_values,
+ left_by_values,
+ right_by_values,
+ self.allow_exact_matches,
+ tolerance)
+ else:
+ # choose appropriate function by type
+ func = _asof_function(self.direction)
+ return func(left_values,
+ right_values,
+ self.allow_exact_matches,
+ tolerance)
+
+
+def _get_multiindex_indexer(join_keys, index, sort):
+ from functools import partial
+
+ # bind `sort` argument
+ fkeys = partial(_factorize_keys, sort=sort)
+
+ # left & right join labels and num. of levels at each location
+ rcodes, lcodes, shape = map(list, zip(* map(fkeys,
+ index.levels,
+ join_keys)))
+ if sort:
+ rcodes = list(map(np.take, rcodes, index.codes))
+ else:
+ i8copy = lambda a: a.astype('i8', subok=False, copy=True)
+ rcodes = list(map(i8copy, index.codes))
+
+ # fix right labels if there were any nulls
+ for i in range(len(join_keys)):
+ mask = index.codes[i] == -1
+ if mask.any():
+ # check if there already was any nulls at this location
+ # if there was, it is factorized to `shape[i] - 1`
+ a = join_keys[i][lcodes[i] == shape[i] - 1]
+ if a.size == 0 or not a[0] != a[0]:
+ shape[i] += 1
+
+ rcodes[i][mask] = shape[i] - 1
+
+ # get flat i8 join keys
+ lkey, rkey = _get_join_keys(lcodes, rcodes, shape, sort)
+
+ # factorize keys to a dense i8 space
+ lkey, rkey, count = fkeys(lkey, rkey)
+
+ return libjoin.left_outer_join(lkey, rkey, count, sort=sort)
+
+
+def _get_single_indexer(join_key, index, sort=False):
+ left_key, right_key, count = _factorize_keys(join_key, index, sort=sort)
+
+ left_indexer, right_indexer = libjoin.left_outer_join(
+ ensure_int64(left_key),
+ ensure_int64(right_key),
+ count, sort=sort)
+
+ return left_indexer, right_indexer
+
+
+def _left_join_on_index(left_ax, right_ax, join_keys, sort=False):
+ if len(join_keys) > 1:
+ if not ((isinstance(right_ax, MultiIndex) and
+ len(join_keys) == right_ax.nlevels)):
+ raise AssertionError("If more than one join key is given then "
+ "'right_ax' must be a MultiIndex and the "
+ "number of join keys must be the number of "
+ "levels in right_ax")
+
+ left_indexer, right_indexer = \
+ _get_multiindex_indexer(join_keys, right_ax, sort=sort)
+ else:
+ jkey = join_keys[0]
+
+ left_indexer, right_indexer = \
+ _get_single_indexer(jkey, right_ax, sort=sort)
+
+ if sort or len(left_ax) != len(left_indexer):
+ # if asked to sort or there are 1-to-many matches
+ join_index = left_ax.take(left_indexer)
+ return join_index, left_indexer, right_indexer
+
+ # left frame preserves order & length of its index
+ return left_ax, None, right_indexer
+
+
+def _right_outer_join(x, y, max_groups):
+ right_indexer, left_indexer = libjoin.left_outer_join(y, x, max_groups)
+ return left_indexer, right_indexer
+
+
+_join_functions = {
+ 'inner': libjoin.inner_join,
+ 'left': libjoin.left_outer_join,
+ 'right': _right_outer_join,
+ 'outer': libjoin.full_outer_join,
+}
+
+
+def _factorize_keys(lk, rk, sort=True):
+ # Some pre-processing for non-ndarray lk / rk
+ if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
+ lk = lk._data
+ rk = rk._data
+
+ elif (is_categorical_dtype(lk) and
+ is_categorical_dtype(rk) and
+ lk.is_dtype_equal(rk)):
+ if lk.categories.equals(rk.categories):
+ # if we exactly match in categories, allow us to factorize on codes
+ rk = rk.codes
+ else:
+ # Same categories in different orders -> recode
+ rk = _recode_for_categories(rk.codes, rk.categories, lk.categories)
+
+ lk = ensure_int64(lk.codes)
+ rk = ensure_int64(rk)
+
+ elif (is_extension_array_dtype(lk.dtype) and
+ is_extension_array_dtype(rk.dtype) and
+ lk.dtype == rk.dtype):
+ lk, _ = lk._values_for_factorize()
+ rk, _ = rk._values_for_factorize()
+
+ if is_integer_dtype(lk) and is_integer_dtype(rk):
+ # GH#23917 TODO: needs tests for case where lk is integer-dtype
+ # and rk is datetime-dtype
+ klass = libhashtable.Int64Factorizer
+ lk = ensure_int64(com.values_from_object(lk))
+ rk = ensure_int64(com.values_from_object(rk))
+ elif (issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and
+ issubclass(rk.dtype.type, (np.timedelta64, np.datetime64))):
+ # GH#23917 TODO: Needs tests for non-matching dtypes
+ klass = libhashtable.Int64Factorizer
+ lk = ensure_int64(com.values_from_object(lk))
+ rk = ensure_int64(com.values_from_object(rk))
+ else:
+ klass = libhashtable.Factorizer
+ lk = ensure_object(lk)
+ rk = ensure_object(rk)
+
+ rizer = klass(max(len(lk), len(rk)))
+
+ llab = rizer.factorize(lk)
+ rlab = rizer.factorize(rk)
+
+ count = rizer.get_count()
+
+ if sort:
+ uniques = rizer.uniques.to_array()
+ llab, rlab = _sort_labels(uniques, llab, rlab)
+
+ # NA group
+ lmask = llab == -1
+ lany = lmask.any()
+ rmask = rlab == -1
+ rany = rmask.any()
+
+ if lany or rany:
+ if lany:
+ np.putmask(llab, lmask, count)
+ if rany:
+ np.putmask(rlab, rmask, count)
+ count += 1
+
+ return llab, rlab, count
+
+
+def _sort_labels(uniques, left, right):
+ if not isinstance(uniques, np.ndarray):
+ # tuplesafe
+ uniques = Index(uniques).values
+
+ llength = len(left)
+ labels = np.concatenate([left, right])
+
+ _, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1)
+ new_labels = ensure_int64(new_labels)
+ new_left, new_right = new_labels[:llength], new_labels[llength:]
+
+ return new_left, new_right
+
+
+def _get_join_keys(llab, rlab, shape, sort):
+
+ # how many levels can be done without overflow
+ pred = lambda i: not is_int64_overflow_possible(shape[:i])
+ nlev = next(filter(pred, range(len(shape), 0, -1)))
+
+ # get keys for the first `nlev` levels
+ stride = np.prod(shape[1:nlev], dtype='i8')
+ lkey = stride * llab[0].astype('i8', subok=False, copy=False)
+ rkey = stride * rlab[0].astype('i8', subok=False, copy=False)
+
+ for i in range(1, nlev):
+ with np.errstate(divide='ignore'):
+ stride //= shape[i]
+ lkey += llab[i] * stride
+ rkey += rlab[i] * stride
+
+ if nlev == len(shape): # all done!
+ return lkey, rkey
+
+ # densify current keys to avoid overflow
+ lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
+
+ llab = [lkey] + llab[nlev:]
+ rlab = [rkey] + rlab[nlev:]
+ shape = [count] + shape[nlev:]
+
+ return _get_join_keys(llab, rlab, shape, sort)
+
+
+def _should_fill(lname, rname):
+ if (not isinstance(lname, compat.string_types) or
+ not isinstance(rname, compat.string_types)):
+ return True
+ return lname == rname
+
+
+def _any(x):
+ return x is not None and com._any_not_none(*x)
+
+
+def validate_operand(obj):
+ if isinstance(obj, DataFrame):
+ return obj
+ elif isinstance(obj, Series):
+ if obj.name is None:
+ raise ValueError('Cannot merge a Series without a name')
+ else:
+ return obj.to_frame()
+ else:
+ raise TypeError('Can only merge Series or DataFrame objects, '
+ 'a {obj} was passed'.format(obj=type(obj)))
diff --git a/contrib/python/pandas/py2/pandas/core/reshape/pivot.py b/contrib/python/pandas/py2/pandas/core/reshape/pivot.py
new file mode 100644
index 00000000000..c7c447d18b6
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/reshape/pivot.py
@@ -0,0 +1,618 @@
+# pylint: disable=E1103
+import numpy as np
+
+from pandas.compat import lrange, range, zip
+from pandas.util._decorators import Appender, Substitution
+
+from pandas.core.dtypes.cast import maybe_downcast_to_dtype
+from pandas.core.dtypes.common import is_integer_dtype, is_list_like, is_scalar
+from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
+
+from pandas import compat
+import pandas.core.common as com
+from pandas.core.frame import _shared_docs
+from pandas.core.groupby import Grouper
+from pandas.core.index import Index, MultiIndex, _get_objs_combined_axis
+from pandas.core.reshape.concat import concat
+from pandas.core.reshape.util import cartesian_product
+from pandas.core.series import Series
+
+
+# Note: We need to make sure `frame` is imported before `pivot`, otherwise
+# _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency
+@Substitution('\ndata : DataFrame')
+@Appender(_shared_docs['pivot_table'], indents=1)
+def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
+ fill_value=None, margins=False, dropna=True,
+ margins_name='All'):
+ index = _convert_by(index)
+ columns = _convert_by(columns)
+
+ if isinstance(aggfunc, list):
+ pieces = []
+ keys = []
+ for func in aggfunc:
+ table = pivot_table(data, values=values, index=index,
+ columns=columns,
+ fill_value=fill_value, aggfunc=func,
+ margins=margins, margins_name=margins_name)
+ pieces.append(table)
+ keys.append(getattr(func, '__name__', func))
+
+ return concat(pieces, keys=keys, axis=1)
+
+ keys = index + columns
+
+ values_passed = values is not None
+ if values_passed:
+ if is_list_like(values):
+ values_multi = True
+ values = list(values)
+ else:
+ values_multi = False
+ values = [values]
+
+ # GH14938 Make sure value labels are in data
+ for i in values:
+ if i not in data:
+ raise KeyError(i)
+
+ to_filter = []
+ for x in keys + values:
+ if isinstance(x, Grouper):
+ x = x.key
+ try:
+ if x in data:
+ to_filter.append(x)
+ except TypeError:
+ pass
+ if len(to_filter) < len(data.columns):
+ data = data[to_filter]
+
+ else:
+ values = data.columns
+ for key in keys:
+ try:
+ values = values.drop(key)
+ except (TypeError, ValueError, KeyError):
+ pass
+ values = list(values)
+
+ grouped = data.groupby(keys, observed=False)
+ agged = grouped.agg(aggfunc)
+ if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
+ agged = agged.dropna(how='all')
+
+ # gh-21133
+ # we want to down cast if
+ # the original values are ints
+ # as we grouped with a NaN value
+ # and then dropped, coercing to floats
+ for v in [v for v in values if v in data and v in agged]:
+ if (is_integer_dtype(data[v]) and
+ not is_integer_dtype(agged[v])):
+ agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype)
+
+ table = agged
+ if table.index.nlevels > 1:
+ # Related GH #17123
+ # If index_names are integers, determine whether the integers refer
+ # to the level position or name.
+ index_names = agged.index.names[:len(index)]
+ to_unstack = []
+ for i in range(len(index), len(keys)):
+ name = agged.index.names[i]
+ if name is None or name in index_names:
+ to_unstack.append(i)
+ else:
+ to_unstack.append(name)
+ table = agged.unstack(to_unstack)
+
+ if not dropna:
+ from pandas import MultiIndex
+ if table.index.nlevels > 1:
+ m = MultiIndex.from_arrays(cartesian_product(table.index.levels),
+ names=table.index.names)
+ table = table.reindex(m, axis=0)
+
+ if table.columns.nlevels > 1:
+ m = MultiIndex.from_arrays(cartesian_product(table.columns.levels),
+ names=table.columns.names)
+ table = table.reindex(m, axis=1)
+
+ if isinstance(table, ABCDataFrame):
+ table = table.sort_index(axis=1)
+
+ if fill_value is not None:
+ table = table.fillna(value=fill_value, downcast='infer')
+
+ if margins:
+ if dropna:
+ data = data[data.notna().all(axis=1)]
+ table = _add_margins(table, data, values, rows=index,
+ cols=columns, aggfunc=aggfunc,
+ observed=dropna,
+ margins_name=margins_name, fill_value=fill_value)
+
+ # discard the top level
+ if (values_passed and not values_multi and not table.empty and
+ (table.columns.nlevels > 1)):
+ table = table[values[0]]
+
+ if len(index) == 0 and len(columns) > 0:
+ table = table.T
+
+ # GH 15193 Make sure empty columns are removed if dropna=True
+ if isinstance(table, ABCDataFrame) and dropna:
+ table = table.dropna(how='all', axis=1)
+
+ return table
+
+
+def _add_margins(table, data, values, rows, cols, aggfunc,
+ observed=None, margins_name='All', fill_value=None):
+ if not isinstance(margins_name, compat.string_types):
+ raise ValueError('margins_name argument must be a string')
+
+ msg = u'Conflicting name "{name}" in margins'.format(name=margins_name)
+ for level in table.index.names:
+ if margins_name in table.index.get_level_values(level):
+ raise ValueError(msg)
+
+ grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name)
+
+ # could be passed a Series object with no 'columns'
+ if hasattr(table, 'columns'):
+ for level in table.columns.names[1:]:
+ if margins_name in table.columns.get_level_values(level):
+ raise ValueError(msg)
+
+ if len(rows) > 1:
+ key = (margins_name,) + ('',) * (len(rows) - 1)
+ else:
+ key = margins_name
+
+ if not values and isinstance(table, ABCSeries):
+ # If there are no values and the table is a series, then there is only
+ # one column in the data. Compute grand margin and return it.
+ return table.append(Series({key: grand_margin[margins_name]}))
+
+ if values:
+ marginal_result_set = _generate_marginal_results(table, data, values,
+ rows, cols, aggfunc,
+ observed,
+ grand_margin,
+ margins_name)
+ if not isinstance(marginal_result_set, tuple):
+ return marginal_result_set
+ result, margin_keys, row_margin = marginal_result_set
+ else:
+ marginal_result_set = _generate_marginal_results_without_values(
+ table, data, rows, cols, aggfunc, observed, margins_name)
+ if not isinstance(marginal_result_set, tuple):
+ return marginal_result_set
+ result, margin_keys, row_margin = marginal_result_set
+ row_margin = row_margin.reindex(result.columns, fill_value=fill_value)
+ # populate grand margin
+ for k in margin_keys:
+ if isinstance(k, compat.string_types):
+ row_margin[k] = grand_margin[k]
+ else:
+ row_margin[k] = grand_margin[k[0]]
+
+ from pandas import DataFrame
+ margin_dummy = DataFrame(row_margin, columns=[key]).T
+
+ row_names = result.index.names
+ try:
+ for dtype in set(result.dtypes):
+ cols = result.select_dtypes([dtype]).columns
+ margin_dummy[cols] = margin_dummy[cols].astype(dtype)
+ result = result.append(margin_dummy)
+ except TypeError:
+
+ # we cannot reshape, so coerce the axis
+ result.index = result.index._to_safe_for_reshape()
+ result = result.append(margin_dummy)
+ result.index.names = row_names
+
+ return result
+
+
+def _compute_grand_margin(data, values, aggfunc,
+ margins_name='All'):
+
+ if values:
+ grand_margin = {}
+ for k, v in data[values].iteritems():
+ try:
+ if isinstance(aggfunc, compat.string_types):
+ grand_margin[k] = getattr(v, aggfunc)()
+ elif isinstance(aggfunc, dict):
+ if isinstance(aggfunc[k], compat.string_types):
+ grand_margin[k] = getattr(v, aggfunc[k])()
+ else:
+ grand_margin[k] = aggfunc[k](v)
+ else:
+ grand_margin[k] = aggfunc(v)
+ except TypeError:
+ pass
+ return grand_margin
+ else:
+ return {margins_name: aggfunc(data.index)}
+
+
+def _generate_marginal_results(table, data, values, rows, cols, aggfunc,
+ observed,
+ grand_margin,
+ margins_name='All'):
+ if len(cols) > 0:
+ # need to "interleave" the margins
+ table_pieces = []
+ margin_keys = []
+
+ def _all_key(key):
+ return (key, margins_name) + ('',) * (len(cols) - 1)
+
+ if len(rows) > 0:
+ margin = data[rows + values].groupby(
+ rows, observed=observed).agg(aggfunc)
+ cat_axis = 1
+
+ for key, piece in table.groupby(level=0,
+ axis=cat_axis,
+ observed=observed):
+ all_key = _all_key(key)
+
+ # we are going to mutate this, so need to copy!
+ piece = piece.copy()
+ try:
+ piece[all_key] = margin[key]
+ except TypeError:
+
+ # we cannot reshape, so coerce the axis
+ piece.set_axis(piece._get_axis(
+ cat_axis)._to_safe_for_reshape(),
+ axis=cat_axis, inplace=True)
+ piece[all_key] = margin[key]
+
+ table_pieces.append(piece)
+ margin_keys.append(all_key)
+ else:
+ margin = grand_margin
+ cat_axis = 0
+ for key, piece in table.groupby(level=0,
+ axis=cat_axis,
+ observed=observed):
+ all_key = _all_key(key)
+ table_pieces.append(piece)
+ table_pieces.append(Series(margin[key], index=[all_key]))
+ margin_keys.append(all_key)
+
+ result = concat(table_pieces, axis=cat_axis)
+
+ if len(rows) == 0:
+ return result
+ else:
+ result = table
+ margin_keys = table.columns
+
+ if len(cols) > 0:
+ row_margin = data[cols + values].groupby(
+ cols, observed=observed).agg(aggfunc)
+ row_margin = row_margin.stack()
+
+ # slight hack
+ new_order = [len(cols)] + lrange(len(cols))
+ row_margin.index = row_margin.index.reorder_levels(new_order)
+ else:
+ row_margin = Series(np.nan, index=result.columns)
+
+ return result, margin_keys, row_margin
+
+
+def _generate_marginal_results_without_values(
+ table, data, rows, cols, aggfunc,
+ observed, margins_name='All'):
+ if len(cols) > 0:
+ # need to "interleave" the margins
+ margin_keys = []
+
+ def _all_key():
+ if len(cols) == 1:
+ return margins_name
+ return (margins_name, ) + ('', ) * (len(cols) - 1)
+
+ if len(rows) > 0:
+ margin = data[rows].groupby(rows,
+ observed=observed).apply(aggfunc)
+ all_key = _all_key()
+ table[all_key] = margin
+ result = table
+ margin_keys.append(all_key)
+
+ else:
+ margin = data.groupby(level=0,
+ axis=0,
+ observed=observed).apply(aggfunc)
+ all_key = _all_key()
+ table[all_key] = margin
+ result = table
+ margin_keys.append(all_key)
+ return result
+ else:
+ result = table
+ margin_keys = table.columns
+
+ if len(cols):
+ row_margin = data[cols].groupby(cols, observed=observed).apply(aggfunc)
+ else:
+ row_margin = Series(np.nan, index=result.columns)
+
+ return result, margin_keys, row_margin
+
+
+def _convert_by(by):
+ if by is None:
+ by = []
+ elif (is_scalar(by) or
+ isinstance(by, (np.ndarray, Index, ABCSeries, Grouper)) or
+ hasattr(by, '__call__')):
+ by = [by]
+ else:
+ by = list(by)
+ return by
+
+
+@Substitution('\ndata : DataFrame')
+@Appender(_shared_docs['pivot'], indents=1)
+def pivot(data, index=None, columns=None, values=None):
+ if values is None:
+ cols = [columns] if index is None else [index, columns]
+ append = index is None
+ indexed = data.set_index(cols, append=append)
+ else:
+ if index is None:
+ index = data.index
+ else:
+ index = data[index]
+ index = MultiIndex.from_arrays([index, data[columns]])
+
+ if is_list_like(values) and not isinstance(values, tuple):
+ # Exclude tuple because it is seen as a single column name
+ indexed = data._constructor(data[values].values, index=index,
+ columns=values)
+ else:
+ indexed = data._constructor_sliced(data[values].values,
+ index=index)
+ return indexed.unstack(columns)
+
+
+def crosstab(index, columns, values=None, rownames=None, colnames=None,
+ aggfunc=None, margins=False, margins_name='All', dropna=True,
+ normalize=False):
+ """
+ Compute a simple cross-tabulation of two (or more) factors. By default
+ computes a frequency table of the factors unless an array of values and an
+ aggregation function are passed
+
+ Parameters
+ ----------
+ index : array-like, Series, or list of arrays/Series
+ Values to group by in the rows
+ columns : array-like, Series, or list of arrays/Series
+ Values to group by in the columns
+ values : array-like, optional
+ Array of values to aggregate according to the factors.
+ Requires `aggfunc` be specified.
+ rownames : sequence, default None
+ If passed, must match number of row arrays passed
+ colnames : sequence, default None
+ If passed, must match number of column arrays passed
+ aggfunc : function, optional
+ If specified, requires `values` be specified as well
+ margins : boolean, default False
+ Add row/column margins (subtotals)
+ margins_name : string, default 'All'
+ Name of the row / column that will contain the totals
+ when margins is True.
+
+ .. versionadded:: 0.21.0
+
+ dropna : boolean, default True
+ Do not include columns whose entries are all NaN
+ normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False
+ Normalize by dividing all values by the sum of values.
+
+ - If passed 'all' or `True`, will normalize over all values.
+ - If passed 'index' will normalize over each row.
+ - If passed 'columns' will normalize over each column.
+ - If margins is `True`, will also normalize margin values.
+
+ .. versionadded:: 0.18.1
+
+ Returns
+ -------
+ crosstab : DataFrame
+
+ Notes
+ -----
+ Any Series passed will have their name attributes used unless row or column
+ names for the cross-tabulation are specified.
+
+ Any input passed containing Categorical data will have **all** of its
+ categories included in the cross-tabulation, even if the actual data does
+ not contain any instances of a particular category.
+
+ In the event that there aren't overlapping indexes an empty DataFrame will
+ be returned.
+
+ Examples
+ --------
+ >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
+ ... "bar", "bar", "foo", "foo", "foo"], dtype=object)
+ >>> b = np.array(["one", "one", "one", "two", "one", "one",
+ ... "one", "two", "two", "two", "one"], dtype=object)
+ >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
+ ... "shiny", "dull", "shiny", "shiny", "shiny"],
+ ... dtype=object)
+
+ >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
+ ... # doctest: +NORMALIZE_WHITESPACE
+ b one two
+ c dull shiny dull shiny
+ a
+ bar 1 2 1 0
+ foo 2 2 1 2
+
+ >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
+ >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
+ >>> crosstab(foo, bar) # 'c' and 'f' are not represented in the data,
+ # and will not be shown in the output because
+ # dropna is True by default. Set 'dropna=False'
+ # to preserve categories with no data
+ ... # doctest: +SKIP
+ col_0 d e
+ row_0
+ a 1 0
+ b 0 1
+
+ >>> crosstab(foo, bar, dropna=False) # 'c' and 'f' are not represented
+ # in the data, but they still will be counted
+ # and shown in the output
+ ... # doctest: +SKIP
+ col_0 d e f
+ row_0
+ a 1 0 0
+ b 0 1 0
+ c 0 0 0
+ """
+
+ index = com.maybe_make_list(index)
+ columns = com.maybe_make_list(columns)
+
+ rownames = _get_names(index, rownames, prefix='row')
+ colnames = _get_names(columns, colnames, prefix='col')
+
+ common_idx = _get_objs_combined_axis(index + columns, intersect=True,
+ sort=False)
+
+ data = {}
+ data.update(zip(rownames, index))
+ data.update(zip(colnames, columns))
+
+ if values is None and aggfunc is not None:
+ raise ValueError("aggfunc cannot be used without values.")
+
+ if values is not None and aggfunc is None:
+ raise ValueError("values cannot be used without an aggfunc.")
+
+ from pandas import DataFrame
+ df = DataFrame(data, index=common_idx)
+ if values is None:
+ df['__dummy__'] = 0
+ kwargs = {'aggfunc': len, 'fill_value': 0}
+ else:
+ df['__dummy__'] = values
+ kwargs = {'aggfunc': aggfunc}
+
+ table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
+ margins=margins, margins_name=margins_name,
+ dropna=dropna, **kwargs)
+
+ # Post-process
+ if normalize is not False:
+ table = _normalize(table, normalize=normalize, margins=margins,
+ margins_name=margins_name)
+
+ return table
+
+
+def _normalize(table, normalize, margins, margins_name='All'):
+
+ if not isinstance(normalize, bool) and not isinstance(normalize,
+ compat.string_types):
+ axis_subs = {0: 'index', 1: 'columns'}
+ try:
+ normalize = axis_subs[normalize]
+ except KeyError:
+ raise ValueError("Not a valid normalize argument")
+
+ if margins is False:
+
+ # Actual Normalizations
+ normalizers = {
+ 'all': lambda x: x / x.sum(axis=1).sum(axis=0),
+ 'columns': lambda x: x / x.sum(),
+ 'index': lambda x: x.div(x.sum(axis=1), axis=0)
+ }
+
+ normalizers[True] = normalizers['all']
+
+ try:
+ f = normalizers[normalize]
+ except KeyError:
+ raise ValueError("Not a valid normalize argument")
+
+ table = f(table)
+ table = table.fillna(0)
+
+ elif margins is True:
+
+ column_margin = table.loc[:, margins_name].drop(margins_name)
+ index_margin = table.loc[margins_name, :].drop(margins_name)
+ table = table.drop(margins_name, axis=1).drop(margins_name)
+ # to keep index and columns names
+ table_index_names = table.index.names
+ table_columns_names = table.columns.names
+
+ # Normalize core
+ table = _normalize(table, normalize=normalize, margins=False)
+
+ # Fix Margins
+ if normalize == 'columns':
+ column_margin = column_margin / column_margin.sum()
+ table = concat([table, column_margin], axis=1)
+ table = table.fillna(0)
+
+ elif normalize == 'index':
+ index_margin = index_margin / index_margin.sum()
+ table = table.append(index_margin)
+ table = table.fillna(0)
+
+ elif normalize == "all" or normalize is True:
+ column_margin = column_margin / column_margin.sum()
+ index_margin = index_margin / index_margin.sum()
+ index_margin.loc[margins_name] = 1
+ table = concat([table, column_margin], axis=1)
+ table = table.append(index_margin)
+
+ table = table.fillna(0)
+
+ else:
+ raise ValueError("Not a valid normalize argument")
+
+ table.index.names = table_index_names
+ table.columns.names = table_columns_names
+
+ else:
+ raise ValueError("Not a valid margins argument")
+
+ return table
+
+
+def _get_names(arrs, names, prefix='row'):
+ if names is None:
+ names = []
+ for i, arr in enumerate(arrs):
+ if isinstance(arr, ABCSeries) and arr.name is not None:
+ names.append(arr.name)
+ else:
+ names.append('{prefix}_{i}'.format(prefix=prefix, i=i))
+ else:
+ if len(names) != len(arrs):
+ raise AssertionError('arrays and names must have the same length')
+ if not isinstance(names, list):
+ names = list(names)
+
+ return names
diff --git a/contrib/python/pandas/py2/pandas/core/reshape/reshape.py b/contrib/python/pandas/py2/pandas/core/reshape/reshape.py
new file mode 100644
index 00000000000..f436b3b92a3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/reshape/reshape.py
@@ -0,0 +1,1044 @@
+# pylint: disable=E1101,E1103
+# pylint: disable=W0703,W0622,W0613,W0201
+from functools import partial
+import itertools
+
+import numpy as np
+
+from pandas._libs import algos as _algos, reshape as _reshape
+from pandas._libs.sparse import IntIndex
+from pandas.compat import PY2, range, text_type, u, zip
+
+from pandas.core.dtypes.cast import maybe_promote
+from pandas.core.dtypes.common import (
+ ensure_platform_int, is_bool_dtype, is_extension_array_dtype,
+ is_integer_dtype, is_list_like, is_object_dtype, needs_i8_conversion)
+from pandas.core.dtypes.missing import notna
+
+from pandas import compat
+import pandas.core.algorithms as algos
+from pandas.core.arrays import SparseArray
+from pandas.core.arrays.categorical import _factorize_from_iterable
+from pandas.core.frame import DataFrame
+from pandas.core.index import Index, MultiIndex
+from pandas.core.internals.arrays import extract_array
+from pandas.core.series import Series
+from pandas.core.sorting import (
+ compress_group_index, decons_obs_group_ids, get_compressed_ids,
+ get_group_index)
+
+
+class _Unstacker(object):
+ """
+ Helper class to unstack data / pivot with multi-level index
+
+ Parameters
+ ----------
+ values : ndarray
+ Values of DataFrame to "Unstack"
+ index : object
+ Pandas ``Index``
+ level : int or str, default last level
+ Level to "unstack". Accepts a name for the level.
+ value_columns : Index, optional
+ Pandas ``Index`` or ``MultiIndex`` object if unstacking a DataFrame
+ fill_value : scalar, optional
+ Default value to fill in missing values if subgroups do not have the
+ same set of labels. By default, missing values will be replaced with
+ the default fill value for that data type, NaN for float, NaT for
+ datetimelike, etc. For integer types, by default data will converted to
+ float and missing values will be set to NaN.
+ constructor : object
+ Pandas ``DataFrame`` or subclass used to create unstacked
+ response. If None, DataFrame or SparseDataFrame will be used.
+
+ Examples
+ --------
+ >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
+ ... ('two', 'a'), ('two', 'b')])
+ >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index)
+ >>> s
+ one a 1
+ b 2
+ two a 3
+ b 4
+ dtype: int64
+
+ >>> s.unstack(level=-1)
+ a b
+ one 1 2
+ two 3 4
+
+ >>> s.unstack(level=0)
+ one two
+ a 1 3
+ b 2 4
+
+ Returns
+ -------
+ unstacked : DataFrame
+ """
+
+ def __init__(self, values, index, level=-1, value_columns=None,
+ fill_value=None, constructor=None):
+
+ if values.ndim == 1:
+ values = values[:, np.newaxis]
+ self.values = values
+ self.value_columns = value_columns
+ self.fill_value = fill_value
+
+ if constructor is None:
+ constructor = DataFrame
+ self.constructor = constructor
+
+ if value_columns is None and values.shape[1] != 1: # pragma: no cover
+ raise ValueError('must pass column labels for multi-column data')
+
+ self.index = index.remove_unused_levels()
+
+ self.level = self.index._get_level_number(level)
+
+ # when index includes `nan`, need to lift levels/strides by 1
+ self.lift = 1 if -1 in self.index.codes[self.level] else 0
+
+ self.new_index_levels = list(self.index.levels)
+ self.new_index_names = list(self.index.names)
+
+ self.removed_name = self.new_index_names.pop(self.level)
+ self.removed_level = self.new_index_levels.pop(self.level)
+ self.removed_level_full = index.levels[self.level]
+
+ # Bug fix GH 20601
+ # If the data frame is too big, the number of unique index combination
+ # will cause int32 overflow on windows environments.
+ # We want to check and raise an error before this happens
+ num_rows = np.max([index_level.size for index_level
+ in self.new_index_levels])
+ num_columns = self.removed_level.size
+
+ # GH20601: This forces an overflow if the number of cells is too high.
+ num_cells = np.multiply(num_rows, num_columns, dtype=np.int32)
+
+ if num_rows > 0 and num_columns > 0 and num_cells <= 0:
+ raise ValueError('Unstacked DataFrame is too big, '
+ 'causing int32 overflow')
+
+ self._make_sorted_values_labels()
+ self._make_selectors()
+
+ def _make_sorted_values_labels(self):
+ v = self.level
+
+ codes = list(self.index.codes)
+ levs = list(self.index.levels)
+ to_sort = codes[:v] + codes[v + 1:] + [codes[v]]
+ sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]
+
+ comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
+ ngroups = len(obs_ids)
+
+ indexer = _algos.groupsort_indexer(comp_index, ngroups)[0]
+ indexer = ensure_platform_int(indexer)
+
+ self.sorted_values = algos.take_nd(self.values, indexer, axis=0)
+ self.sorted_labels = [l.take(indexer) for l in to_sort]
+
+ def _make_selectors(self):
+ new_levels = self.new_index_levels
+
+ # make the mask
+ remaining_labels = self.sorted_labels[:-1]
+ level_sizes = [len(x) for x in new_levels]
+
+ comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
+ ngroups = len(obs_ids)
+
+ comp_index = ensure_platform_int(comp_index)
+ stride = self.index.levshape[self.level] + self.lift
+ self.full_shape = ngroups, stride
+
+ selector = self.sorted_labels[-1] + stride * comp_index + self.lift
+ mask = np.zeros(np.prod(self.full_shape), dtype=bool)
+ mask.put(selector, True)
+
+ if mask.sum() < len(self.index):
+ raise ValueError('Index contains duplicate entries, '
+ 'cannot reshape')
+
+ self.group_index = comp_index
+ self.mask = mask
+ self.unique_groups = obs_ids
+ self.compressor = comp_index.searchsorted(np.arange(ngroups))
+
+ def get_result(self):
+ values, _ = self.get_new_values()
+ columns = self.get_new_columns()
+ index = self.get_new_index()
+
+ return self.constructor(values, index=index, columns=columns)
+
+ def get_new_values(self):
+ values = self.values
+
+ # place the values
+ length, width = self.full_shape
+ stride = values.shape[1]
+ result_width = width * stride
+ result_shape = (length, result_width)
+ mask = self.mask
+ mask_all = mask.all()
+
+ # we can simply reshape if we don't have a mask
+ if mask_all and len(values):
+ new_values = (self.sorted_values
+ .reshape(length, width, stride)
+ .swapaxes(1, 2)
+ .reshape(result_shape)
+ )
+ new_mask = np.ones(result_shape, dtype=bool)
+ return new_values, new_mask
+
+ # if our mask is all True, then we can use our existing dtype
+ if mask_all:
+ dtype = values.dtype
+ new_values = np.empty(result_shape, dtype=dtype)
+ else:
+ dtype, fill_value = maybe_promote(values.dtype, self.fill_value)
+ new_values = np.empty(result_shape, dtype=dtype)
+ new_values.fill(fill_value)
+
+ new_mask = np.zeros(result_shape, dtype=bool)
+
+ name = np.dtype(dtype).name
+ sorted_values = self.sorted_values
+
+ # we need to convert to a basic dtype
+ # and possibly coerce an input to our output dtype
+ # e.g. ints -> floats
+ if needs_i8_conversion(values):
+ sorted_values = sorted_values.view('i8')
+ new_values = new_values.view('i8')
+ name = 'int64'
+ elif is_bool_dtype(values):
+ sorted_values = sorted_values.astype('object')
+ new_values = new_values.astype('object')
+ name = 'object'
+ else:
+ sorted_values = sorted_values.astype(name, copy=False)
+
+ # fill in our values & mask
+ f = getattr(_reshape, "unstack_{name}".format(name=name))
+ f(sorted_values,
+ mask.view('u1'),
+ stride,
+ length,
+ width,
+ new_values,
+ new_mask.view('u1'))
+
+ # reconstruct dtype if needed
+ if needs_i8_conversion(values):
+ new_values = new_values.view(values.dtype)
+
+ return new_values, new_mask
+
+ def get_new_columns(self):
+ if self.value_columns is None:
+ if self.lift == 0:
+ return self.removed_level
+
+ lev = self.removed_level
+ return lev.insert(0, lev._na_value)
+
+ stride = len(self.removed_level) + self.lift
+ width = len(self.value_columns)
+ propagator = np.repeat(np.arange(width), stride)
+ if isinstance(self.value_columns, MultiIndex):
+ new_levels = self.value_columns.levels + (self.removed_level_full,)
+ new_names = self.value_columns.names + (self.removed_name,)
+
+ new_codes = [lab.take(propagator)
+ for lab in self.value_columns.codes]
+ else:
+ new_levels = [self.value_columns, self.removed_level_full]
+ new_names = [self.value_columns.name, self.removed_name]
+ new_codes = [propagator]
+
+ # The two indices differ only if the unstacked level had unused items:
+ if len(self.removed_level_full) != len(self.removed_level):
+ # In this case, we remap the new codes to the original level:
+ repeater = self.removed_level_full.get_indexer(self.removed_level)
+ if self.lift:
+ repeater = np.insert(repeater, 0, -1)
+ else:
+ # Otherwise, we just use each level item exactly once:
+ repeater = np.arange(stride) - self.lift
+
+ # The entire level is then just a repetition of the single chunk:
+ new_codes.append(np.tile(repeater, width))
+ return MultiIndex(levels=new_levels, codes=new_codes,
+ names=new_names, verify_integrity=False)
+
+ def get_new_index(self):
+ result_codes = [lab.take(self.compressor)
+ for lab in self.sorted_labels[:-1]]
+
+ # construct the new index
+ if len(self.new_index_levels) == 1:
+ lev, lab = self.new_index_levels[0], result_codes[0]
+ if (lab == -1).any():
+ lev = lev.insert(len(lev), lev._na_value)
+ return lev.take(lab)
+
+ return MultiIndex(levels=self.new_index_levels, codes=result_codes,
+ names=self.new_index_names, verify_integrity=False)
+
+
+def _unstack_multiple(data, clocs, fill_value=None):
+ if len(clocs) == 0:
+ return data
+
+ # NOTE: This doesn't deal with hierarchical columns yet
+
+ index = data.index
+
+ clocs = [index._get_level_number(i) for i in clocs]
+
+ rlocs = [i for i in range(index.nlevels) if i not in clocs]
+
+ clevels = [index.levels[i] for i in clocs]
+ ccodes = [index.codes[i] for i in clocs]
+ cnames = [index.names[i] for i in clocs]
+ rlevels = [index.levels[i] for i in rlocs]
+ rcodes = [index.codes[i] for i in rlocs]
+ rnames = [index.names[i] for i in rlocs]
+
+ shape = [len(x) for x in clevels]
+ group_index = get_group_index(ccodes, shape, sort=False, xnull=False)
+
+ comp_ids, obs_ids = compress_group_index(group_index, sort=False)
+ recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes,
+ xnull=False)
+
+ if rlocs == []:
+ # Everything is in clocs, so the dummy df has a regular index
+ dummy_index = Index(obs_ids, name='__placeholder__')
+ else:
+ dummy_index = MultiIndex(levels=rlevels + [obs_ids],
+ codes=rcodes + [comp_ids],
+ names=rnames + ['__placeholder__'],
+ verify_integrity=False)
+
+ if isinstance(data, Series):
+ dummy = data.copy()
+ dummy.index = dummy_index
+
+ unstacked = dummy.unstack('__placeholder__', fill_value=fill_value)
+ new_levels = clevels
+ new_names = cnames
+ new_codes = recons_codes
+ else:
+ if isinstance(data.columns, MultiIndex):
+ result = data
+ for i in range(len(clocs)):
+ val = clocs[i]
+ result = result.unstack(val)
+ clocs = [v if i > v else v - 1 for v in clocs]
+
+ return result
+
+ dummy = data.copy()
+ dummy.index = dummy_index
+
+ unstacked = dummy.unstack('__placeholder__', fill_value=fill_value)
+ if isinstance(unstacked, Series):
+ unstcols = unstacked.index
+ else:
+ unstcols = unstacked.columns
+ new_levels = [unstcols.levels[0]] + clevels
+ new_names = [data.columns.name] + cnames
+
+ new_codes = [unstcols.codes[0]]
+ for rec in recons_codes:
+ new_codes.append(rec.take(unstcols.codes[-1]))
+
+ new_columns = MultiIndex(levels=new_levels, codes=new_codes,
+ names=new_names, verify_integrity=False)
+
+ if isinstance(unstacked, Series):
+ unstacked.index = new_columns
+ else:
+ unstacked.columns = new_columns
+
+ return unstacked
+
+
+def unstack(obj, level, fill_value=None):
+ if isinstance(level, (tuple, list)):
+ if len(level) != 1:
+ # _unstack_multiple only handles MultiIndexes,
+ # and isn't needed for a single level
+ return _unstack_multiple(obj, level, fill_value=fill_value)
+ else:
+ level = level[0]
+
+ if isinstance(obj, DataFrame):
+ if isinstance(obj.index, MultiIndex):
+ return _unstack_frame(obj, level, fill_value=fill_value)
+ else:
+ return obj.T.stack(dropna=False)
+ else:
+ if is_extension_array_dtype(obj.dtype):
+ return _unstack_extension_series(obj, level, fill_value)
+ unstacker = _Unstacker(obj.values, obj.index, level=level,
+ fill_value=fill_value,
+ constructor=obj._constructor_expanddim)
+ return unstacker.get_result()
+
+
+def _unstack_frame(obj, level, fill_value=None):
+ if obj._is_mixed_type:
+ unstacker = partial(_Unstacker, index=obj.index,
+ level=level, fill_value=fill_value)
+ blocks = obj._data.unstack(unstacker,
+ fill_value=fill_value)
+ return obj._constructor(blocks)
+ else:
+ unstacker = _Unstacker(obj.values, obj.index, level=level,
+ value_columns=obj.columns,
+ fill_value=fill_value,
+ constructor=obj._constructor)
+ return unstacker.get_result()
+
+
+def _unstack_extension_series(series, level, fill_value):
+ """
+ Unstack an ExtensionArray-backed Series.
+
+ The ExtensionDtype is preserved.
+
+ Parameters
+ ----------
+ series : Series
+ A Series with an ExtensionArray for values
+ level : Any
+ The level name or number.
+ fill_value : Any
+ The user-level (not physical storage) fill value to use for
+ missing values introduced by the reshape. Passed to
+ ``series.values.take``.
+
+ Returns
+ -------
+ DataFrame
+ Each column of the DataFrame will have the same dtype as
+ the input Series.
+ """
+ # Implementation note: the basic idea is to
+ # 1. Do a regular unstack on a dummy array of integers
+ # 2. Followup with a columnwise take.
+ # We use the dummy take to discover newly-created missing values
+ # introduced by the reshape.
+ from pandas.core.reshape.concat import concat
+
+ dummy_arr = np.arange(len(series))
+ # fill_value=-1, since we will do a series.values.take later
+ result = _Unstacker(dummy_arr, series.index,
+ level=level, fill_value=-1).get_result()
+
+ out = []
+ values = extract_array(series, extract_numpy=False)
+
+ for col, indices in result.iteritems():
+ out.append(Series(values.take(indices.values,
+ allow_fill=True,
+ fill_value=fill_value),
+ name=col, index=result.index))
+ return concat(out, axis='columns', copy=False, keys=result.columns)
+
+
+def stack(frame, level=-1, dropna=True):
+ """
+ Convert DataFrame to Series with multi-level Index. Columns become the
+ second level of the resulting hierarchical index
+
+ Returns
+ -------
+ stacked : Series
+ """
+ def factorize(index):
+ if index.is_unique:
+ return index, np.arange(len(index))
+ codes, categories = _factorize_from_iterable(index)
+ return categories, codes
+
+ N, K = frame.shape
+
+ # Will also convert negative level numbers and check if out of bounds.
+ level_num = frame.columns._get_level_number(level)
+
+ if isinstance(frame.columns, MultiIndex):
+ return _stack_multi_columns(frame, level_num=level_num, dropna=dropna)
+ elif isinstance(frame.index, MultiIndex):
+ new_levels = list(frame.index.levels)
+ new_codes = [lab.repeat(K) for lab in frame.index.codes]
+
+ clev, clab = factorize(frame.columns)
+ new_levels.append(clev)
+ new_codes.append(np.tile(clab, N).ravel())
+
+ new_names = list(frame.index.names)
+ new_names.append(frame.columns.name)
+ new_index = MultiIndex(levels=new_levels, codes=new_codes,
+ names=new_names, verify_integrity=False)
+ else:
+ levels, (ilab, clab) = zip(*map(factorize, (frame.index,
+ frame.columns)))
+ codes = ilab.repeat(K), np.tile(clab, N).ravel()
+ new_index = MultiIndex(levels=levels, codes=codes,
+ names=[frame.index.name, frame.columns.name],
+ verify_integrity=False)
+
+ if frame._is_homogeneous_type:
+ # For homogeneous EAs, frame.values will coerce to object. So
+ # we concatenate instead.
+ dtypes = list(frame.dtypes.values)
+ dtype = dtypes[0]
+
+ if is_extension_array_dtype(dtype):
+ arr = dtype.construct_array_type()
+ new_values = arr._concat_same_type([
+ col._values for _, col in frame.iteritems()
+ ])
+ new_values = _reorder_for_extension_array_stack(new_values, N, K)
+ else:
+ # homogeneous, non-EA
+ new_values = frame.values.ravel()
+
+ else:
+ # non-homogeneous
+ new_values = frame.values.ravel()
+
+ if dropna:
+ mask = notna(new_values)
+ new_values = new_values[mask]
+ new_index = new_index[mask]
+
+ return frame._constructor_sliced(new_values, index=new_index)
+
+
+def stack_multiple(frame, level, dropna=True):
+ # If all passed levels match up to column names, no
+ # ambiguity about what to do
+ if all(lev in frame.columns.names for lev in level):
+ result = frame
+ for lev in level:
+ result = stack(result, lev, dropna=dropna)
+
+ # Otherwise, level numbers may change as each successive level is stacked
+ elif all(isinstance(lev, int) for lev in level):
+ # As each stack is done, the level numbers decrease, so we need
+ # to account for that when level is a sequence of ints
+ result = frame
+ # _get_level_number() checks level numbers are in range and converts
+ # negative numbers to positive
+ level = [frame.columns._get_level_number(lev) for lev in level]
+
+ # Can't iterate directly through level as we might need to change
+ # values as we go
+ for index in range(len(level)):
+ lev = level[index]
+ result = stack(result, lev, dropna=dropna)
+ # Decrement all level numbers greater than current, as these
+ # have now shifted down by one
+ updated_level = []
+ for other in level:
+ if other > lev:
+ updated_level.append(other - 1)
+ else:
+ updated_level.append(other)
+ level = updated_level
+
+ else:
+ raise ValueError("level should contain all level names or all level "
+ "numbers, not a mixture of the two.")
+
+ return result
+
+
+def _stack_multi_columns(frame, level_num=-1, dropna=True):
+ def _convert_level_number(level_num, columns):
+ """
+ Logic for converting the level number to something we can safely pass
+ to swaplevel:
+
+ We generally want to convert the level number into a level name, except
+ when columns do not have names, in which case we must leave as a level
+ number
+ """
+ if level_num in columns.names:
+ return columns.names[level_num]
+ else:
+ if columns.names[level_num] is None:
+ return level_num
+ else:
+ return columns.names[level_num]
+
+ this = frame.copy()
+
+ # this makes life much simpler
+ if level_num != frame.columns.nlevels - 1:
+ # roll levels to put selected level at end
+ roll_columns = this.columns
+ for i in range(level_num, frame.columns.nlevels - 1):
+ # Need to check if the ints conflict with level names
+ lev1 = _convert_level_number(i, roll_columns)
+ lev2 = _convert_level_number(i + 1, roll_columns)
+ roll_columns = roll_columns.swaplevel(lev1, lev2)
+ this.columns = roll_columns
+
+ if not this.columns.is_lexsorted():
+ # Workaround the edge case where 0 is one of the column names,
+ # which interferes with trying to sort based on the first
+ # level
+ level_to_sort = _convert_level_number(0, this.columns)
+ this = this.sort_index(level=level_to_sort, axis=1)
+
+ # tuple list excluding level for grouping columns
+ if len(frame.columns.levels) > 2:
+ tuples = list(zip(*[lev.take(level_codes) for lev, level_codes
+ in zip(this.columns.levels[:-1],
+ this.columns.codes[:-1])]))
+ unique_groups = [key for key, _ in itertools.groupby(tuples)]
+ new_names = this.columns.names[:-1]
+ new_columns = MultiIndex.from_tuples(unique_groups, names=new_names)
+ else:
+ new_columns = unique_groups = this.columns.levels[0]
+
+ # time to ravel the values
+ new_data = {}
+ level_vals = this.columns.levels[-1]
+ level_codes = sorted(set(this.columns.codes[-1]))
+ level_vals_used = level_vals[level_codes]
+ levsize = len(level_codes)
+ drop_cols = []
+ for key in unique_groups:
+ try:
+ loc = this.columns.get_loc(key)
+ except KeyError:
+ drop_cols.append(key)
+ continue
+
+ # can make more efficient?
+ # we almost always return a slice
+ # but if unsorted can get a boolean
+ # indexer
+ if not isinstance(loc, slice):
+ slice_len = len(loc)
+ else:
+ slice_len = loc.stop - loc.start
+
+ if slice_len != levsize:
+ chunk = this.loc[:, this.columns[loc]]
+ chunk.columns = level_vals.take(chunk.columns.codes[-1])
+ value_slice = chunk.reindex(columns=level_vals_used).values
+ else:
+ if (frame._is_homogeneous_type and
+ is_extension_array_dtype(frame.dtypes.iloc[0])):
+ dtype = this[this.columns[loc]].dtypes.iloc[0]
+ subset = this[this.columns[loc]]
+
+ value_slice = dtype.construct_array_type()._concat_same_type(
+ [x._values for _, x in subset.iteritems()]
+ )
+ N, K = this.shape
+ idx = np.arange(N * K).reshape(K, N).T.ravel()
+ value_slice = value_slice.take(idx)
+
+ elif frame._is_mixed_type:
+ value_slice = this[this.columns[loc]].values
+ else:
+ value_slice = this.values[:, loc]
+
+ if value_slice.ndim > 1:
+ # i.e. not extension
+ value_slice = value_slice.ravel()
+
+ new_data[key] = value_slice
+
+ if len(drop_cols) > 0:
+ new_columns = new_columns.difference(drop_cols)
+
+ N = len(this)
+
+ if isinstance(this.index, MultiIndex):
+ new_levels = list(this.index.levels)
+ new_names = list(this.index.names)
+ new_codes = [lab.repeat(levsize) for lab in this.index.codes]
+ else:
+ new_levels = [this.index]
+ new_codes = [np.arange(N).repeat(levsize)]
+ new_names = [this.index.name] # something better?
+
+ new_levels.append(level_vals)
+ new_codes.append(np.tile(level_codes, N))
+ new_names.append(frame.columns.names[level_num])
+
+ new_index = MultiIndex(levels=new_levels, codes=new_codes,
+ names=new_names, verify_integrity=False)
+
+ result = frame._constructor(new_data, index=new_index, columns=new_columns)
+
+ # more efficient way to go about this? can do the whole masking biz but
+ # will only save a small amount of time...
+ if dropna:
+ result = result.dropna(axis=0, how='all')
+
+ return result
+
+
+def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
+ columns=None, sparse=False, drop_first=False, dtype=None):
+ """
+ Convert categorical variable into dummy/indicator variables
+
+ Parameters
+ ----------
+ data : array-like, Series, or DataFrame
+ prefix : string, list of strings, or dict of strings, default None
+ String to append DataFrame column names.
+ Pass a list with length equal to the number of columns
+ when calling get_dummies on a DataFrame. Alternatively, `prefix`
+ can be a dictionary mapping column names to prefixes.
+ prefix_sep : string, default '_'
+ If appending prefix, separator/delimiter to use. Or pass a
+ list or dictionary as with `prefix.`
+ dummy_na : bool, default False
+ Add a column to indicate NaNs, if False NaNs are ignored.
+ columns : list-like, default None
+ Column names in the DataFrame to be encoded.
+ If `columns` is None then all the columns with
+ `object` or `category` dtype will be converted.
+ sparse : bool, default False
+ Whether the dummy-encoded columns should be be backed by
+ a :class:`SparseArray` (True) or a regular NumPy array (False).
+ drop_first : bool, default False
+ Whether to get k-1 dummies out of k categorical levels by removing the
+ first level.
+
+ .. versionadded:: 0.18.0
+
+ dtype : dtype, default np.uint8
+ Data type for new columns. Only a single dtype is allowed.
+
+ .. versionadded:: 0.23.0
+
+ Returns
+ -------
+ dummies : DataFrame
+
+ See Also
+ --------
+ Series.str.get_dummies
+
+ Examples
+ --------
+ >>> s = pd.Series(list('abca'))
+
+ >>> pd.get_dummies(s)
+ a b c
+ 0 1 0 0
+ 1 0 1 0
+ 2 0 0 1
+ 3 1 0 0
+
+ >>> s1 = ['a', 'b', np.nan]
+
+ >>> pd.get_dummies(s1)
+ a b
+ 0 1 0
+ 1 0 1
+ 2 0 0
+
+ >>> pd.get_dummies(s1, dummy_na=True)
+ a b NaN
+ 0 1 0 0
+ 1 0 1 0
+ 2 0 0 1
+
+ >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
+ ... 'C': [1, 2, 3]})
+
+ >>> pd.get_dummies(df, prefix=['col1', 'col2'])
+ C col1_a col1_b col2_a col2_b col2_c
+ 0 1 1 0 0 1 0
+ 1 2 0 1 1 0 0
+ 2 3 1 0 0 0 1
+
+ >>> pd.get_dummies(pd.Series(list('abcaa')))
+ a b c
+ 0 1 0 0
+ 1 0 1 0
+ 2 0 0 1
+ 3 1 0 0
+ 4 1 0 0
+
+ >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
+ b c
+ 0 0 0
+ 1 1 0
+ 2 0 1
+ 3 0 0
+ 4 0 0
+
+ >>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
+ a b c
+ 0 1.0 0.0 0.0
+ 1 0.0 1.0 0.0
+ 2 0.0 0.0 1.0
+ """
+ from pandas.core.reshape.concat import concat
+ from itertools import cycle
+
+ dtypes_to_encode = ['object', 'category']
+
+ if isinstance(data, DataFrame):
+ # determine columns being encoded
+ if columns is None:
+ data_to_encode = data.select_dtypes(
+ include=dtypes_to_encode)
+ else:
+ data_to_encode = data[columns]
+
+ # validate prefixes and separator to avoid silently dropping cols
+ def check_len(item, name):
+ len_msg = ("Length of '{name}' ({len_item}) did not match the "
+ "length of the columns being encoded ({len_enc}).")
+
+ if is_list_like(item):
+ if not len(item) == data_to_encode.shape[1]:
+ len_msg = len_msg.format(name=name, len_item=len(item),
+ len_enc=data_to_encode.shape[1])
+ raise ValueError(len_msg)
+
+ check_len(prefix, 'prefix')
+ check_len(prefix_sep, 'prefix_sep')
+
+ if isinstance(prefix, compat.string_types):
+ prefix = cycle([prefix])
+ if isinstance(prefix, dict):
+ prefix = [prefix[col] for col in data_to_encode.columns]
+
+ if prefix is None:
+ prefix = data_to_encode.columns
+
+ # validate separators
+ if isinstance(prefix_sep, compat.string_types):
+ prefix_sep = cycle([prefix_sep])
+ elif isinstance(prefix_sep, dict):
+ prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
+
+ if data_to_encode.shape == data.shape:
+ # Encoding the entire df, do not prepend any dropped columns
+ with_dummies = []
+ elif columns is not None:
+ # Encoding only cols specified in columns. Get all cols not in
+ # columns to prepend to result.
+ with_dummies = [data.drop(columns, axis=1)]
+ else:
+ # Encoding only object and category dtype columns. Get remaining
+ # columns to prepend to result.
+ with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
+
+ for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix,
+ prefix_sep):
+ # col is (column_name, column), use just column data here
+ dummy = _get_dummies_1d(col[1], prefix=pre, prefix_sep=sep,
+ dummy_na=dummy_na, sparse=sparse,
+ drop_first=drop_first, dtype=dtype)
+ with_dummies.append(dummy)
+ result = concat(with_dummies, axis=1)
+ else:
+ result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
+ sparse=sparse,
+ drop_first=drop_first,
+ dtype=dtype)
+ return result
+
+
+def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
+ sparse=False, drop_first=False, dtype=None):
+ from pandas.core.reshape.concat import concat
+ # Series avoids inconsistent NaN handling
+ codes, levels = _factorize_from_iterable(Series(data))
+
+ if dtype is None:
+ dtype = np.uint8
+ dtype = np.dtype(dtype)
+
+ if is_object_dtype(dtype):
+ raise ValueError("dtype=object is not a valid dtype for get_dummies")
+
+ def get_empty_frame(data):
+ if isinstance(data, Series):
+ index = data.index
+ else:
+ index = np.arange(len(data))
+ return DataFrame(index=index)
+
+ # if all NaN
+ if not dummy_na and len(levels) == 0:
+ return get_empty_frame(data)
+
+ codes = codes.copy()
+ if dummy_na:
+ codes[codes == -1] = len(levels)
+ levels = np.append(levels, np.nan)
+
+ # if dummy_na, we just fake a nan level. drop_first will drop it again
+ if drop_first and len(levels) == 1:
+ return get_empty_frame(data)
+
+ number_of_cols = len(levels)
+
+ if prefix is None:
+ dummy_cols = levels
+ else:
+
+ # PY2 embedded unicode, gh-22084
+ def _make_col_name(prefix, prefix_sep, level):
+ fstr = '{prefix}{prefix_sep}{level}'
+ if PY2 and (isinstance(prefix, text_type) or
+ isinstance(prefix_sep, text_type) or
+ isinstance(level, text_type)):
+ fstr = u(fstr)
+ return fstr.format(prefix=prefix,
+ prefix_sep=prefix_sep,
+ level=level)
+
+ dummy_cols = [_make_col_name(prefix, prefix_sep, level)
+ for level in levels]
+
+ if isinstance(data, Series):
+ index = data.index
+ else:
+ index = None
+
+ if sparse:
+
+ if is_integer_dtype(dtype):
+ fill_value = 0
+ elif dtype == bool:
+ fill_value = False
+ else:
+ fill_value = 0.0
+
+ sparse_series = []
+ N = len(data)
+ sp_indices = [[] for _ in range(len(dummy_cols))]
+ mask = codes != -1
+ codes = codes[mask]
+ n_idx = np.arange(N)[mask]
+
+ for ndx, code in zip(n_idx, codes):
+ sp_indices[code].append(ndx)
+
+ if drop_first:
+ # remove first categorical level to avoid perfect collinearity
+ # GH12042
+ sp_indices = sp_indices[1:]
+ dummy_cols = dummy_cols[1:]
+ for col, ixs in zip(dummy_cols, sp_indices):
+ sarr = SparseArray(np.ones(len(ixs), dtype=dtype),
+ sparse_index=IntIndex(N, ixs),
+ fill_value=fill_value,
+ dtype=dtype)
+ sparse_series.append(Series(data=sarr, index=index, name=col))
+
+ out = concat(sparse_series, axis=1, copy=False)
+ return out
+
+ else:
+ dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0)
+
+ if not dummy_na:
+ # reset NaN GH4446
+ dummy_mat[codes == -1] = 0
+
+ if drop_first:
+ # remove first GH12042
+ dummy_mat = dummy_mat[:, 1:]
+ dummy_cols = dummy_cols[1:]
+ return DataFrame(dummy_mat, index=index, columns=dummy_cols)
+
+
+def make_axis_dummies(frame, axis='minor', transform=None):
+ """
+ Construct 1-0 dummy variables corresponding to designated axis
+ labels
+
+ Parameters
+ ----------
+ frame : DataFrame
+ axis : {'major', 'minor'}, default 'minor'
+ transform : function, default None
+ Function to apply to axis labels first. For example, to
+ get "day of week" dummies in a time series regression
+ you might call::
+
+ make_axis_dummies(panel, axis='major',
+ transform=lambda d: d.weekday())
+ Returns
+ -------
+ dummies : DataFrame
+ Column names taken from chosen axis
+ """
+ numbers = {'major': 0, 'minor': 1}
+ num = numbers.get(axis, axis)
+
+ items = frame.index.levels[num]
+ codes = frame.index.codes[num]
+ if transform is not None:
+ mapped_items = items.map(transform)
+ codes, items = _factorize_from_iterable(mapped_items.take(codes))
+
+ values = np.eye(len(items), dtype=float)
+ values = values.take(codes, axis=0)
+
+ return DataFrame(values, columns=items, index=frame.index)
+
+
+def _reorder_for_extension_array_stack(arr, n_rows, n_columns):
+ """
+ Re-orders the values when stacking multiple extension-arrays.
+
+ The indirect stacking method used for EAs requires a followup
+ take to get the order correct.
+
+ Parameters
+ ----------
+ arr : ExtensionArray
+ n_rows, n_columns : int
+ The number of rows and columns in the original DataFrame.
+
+ Returns
+ -------
+ taken : ExtensionArray
+ The original `arr` with elements re-ordered appropriately
+
+ Examples
+ --------
+ >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f'])
+ >>> _reorder_for_extension_array_stack(arr, 2, 3)
+ array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='<U1')
+
+ >>> _reorder_for_extension_array_stack(arr, 3, 2)
+ array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='<U1')
+ """
+ # final take to get the order correct.
+ # idx is an indexer like
+ # [c0r0, c1r0, c2r0, ...,
+ # c0r1, c1r1, c2r1, ...]
+ idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel()
+ return arr.take(idx)
diff --git a/contrib/python/pandas/py2/pandas/core/reshape/tile.py b/contrib/python/pandas/py2/pandas/core/reshape/tile.py
new file mode 100644
index 00000000000..c107ed51226
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/reshape/tile.py
@@ -0,0 +1,559 @@
+"""
+Quantilization functions and related stuff
+"""
+from functools import partial
+
+import numpy as np
+
+from pandas._libs.lib import infer_dtype
+
+from pandas.core.dtypes.common import (
+ _NS_DTYPE, ensure_int64, is_categorical_dtype, is_datetime64_dtype,
+ is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_integer,
+ is_scalar, is_timedelta64_dtype)
+from pandas.core.dtypes.missing import isna
+
+from pandas import (
+ Categorical, Index, Interval, IntervalIndex, Series, Timedelta, Timestamp,
+ to_datetime, to_timedelta)
+import pandas.core.algorithms as algos
+import pandas.core.nanops as nanops
+
+
+def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
+ include_lowest=False, duplicates='raise'):
+ """
+ Bin values into discrete intervals.
+
+ Use `cut` when you need to segment and sort data values into bins. This
+ function is also useful for going from a continuous variable to a
+ categorical variable. For example, `cut` could convert ages to groups of
+ age ranges. Supports binning into an equal number of bins, or a
+ pre-specified array of bins.
+
+ Parameters
+ ----------
+ x : array-like
+ The input array to be binned. Must be 1-dimensional.
+ bins : int, sequence of scalars, or pandas.IntervalIndex
+ The criteria to bin by.
+
+ * int : Defines the number of equal-width bins in the range of `x`. The
+ range of `x` is extended by .1% on each side to include the minimum
+ and maximum values of `x`.
+ * sequence of scalars : Defines the bin edges allowing for non-uniform
+ width. No extension of the range of `x` is done.
+ * IntervalIndex : Defines the exact bins to be used. Note that
+ IntervalIndex for `bins` must be non-overlapping.
+
+ right : bool, default True
+ Indicates whether `bins` includes the rightmost edge or not. If
+ ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]``
+ indicate (1,2], (2,3], (3,4]. This argument is ignored when
+ `bins` is an IntervalIndex.
+ labels : array or bool, optional
+ Specifies the labels for the returned bins. Must be the same length as
+ the resulting bins. If False, returns only integer indicators of the
+ bins. This affects the type of the output container (see below).
+ This argument is ignored when `bins` is an IntervalIndex.
+ retbins : bool, default False
+ Whether to return the bins or not. Useful when bins is provided
+ as a scalar.
+ precision : int, default 3
+ The precision at which to store and display the bins labels.
+ include_lowest : bool, default False
+ Whether the first interval should be left-inclusive or not.
+ duplicates : {default 'raise', 'drop'}, optional
+ If bin edges are not unique, raise ValueError or drop non-uniques.
+
+ .. versionadded:: 0.23.0
+
+ Returns
+ -------
+ out : pandas.Categorical, Series, or ndarray
+ An array-like object representing the respective bin for each value
+ of `x`. The type depends on the value of `labels`.
+
+ * True (default) : returns a Series for Series `x` or a
+ pandas.Categorical for all other inputs. The values stored within
+ are Interval dtype.
+
+ * sequence of scalars : returns a Series for Series `x` or a
+ pandas.Categorical for all other inputs. The values stored within
+ are whatever the type in the sequence is.
+
+ * False : returns an ndarray of integers.
+
+ bins : numpy.ndarray or IntervalIndex.
+ The computed or specified bins. Only returned when `retbins=True`.
+ For scalar or sequence `bins`, this is an ndarray with the computed
+ bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
+ an IntervalIndex `bins`, this is equal to `bins`.
+
+ See Also
+ --------
+ qcut : Discretize variable into equal-sized buckets based on rank
+ or based on sample quantiles.
+ pandas.Categorical : Array type for storing data that come from a
+ fixed set of values.
+ Series : One-dimensional array with axis labels (including time series).
+ pandas.IntervalIndex : Immutable Index implementing an ordered,
+ sliceable set.
+
+ Notes
+ -----
+ Any NA values will be NA in the result. Out of bounds values will be NA in
+ the resulting Series or pandas.Categorical object.
+
+ Examples
+ --------
+ Discretize into three equal-sized bins.
+
+ >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
+ ... # doctest: +ELLIPSIS
+ [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
+ Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ...
+
+ >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
+ ... # doctest: +ELLIPSIS
+ ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
+ Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ...
+ array([0.994, 3. , 5. , 7. ]))
+
+ Discovers the same bins, but assign them specific labels. Notice that
+ the returned Categorical's categories are `labels` and is ordered.
+
+ >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]),
+ ... 3, labels=["bad", "medium", "good"])
+ [bad, good, medium, medium, good, bad]
+ Categories (3, object): [bad < medium < good]
+
+ ``labels=False`` implies you just want the bins back.
+
+ >>> pd.cut([0, 1, 1, 2], bins=4, labels=False)
+ array([0, 1, 1, 3])
+
+ Passing a Series as an input returns a Series with categorical dtype:
+
+ >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
+ ... index=['a', 'b', 'c', 'd', 'e'])
+ >>> pd.cut(s, 3)
+ ... # doctest: +ELLIPSIS
+ a (1.992, 4.667]
+ b (1.992, 4.667]
+ c (4.667, 7.333]
+ d (7.333, 10.0]
+ e (7.333, 10.0]
+ dtype: category
+ Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ...
+
+ Passing a Series as an input returns a Series with mapping value.
+ It is used to map numerically to intervals based on bins.
+
+ >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
+ ... index=['a', 'b', 'c', 'd', 'e'])
+ >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False)
+ ... # doctest: +ELLIPSIS
+ (a 0.0
+ b 1.0
+ c 2.0
+ d 3.0
+ e 4.0
+ dtype: float64, array([0, 2, 4, 6, 8]))
+
+ Use `drop` optional when bins is not unique
+
+ >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True,
+ ... right=False, duplicates='drop')
+ ... # doctest: +ELLIPSIS
+ (a 0.0
+ b 1.0
+ c 2.0
+ d 3.0
+ e 3.0
+ dtype: float64, array([0, 2, 4, 6, 8]))
+
+ Passing an IntervalIndex for `bins` results in those categories exactly.
+ Notice that values not covered by the IntervalIndex are set to NaN. 0
+ is to the left of the first bin (which is closed on the right), and 1.5
+ falls between two bins.
+
+ >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
+ >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
+ [NaN, (0, 1], NaN, (2, 3], (4, 5]]
+ Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]]
+ """
+ # NOTE: this binning code is changed a bit from histogram for var(x) == 0
+
+ # for handling the cut for datetime and timedelta objects
+ x_is_series, series_index, name, x = _preprocess_for_cut(x)
+ x, dtype = _coerce_to_type(x)
+
+ if not np.iterable(bins):
+ if is_scalar(bins) and bins < 1:
+ raise ValueError("`bins` should be a positive integer.")
+
+ try: # for array-like
+ sz = x.size
+ except AttributeError:
+ x = np.asarray(x)
+ sz = x.size
+
+ if sz == 0:
+ raise ValueError('Cannot cut empty array')
+
+ rng = (nanops.nanmin(x), nanops.nanmax(x))
+ mn, mx = [mi + 0.0 for mi in rng]
+
+ if np.isinf(mn) or np.isinf(mx):
+ # GH 24314
+ raise ValueError('cannot specify integer `bins` when input data '
+ 'contains infinity')
+ elif mn == mx: # adjust end points before binning
+ mn -= .001 * abs(mn) if mn != 0 else .001
+ mx += .001 * abs(mx) if mx != 0 else .001
+ bins = np.linspace(mn, mx, bins + 1, endpoint=True)
+ else: # adjust end points after binning
+ bins = np.linspace(mn, mx, bins + 1, endpoint=True)
+ adj = (mx - mn) * 0.001 # 0.1% of the range
+ if right:
+ bins[0] -= adj
+ else:
+ bins[-1] += adj
+
+ elif isinstance(bins, IntervalIndex):
+ if bins.is_overlapping:
+ raise ValueError('Overlapping IntervalIndex is not accepted.')
+
+ else:
+ if is_datetime64tz_dtype(bins):
+ bins = np.asarray(bins, dtype=_NS_DTYPE)
+ else:
+ bins = np.asarray(bins)
+ bins = _convert_bin_to_numeric_type(bins, dtype)
+ if (np.diff(bins) < 0).any():
+ raise ValueError('bins must increase monotonically.')
+
+ fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels,
+ precision=precision,
+ include_lowest=include_lowest,
+ dtype=dtype,
+ duplicates=duplicates)
+
+ return _postprocess_for_cut(fac, bins, retbins, x_is_series,
+ series_index, name, dtype)
+
+
+def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'):
+ """
+ Quantile-based discretization function. Discretize variable into
+ equal-sized buckets based on rank or based on sample quantiles. For example
+ 1000 values for 10 quantiles would produce a Categorical object indicating
+ quantile membership for each data point.
+
+ Parameters
+ ----------
+ x : 1d ndarray or Series
+ q : integer or array of quantiles
+ Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
+ array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
+ labels : array or boolean, default None
+ Used as labels for the resulting bins. Must be of the same length as
+ the resulting bins. If False, return only integer indicators of the
+ bins.
+ retbins : bool, optional
+ Whether to return the (bins, labels) or not. Can be useful if bins
+ is given as a scalar.
+ precision : int, optional
+ The precision at which to store and display the bins labels
+ duplicates : {default 'raise', 'drop'}, optional
+ If bin edges are not unique, raise ValueError or drop non-uniques.
+
+ .. versionadded:: 0.20.0
+
+ Returns
+ -------
+ out : Categorical or Series or array of integers if labels is False
+ The return type (Categorical or Series) depends on the input: a Series
+ of type category if input is a Series else Categorical. Bins are
+ represented as categories when categorical data is returned.
+ bins : ndarray of floats
+ Returned only if `retbins` is True.
+
+ Notes
+ -----
+ Out of bounds values will be NA in the resulting Categorical object
+
+ Examples
+ --------
+ >>> pd.qcut(range(5), 4)
+ ... # doctest: +ELLIPSIS
+ [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
+ Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] ...
+
+ >>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"])
+ ... # doctest: +SKIP
+ [good, good, medium, bad, bad]
+ Categories (3, object): [good < medium < bad]
+
+ >>> pd.qcut(range(5), 4, labels=False)
+ array([0, 0, 1, 2, 3])
+ """
+ x_is_series, series_index, name, x = _preprocess_for_cut(x)
+
+ x, dtype = _coerce_to_type(x)
+
+ if is_integer(q):
+ quantiles = np.linspace(0, 1, q + 1)
+ else:
+ quantiles = q
+ bins = algos.quantile(x, quantiles)
+ fac, bins = _bins_to_cuts(x, bins, labels=labels,
+ precision=precision, include_lowest=True,
+ dtype=dtype, duplicates=duplicates)
+
+ return _postprocess_for_cut(fac, bins, retbins, x_is_series,
+ series_index, name, dtype)
+
+
+def _bins_to_cuts(x, bins, right=True, labels=None,
+ precision=3, include_lowest=False,
+ dtype=None, duplicates='raise'):
+
+ if duplicates not in ['raise', 'drop']:
+ raise ValueError("invalid value for 'duplicates' parameter, "
+ "valid options are: raise, drop")
+
+ if isinstance(bins, IntervalIndex):
+ # we have a fast-path here
+ ids = bins.get_indexer(x)
+ result = algos.take_nd(bins, ids)
+ result = Categorical(result, categories=bins, ordered=True)
+ return result, bins
+
+ unique_bins = algos.unique(bins)
+ if len(unique_bins) < len(bins) and len(bins) != 2:
+ if duplicates == 'raise':
+ raise ValueError("Bin edges must be unique: {bins!r}.\nYou "
+ "can drop duplicate edges by setting "
+ "the 'duplicates' kwarg".format(bins=bins))
+ else:
+ bins = unique_bins
+
+ side = 'left' if right else 'right'
+ ids = ensure_int64(bins.searchsorted(x, side=side))
+
+ if include_lowest:
+ ids[x == bins[0]] = 1
+
+ na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
+ has_nas = na_mask.any()
+
+ if labels is not False:
+ if labels is None:
+ labels = _format_labels(bins, precision, right=right,
+ include_lowest=include_lowest,
+ dtype=dtype)
+ else:
+ if len(labels) != len(bins) - 1:
+ raise ValueError('Bin labels must be one fewer than '
+ 'the number of bin edges')
+ if not is_categorical_dtype(labels):
+ labels = Categorical(labels, categories=labels, ordered=True)
+
+ np.putmask(ids, na_mask, 0)
+ result = algos.take_nd(labels, ids - 1)
+
+ else:
+ result = ids - 1
+ if has_nas:
+ result = result.astype(np.float64)
+ np.putmask(result, na_mask, np.nan)
+
+ return result, bins
+
+
+def _trim_zeros(x):
+ while len(x) > 1 and x[-1] == '0':
+ x = x[:-1]
+ if len(x) > 1 and x[-1] == '.':
+ x = x[:-1]
+ return x
+
+
+def _coerce_to_type(x):
+ """
+ if the passed data is of datetime/timedelta type,
+ this method converts it to numeric so that cut method can
+ handle it
+ """
+ dtype = None
+
+ if is_datetime64tz_dtype(x):
+ dtype = x.dtype
+ elif is_datetime64_dtype(x):
+ x = to_datetime(x)
+ dtype = np.dtype('datetime64[ns]')
+ elif is_timedelta64_dtype(x):
+ x = to_timedelta(x)
+ dtype = np.dtype('timedelta64[ns]')
+
+ if dtype is not None:
+ # GH 19768: force NaT to NaN during integer conversion
+ x = np.where(x.notna(), x.view(np.int64), np.nan)
+
+ return x, dtype
+
+
+def _convert_bin_to_numeric_type(bins, dtype):
+ """
+ if the passed bin is of datetime/timedelta type,
+ this method converts it to integer
+
+ Parameters
+ ----------
+ bins : list-like of bins
+ dtype : dtype of data
+
+ Raises
+ ------
+ ValueError if bins are not of a compat dtype to dtype
+ """
+ bins_dtype = infer_dtype(bins, skipna=False)
+ if is_timedelta64_dtype(dtype):
+ if bins_dtype in ['timedelta', 'timedelta64']:
+ bins = to_timedelta(bins).view(np.int64)
+ else:
+ raise ValueError("bins must be of timedelta64 dtype")
+ elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
+ if bins_dtype in ['datetime', 'datetime64']:
+ bins = to_datetime(bins).view(np.int64)
+ else:
+ raise ValueError("bins must be of datetime64 dtype")
+
+ return bins
+
+
+def _convert_bin_to_datelike_type(bins, dtype):
+ """
+ Convert bins to a DatetimeIndex or TimedeltaIndex if the orginal dtype is
+ datelike
+
+ Parameters
+ ----------
+ bins : list-like of bins
+ dtype : dtype of data
+
+ Returns
+ -------
+ bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is
+ datelike
+ """
+ if is_datetime64tz_dtype(dtype):
+ bins = to_datetime(bins.astype(np.int64),
+ utc=True).tz_convert(dtype.tz)
+ elif is_datetime_or_timedelta_dtype(dtype):
+ bins = Index(bins.astype(np.int64), dtype=dtype)
+ return bins
+
+
+def _format_labels(bins, precision, right=True,
+ include_lowest=False, dtype=None):
+ """ based on the dtype, return our labels """
+
+ closed = 'right' if right else 'left'
+
+ if is_datetime64tz_dtype(dtype):
+ formatter = partial(Timestamp, tz=dtype.tz)
+ adjust = lambda x: x - Timedelta('1ns')
+ elif is_datetime64_dtype(dtype):
+ formatter = Timestamp
+ adjust = lambda x: x - Timedelta('1ns')
+ elif is_timedelta64_dtype(dtype):
+ formatter = Timedelta
+ adjust = lambda x: x - Timedelta('1ns')
+ else:
+ precision = _infer_precision(precision, bins)
+ formatter = lambda x: _round_frac(x, precision)
+ adjust = lambda x: x - 10 ** (-precision)
+
+ breaks = [formatter(b) for b in bins]
+ labels = IntervalIndex.from_breaks(breaks, closed=closed)
+
+ if right and include_lowest:
+ # we will adjust the left hand side by precision to
+ # account that we are all right closed
+ v = adjust(labels[0].left)
+
+ i = IntervalIndex([Interval(v, labels[0].right, closed='right')])
+ labels = i.append(labels[1:])
+
+ return labels
+
+
+def _preprocess_for_cut(x):
+ """
+ handles preprocessing for cut where we convert passed
+ input to array, strip the index information and store it
+ separately
+ """
+ x_is_series = isinstance(x, Series)
+ series_index = None
+ name = None
+
+ if x_is_series:
+ series_index = x.index
+ name = x.name
+
+ # Check that the passed array is a Pandas or Numpy object
+ # We don't want to strip away a Pandas data-type here (e.g. datetimetz)
+ ndim = getattr(x, 'ndim', None)
+ if ndim is None:
+ x = np.asarray(x)
+ if x.ndim != 1:
+ raise ValueError("Input array must be 1 dimensional")
+
+ return x_is_series, series_index, name, x
+
+
+def _postprocess_for_cut(fac, bins, retbins, x_is_series,
+ series_index, name, dtype):
+ """
+ handles post processing for the cut method where
+ we combine the index information if the originally passed
+ datatype was a series
+ """
+ if x_is_series:
+ fac = Series(fac, index=series_index, name=name)
+
+ if not retbins:
+ return fac
+
+ bins = _convert_bin_to_datelike_type(bins, dtype)
+
+ return fac, bins
+
+
+def _round_frac(x, precision):
+ """
+ Round the fractional part of the given number
+ """
+ if not np.isfinite(x) or x == 0:
+ return x
+ else:
+ frac, whole = np.modf(x)
+ if whole == 0:
+ digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision
+ else:
+ digits = precision
+ return np.around(x, digits)
+
+
+def _infer_precision(base_precision, bins):
+ """Infer an appropriate precision for _round_frac
+ """
+ for precision in range(base_precision, 20):
+ levels = [_round_frac(b, precision) for b in bins]
+ if algos.unique(levels).size == bins.size:
+ return precision
+ return base_precision # default
diff --git a/contrib/python/pandas/py2/pandas/core/reshape/util.py b/contrib/python/pandas/py2/pandas/core/reshape/util.py
new file mode 100644
index 00000000000..9d4135a7f31
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/reshape/util.py
@@ -0,0 +1,57 @@
+import numpy as np
+
+from pandas.core.dtypes.common import is_list_like
+
+from pandas.core import common as com
+
+
+def cartesian_product(X):
+ """
+ Numpy version of itertools.product or pandas.compat.product.
+ Sometimes faster (for large inputs)...
+
+ Parameters
+ ----------
+ X : list-like of list-likes
+
+ Returns
+ -------
+ product : list of ndarrays
+
+ Examples
+ --------
+ >>> cartesian_product([list('ABC'), [1, 2]])
+ [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'),
+ array([1, 2, 1, 2, 1, 2])]
+
+ See Also
+ --------
+ itertools.product : Cartesian product of input iterables. Equivalent to
+ nested for-loops.
+ pandas.compat.product : An alias for itertools.product.
+ """
+ msg = "Input must be a list-like of list-likes"
+ if not is_list_like(X):
+ raise TypeError(msg)
+ for x in X:
+ if not is_list_like(x):
+ raise TypeError(msg)
+
+ if len(X) == 0:
+ return []
+
+ lenX = np.fromiter((len(x) for x in X), dtype=np.intp)
+ cumprodX = np.cumproduct(lenX)
+
+ a = np.roll(cumprodX, 1)
+ a[0] = 1
+
+ if cumprodX[-1] != 0:
+ b = cumprodX[-1] / cumprodX
+ else:
+ # if any factor is empty, the cartesian product is empty
+ b = np.zeros_like(cumprodX)
+
+ return [np.tile(np.repeat(np.asarray(com.values_from_object(x)), b[i]),
+ np.product(a[i]))
+ for i, x in enumerate(X)]
diff --git a/contrib/python/pandas/py2/pandas/core/series.py b/contrib/python/pandas/py2/pandas/core/series.py
new file mode 100644
index 00000000000..3ed4e2e12ed
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/series.py
@@ -0,0 +1,4394 @@
+"""
+Data structure for 1-dimensional cross-sectional and time series data
+"""
+from __future__ import division
+
+from textwrap import dedent
+import warnings
+
+import numpy as np
+
+from pandas._libs import iNaT, index as libindex, lib, tslibs
+import pandas.compat as compat
+from pandas.compat import PY36, OrderedDict, StringIO, u, zip
+from pandas.compat.numpy import function as nv
+from pandas.util._decorators import Appender, Substitution, deprecate
+from pandas.util._validators import validate_bool_kwarg
+
+from pandas.core.dtypes.common import (
+ _is_unorderable_exception, ensure_platform_int, is_bool,
+ is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like,
+ is_extension_array_dtype, is_extension_type, is_hashable, is_integer,
+ is_iterator, is_list_like, is_scalar, is_string_like, is_timedelta64_dtype)
+from pandas.core.dtypes.generic import (
+ ABCDataFrame, ABCDatetimeArray, ABCDatetimeIndex, ABCSeries,
+ ABCSparseArray, ABCSparseSeries)
+from pandas.core.dtypes.missing import (
+ isna, na_value_for_dtype, notna, remove_na_arraylike)
+
+from pandas.core import algorithms, base, generic, nanops, ops
+from pandas.core.accessor import CachedAccessor
+from pandas.core.arrays import ExtensionArray, SparseArray
+from pandas.core.arrays.categorical import Categorical, CategoricalAccessor
+from pandas.core.arrays.sparse import SparseAccessor
+import pandas.core.common as com
+from pandas.core.config import get_option
+from pandas.core.index import (
+ Float64Index, Index, InvalidIndexError, MultiIndex, ensure_index)
+from pandas.core.indexes.accessors import CombinedDatetimelikeProperties
+import pandas.core.indexes.base as ibase
+from pandas.core.indexes.datetimes import DatetimeIndex
+from pandas.core.indexes.period import PeriodIndex
+from pandas.core.indexes.timedeltas import TimedeltaIndex
+from pandas.core.indexing import check_bool_indexer, maybe_convert_indices
+from pandas.core.internals import SingleBlockManager
+from pandas.core.internals.construction import sanitize_array
+from pandas.core.strings import StringMethods
+from pandas.core.tools.datetimes import to_datetime
+
+import pandas.io.formats.format as fmt
+from pandas.io.formats.terminal import get_terminal_size
+import pandas.plotting._core as gfx
+
+# pylint: disable=E1101,E1103
+# pylint: disable=W0703,W0622,W0613,W0201
+
+
+__all__ = ['Series']
+
+_shared_doc_kwargs = dict(
+ axes='index', klass='Series', axes_single_arg="{0 or 'index'}",
+ axis="""axis : {0 or 'index'}
+ Parameter needed for compatibility with DataFrame.""",
+ inplace="""inplace : boolean, default False
+ If True, performs operation inplace and returns None.""",
+ unique='np.ndarray', duplicated='Series',
+ optional_by='', optional_mapper='', optional_labels='', optional_axis='',
+ versionadded_to_excel='\n .. versionadded:: 0.20.0\n')
+
+
+# see gh-16971
+def remove_na(arr):
+ """
+ Remove null values from array like structure.
+
+ .. deprecated:: 0.21.0
+ Use s[s.notnull()] instead.
+ """
+
+ warnings.warn("remove_na is deprecated and is a private "
+ "function. Do not use.", FutureWarning, stacklevel=2)
+ return remove_na_arraylike(arr)
+
+
+def _coerce_method(converter):
+ """
+ Install the scalar coercion methods.
+ """
+
+ def wrapper(self):
+ if len(self) == 1:
+ return converter(self.iloc[0])
+ raise TypeError("cannot convert the series to "
+ "{0}".format(str(converter)))
+
+ wrapper.__name__ = "__{name}__".format(name=converter.__name__)
+ return wrapper
+
+# ----------------------------------------------------------------------
+# Series class
+
+
+class Series(base.IndexOpsMixin, generic.NDFrame):
+ """
+ One-dimensional ndarray with axis labels (including time series).
+
+ Labels need not be unique but must be a hashable type. The object
+ supports both integer- and label-based indexing and provides a host of
+ methods for performing operations involving the index. Statistical
+ methods from ndarray have been overridden to automatically exclude
+ missing data (currently represented as NaN).
+
+ Operations between Series (+, -, /, *, **) align values based on their
+ associated index values-- they need not be the same length. The result
+ index will be the sorted union of the two indexes.
+
+ Parameters
+ ----------
+ data : array-like, Iterable, dict, or scalar value
+ Contains data stored in Series.
+
+ .. versionchanged :: 0.23.0
+ If data is a dict, argument order is maintained for Python 3.6
+ and later.
+
+ index : array-like or Index (1d)
+ Values must be hashable and have the same length as `data`.
+ Non-unique index values are allowed. Will default to
+ RangeIndex (0, 1, 2, ..., n) if not provided. If both a dict and index
+ sequence are used, the index will override the keys found in the
+ dict.
+ dtype : str, numpy.dtype, or ExtensionDtype, optional
+ dtype for the output Series. If not specified, this will be
+ inferred from `data`.
+ See the :ref:`user guide <basics.dtypes>` for more usages.
+ copy : bool, default False
+ Copy input data.
+ """
+ _metadata = ['name']
+ _accessors = {'dt', 'cat', 'str', 'sparse'}
+ # tolist is not actually deprecated, just suppressed in the __dir__
+ _deprecations = generic.NDFrame._deprecations | frozenset(
+ ['asobject', 'reshape', 'get_value', 'set_value',
+ 'from_csv', 'valid', 'tolist'])
+
+ # Override cache_readonly bc Series is mutable
+ hasnans = property(base.IndexOpsMixin.hasnans.func,
+ doc=base.IndexOpsMixin.hasnans.__doc__)
+
+ # ----------------------------------------------------------------------
+ # Constructors
+
+ def __init__(self, data=None, index=None, dtype=None, name=None,
+ copy=False, fastpath=False):
+
+ # we are called internally, so short-circuit
+ if fastpath:
+
+ # data is an ndarray, index is defined
+ if not isinstance(data, SingleBlockManager):
+ data = SingleBlockManager(data, index, fastpath=True)
+ if copy:
+ data = data.copy()
+ if index is None:
+ index = data.index
+
+ else:
+
+ if index is not None:
+ index = ensure_index(index)
+
+ if data is None:
+ data = {}
+ if dtype is not None:
+ dtype = self._validate_dtype(dtype)
+
+ if isinstance(data, MultiIndex):
+ raise NotImplementedError("initializing a Series from a "
+ "MultiIndex is not supported")
+ elif isinstance(data, Index):
+ if name is None:
+ name = data.name
+
+ if dtype is not None:
+ # astype copies
+ data = data.astype(dtype)
+ else:
+ # need to copy to avoid aliasing issues
+ data = data._values.copy()
+ if (isinstance(data, ABCDatetimeIndex) and
+ data.tz is not None):
+ # GH#24096 need copy to be deep for datetime64tz case
+ # TODO: See if we can avoid these copies
+ data = data._values.copy(deep=True)
+ copy = False
+
+ elif isinstance(data, np.ndarray):
+ pass
+ elif isinstance(data, (ABCSeries, ABCSparseSeries)):
+ if name is None:
+ name = data.name
+ if index is None:
+ index = data.index
+ else:
+ data = data.reindex(index, copy=copy)
+ data = data._data
+ elif isinstance(data, dict):
+ data, index = self._init_dict(data, index, dtype)
+ dtype = None
+ copy = False
+ elif isinstance(data, SingleBlockManager):
+ if index is None:
+ index = data.index
+ elif not data.index.equals(index) or copy:
+ # GH#19275 SingleBlockManager input should only be called
+ # internally
+ raise AssertionError('Cannot pass both SingleBlockManager '
+ '`data` argument and a different '
+ '`index` argument. `copy` must '
+ 'be False.')
+
+ elif is_extension_array_dtype(data):
+ pass
+ elif isinstance(data, (set, frozenset)):
+ raise TypeError("{0!r} type is unordered"
+ "".format(data.__class__.__name__))
+ # If data is Iterable but not list-like, consume into list.
+ elif (isinstance(data, compat.Iterable)
+ and not isinstance(data, compat.Sized)):
+ data = list(data)
+ else:
+
+ # handle sparse passed here (and force conversion)
+ if isinstance(data, ABCSparseArray):
+ data = data.to_dense()
+
+ if index is None:
+ if not is_list_like(data):
+ data = [data]
+ index = ibase.default_index(len(data))
+ elif is_list_like(data):
+
+ # a scalar numpy array is list-like but doesn't
+ # have a proper length
+ try:
+ if len(index) != len(data):
+ raise ValueError(
+ 'Length of passed values is {val}, '
+ 'index implies {ind}'
+ .format(val=len(data), ind=len(index)))
+ except TypeError:
+ pass
+
+ # create/copy the manager
+ if isinstance(data, SingleBlockManager):
+ if dtype is not None:
+ data = data.astype(dtype=dtype, errors='ignore',
+ copy=copy)
+ elif copy:
+ data = data.copy()
+ else:
+ data = sanitize_array(data, index, dtype, copy,
+ raise_cast_failure=True)
+
+ data = SingleBlockManager(data, index, fastpath=True)
+
+ generic.NDFrame.__init__(self, data, fastpath=True)
+
+ self.name = name
+ self._set_axis(0, index, fastpath=True)
+
+ def _init_dict(self, data, index=None, dtype=None):
+ """
+ Derive the "_data" and "index" attributes of a new Series from a
+ dictionary input.
+
+ Parameters
+ ----------
+ data : dict or dict-like
+ Data used to populate the new Series
+ index : Index or index-like, default None
+ index for the new Series: if None, use dict keys
+ dtype : dtype, default None
+ dtype for the new Series: if None, infer from data
+
+ Returns
+ -------
+ _data : BlockManager for the new Series
+ index : index for the new Series
+ """
+ # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')]
+ # raises KeyError), so we iterate the entire dict, and align
+ if data:
+ keys, values = zip(*compat.iteritems(data))
+ values = list(values)
+ elif index is not None:
+ # fastpath for Series(data=None). Just use broadcasting a scalar
+ # instead of reindexing.
+ values = na_value_for_dtype(dtype)
+ keys = index
+ else:
+ keys, values = [], []
+
+ # Input is now list-like, so rely on "standard" construction:
+ s = Series(values, index=keys, dtype=dtype)
+
+ # Now we just make sure the order is respected, if any
+ if data and index is not None:
+ s = s.reindex(index, copy=False)
+ elif not PY36 and not isinstance(data, OrderedDict) and data:
+ # Need the `and data` to avoid sorting Series(None, index=[...])
+ # since that isn't really dict-like
+ try:
+ s = s.sort_index()
+ except TypeError:
+ pass
+ return s._data, s.index
+
+ @classmethod
+ def from_array(cls, arr, index=None, name=None, dtype=None, copy=False,
+ fastpath=False):
+ """
+ Construct Series from array.
+
+ .. deprecated :: 0.23.0
+ Use pd.Series(..) constructor instead.
+ """
+ warnings.warn("'from_array' is deprecated and will be removed in a "
+ "future version. Please use the pd.Series(..) "
+ "constructor instead.", FutureWarning, stacklevel=2)
+ if isinstance(arr, ABCSparseArray):
+ from pandas.core.sparse.series import SparseSeries
+ cls = SparseSeries
+ return cls(arr, index=index, name=name, dtype=dtype,
+ copy=copy, fastpath=fastpath)
+
+ # ----------------------------------------------------------------------
+
+ @property
+ def _constructor(self):
+ return Series
+
+ @property
+ def _constructor_expanddim(self):
+ from pandas.core.frame import DataFrame
+ return DataFrame
+
+ # types
+ @property
+ def _can_hold_na(self):
+ return self._data._can_hold_na
+
+ _index = None
+
+ def _set_axis(self, axis, labels, fastpath=False):
+ """
+ Override generic, we want to set the _typ here.
+ """
+
+ if not fastpath:
+ labels = ensure_index(labels)
+
+ is_all_dates = labels.is_all_dates
+ if is_all_dates:
+ if not isinstance(labels,
+ (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
+ try:
+ labels = DatetimeIndex(labels)
+ # need to set here because we changed the index
+ if fastpath:
+ self._data.set_axis(axis, labels)
+ except (tslibs.OutOfBoundsDatetime, ValueError):
+ # labels may exceeds datetime bounds,
+ # or not be a DatetimeIndex
+ pass
+
+ self._set_subtyp(is_all_dates)
+
+ object.__setattr__(self, '_index', labels)
+ if not fastpath:
+ self._data.set_axis(axis, labels)
+
+ def _set_subtyp(self, is_all_dates):
+ if is_all_dates:
+ object.__setattr__(self, '_subtyp', 'time_series')
+ else:
+ object.__setattr__(self, '_subtyp', 'series')
+
+ def _update_inplace(self, result, **kwargs):
+ # we want to call the generic version and not the IndexOpsMixin
+ return generic.NDFrame._update_inplace(self, result, **kwargs)
+
+ @property
+ def name(self):
+ """
+ Return name of the Series.
+ """
+ return self._name
+
+ @name.setter
+ def name(self, value):
+ if value is not None and not is_hashable(value):
+ raise TypeError('Series.name must be a hashable type')
+ object.__setattr__(self, '_name', value)
+
+ # ndarray compatibility
+ @property
+ def dtype(self):
+ """
+ Return the dtype object of the underlying data.
+ """
+ return self._data.dtype
+
+ @property
+ def dtypes(self):
+ """
+ Return the dtype object of the underlying data.
+ """
+ return self._data.dtype
+
+ @property
+ def ftype(self):
+ """
+ Return if the data is sparse|dense.
+ """
+ return self._data.ftype
+
+ @property
+ def ftypes(self):
+ """
+ Return if the data is sparse|dense.
+ """
+ return self._data.ftype
+
+ @property
+ def values(self):
+ """
+ Return Series as ndarray or ndarray-like depending on the dtype.
+
+ .. warning::
+
+ We recommend using :attr:`Series.array` or
+ :meth:`Series.to_numpy`, depending on whether you need
+ a reference to the underlying data or a NumPy array.
+
+ Returns
+ -------
+ arr : numpy.ndarray or ndarray-like
+
+ See Also
+ --------
+ Series.array : Reference to the underlying data.
+ Series.to_numpy : A NumPy array representing the underlying data.
+
+ Examples
+ --------
+ >>> pd.Series([1, 2, 3]).values
+ array([1, 2, 3])
+
+ >>> pd.Series(list('aabc')).values
+ array(['a', 'a', 'b', 'c'], dtype=object)
+
+ >>> pd.Series(list('aabc')).astype('category').values
+ [a, a, b, c]
+ Categories (3, object): [a, b, c]
+
+ Timezone aware datetime data is converted to UTC:
+
+ >>> pd.Series(pd.date_range('20130101', periods=3,
+ ... tz='US/Eastern')).values
+ array(['2013-01-01T05:00:00.000000000',
+ '2013-01-02T05:00:00.000000000',
+ '2013-01-03T05:00:00.000000000'], dtype='datetime64[ns]')
+ """
+ return self._data.external_values()
+
+ @property
+ def _values(self):
+ """
+ Return the internal repr of this data.
+ """
+ return self._data.internal_values()
+
+ def _formatting_values(self):
+ """
+ Return the values that can be formatted (used by SeriesFormatter
+ and DataFrameFormatter).
+ """
+ return self._data.formatting_values()
+
+ def get_values(self):
+ """
+ Same as values (but handles sparseness conversions); is a view.
+ """
+ return self._data.get_values()
+
+ @property
+ def asobject(self):
+ """
+ Return object Series which contains boxed values.
+
+ .. deprecated :: 0.23.0
+
+ Use ``astype(object)`` instead.
+
+ *this is an internal non-public method*
+ """
+ warnings.warn("'asobject' is deprecated. Use 'astype(object)'"
+ " instead", FutureWarning, stacklevel=2)
+ return self.astype(object).values
+
+ # ops
+ def ravel(self, order='C'):
+ """
+ Return the flattened underlying data as an ndarray.
+
+ See Also
+ --------
+ numpy.ndarray.ravel
+ """
+ return self._values.ravel(order=order)
+
+ def compress(self, condition, *args, **kwargs):
+ """
+ Return selected slices of an array along given axis as a Series.
+
+ .. deprecated:: 0.24.0
+
+ See Also
+ --------
+ numpy.ndarray.compress
+ """
+ msg = ("Series.compress(condition) is deprecated. "
+ "Use 'Series[condition]' or "
+ "'np.asarray(series).compress(condition)' instead.")
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+ nv.validate_compress(args, kwargs)
+ return self[condition]
+
+ def nonzero(self):
+ """
+ Return the *integer* indices of the elements that are non-zero.
+
+ .. deprecated:: 0.24.0
+ Please use .to_numpy().nonzero() as a replacement.
+
+ This method is equivalent to calling `numpy.nonzero` on the
+ series data. For compatibility with NumPy, the return value is
+ the same (a tuple with an array of indices for each dimension),
+ but it will always be a one-item tuple because series only have
+ one dimension.
+
+ See Also
+ --------
+ numpy.nonzero
+
+ Examples
+ --------
+ >>> s = pd.Series([0, 3, 0, 4])
+ >>> s.nonzero()
+ (array([1, 3]),)
+ >>> s.iloc[s.nonzero()[0]]
+ 1 3
+ 3 4
+ dtype: int64
+
+ >>> s = pd.Series([0, 3, 0, 4], index=['a', 'b', 'c', 'd'])
+ # same return although index of s is different
+ >>> s.nonzero()
+ (array([1, 3]),)
+ >>> s.iloc[s.nonzero()[0]]
+ b 3
+ d 4
+ dtype: int64
+ """
+ msg = ("Series.nonzero() is deprecated "
+ "and will be removed in a future version."
+ "Use Series.to_numpy().nonzero() instead")
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+ return self._values.nonzero()
+
+ def put(self, *args, **kwargs):
+ """
+ Applies the `put` method to its `values` attribute if it has one.
+
+ See Also
+ --------
+ numpy.ndarray.put
+ """
+ self._values.put(*args, **kwargs)
+
+ def __len__(self):
+ """
+ Return the length of the Series.
+ """
+ return len(self._data)
+
+ def view(self, dtype=None):
+ """
+ Create a new view of the Series.
+
+ This function will return a new Series with a view of the same
+ underlying values in memory, optionally reinterpreted with a new data
+ type. The new data type must preserve the same size in bytes as to not
+ cause index misalignment.
+
+ Parameters
+ ----------
+ dtype : data type
+ Data type object or one of their string representations.
+
+ Returns
+ -------
+ Series
+ A new Series object as a view of the same data in memory.
+
+ See Also
+ --------
+ numpy.ndarray.view : Equivalent numpy function to create a new view of
+ the same data in memory.
+
+ Notes
+ -----
+ Series are instantiated with ``dtype=float64`` by default. While
+ ``numpy.ndarray.view()`` will return a view with the same data type as
+ the original array, ``Series.view()`` (without specified dtype)
+ will try using ``float64`` and may fail if the original data type size
+ in bytes is not the same.
+
+ Examples
+ --------
+ >>> s = pd.Series([-2, -1, 0, 1, 2], dtype='int8')
+ >>> s
+ 0 -2
+ 1 -1
+ 2 0
+ 3 1
+ 4 2
+ dtype: int8
+
+ The 8 bit signed integer representation of `-1` is `0b11111111`, but
+ the same bytes represent 255 if read as an 8 bit unsigned integer:
+
+ >>> us = s.view('uint8')
+ >>> us
+ 0 254
+ 1 255
+ 2 0
+ 3 1
+ 4 2
+ dtype: uint8
+
+ The views share the same underlying values:
+
+ >>> us[0] = 128
+ >>> s
+ 0 -128
+ 1 -1
+ 2 0
+ 3 1
+ 4 2
+ dtype: int8
+ """
+ return self._constructor(self._values.view(dtype),
+ index=self.index).__finalize__(self)
+
+ # ----------------------------------------------------------------------
+ # NDArray Compat
+
+ def __array__(self, dtype=None):
+ """
+ Return the values as a NumPy array.
+
+ Users should not call this directly. Rather, it is invoked by
+ :func:`numpy.array` and :func:`numpy.asarray`.
+
+ Parameters
+ ----------
+ dtype : str or numpy.dtype, optional
+ The dtype to use for the resulting NumPy array. By default,
+ the dtype is inferred from the data.
+
+ Returns
+ -------
+ numpy.ndarray
+ The values in the series converted to a :class:`numpy.ndarary`
+ with the specified `dtype`.
+
+ See Also
+ --------
+ pandas.array : Create a new array from data.
+ Series.array : Zero-copy view to the array backing the Series.
+ Series.to_numpy : Series method for similar behavior.
+
+ Examples
+ --------
+ >>> ser = pd.Series([1, 2, 3])
+ >>> np.asarray(ser)
+ array([1, 2, 3])
+
+ For timezone-aware data, the timezones may be retained with
+ ``dtype='object'``
+
+ >>> tzser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
+ >>> np.asarray(tzser, dtype="object")
+ array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'),
+ Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')],
+ dtype=object)
+
+ Or the values may be localized to UTC and the tzinfo discared with
+ ``dtype='datetime64[ns]'``
+
+ >>> np.asarray(tzser, dtype="datetime64[ns]") # doctest: +ELLIPSIS
+ array(['1999-12-31T23:00:00.000000000', ...],
+ dtype='datetime64[ns]')
+ """
+ if (dtype is None and isinstance(self.array, ABCDatetimeArray)
+ and getattr(self.dtype, 'tz', None)):
+ msg = (
+ "Converting timezone-aware DatetimeArray to timezone-naive "
+ "ndarray with 'datetime64[ns]' dtype. In the future, this "
+ "will return an ndarray with 'object' dtype where each "
+ "element is a 'pandas.Timestamp' with the correct 'tz'.\n\t"
+ "To accept the future behavior, pass 'dtype=object'.\n\t"
+ "To keep the old behavior, pass 'dtype=\"datetime64[ns]\"'."
+ )
+ warnings.warn(msg, FutureWarning, stacklevel=3)
+ dtype = 'M8[ns]'
+ return np.asarray(self.array, dtype)
+
+ def __array_wrap__(self, result, context=None):
+ """
+ Gets called after a ufunc.
+ """
+ return self._constructor(result, index=self.index,
+ copy=False).__finalize__(self)
+
+ def __array_prepare__(self, result, context=None):
+ """
+ Gets called prior to a ufunc.
+ """
+
+ # nice error message for non-ufunc types
+ if (context is not None and
+ (not isinstance(self._values, (np.ndarray, ExtensionArray))
+ or isinstance(self._values, Categorical))):
+ obj = context[1][0]
+ raise TypeError("{obj} with dtype {dtype} cannot perform "
+ "the numpy op {op}".format(
+ obj=type(obj).__name__,
+ dtype=getattr(obj, 'dtype', None),
+ op=context[0].__name__))
+ return result
+
+ # ----------------------------------------------------------------------
+ # Unary Methods
+
+ @property
+ def real(self):
+ """
+ Return the real value of vector.
+ """
+ return self.values.real
+
+ @real.setter
+ def real(self, v):
+ self.values.real = v
+
+ @property
+ def imag(self):
+ """
+ Return imag value of vector.
+ """
+ return self.values.imag
+
+ @imag.setter
+ def imag(self, v):
+ self.values.imag = v
+
+ # coercion
+ __float__ = _coerce_method(float)
+ __long__ = _coerce_method(int)
+ __int__ = _coerce_method(int)
+
+ # ----------------------------------------------------------------------
+
+ def _unpickle_series_compat(self, state):
+ if isinstance(state, dict):
+ self._data = state['_data']
+ self.name = state['name']
+ self.index = self._data.index
+
+ elif isinstance(state, tuple):
+
+ # < 0.12 series pickle
+
+ nd_state, own_state = state
+
+ # recreate the ndarray
+ data = np.empty(nd_state[1], dtype=nd_state[2])
+ np.ndarray.__setstate__(data, nd_state)
+
+ # backwards compat
+ index, name = own_state[0], None
+ if len(own_state) > 1:
+ name = own_state[1]
+
+ # recreate
+ self._data = SingleBlockManager(data, index, fastpath=True)
+ self._index = index
+ self.name = name
+
+ else:
+ raise Exception("cannot unpickle legacy formats -> [%s]" % state)
+
+ # indexers
+ @property
+ def axes(self):
+ """
+ Return a list of the row axis labels.
+ """
+ return [self.index]
+
+ def _ixs(self, i, axis=0):
+ """
+ Return the i-th value or values in the Series by location.
+
+ Parameters
+ ----------
+ i : int, slice, or sequence of integers
+
+ Returns
+ -------
+ value : scalar (int) or Series (slice, sequence)
+ """
+ try:
+
+ # dispatch to the values if we need
+ values = self._values
+ if isinstance(values, np.ndarray):
+ return libindex.get_value_at(values, i)
+ else:
+ return values[i]
+ except IndexError:
+ raise
+ except Exception:
+ if isinstance(i, slice):
+ indexer = self.index._convert_slice_indexer(i, kind='iloc')
+ return self._get_values(indexer)
+ else:
+ label = self.index[i]
+ if isinstance(label, Index):
+ return self.take(i, axis=axis, convert=True)
+ else:
+ return libindex.get_value_at(self, i)
+
+ @property
+ def _is_mixed_type(self):
+ return False
+
+ def _slice(self, slobj, axis=0, kind=None):
+ slobj = self.index._convert_slice_indexer(slobj,
+ kind=kind or 'getitem')
+ return self._get_values(slobj)
+
+ def __getitem__(self, key):
+ key = com.apply_if_callable(key, self)
+ try:
+ result = self.index.get_value(self, key)
+
+ if not is_scalar(result):
+ if is_list_like(result) and not isinstance(result, Series):
+
+ # we need to box if loc of the key isn't scalar here
+ # otherwise have inline ndarray/lists
+ try:
+ if not is_scalar(self.index.get_loc(key)):
+ result = self._constructor(
+ result, index=[key] * len(result),
+ dtype=self.dtype).__finalize__(self)
+ except KeyError:
+ pass
+ return result
+ except InvalidIndexError:
+ pass
+ except (KeyError, ValueError):
+ if isinstance(key, tuple) and isinstance(self.index, MultiIndex):
+ # kludge
+ pass
+ elif key is Ellipsis:
+ return self
+ elif com.is_bool_indexer(key):
+ pass
+ else:
+
+ # we can try to coerce the indexer (or this will raise)
+ new_key = self.index._convert_scalar_indexer(key,
+ kind='getitem')
+ if type(new_key) != type(key):
+ return self.__getitem__(new_key)
+ raise
+
+ except Exception:
+ raise
+
+ if is_iterator(key):
+ key = list(key)
+
+ if com.is_bool_indexer(key):
+ key = check_bool_indexer(self.index, key)
+
+ return self._get_with(key)
+
+ def _get_with(self, key):
+ # other: fancy integer or otherwise
+ if isinstance(key, slice):
+ indexer = self.index._convert_slice_indexer(key, kind='getitem')
+ return self._get_values(indexer)
+ elif isinstance(key, ABCDataFrame):
+ raise TypeError('Indexing a Series with DataFrame is not '
+ 'supported, use the appropriate DataFrame column')
+ elif isinstance(key, tuple):
+ try:
+ return self._get_values_tuple(key)
+ except Exception:
+ if len(key) == 1:
+ key = key[0]
+ if isinstance(key, slice):
+ return self._get_values(key)
+ raise
+
+ # pragma: no cover
+ if not isinstance(key, (list, np.ndarray, Series, Index)):
+ key = list(key)
+
+ if isinstance(key, Index):
+ key_type = key.inferred_type
+ else:
+ key_type = lib.infer_dtype(key, skipna=False)
+
+ if key_type == 'integer':
+ if self.index.is_integer() or self.index.is_floating():
+ return self.loc[key]
+ else:
+ return self._get_values(key)
+ elif key_type == 'boolean':
+ return self._get_values(key)
+
+ try:
+ # handle the dup indexing case (GH 4246)
+ if isinstance(key, (list, tuple)):
+ return self.loc[key]
+
+ return self.reindex(key)
+ except Exception:
+ # [slice(0, 5, None)] will break if you convert to ndarray,
+ # e.g. as requested by np.median
+ # hack
+ if isinstance(key[0], slice):
+ return self._get_values(key)
+ raise
+
+ def _get_values_tuple(self, key):
+ # mpl hackaround
+ if com._any_none(*key):
+ return self._get_values(key)
+
+ if not isinstance(self.index, MultiIndex):
+ raise ValueError('Can only tuple-index with a MultiIndex')
+
+ # If key is contained, would have returned by now
+ indexer, new_index = self.index.get_loc_level(key)
+ return self._constructor(self._values[indexer],
+ index=new_index).__finalize__(self)
+
+ def _get_values(self, indexer):
+ try:
+ return self._constructor(self._data.get_slice(indexer),
+ fastpath=True).__finalize__(self)
+ except Exception:
+ return self._values[indexer]
+
+ def __setitem__(self, key, value):
+ key = com.apply_if_callable(key, self)
+
+ def setitem(key, value):
+ try:
+ self._set_with_engine(key, value)
+ return
+ except com.SettingWithCopyError:
+ raise
+ except (KeyError, ValueError):
+ values = self._values
+ if (is_integer(key) and
+ not self.index.inferred_type == 'integer'):
+
+ values[key] = value
+ return
+ elif key is Ellipsis:
+ self[:] = value
+ return
+ elif com.is_bool_indexer(key):
+ pass
+ elif is_timedelta64_dtype(self.dtype):
+ # reassign a null value to iNaT
+ if isna(value):
+ value = iNaT
+
+ try:
+ self.index._engine.set_value(self._values, key,
+ value)
+ return
+ except TypeError:
+ pass
+
+ self.loc[key] = value
+ return
+
+ except TypeError as e:
+ if (isinstance(key, tuple) and
+ not isinstance(self.index, MultiIndex)):
+ raise ValueError("Can only tuple-index with a MultiIndex")
+
+ # python 3 type errors should be raised
+ if _is_unorderable_exception(e):
+ raise IndexError(key)
+
+ if com.is_bool_indexer(key):
+ key = check_bool_indexer(self.index, key)
+ try:
+ self._where(~key, value, inplace=True)
+ return
+ except InvalidIndexError:
+ pass
+
+ self._set_with(key, value)
+
+ # do the setitem
+ cacher_needs_updating = self._check_is_chained_assignment_possible()
+ setitem(key, value)
+ if cacher_needs_updating:
+ self._maybe_update_cacher()
+
+ def _set_with_engine(self, key, value):
+ values = self._values
+ try:
+ self.index._engine.set_value(values, key, value)
+ return
+ except KeyError:
+ values[self.index.get_loc(key)] = value
+ return
+
+ def _set_with(self, key, value):
+ # other: fancy integer or otherwise
+ if isinstance(key, slice):
+ indexer = self.index._convert_slice_indexer(key, kind='getitem')
+ return self._set_values(indexer, value)
+ else:
+ if isinstance(key, tuple):
+ try:
+ self._set_values(key, value)
+ except Exception:
+ pass
+
+ if is_scalar(key):
+ key = [key]
+ elif not isinstance(key, (list, Series, np.ndarray)):
+ try:
+ key = list(key)
+ except Exception:
+ key = [key]
+
+ if isinstance(key, Index):
+ key_type = key.inferred_type
+ else:
+ key_type = lib.infer_dtype(key, skipna=False)
+
+ if key_type == 'integer':
+ if self.index.inferred_type == 'integer':
+ self._set_labels(key, value)
+ else:
+ return self._set_values(key, value)
+ elif key_type == 'boolean':
+ self._set_values(key.astype(np.bool_), value)
+ else:
+ self._set_labels(key, value)
+
+ def _set_labels(self, key, value):
+ if isinstance(key, Index):
+ key = key.values
+ else:
+ key = com.asarray_tuplesafe(key)
+ indexer = self.index.get_indexer(key)
+ mask = indexer == -1
+ if mask.any():
+ raise ValueError('%s not contained in the index' % str(key[mask]))
+ self._set_values(indexer, value)
+
+ def _set_values(self, key, value):
+ if isinstance(key, Series):
+ key = key._values
+ self._data = self._data.setitem(indexer=key, value=value)
+ self._maybe_update_cacher()
+
+ def repeat(self, repeats, axis=None):
+ """
+ Repeat elements of a Series.
+
+ Returns a new Series where each element of the current Series
+ is repeated consecutively a given number of times.
+
+ Parameters
+ ----------
+ repeats : int or array of ints
+ The number of repetitions for each element. This should be a
+ non-negative integer. Repeating 0 times will return an empty
+ Series.
+ axis : None
+ Must be ``None``. Has no effect but is accepted for compatibility
+ with numpy.
+
+ Returns
+ -------
+ repeated_series : Series
+ Newly created Series with repeated elements.
+
+ See Also
+ --------
+ Index.repeat : Equivalent function for Index.
+ numpy.repeat : Similar method for :class:`numpy.ndarray`.
+
+ Examples
+ --------
+ >>> s = pd.Series(['a', 'b', 'c'])
+ >>> s
+ 0 a
+ 1 b
+ 2 c
+ dtype: object
+ >>> s.repeat(2)
+ 0 a
+ 0 a
+ 1 b
+ 1 b
+ 2 c
+ 2 c
+ dtype: object
+ >>> s.repeat([1, 2, 3])
+ 0 a
+ 1 b
+ 1 b
+ 2 c
+ 2 c
+ 2 c
+ dtype: object
+ """
+ nv.validate_repeat(tuple(), dict(axis=axis))
+ new_index = self.index.repeat(repeats)
+ new_values = self._values.repeat(repeats)
+ return self._constructor(new_values,
+ index=new_index).__finalize__(self)
+
+ def get_value(self, label, takeable=False):
+ """
+ Quickly retrieve single value at passed index label.
+
+ .. deprecated:: 0.21.0
+ Please use .at[] or .iat[] accessors.
+
+ Parameters
+ ----------
+ label : object
+ takeable : interpret the index as indexers, default False
+
+ Returns
+ -------
+ value : scalar value
+ """
+ warnings.warn("get_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead", FutureWarning,
+ stacklevel=2)
+ return self._get_value(label, takeable=takeable)
+
+ def _get_value(self, label, takeable=False):
+ if takeable is True:
+ return com.maybe_box_datetimelike(self._values[label])
+ return self.index.get_value(self._values, label)
+ _get_value.__doc__ = get_value.__doc__
+
+ def set_value(self, label, value, takeable=False):
+ """
+ Quickly set single value at passed label.
+
+ .. deprecated:: 0.21.0
+ Please use .at[] or .iat[] accessors.
+
+ If label is not contained, a new object is created with the label
+ placed at the end of the result index.
+
+ Parameters
+ ----------
+ label : object
+ Partial indexing with MultiIndex not allowed
+ value : object
+ Scalar value
+ takeable : interpret the index as indexers, default False
+
+ Returns
+ -------
+ series : Series
+ If label is contained, will be reference to calling Series,
+ otherwise a new object
+ """
+ warnings.warn("set_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead", FutureWarning,
+ stacklevel=2)
+ return self._set_value(label, value, takeable=takeable)
+
+ def _set_value(self, label, value, takeable=False):
+ try:
+ if takeable:
+ self._values[label] = value
+ else:
+ self.index._engine.set_value(self._values, label, value)
+ except KeyError:
+
+ # set using a non-recursive method
+ self.loc[label] = value
+
+ return self
+ _set_value.__doc__ = set_value.__doc__
+
+ def reset_index(self, level=None, drop=False, name=None, inplace=False):
+ """
+ Generate a new DataFrame or Series with the index reset.
+
+ This is useful when the index needs to be treated as a column, or
+ when the index is meaningless and needs to be reset to the default
+ before another operation.
+
+ Parameters
+ ----------
+ level : int, str, tuple, or list, default optional
+ For a Series with a MultiIndex, only remove the specified levels
+ from the index. Removes all levels by default.
+ drop : bool, default False
+ Just reset the index, without inserting it as a column in
+ the new DataFrame.
+ name : object, optional
+ The name to use for the column containing the original Series
+ values. Uses ``self.name`` by default. This argument is ignored
+ when `drop` is True.
+ inplace : bool, default False
+ Modify the Series in place (do not create a new object).
+
+ Returns
+ -------
+ Series or DataFrame
+ When `drop` is False (the default), a DataFrame is returned.
+ The newly created columns will come first in the DataFrame,
+ followed by the original Series values.
+ When `drop` is True, a `Series` is returned.
+ In either case, if ``inplace=True``, no value is returned.
+
+ See Also
+ --------
+ DataFrame.reset_index: Analogous function for DataFrame.
+
+ Examples
+ --------
+ >>> s = pd.Series([1, 2, 3, 4], name='foo',
+ ... index=pd.Index(['a', 'b', 'c', 'd'], name='idx'))
+
+ Generate a DataFrame with default index.
+
+ >>> s.reset_index()
+ idx foo
+ 0 a 1
+ 1 b 2
+ 2 c 3
+ 3 d 4
+
+ To specify the name of the new column use `name`.
+
+ >>> s.reset_index(name='values')
+ idx values
+ 0 a 1
+ 1 b 2
+ 2 c 3
+ 3 d 4
+
+ To generate a new Series with the default set `drop` to True.
+
+ >>> s.reset_index(drop=True)
+ 0 1
+ 1 2
+ 2 3
+ 3 4
+ Name: foo, dtype: int64
+
+ To update the Series in place, without generating a new one
+ set `inplace` to True. Note that it also requires ``drop=True``.
+
+ >>> s.reset_index(inplace=True, drop=True)
+ >>> s
+ 0 1
+ 1 2
+ 2 3
+ 3 4
+ Name: foo, dtype: int64
+
+ The `level` parameter is interesting for Series with a multi-level
+ index.
+
+ >>> arrays = [np.array(['bar', 'bar', 'baz', 'baz']),
+ ... np.array(['one', 'two', 'one', 'two'])]
+ >>> s2 = pd.Series(
+ ... range(4), name='foo',
+ ... index=pd.MultiIndex.from_arrays(arrays,
+ ... names=['a', 'b']))
+
+ To remove a specific level from the Index, use `level`.
+
+ >>> s2.reset_index(level='a')
+ a foo
+ b
+ one bar 0
+ two bar 1
+ one baz 2
+ two baz 3
+
+ If `level` is not set, all levels are removed from the Index.
+
+ >>> s2.reset_index()
+ a b foo
+ 0 bar one 0
+ 1 bar two 1
+ 2 baz one 2
+ 3 baz two 3
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ if drop:
+ new_index = ibase.default_index(len(self))
+ if level is not None:
+ if not isinstance(level, (tuple, list)):
+ level = [level]
+ level = [self.index._get_level_number(lev) for lev in level]
+ if len(level) < self.index.nlevels:
+ new_index = self.index.droplevel(level)
+
+ if inplace:
+ self.index = new_index
+ # set name if it was passed, otherwise, keep the previous name
+ self.name = name or self.name
+ else:
+ return self._constructor(self._values.copy(),
+ index=new_index).__finalize__(self)
+ elif inplace:
+ raise TypeError('Cannot reset_index inplace on a Series '
+ 'to create a DataFrame')
+ else:
+ df = self.to_frame(name)
+ return df.reset_index(level=level, drop=drop)
+
+ # ----------------------------------------------------------------------
+ # Rendering Methods
+
+ def __unicode__(self):
+ """
+ Return a string representation for a particular DataFrame.
+
+ Invoked by unicode(df) in py2 only. Yields a Unicode String in both
+ py2/py3.
+ """
+ buf = StringIO(u(""))
+ width, height = get_terminal_size()
+ max_rows = (height if get_option("display.max_rows") == 0 else
+ get_option("display.max_rows"))
+ show_dimensions = get_option("display.show_dimensions")
+
+ self.to_string(buf=buf, name=self.name, dtype=self.dtype,
+ max_rows=max_rows, length=show_dimensions)
+ result = buf.getvalue()
+
+ return result
+
+ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True,
+ index=True, length=False, dtype=False, name=False,
+ max_rows=None):
+ """
+ Render a string representation of the Series.
+
+ Parameters
+ ----------
+ buf : StringIO-like, optional
+ buffer to write to
+ na_rep : string, optional
+ string representation of NAN to use, default 'NaN'
+ float_format : one-parameter function, optional
+ formatter function to apply to columns' elements if they are floats
+ default None
+ header : boolean, default True
+ Add the Series header (index name)
+ index : bool, optional
+ Add index (row) labels, default True
+ length : boolean, default False
+ Add the Series length
+ dtype : boolean, default False
+ Add the Series dtype
+ name : boolean, default False
+ Add the Series name if not None
+ max_rows : int, optional
+ Maximum number of rows to show before truncating. If None, show
+ all.
+
+ Returns
+ -------
+ formatted : string (if not buffer passed)
+ """
+
+ formatter = fmt.SeriesFormatter(self, name=name, length=length,
+ header=header, index=index,
+ dtype=dtype, na_rep=na_rep,
+ float_format=float_format,
+ max_rows=max_rows)
+ result = formatter.to_string()
+
+ # catch contract violations
+ if not isinstance(result, compat.text_type):
+ raise AssertionError("result must be of type unicode, type"
+ " of result is {0!r}"
+ "".format(result.__class__.__name__))
+
+ if buf is None:
+ return result
+ else:
+ try:
+ buf.write(result)
+ except AttributeError:
+ with open(buf, 'w') as f:
+ f.write(result)
+
+ # ----------------------------------------------------------------------
+
+ def iteritems(self):
+ """
+ Lazily iterate over (index, value) tuples.
+ """
+ return zip(iter(self.index), iter(self))
+
+ items = iteritems
+
+ # ----------------------------------------------------------------------
+ # Misc public methods
+
+ def keys(self):
+ """
+ Alias for index.
+ """
+ return self.index
+
+ def to_dict(self, into=dict):
+ """
+ Convert Series to {label -> value} dict or dict-like object.
+
+ Parameters
+ ----------
+ into : class, default dict
+ The collections.Mapping subclass to use as the return
+ object. Can be the actual class or an empty
+ instance of the mapping type you want. If you want a
+ collections.defaultdict, you must pass it initialized.
+
+ .. versionadded:: 0.21.0
+
+ Returns
+ -------
+ value_dict : collections.Mapping
+
+ Examples
+ --------
+ >>> s = pd.Series([1, 2, 3, 4])
+ >>> s.to_dict()
+ {0: 1, 1: 2, 2: 3, 3: 4}
+ >>> from collections import OrderedDict, defaultdict
+ >>> s.to_dict(OrderedDict)
+ OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])
+ >>> dd = defaultdict(list)
+ >>> s.to_dict(dd)
+ defaultdict(<type 'list'>, {0: 1, 1: 2, 2: 3, 3: 4})
+ """
+ # GH16122
+ into_c = com.standardize_mapping(into)
+ return into_c(compat.iteritems(self))
+
+ def to_frame(self, name=None):
+ """
+ Convert Series to DataFrame.
+
+ Parameters
+ ----------
+ name : object, default None
+ The passed name should substitute for the series name (if it has
+ one).
+
+ Returns
+ -------
+ data_frame : DataFrame
+ """
+ if name is None:
+ df = self._constructor_expanddim(self)
+ else:
+ df = self._constructor_expanddim({name: self})
+
+ return df
+
+ def to_sparse(self, kind='block', fill_value=None):
+ """
+ Convert Series to SparseSeries.
+
+ Parameters
+ ----------
+ kind : {'block', 'integer'}
+ fill_value : float, defaults to NaN (missing)
+
+ Returns
+ -------
+ sp : SparseSeries
+ """
+ # TODO: deprecate
+ from pandas.core.sparse.series import SparseSeries
+
+ values = SparseArray(self, kind=kind, fill_value=fill_value)
+ return SparseSeries(
+ values, index=self.index, name=self.name
+ ).__finalize__(self)
+
+ def _set_name(self, name, inplace=False):
+ """
+ Set the Series name.
+
+ Parameters
+ ----------
+ name : str
+ inplace : bool
+ whether to modify `self` directly or return a copy
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ ser = self if inplace else self.copy()
+ ser.name = name
+ return ser
+
+ # ----------------------------------------------------------------------
+ # Statistics, overridden ndarray methods
+
+ # TODO: integrate bottleneck
+
+ def count(self, level=None):
+ """
+ Return number of non-NA/null observations in the Series.
+
+ Parameters
+ ----------
+ level : int or level name, default None
+ If the axis is a MultiIndex (hierarchical), count along a
+ particular level, collapsing into a smaller Series
+
+ Returns
+ -------
+ nobs : int or Series (if level specified)
+ """
+ if level is None:
+ return notna(com.values_from_object(self)).sum()
+
+ if isinstance(level, compat.string_types):
+ level = self.index._get_level_number(level)
+
+ lev = self.index.levels[level]
+ level_codes = np.array(self.index.codes[level], subok=False, copy=True)
+
+ mask = level_codes == -1
+ if mask.any():
+ level_codes[mask] = cnt = len(lev)
+ lev = lev.insert(cnt, lev._na_value)
+
+ obs = level_codes[notna(self.values)]
+ out = np.bincount(obs, minlength=len(lev) or None)
+ return self._constructor(out, index=lev,
+ dtype='int64').__finalize__(self)
+
+ def mode(self, dropna=True):
+ """
+ Return the mode(s) of the dataset.
+
+ Always returns Series even if only one value is returned.
+
+ Parameters
+ ----------
+ dropna : boolean, default True
+ Don't consider counts of NaN/NaT.
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ modes : Series (sorted)
+ """
+ # TODO: Add option for bins like value_counts()
+ return algorithms.mode(self, dropna=dropna)
+
+ def unique(self):
+ """
+ Return unique values of Series object.
+
+ Uniques are returned in order of appearance. Hash table-based unique,
+ therefore does NOT sort.
+
+ Returns
+ -------
+ ndarray or ExtensionArray
+ The unique values returned as a NumPy array. In case of an
+ extension-array backed Series, a new
+ :class:`~api.extensions.ExtensionArray` of that type with just
+ the unique values is returned. This includes
+
+ * Categorical
+ * Period
+ * Datetime with Timezone
+ * Interval
+ * Sparse
+ * IntegerNA
+
+ See Also
+ --------
+ unique : Top-level unique method for any 1-d array-like object.
+ Index.unique : Return Index with unique values from an Index object.
+
+ Examples
+ --------
+ >>> pd.Series([2, 1, 3, 3], name='A').unique()
+ array([2, 1, 3])
+
+ >>> pd.Series([pd.Timestamp('2016-01-01') for _ in range(3)]).unique()
+ array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')
+
+ >>> pd.Series([pd.Timestamp('2016-01-01', tz='US/Eastern')
+ ... for _ in range(3)]).unique()
+ <DatetimeArray>
+ ['2016-01-01 00:00:00-05:00']
+ Length: 1, dtype: datetime64[ns, US/Eastern]
+
+ An unordered Categorical will return categories in the order of
+ appearance.
+
+ >>> pd.Series(pd.Categorical(list('baabc'))).unique()
+ [b, a, c]
+ Categories (3, object): [b, a, c]
+
+ An ordered Categorical preserves the category ordering.
+
+ >>> pd.Series(pd.Categorical(list('baabc'), categories=list('abc'),
+ ... ordered=True)).unique()
+ [b, a, c]
+ Categories (3, object): [a < b < c]
+ """
+ result = super(Series, self).unique()
+ return result
+
+ def drop_duplicates(self, keep='first', inplace=False):
+ """
+ Return Series with duplicate values removed.
+
+ Parameters
+ ----------
+ keep : {'first', 'last', ``False``}, default 'first'
+ - 'first' : Drop duplicates except for the first occurrence.
+ - 'last' : Drop duplicates except for the last occurrence.
+ - ``False`` : Drop all duplicates.
+ inplace : boolean, default ``False``
+ If ``True``, performs operation inplace and returns None.
+
+ Returns
+ -------
+ deduplicated : Series
+
+ See Also
+ --------
+ Index.drop_duplicates : Equivalent method on Index.
+ DataFrame.drop_duplicates : Equivalent method on DataFrame.
+ Series.duplicated : Related method on Series, indicating duplicate
+ Series values.
+
+ Examples
+ --------
+ Generate an Series with duplicated entries.
+
+ >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'],
+ ... name='animal')
+ >>> s
+ 0 lama
+ 1 cow
+ 2 lama
+ 3 beetle
+ 4 lama
+ 5 hippo
+ Name: animal, dtype: object
+
+ With the 'keep' parameter, the selection behaviour of duplicated values
+ can be changed. The value 'first' keeps the first occurrence for each
+ set of duplicated entries. The default value of keep is 'first'.
+
+ >>> s.drop_duplicates()
+ 0 lama
+ 1 cow
+ 3 beetle
+ 5 hippo
+ Name: animal, dtype: object
+
+ The value 'last' for parameter 'keep' keeps the last occurrence for
+ each set of duplicated entries.
+
+ >>> s.drop_duplicates(keep='last')
+ 1 cow
+ 3 beetle
+ 4 lama
+ 5 hippo
+ Name: animal, dtype: object
+
+ The value ``False`` for parameter 'keep' discards all sets of
+ duplicated entries. Setting the value of 'inplace' to ``True`` performs
+ the operation inplace and returns ``None``.
+
+ >>> s.drop_duplicates(keep=False, inplace=True)
+ >>> s
+ 1 cow
+ 3 beetle
+ 5 hippo
+ Name: animal, dtype: object
+ """
+ return super(Series, self).drop_duplicates(keep=keep, inplace=inplace)
+
+ def duplicated(self, keep='first'):
+ """
+ Indicate duplicate Series values.
+
+ Duplicated values are indicated as ``True`` values in the resulting
+ Series. Either all duplicates, all except the first or all except the
+ last occurrence of duplicates can be indicated.
+
+ Parameters
+ ----------
+ keep : {'first', 'last', False}, default 'first'
+ - 'first' : Mark duplicates as ``True`` except for the first
+ occurrence.
+ - 'last' : Mark duplicates as ``True`` except for the last
+ occurrence.
+ - ``False`` : Mark all duplicates as ``True``.
+
+ Returns
+ -------
+ pandas.core.series.Series
+
+ See Also
+ --------
+ Index.duplicated : Equivalent method on pandas.Index.
+ DataFrame.duplicated : Equivalent method on pandas.DataFrame.
+ Series.drop_duplicates : Remove duplicate values from Series.
+
+ Examples
+ --------
+ By default, for each set of duplicated values, the first occurrence is
+ set on False and all others on True:
+
+ >>> animals = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama'])
+ >>> animals.duplicated()
+ 0 False
+ 1 False
+ 2 True
+ 3 False
+ 4 True
+ dtype: bool
+
+ which is equivalent to
+
+ >>> animals.duplicated(keep='first')
+ 0 False
+ 1 False
+ 2 True
+ 3 False
+ 4 True
+ dtype: bool
+
+ By using 'last', the last occurrence of each set of duplicated values
+ is set on False and all others on True:
+
+ >>> animals.duplicated(keep='last')
+ 0 True
+ 1 False
+ 2 True
+ 3 False
+ 4 False
+ dtype: bool
+
+ By setting keep on ``False``, all duplicates are True:
+
+ >>> animals.duplicated(keep=False)
+ 0 True
+ 1 False
+ 2 True
+ 3 False
+ 4 True
+ dtype: bool
+ """
+ return super(Series, self).duplicated(keep=keep)
+
+ def idxmin(self, axis=0, skipna=True, *args, **kwargs):
+ """
+ Return the row label of the minimum value.
+
+ If multiple values equal the minimum, the first row label with that
+ value is returned.
+
+ Parameters
+ ----------
+ skipna : boolean, default True
+ Exclude NA/null values. If the entire Series is NA, the result
+ will be NA.
+ axis : int, default 0
+ For compatibility with DataFrame.idxmin. Redundant for application
+ on Series.
+ *args, **kwargs
+ Additional keywords have no effect but might be accepted
+ for compatibility with NumPy.
+
+ Returns
+ -------
+ idxmin : Index of minimum of values.
+
+ Raises
+ ------
+ ValueError
+ If the Series is empty.
+
+ See Also
+ --------
+ numpy.argmin : Return indices of the minimum values
+ along the given axis.
+ DataFrame.idxmin : Return index of first occurrence of minimum
+ over requested axis.
+ Series.idxmax : Return index *label* of the first occurrence
+ of maximum of values.
+
+ Notes
+ -----
+ This method is the Series version of ``ndarray.argmin``. This method
+ returns the label of the minimum, while ``ndarray.argmin`` returns
+ the position. To get the position, use ``series.values.argmin()``.
+
+ Examples
+ --------
+ >>> s = pd.Series(data=[1, None, 4, 1],
+ ... index=['A' ,'B' ,'C' ,'D'])
+ >>> s
+ A 1.0
+ B NaN
+ C 4.0
+ D 1.0
+ dtype: float64
+
+ >>> s.idxmin()
+ 'A'
+
+ If `skipna` is False and there is an NA value in the data,
+ the function returns ``nan``.
+
+ >>> s.idxmin(skipna=False)
+ nan
+ """
+ skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs)
+ i = nanops.nanargmin(com.values_from_object(self), skipna=skipna)
+ if i == -1:
+ return np.nan
+ return self.index[i]
+
+ def idxmax(self, axis=0, skipna=True, *args, **kwargs):
+ """
+ Return the row label of the maximum value.
+
+ If multiple values equal the maximum, the first row label with that
+ value is returned.
+
+ Parameters
+ ----------
+ skipna : boolean, default True
+ Exclude NA/null values. If the entire Series is NA, the result
+ will be NA.
+ axis : int, default 0
+ For compatibility with DataFrame.idxmax. Redundant for application
+ on Series.
+ *args, **kwargs
+ Additional keywords have no effect but might be accepted
+ for compatibility with NumPy.
+
+ Returns
+ -------
+ idxmax : Index of maximum of values.
+
+ Raises
+ ------
+ ValueError
+ If the Series is empty.
+
+ See Also
+ --------
+ numpy.argmax : Return indices of the maximum values
+ along the given axis.
+ DataFrame.idxmax : Return index of first occurrence of maximum
+ over requested axis.
+ Series.idxmin : Return index *label* of the first occurrence
+ of minimum of values.
+
+ Notes
+ -----
+ This method is the Series version of ``ndarray.argmax``. This method
+ returns the label of the maximum, while ``ndarray.argmax`` returns
+ the position. To get the position, use ``series.values.argmax()``.
+
+ Examples
+ --------
+ >>> s = pd.Series(data=[1, None, 4, 3, 4],
+ ... index=['A', 'B', 'C', 'D', 'E'])
+ >>> s
+ A 1.0
+ B NaN
+ C 4.0
+ D 3.0
+ E 4.0
+ dtype: float64
+
+ >>> s.idxmax()
+ 'C'
+
+ If `skipna` is False and there is an NA value in the data,
+ the function returns ``nan``.
+
+ >>> s.idxmax(skipna=False)
+ nan
+ """
+ skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs)
+ i = nanops.nanargmax(com.values_from_object(self), skipna=skipna)
+ if i == -1:
+ return np.nan
+ return self.index[i]
+
+ # ndarray compat
+ argmin = deprecate(
+ 'argmin', idxmin, '0.21.0',
+ msg=dedent("""
+ The current behaviour of 'Series.argmin' is deprecated, use 'idxmin'
+ instead.
+ The behavior of 'argmin' will be corrected to return the positional
+ minimum in the future. For now, use 'series.values.argmin' or
+ 'np.argmin(np.array(values))' to get the position of the minimum
+ row.""")
+ )
+ argmax = deprecate(
+ 'argmax', idxmax, '0.21.0',
+ msg=dedent("""
+ The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
+ instead.
+ The behavior of 'argmax' will be corrected to return the positional
+ maximum in the future. For now, use 'series.values.argmax' or
+ 'np.argmax(np.array(values))' to get the position of the maximum
+ row.""")
+ )
+
+ def round(self, decimals=0, *args, **kwargs):
+ """
+ Round each value in a Series to the given number of decimals.
+
+ Parameters
+ ----------
+ decimals : int
+ Number of decimal places to round to (default: 0).
+ If decimals is negative, it specifies the number of
+ positions to the left of the decimal point.
+
+ Returns
+ -------
+ Series object
+
+ See Also
+ --------
+ numpy.around
+ DataFrame.round
+ """
+ nv.validate_round(args, kwargs)
+ result = com.values_from_object(self).round(decimals)
+ result = self._constructor(result, index=self.index).__finalize__(self)
+
+ return result
+
+ def quantile(self, q=0.5, interpolation='linear'):
+ """
+ Return value at the given quantile.
+
+ Parameters
+ ----------
+ q : float or array-like, default 0.5 (50% quantile)
+ 0 <= q <= 1, the quantile(s) to compute
+ interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
+ .. versionadded:: 0.18.0
+
+ This optional parameter specifies the interpolation method to use,
+ when the desired quantile lies between two data points `i` and `j`:
+
+ * linear: `i + (j - i) * fraction`, where `fraction` is the
+ fractional part of the index surrounded by `i` and `j`.
+ * lower: `i`.
+ * higher: `j`.
+ * nearest: `i` or `j` whichever is nearest.
+ * midpoint: (`i` + `j`) / 2.
+
+ Returns
+ -------
+ quantile : float or Series
+ if ``q`` is an array, a Series will be returned where the
+ index is ``q`` and the values are the quantiles.
+
+ See Also
+ --------
+ core.window.Rolling.quantile
+ numpy.percentile
+
+ Examples
+ --------
+ >>> s = pd.Series([1, 2, 3, 4])
+ >>> s.quantile(.5)
+ 2.5
+ >>> s.quantile([.25, .5, .75])
+ 0.25 1.75
+ 0.50 2.50
+ 0.75 3.25
+ dtype: float64
+ """
+
+ self._check_percentile(q)
+
+ # We dispatch to DataFrame so that core.internals only has to worry
+ # about 2D cases.
+ df = self.to_frame()
+
+ result = df.quantile(q=q, interpolation=interpolation,
+ numeric_only=False)
+ if result.ndim == 2:
+ result = result.iloc[:, 0]
+
+ if is_list_like(q):
+ result.name = self.name
+ return self._constructor(result,
+ index=Float64Index(q),
+ name=self.name)
+ else:
+ # scalar
+ return result.iloc[0]
+
+ def corr(self, other, method='pearson', min_periods=None):
+ """
+ Compute correlation with `other` Series, excluding missing values.
+
+ Parameters
+ ----------
+ other : Series
+ method : {'pearson', 'kendall', 'spearman'} or callable
+ * pearson : standard correlation coefficient
+ * kendall : Kendall Tau correlation coefficient
+ * spearman : Spearman rank correlation
+ * callable: callable with input two 1d ndarray
+ and returning a float
+ .. versionadded:: 0.24.0
+
+ min_periods : int, optional
+ Minimum number of observations needed to have a valid result
+
+ Returns
+ -------
+ correlation : float
+
+ Examples
+ --------
+ >>> histogram_intersection = lambda a, b: np.minimum(a, b
+ ... ).sum().round(decimals=1)
+ >>> s1 = pd.Series([.2, .0, .6, .2])
+ >>> s2 = pd.Series([.3, .6, .0, .1])
+ >>> s1.corr(s2, method=histogram_intersection)
+ 0.3
+ """
+ this, other = self.align(other, join='inner', copy=False)
+ if len(this) == 0:
+ return np.nan
+
+ if method in ['pearson', 'spearman', 'kendall'] or callable(method):
+ return nanops.nancorr(this.values, other.values, method=method,
+ min_periods=min_periods)
+
+ raise ValueError("method must be either 'pearson', "
+ "'spearman', or 'kendall', '{method}' "
+ "was supplied".format(method=method))
+
+ def cov(self, other, min_periods=None):
+ """
+ Compute covariance with Series, excluding missing values.
+
+ Parameters
+ ----------
+ other : Series
+ min_periods : int, optional
+ Minimum number of observations needed to have a valid result
+
+ Returns
+ -------
+ covariance : float
+
+ Normalized by N-1 (unbiased estimator).
+ """
+ this, other = self.align(other, join='inner', copy=False)
+ if len(this) == 0:
+ return np.nan
+ return nanops.nancov(this.values, other.values,
+ min_periods=min_periods)
+
+ def diff(self, periods=1):
+ """
+ First discrete difference of element.
+
+ Calculates the difference of a Series element compared with another
+ element in the Series (default is element in previous row).
+
+ Parameters
+ ----------
+ periods : int, default 1
+ Periods to shift for calculating difference, accepts negative
+ values.
+
+ Returns
+ -------
+ diffed : Series
+
+ See Also
+ --------
+ Series.pct_change: Percent change over given number of periods.
+ Series.shift: Shift index by desired number of periods with an
+ optional time freq.
+ DataFrame.diff: First discrete difference of object.
+
+ Examples
+ --------
+ Difference with previous row
+
+ >>> s = pd.Series([1, 1, 2, 3, 5, 8])
+ >>> s.diff()
+ 0 NaN
+ 1 0.0
+ 2 1.0
+ 3 1.0
+ 4 2.0
+ 5 3.0
+ dtype: float64
+
+ Difference with 3rd previous row
+
+ >>> s.diff(periods=3)
+ 0 NaN
+ 1 NaN
+ 2 NaN
+ 3 2.0
+ 4 4.0
+ 5 6.0
+ dtype: float64
+
+ Difference with following row
+
+ >>> s.diff(periods=-1)
+ 0 0.0
+ 1 -1.0
+ 2 -1.0
+ 3 -2.0
+ 4 -3.0
+ 5 NaN
+ dtype: float64
+ """
+ result = algorithms.diff(com.values_from_object(self), periods)
+ return self._constructor(result, index=self.index).__finalize__(self)
+
+ def autocorr(self, lag=1):
+ """
+ Compute the lag-N autocorrelation.
+
+ This method computes the Pearson correlation between
+ the Series and its shifted self.
+
+ Parameters
+ ----------
+ lag : int, default 1
+ Number of lags to apply before performing autocorrelation.
+
+ Returns
+ -------
+ float
+ The Pearson correlation between self and self.shift(lag).
+
+ See Also
+ --------
+ Series.corr : Compute the correlation between two Series.
+ Series.shift : Shift index by desired number of periods.
+ DataFrame.corr : Compute pairwise correlation of columns.
+ DataFrame.corrwith : Compute pairwise correlation between rows or
+ columns of two DataFrame objects.
+
+ Notes
+ -----
+ If the Pearson correlation is not well defined return 'NaN'.
+
+ Examples
+ --------
+ >>> s = pd.Series([0.25, 0.5, 0.2, -0.05])
+ >>> s.autocorr() # doctest: +ELLIPSIS
+ 0.10355...
+ >>> s.autocorr(lag=2) # doctest: +ELLIPSIS
+ -0.99999...
+
+ If the Pearson correlation is not well defined, then 'NaN' is returned.
+
+ >>> s = pd.Series([1, 0, 0, 0])
+ >>> s.autocorr()
+ nan
+ """
+ return self.corr(self.shift(lag))
+
+ def dot(self, other):
+ """
+ Compute the dot product between the Series and the columns of other.
+
+ This method computes the dot product between the Series and another
+ one, or the Series and each columns of a DataFrame, or the Series and
+ each columns of an array.
+
+ It can also be called using `self @ other` in Python >= 3.5.
+
+ Parameters
+ ----------
+ other : Series, DataFrame or array-like
+ The other object to compute the dot product with its columns.
+
+ Returns
+ -------
+ scalar, Series or numpy.ndarray
+ Return the dot product of the Series and other if other is a
+ Series, the Series of the dot product of Series and each rows of
+ other if other is a DataFrame or a numpy.ndarray between the Series
+ and each columns of the numpy array.
+
+ See Also
+ --------
+ DataFrame.dot: Compute the matrix product with the DataFrame.
+ Series.mul: Multiplication of series and other, element-wise.
+
+ Notes
+ -----
+ The Series and other has to share the same index if other is a Series
+ or a DataFrame.
+
+ Examples
+ --------
+ >>> s = pd.Series([0, 1, 2, 3])
+ >>> other = pd.Series([-1, 2, -3, 4])
+ >>> s.dot(other)
+ 8
+ >>> s @ other
+ 8
+ >>> df = pd.DataFrame([[0 ,1], [-2, 3], [4, -5], [6, 7]])
+ >>> s.dot(df)
+ 0 24
+ 1 14
+ dtype: int64
+ >>> arr = np.array([[0, 1], [-2, 3], [4, -5], [6, 7]])
+ >>> s.dot(arr)
+ array([24, 14])
+ """
+ from pandas.core.frame import DataFrame
+ if isinstance(other, (Series, DataFrame)):
+ common = self.index.union(other.index)
+ if (len(common) > len(self.index) or
+ len(common) > len(other.index)):
+ raise ValueError('matrices are not aligned')
+
+ left = self.reindex(index=common, copy=False)
+ right = other.reindex(index=common, copy=False)
+ lvals = left.values
+ rvals = right.values
+ else:
+ lvals = self.values
+ rvals = np.asarray(other)
+ if lvals.shape[0] != rvals.shape[0]:
+ raise Exception('Dot product shape mismatch, %s vs %s' %
+ (lvals.shape, rvals.shape))
+
+ if isinstance(other, DataFrame):
+ return self._constructor(np.dot(lvals, rvals),
+ index=other.columns).__finalize__(self)
+ elif isinstance(other, Series):
+ return np.dot(lvals, rvals)
+ elif isinstance(rvals, np.ndarray):
+ return np.dot(lvals, rvals)
+ else: # pragma: no cover
+ raise TypeError('unsupported type: %s' % type(other))
+
+ def __matmul__(self, other):
+ """
+ Matrix multiplication using binary `@` operator in Python>=3.5.
+ """
+ return self.dot(other)
+
+ def __rmatmul__(self, other):
+ """
+ Matrix multiplication using binary `@` operator in Python>=3.5.
+ """
+ return self.dot(np.transpose(other))
+
+ @Substitution(klass='Series')
+ @Appender(base._shared_docs['searchsorted'])
+ def searchsorted(self, value, side='left', sorter=None):
+ if sorter is not None:
+ sorter = ensure_platform_int(sorter)
+ result = self._values.searchsorted(Series(value)._values,
+ side=side, sorter=sorter)
+
+ return result[0] if is_scalar(value) else result
+
+ # -------------------------------------------------------------------
+ # Combination
+
+ def append(self, to_append, ignore_index=False, verify_integrity=False):
+ """
+ Concatenate two or more Series.
+
+ Parameters
+ ----------
+ to_append : Series or list/tuple of Series
+ ignore_index : boolean, default False
+ If True, do not use the index labels.
+
+ .. versionadded:: 0.19.0
+
+ verify_integrity : boolean, default False
+ If True, raise Exception on creating index with duplicates
+
+ Returns
+ -------
+ appended : Series
+
+ See Also
+ --------
+ concat : General function to concatenate DataFrame, Series
+ or Panel objects.
+
+ Notes
+ -----
+ Iteratively appending to a Series can be more computationally intensive
+ than a single concatenate. A better solution is to append values to a
+ list and then concatenate the list with the original Series all at
+ once.
+
+ Examples
+ --------
+ >>> s1 = pd.Series([1, 2, 3])
+ >>> s2 = pd.Series([4, 5, 6])
+ >>> s3 = pd.Series([4, 5, 6], index=[3,4,5])
+ >>> s1.append(s2)
+ 0 1
+ 1 2
+ 2 3
+ 0 4
+ 1 5
+ 2 6
+ dtype: int64
+
+ >>> s1.append(s3)
+ 0 1
+ 1 2
+ 2 3
+ 3 4
+ 4 5
+ 5 6
+ dtype: int64
+
+ With `ignore_index` set to True:
+
+ >>> s1.append(s2, ignore_index=True)
+ 0 1
+ 1 2
+ 2 3
+ 3 4
+ 4 5
+ 5 6
+ dtype: int64
+
+ With `verify_integrity` set to True:
+
+ >>> s1.append(s2, verify_integrity=True)
+ Traceback (most recent call last):
+ ...
+ ValueError: Indexes have overlapping values: [0, 1, 2]
+ """
+ from pandas.core.reshape.concat import concat
+
+ if isinstance(to_append, (list, tuple)):
+ to_concat = [self] + to_append
+ else:
+ to_concat = [self, to_append]
+ return concat(to_concat, ignore_index=ignore_index,
+ verify_integrity=verify_integrity)
+
+ def _binop(self, other, func, level=None, fill_value=None):
+ """
+ Perform generic binary operation with optional fill value.
+
+ Parameters
+ ----------
+ other : Series
+ func : binary operator
+ fill_value : float or object
+ Value to substitute for NA/null values. If both Series are NA in a
+ location, the result will be NA regardless of the passed fill value
+ level : int or level name, default None
+ Broadcast across a level, matching Index values on the
+ passed MultiIndex level
+
+ Returns
+ -------
+ combined : Series
+ """
+ if not isinstance(other, Series):
+ raise AssertionError('Other operand must be Series')
+
+ new_index = self.index
+ this = self
+
+ if not self.index.equals(other.index):
+ this, other = self.align(other, level=level, join='outer',
+ copy=False)
+ new_index = this.index
+
+ this_vals, other_vals = ops.fill_binop(this.values, other.values,
+ fill_value)
+
+ with np.errstate(all='ignore'):
+ result = func(this_vals, other_vals)
+ name = ops.get_op_result_name(self, other)
+ result = self._constructor(result, index=new_index, name=name)
+ result = result.__finalize__(self)
+ if name is None:
+ # When name is None, __finalize__ overwrites current name
+ result.name = None
+ return result
+
+ def combine(self, other, func, fill_value=None):
+ """
+ Combine the Series with a Series or scalar according to `func`.
+
+ Combine the Series and `other` using `func` to perform elementwise
+ selection for combined Series.
+ `fill_value` is assumed when value is missing at some index
+ from one of the two objects being combined.
+
+ Parameters
+ ----------
+ other : Series or scalar
+ The value(s) to be combined with the `Series`.
+ func : function
+ Function that takes two scalars as inputs and returns an element.
+ fill_value : scalar, optional
+ The value to assume when an index is missing from
+ one Series or the other. The default specifies to use the
+ appropriate NaN value for the underlying dtype of the Series.
+
+ Returns
+ -------
+ Series
+ The result of combining the Series with the other object.
+
+ See Also
+ --------
+ Series.combine_first : Combine Series values, choosing the calling
+ Series' values first.
+
+ Examples
+ --------
+ Consider 2 Datasets ``s1`` and ``s2`` containing
+ highest clocked speeds of different birds.
+
+ >>> s1 = pd.Series({'falcon': 330.0, 'eagle': 160.0})
+ >>> s1
+ falcon 330.0
+ eagle 160.0
+ dtype: float64
+ >>> s2 = pd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0})
+ >>> s2
+ falcon 345.0
+ eagle 200.0
+ duck 30.0
+ dtype: float64
+
+ Now, to combine the two datasets and view the highest speeds
+ of the birds across the two datasets
+
+ >>> s1.combine(s2, max)
+ duck NaN
+ eagle 200.0
+ falcon 345.0
+ dtype: float64
+
+ In the previous example, the resulting value for duck is missing,
+ because the maximum of a NaN and a float is a NaN.
+ So, in the example, we set ``fill_value=0``,
+ so the maximum value returned will be the value from some dataset.
+
+ >>> s1.combine(s2, max, fill_value=0)
+ duck 30.0
+ eagle 200.0
+ falcon 345.0
+ dtype: float64
+ """
+ if fill_value is None:
+ fill_value = na_value_for_dtype(self.dtype, compat=False)
+
+ if isinstance(other, Series):
+ # If other is a Series, result is based on union of Series,
+ # so do this element by element
+ new_index = self.index.union(other.index)
+ new_name = ops.get_op_result_name(self, other)
+ new_values = []
+ for idx in new_index:
+ lv = self.get(idx, fill_value)
+ rv = other.get(idx, fill_value)
+ with np.errstate(all='ignore'):
+ new_values.append(func(lv, rv))
+ else:
+ # Assume that other is a scalar, so apply the function for
+ # each element in the Series
+ new_index = self.index
+ with np.errstate(all='ignore'):
+ new_values = [func(lv, other) for lv in self._values]
+ new_name = self.name
+
+ if is_categorical_dtype(self.values):
+ pass
+ elif is_extension_array_dtype(self.values):
+ # The function can return something of any type, so check
+ # if the type is compatible with the calling EA.
+ try:
+ new_values = self._values._from_sequence(new_values)
+ except Exception:
+ # https://github.com/pandas-dev/pandas/issues/22850
+ # pandas has no control over what 3rd-party ExtensionArrays
+ # do in _values_from_sequence. We still want ops to work
+ # though, so we catch any regular Exception.
+ pass
+
+ return self._constructor(new_values, index=new_index, name=new_name)
+
+ def combine_first(self, other):
+ """
+ Combine Series values, choosing the calling Series's values first.
+
+ Parameters
+ ----------
+ other : Series
+ The value(s) to be combined with the `Series`.
+
+ Returns
+ -------
+ Series
+ The result of combining the Series with the other object.
+
+ See Also
+ --------
+ Series.combine : Perform elementwise operation on two Series
+ using a given function.
+
+ Notes
+ -----
+ Result index will be the union of the two indexes.
+
+ Examples
+ --------
+ >>> s1 = pd.Series([1, np.nan])
+ >>> s2 = pd.Series([3, 4])
+ >>> s1.combine_first(s2)
+ 0 1.0
+ 1 4.0
+ dtype: float64
+ """
+ new_index = self.index.union(other.index)
+ this = self.reindex(new_index, copy=False)
+ other = other.reindex(new_index, copy=False)
+ if is_datetimelike(this) and not is_datetimelike(other):
+ other = to_datetime(other)
+
+ return this.where(notna(this), other)
+
+ def update(self, other):
+ """
+ Modify Series in place using non-NA values from passed
+ Series. Aligns on index.
+
+ Parameters
+ ----------
+ other : Series
+
+ Examples
+ --------
+ >>> s = pd.Series([1, 2, 3])
+ >>> s.update(pd.Series([4, 5, 6]))
+ >>> s
+ 0 4
+ 1 5
+ 2 6
+ dtype: int64
+
+ >>> s = pd.Series(['a', 'b', 'c'])
+ >>> s.update(pd.Series(['d', 'e'], index=[0, 2]))
+ >>> s
+ 0 d
+ 1 b
+ 2 e
+ dtype: object
+
+ >>> s = pd.Series([1, 2, 3])
+ >>> s.update(pd.Series([4, 5, 6, 7, 8]))
+ >>> s
+ 0 4
+ 1 5
+ 2 6
+ dtype: int64
+
+ If ``other`` contains NaNs the corresponding values are not updated
+ in the original Series.
+
+ >>> s = pd.Series([1, 2, 3])
+ >>> s.update(pd.Series([4, np.nan, 6]))
+ >>> s
+ 0 4
+ 1 2
+ 2 6
+ dtype: int64
+ """
+ other = other.reindex_like(self)
+ mask = notna(other)
+
+ self._data = self._data.putmask(mask=mask, new=other, inplace=True)
+ self._maybe_update_cacher()
+
+ # ----------------------------------------------------------------------
+ # Reindexing, sorting
+
+ def sort_values(self, axis=0, ascending=True, inplace=False,
+ kind='quicksort', na_position='last'):
+ """
+ Sort by the values.
+
+ Sort a Series in ascending or descending order by some
+ criterion.
+
+ Parameters
+ ----------
+ axis : {0 or 'index'}, default 0
+ Axis to direct sorting. The value 'index' is accepted for
+ compatibility with DataFrame.sort_values.
+ ascending : bool, default True
+ If True, sort values in ascending order, otherwise descending.
+ inplace : bool, default False
+ If True, perform operation in-place.
+ kind : {'quicksort', 'mergesort' or 'heapsort'}, default 'quicksort'
+ Choice of sorting algorithm. See also :func:`numpy.sort` for more
+ information. 'mergesort' is the only stable algorithm.
+ na_position : {'first' or 'last'}, default 'last'
+ Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at
+ the end.
+
+ Returns
+ -------
+ Series
+ Series ordered by values.
+
+ See Also
+ --------
+ Series.sort_index : Sort by the Series indices.
+ DataFrame.sort_values : Sort DataFrame by the values along either axis.
+ DataFrame.sort_index : Sort DataFrame by indices.
+
+ Examples
+ --------
+ >>> s = pd.Series([np.nan, 1, 3, 10, 5])
+ >>> s
+ 0 NaN
+ 1 1.0
+ 2 3.0
+ 3 10.0
+ 4 5.0
+ dtype: float64
+
+ Sort values ascending order (default behaviour)
+
+ >>> s.sort_values(ascending=True)
+ 1 1.0
+ 2 3.0
+ 4 5.0
+ 3 10.0
+ 0 NaN
+ dtype: float64
+
+ Sort values descending order
+
+ >>> s.sort_values(ascending=False)
+ 3 10.0
+ 4 5.0
+ 2 3.0
+ 1 1.0
+ 0 NaN
+ dtype: float64
+
+ Sort values inplace
+
+ >>> s.sort_values(ascending=False, inplace=True)
+ >>> s
+ 3 10.0
+ 4 5.0
+ 2 3.0
+ 1 1.0
+ 0 NaN
+ dtype: float64
+
+ Sort values putting NAs first
+
+ >>> s.sort_values(na_position='first')
+ 0 NaN
+ 1 1.0
+ 2 3.0
+ 4 5.0
+ 3 10.0
+ dtype: float64
+
+ Sort a series of strings
+
+ >>> s = pd.Series(['z', 'b', 'd', 'a', 'c'])
+ >>> s
+ 0 z
+ 1 b
+ 2 d
+ 3 a
+ 4 c
+ dtype: object
+
+ >>> s.sort_values()
+ 3 a
+ 1 b
+ 4 c
+ 2 d
+ 0 z
+ dtype: object
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ # Validate the axis parameter
+ self._get_axis_number(axis)
+
+ # GH 5856/5853
+ if inplace and self._is_cached:
+ raise ValueError("This Series is a view of some other array, to "
+ "sort in-place you must create a copy")
+
+ def _try_kind_sort(arr):
+ # easier to ask forgiveness than permission
+ try:
+ # if kind==mergesort, it can fail for object dtype
+ return arr.argsort(kind=kind)
+ except TypeError:
+ # stable sort not available for object dtype
+ # uses the argsort default quicksort
+ return arr.argsort(kind='quicksort')
+
+ arr = self._values
+ sortedIdx = np.empty(len(self), dtype=np.int32)
+
+ bad = isna(arr)
+
+ good = ~bad
+ idx = ibase.default_index(len(self))
+
+ argsorted = _try_kind_sort(arr[good])
+
+ if is_list_like(ascending):
+ if len(ascending) != 1:
+ raise ValueError('Length of ascending (%d) must be 1 '
+ 'for Series' % (len(ascending)))
+ ascending = ascending[0]
+
+ if not is_bool(ascending):
+ raise ValueError('ascending must be boolean')
+
+ if not ascending:
+ argsorted = argsorted[::-1]
+
+ if na_position == 'last':
+ n = good.sum()
+ sortedIdx[:n] = idx[good][argsorted]
+ sortedIdx[n:] = idx[bad]
+ elif na_position == 'first':
+ n = bad.sum()
+ sortedIdx[n:] = idx[good][argsorted]
+ sortedIdx[:n] = idx[bad]
+ else:
+ raise ValueError('invalid na_position: {!r}'.format(na_position))
+
+ result = self._constructor(arr[sortedIdx], index=self.index[sortedIdx])
+
+ if inplace:
+ self._update_inplace(result)
+ else:
+ return result.__finalize__(self)
+
+ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
+ kind='quicksort', na_position='last', sort_remaining=True):
+ """
+ Sort Series by index labels.
+
+ Returns a new Series sorted by label if `inplace` argument is
+ ``False``, otherwise updates the original series and returns None.
+
+ Parameters
+ ----------
+ axis : int, default 0
+ Axis to direct sorting. This can only be 0 for Series.
+ level : int, optional
+ If not None, sort on values in specified index level(s).
+ ascending : bool, default true
+ Sort ascending vs. descending.
+ inplace : bool, default False
+ If True, perform operation in-place.
+ kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
+ Choice of sorting algorithm. See also :func:`numpy.sort` for more
+ information. 'mergesort' is the only stable algorithm. For
+ DataFrames, this option is only applied when sorting on a single
+ column or label.
+ na_position : {'first', 'last'}, default 'last'
+ If 'first' puts NaNs at the beginning, 'last' puts NaNs at the end.
+ Not implemented for MultiIndex.
+ sort_remaining : bool, default True
+ If true and sorting by level and index is multilevel, sort by other
+ levels too (in order) after sorting by specified level.
+
+ Returns
+ -------
+ pandas.Series
+ The original Series sorted by the labels
+
+ See Also
+ --------
+ DataFrame.sort_index: Sort DataFrame by the index.
+ DataFrame.sort_values: Sort DataFrame by the value.
+ Series.sort_values : Sort Series by the value.
+
+ Examples
+ --------
+ >>> s = pd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4])
+ >>> s.sort_index()
+ 1 c
+ 2 b
+ 3 a
+ 4 d
+ dtype: object
+
+ Sort Descending
+
+ >>> s.sort_index(ascending=False)
+ 4 d
+ 3 a
+ 2 b
+ 1 c
+ dtype: object
+
+ Sort Inplace
+
+ >>> s.sort_index(inplace=True)
+ >>> s
+ 1 c
+ 2 b
+ 3 a
+ 4 d
+ dtype: object
+
+ By default NaNs are put at the end, but use `na_position` to place
+ them at the beginning
+
+ >>> s = pd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, np.nan])
+ >>> s.sort_index(na_position='first')
+ NaN d
+ 1.0 c
+ 2.0 b
+ 3.0 a
+ dtype: object
+
+ Specify index level to sort
+
+ >>> arrays = [np.array(['qux', 'qux', 'foo', 'foo',
+ ... 'baz', 'baz', 'bar', 'bar']),
+ ... np.array(['two', 'one', 'two', 'one',
+ ... 'two', 'one', 'two', 'one'])]
+ >>> s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=arrays)
+ >>> s.sort_index(level=1)
+ bar one 8
+ baz one 6
+ foo one 4
+ qux one 2
+ bar two 7
+ baz two 5
+ foo two 3
+ qux two 1
+ dtype: int64
+
+ Does not sort by remaining levels when sorting by levels
+
+ >>> s.sort_index(level=1, sort_remaining=False)
+ qux one 2
+ foo one 4
+ baz one 6
+ bar one 8
+ qux two 1
+ foo two 3
+ baz two 5
+ bar two 7
+ dtype: int64
+ """
+ # TODO: this can be combined with DataFrame.sort_index impl as
+ # almost identical
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ # Validate the axis parameter
+ self._get_axis_number(axis)
+ index = self.index
+
+ if level is not None:
+ new_index, indexer = index.sortlevel(level, ascending=ascending,
+ sort_remaining=sort_remaining)
+ elif isinstance(index, MultiIndex):
+ from pandas.core.sorting import lexsort_indexer
+ labels = index._sort_levels_monotonic()
+ indexer = lexsort_indexer(labels._get_codes_for_sorting(),
+ orders=ascending,
+ na_position=na_position)
+ else:
+ from pandas.core.sorting import nargsort
+
+ # Check monotonic-ness before sort an index
+ # GH11080
+ if ((ascending and index.is_monotonic_increasing) or
+ (not ascending and index.is_monotonic_decreasing)):
+ if inplace:
+ return
+ else:
+ return self.copy()
+
+ indexer = nargsort(index, kind=kind, ascending=ascending,
+ na_position=na_position)
+
+ indexer = ensure_platform_int(indexer)
+ new_index = index.take(indexer)
+ new_index = new_index._sort_levels_monotonic()
+
+ new_values = self._values.take(indexer)
+ result = self._constructor(new_values, index=new_index)
+
+ if inplace:
+ self._update_inplace(result)
+ else:
+ return result.__finalize__(self)
+
+ def argsort(self, axis=0, kind='quicksort', order=None):
+ """
+ Overrides ndarray.argsort. Argsorts the value, omitting NA/null values,
+ and places the result in the same locations as the non-NA values.
+
+ Parameters
+ ----------
+ axis : int
+ Has no effect but is accepted for compatibility with numpy.
+ kind : {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort'
+ Choice of sorting algorithm. See np.sort for more
+ information. 'mergesort' is the only stable algorithm
+ order : None
+ Has no effect but is accepted for compatibility with numpy.
+
+ Returns
+ -------
+ argsorted : Series, with -1 indicated where nan values are present
+
+ See Also
+ --------
+ numpy.ndarray.argsort
+ """
+ values = self._values
+ mask = isna(values)
+
+ if mask.any():
+ result = Series(-1, index=self.index, name=self.name,
+ dtype='int64')
+ notmask = ~mask
+ result[notmask] = np.argsort(values[notmask], kind=kind)
+ return self._constructor(result,
+ index=self.index).__finalize__(self)
+ else:
+ return self._constructor(
+ np.argsort(values, kind=kind), index=self.index,
+ dtype='int64').__finalize__(self)
+
+ def nlargest(self, n=5, keep='first'):
+ """
+ Return the largest `n` elements.
+
+ Parameters
+ ----------
+ n : int, default 5
+ Return this many descending sorted values.
+ keep : {'first', 'last', 'all'}, default 'first'
+ When there are duplicate values that cannot all fit in a
+ Series of `n` elements:
+
+ - ``first`` : take the first occurrences based on the index order
+ - ``last`` : take the last occurrences based on the index order
+ - ``all`` : keep all occurrences. This can result in a Series of
+ size larger than `n`.
+
+ Returns
+ -------
+ Series
+ The `n` largest values in the Series, sorted in decreasing order.
+
+ See Also
+ --------
+ Series.nsmallest: Get the `n` smallest elements.
+ Series.sort_values: Sort Series by values.
+ Series.head: Return the first `n` rows.
+
+ Notes
+ -----
+ Faster than ``.sort_values(ascending=False).head(n)`` for small `n`
+ relative to the size of the ``Series`` object.
+
+ Examples
+ --------
+ >>> countries_population = {"Italy": 59000000, "France": 65000000,
+ ... "Malta": 434000, "Maldives": 434000,
+ ... "Brunei": 434000, "Iceland": 337000,
+ ... "Nauru": 11300, "Tuvalu": 11300,
+ ... "Anguilla": 11300, "Monserat": 5200}
+ >>> s = pd.Series(countries_population)
+ >>> s
+ Italy 59000000
+ France 65000000
+ Malta 434000
+ Maldives 434000
+ Brunei 434000
+ Iceland 337000
+ Nauru 11300
+ Tuvalu 11300
+ Anguilla 11300
+ Monserat 5200
+ dtype: int64
+
+ The `n` largest elements where ``n=5`` by default.
+
+ >>> s.nlargest()
+ France 65000000
+ Italy 59000000
+ Malta 434000
+ Maldives 434000
+ Brunei 434000
+ dtype: int64
+
+ The `n` largest elements where ``n=3``. Default `keep` value is 'first'
+ so Malta will be kept.
+
+ >>> s.nlargest(3)
+ France 65000000
+ Italy 59000000
+ Malta 434000
+ dtype: int64
+
+ The `n` largest elements where ``n=3`` and keeping the last duplicates.
+ Brunei will be kept since it is the last with value 434000 based on
+ the index order.
+
+ >>> s.nlargest(3, keep='last')
+ France 65000000
+ Italy 59000000
+ Brunei 434000
+ dtype: int64
+
+ The `n` largest elements where ``n=3`` with all duplicates kept. Note
+ that the returned Series has five elements due to the three duplicates.
+
+ >>> s.nlargest(3, keep='all')
+ France 65000000
+ Italy 59000000
+ Malta 434000
+ Maldives 434000
+ Brunei 434000
+ dtype: int64
+ """
+ return algorithms.SelectNSeries(self, n=n, keep=keep).nlargest()
+
+ def nsmallest(self, n=5, keep='first'):
+ """
+ Return the smallest `n` elements.
+
+ Parameters
+ ----------
+ n : int, default 5
+ Return this many ascending sorted values.
+ keep : {'first', 'last', 'all'}, default 'first'
+ When there are duplicate values that cannot all fit in a
+ Series of `n` elements:
+
+ - ``first`` : take the first occurrences based on the index order
+ - ``last`` : take the last occurrences based on the index order
+ - ``all`` : keep all occurrences. This can result in a Series of
+ size larger than `n`.
+
+ Returns
+ -------
+ Series
+ The `n` smallest values in the Series, sorted in increasing order.
+
+ See Also
+ --------
+ Series.nlargest: Get the `n` largest elements.
+ Series.sort_values: Sort Series by values.
+ Series.head: Return the first `n` rows.
+
+ Notes
+ -----
+ Faster than ``.sort_values().head(n)`` for small `n` relative to
+ the size of the ``Series`` object.
+
+ Examples
+ --------
+ >>> countries_population = {"Italy": 59000000, "France": 65000000,
+ ... "Brunei": 434000, "Malta": 434000,
+ ... "Maldives": 434000, "Iceland": 337000,
+ ... "Nauru": 11300, "Tuvalu": 11300,
+ ... "Anguilla": 11300, "Monserat": 5200}
+ >>> s = pd.Series(countries_population)
+ >>> s
+ Italy 59000000
+ France 65000000
+ Brunei 434000
+ Malta 434000
+ Maldives 434000
+ Iceland 337000
+ Nauru 11300
+ Tuvalu 11300
+ Anguilla 11300
+ Monserat 5200
+ dtype: int64
+
+ The `n` largest elements where ``n=5`` by default.
+
+ >>> s.nsmallest()
+ Monserat 5200
+ Nauru 11300
+ Tuvalu 11300
+ Anguilla 11300
+ Iceland 337000
+ dtype: int64
+
+ The `n` smallest elements where ``n=3``. Default `keep` value is
+ 'first' so Nauru and Tuvalu will be kept.
+
+ >>> s.nsmallest(3)
+ Monserat 5200
+ Nauru 11300
+ Tuvalu 11300
+ dtype: int64
+
+ The `n` smallest elements where ``n=3`` and keeping the last
+ duplicates. Anguilla and Tuvalu will be kept since they are the last
+ with value 11300 based on the index order.
+
+ >>> s.nsmallest(3, keep='last')
+ Monserat 5200
+ Anguilla 11300
+ Tuvalu 11300
+ dtype: int64
+
+ The `n` smallest elements where ``n=3`` with all duplicates kept. Note
+ that the returned Series has four elements due to the three duplicates.
+
+ >>> s.nsmallest(3, keep='all')
+ Monserat 5200
+ Nauru 11300
+ Tuvalu 11300
+ Anguilla 11300
+ dtype: int64
+ """
+ return algorithms.SelectNSeries(self, n=n, keep=keep).nsmallest()
+
+ def swaplevel(self, i=-2, j=-1, copy=True):
+ """
+ Swap levels i and j in a MultiIndex.
+
+ Parameters
+ ----------
+ i, j : int, string (can be mixed)
+ Level of index to be swapped. Can pass level name as string.
+
+ Returns
+ -------
+ swapped : Series
+
+ .. versionchanged:: 0.18.1
+
+ The indexes ``i`` and ``j`` are now optional, and default to
+ the two innermost levels of the index.
+ """
+ new_index = self.index.swaplevel(i, j)
+ return self._constructor(self._values, index=new_index,
+ copy=copy).__finalize__(self)
+
+ def reorder_levels(self, order):
+ """
+ Rearrange index levels using input order.
+
+ May not drop or duplicate levels.
+
+ Parameters
+ ----------
+ order : list of int representing new level order
+ (reference level by number or key)
+
+ Returns
+ -------
+ type of caller (new object)
+ """
+ if not isinstance(self.index, MultiIndex): # pragma: no cover
+ raise Exception('Can only reorder levels on a hierarchical axis.')
+
+ result = self.copy()
+ result.index = result.index.reorder_levels(order)
+ return result
+
+ def unstack(self, level=-1, fill_value=None):
+ """
+ Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.
+ The level involved will automatically get sorted.
+
+ Parameters
+ ----------
+ level : int, string, or list of these, default last level
+ Level(s) to unstack, can pass level name
+ fill_value : replace NaN with this value if the unstack produces
+ missing values
+
+ .. versionadded:: 0.18.0
+
+ Returns
+ -------
+ unstacked : DataFrame
+
+ Examples
+ --------
+ >>> s = pd.Series([1, 2, 3, 4],
+ ... index=pd.MultiIndex.from_product([['one', 'two'], ['a', 'b']]))
+ >>> s
+ one a 1
+ b 2
+ two a 3
+ b 4
+ dtype: int64
+
+ >>> s.unstack(level=-1)
+ a b
+ one 1 2
+ two 3 4
+
+ >>> s.unstack(level=0)
+ one two
+ a 1 3
+ b 2 4
+ """
+ from pandas.core.reshape.reshape import unstack
+ return unstack(self, level, fill_value)
+
+ # ----------------------------------------------------------------------
+ # function application
+
+ def map(self, arg, na_action=None):
+ """
+ Map values of Series according to input correspondence.
+
+ Used for substituting each value in a Series with another value,
+ that may be derived from a function, a ``dict`` or
+ a :class:`Series`.
+
+ Parameters
+ ----------
+ arg : function, dict, or Series
+ Mapping correspondence.
+ na_action : {None, 'ignore'}, default None
+ If 'ignore', propagate NaN values, without passing them to the
+ mapping correspondence.
+
+ Returns
+ -------
+ Series
+ Same index as caller.
+
+ See Also
+ --------
+ Series.apply : For applying more complex functions on a Series.
+ DataFrame.apply : Apply a function row-/column-wise.
+ DataFrame.applymap : Apply a function elementwise on a whole DataFrame.
+
+ Notes
+ -----
+ When ``arg`` is a dictionary, values in Series that are not in the
+ dictionary (as keys) are converted to ``NaN``. However, if the
+ dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e.
+ provides a method for default values), then this default is used
+ rather than ``NaN``.
+
+ Examples
+ --------
+ >>> s = pd.Series(['cat', 'dog', np.nan, 'rabbit'])
+ >>> s
+ 0 cat
+ 1 dog
+ 2 NaN
+ 3 rabbit
+ dtype: object
+
+ ``map`` accepts a ``dict`` or a ``Series``. Values that are not found
+ in the ``dict`` are converted to ``NaN``, unless the dict has a default
+ value (e.g. ``defaultdict``):
+
+ >>> s.map({'cat': 'kitten', 'dog': 'puppy'})
+ 0 kitten
+ 1 puppy
+ 2 NaN
+ 3 NaN
+ dtype: object
+
+ It also accepts a function:
+
+ >>> s.map('I am a {}'.format)
+ 0 I am a cat
+ 1 I am a dog
+ 2 I am a nan
+ 3 I am a rabbit
+ dtype: object
+
+ To avoid applying the function to missing values (and keep them as
+ ``NaN``) ``na_action='ignore'`` can be used:
+
+ >>> s.map('I am a {}'.format, na_action='ignore')
+ 0 I am a cat
+ 1 I am a dog
+ 2 NaN
+ 3 I am a rabbit
+ dtype: object
+ """
+ new_values = super(Series, self)._map_values(
+ arg, na_action=na_action)
+ return self._constructor(new_values,
+ index=self.index).__finalize__(self)
+
+ def _gotitem(self, key, ndim, subset=None):
+ """
+ Sub-classes to define. Return a sliced object.
+
+ Parameters
+ ----------
+ key : string / list of selections
+ ndim : 1,2
+ requested ndim of result
+ subset : object, default None
+ subset to act on
+ """
+ return self
+
+ _agg_see_also_doc = dedent("""
+ See Also
+ --------
+ Series.apply : Invoke function on a Series.
+ Series.transform : Transform function producing a Series with like indexes.
+ """)
+
+ _agg_examples_doc = dedent("""
+ Examples
+ --------
+ >>> s = pd.Series([1, 2, 3, 4])
+ >>> s
+ 0 1
+ 1 2
+ 2 3
+ 3 4
+ dtype: int64
+
+ >>> s.agg('min')
+ 1
+
+ >>> s.agg(['min', 'max'])
+ min 1
+ max 4
+ dtype: int64
+ """)
+
+ @Substitution(see_also=_agg_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded='.. versionadded:: 0.20.0',
+ **_shared_doc_kwargs)
+ @Appender(generic._shared_docs['aggregate'])
+ def aggregate(self, func, axis=0, *args, **kwargs):
+ # Validate the axis parameter
+ self._get_axis_number(axis)
+ result, how = self._aggregate(func, *args, **kwargs)
+ if result is None:
+
+ # we can be called from an inner function which
+ # passes this meta-data
+ kwargs.pop('_axis', None)
+ kwargs.pop('_level', None)
+
+ # try a regular apply, this evaluates lambdas
+ # row-by-row; however if the lambda is expected a Series
+ # expression, e.g.: lambda x: x-x.quantile(0.25)
+ # this will fail, so we can try a vectorized evaluation
+
+ # we cannot FIRST try the vectorized evaluation, because
+ # then .agg and .apply would have different semantics if the
+ # operation is actually defined on the Series, e.g. str
+ try:
+ result = self.apply(func, *args, **kwargs)
+ except (ValueError, AttributeError, TypeError):
+ result = func(self, *args, **kwargs)
+
+ return result
+
+ agg = aggregate
+
+ @Appender(generic._shared_docs['transform'] % _shared_doc_kwargs)
+ def transform(self, func, axis=0, *args, **kwargs):
+ # Validate the axis parameter
+ self._get_axis_number(axis)
+ return super(Series, self).transform(func, *args, **kwargs)
+
+ def apply(self, func, convert_dtype=True, args=(), **kwds):
+ """
+ Invoke function on values of Series.
+
+ Can be ufunc (a NumPy function that applies to the entire Series)
+ or a Python function that only works on single values.
+
+ Parameters
+ ----------
+ func : function
+ Python function or NumPy ufunc to apply.
+ convert_dtype : bool, default True
+ Try to find better dtype for elementwise function results. If
+ False, leave as dtype=object.
+ args : tuple
+ Positional arguments passed to func after the series value.
+ **kwds
+ Additional keyword arguments passed to func.
+
+ Returns
+ -------
+ Series or DataFrame
+ If func returns a Series object the result will be a DataFrame.
+
+ See Also
+ --------
+ Series.map: For element-wise operations.
+ Series.agg: Only perform aggregating type operations.
+ Series.transform: Only perform transforming type operations.
+
+ Examples
+ --------
+ Create a series with typical summer temperatures for each city.
+
+ >>> s = pd.Series([20, 21, 12],
+ ... index=['London', 'New York', 'Helsinki'])
+ >>> s
+ London 20
+ New York 21
+ Helsinki 12
+ dtype: int64
+
+ Square the values by defining a function and passing it as an
+ argument to ``apply()``.
+
+ >>> def square(x):
+ ... return x ** 2
+ >>> s.apply(square)
+ London 400
+ New York 441
+ Helsinki 144
+ dtype: int64
+
+ Square the values by passing an anonymous function as an
+ argument to ``apply()``.
+
+ >>> s.apply(lambda x: x ** 2)
+ London 400
+ New York 441
+ Helsinki 144
+ dtype: int64
+
+ Define a custom function that needs additional positional
+ arguments and pass these additional arguments using the
+ ``args`` keyword.
+
+ >>> def subtract_custom_value(x, custom_value):
+ ... return x - custom_value
+
+ >>> s.apply(subtract_custom_value, args=(5,))
+ London 15
+ New York 16
+ Helsinki 7
+ dtype: int64
+
+ Define a custom function that takes keyword arguments
+ and pass these arguments to ``apply``.
+
+ >>> def add_custom_values(x, **kwargs):
+ ... for month in kwargs:
+ ... x += kwargs[month]
+ ... return x
+
+ >>> s.apply(add_custom_values, june=30, july=20, august=25)
+ London 95
+ New York 96
+ Helsinki 87
+ dtype: int64
+
+ Use a function from the Numpy library.
+
+ >>> s.apply(np.log)
+ London 2.995732
+ New York 3.044522
+ Helsinki 2.484907
+ dtype: float64
+ """
+ if len(self) == 0:
+ return self._constructor(dtype=self.dtype,
+ index=self.index).__finalize__(self)
+
+ # dispatch to agg
+ if isinstance(func, (list, dict)):
+ return self.aggregate(func, *args, **kwds)
+
+ # if we are a string, try to dispatch
+ if isinstance(func, compat.string_types):
+ return self._try_aggregate_string_function(func, *args, **kwds)
+
+ # handle ufuncs and lambdas
+ if kwds or args and not isinstance(func, np.ufunc):
+ def f(x):
+ return func(x, *args, **kwds)
+ else:
+ f = func
+
+ with np.errstate(all='ignore'):
+ if isinstance(f, np.ufunc):
+ return f(self)
+
+ # row-wise access
+ if is_extension_type(self.dtype):
+ mapped = self._values.map(f)
+ else:
+ values = self.astype(object).values
+ mapped = lib.map_infer(values, f, convert=convert_dtype)
+
+ if len(mapped) and isinstance(mapped[0], Series):
+ from pandas.core.frame import DataFrame
+ return DataFrame(mapped.tolist(), index=self.index)
+ else:
+ return self._constructor(mapped,
+ index=self.index).__finalize__(self)
+
+ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
+ filter_type=None, **kwds):
+ """
+ Perform a reduction operation.
+
+ If we have an ndarray as a value, then simply perform the operation,
+ otherwise delegate to the object.
+ """
+ delegate = self._values
+
+ if axis is not None:
+ self._get_axis_number(axis)
+
+ if isinstance(delegate, Categorical):
+ # TODO deprecate numeric_only argument for Categorical and use
+ # skipna as well, see GH25303
+ return delegate._reduce(name, numeric_only=numeric_only, **kwds)
+ elif isinstance(delegate, ExtensionArray):
+ # dispatch to ExtensionArray interface
+ return delegate._reduce(name, skipna=skipna, **kwds)
+ elif is_datetime64_dtype(delegate):
+ # use DatetimeIndex implementation to handle skipna correctly
+ delegate = DatetimeIndex(delegate)
+
+ # dispatch to numpy arrays
+ elif isinstance(delegate, np.ndarray):
+ if numeric_only:
+ raise NotImplementedError('Series.{0} does not implement '
+ 'numeric_only.'.format(name))
+ with np.errstate(all='ignore'):
+ return op(delegate, skipna=skipna, **kwds)
+
+ # TODO(EA) dispatch to Index
+ # remove once all internals extension types are
+ # moved to ExtensionArrays
+ return delegate._reduce(op=op, name=name, axis=axis, skipna=skipna,
+ numeric_only=numeric_only,
+ filter_type=filter_type, **kwds)
+
+ def _reindex_indexer(self, new_index, indexer, copy):
+ if indexer is None:
+ if copy:
+ return self.copy()
+ return self
+
+ new_values = algorithms.take_1d(self._values, indexer,
+ allow_fill=True, fill_value=None)
+ return self._constructor(new_values, index=new_index)
+
+ def _needs_reindex_multi(self, axes, method, level):
+ """
+ Check if we do need a multi reindex; this is for compat with
+ higher dims.
+ """
+ return False
+
+ @Appender(generic._shared_docs['align'] % _shared_doc_kwargs)
+ def align(self, other, join='outer', axis=None, level=None, copy=True,
+ fill_value=None, method=None, limit=None, fill_axis=0,
+ broadcast_axis=None):
+ return super(Series, self).align(other, join=join, axis=axis,
+ level=level, copy=copy,
+ fill_value=fill_value, method=method,
+ limit=limit, fill_axis=fill_axis,
+ broadcast_axis=broadcast_axis)
+
+ def rename(self, index=None, **kwargs):
+ """
+ Alter Series index labels or name.
+
+ Function / dict values must be unique (1-to-1). Labels not contained in
+ a dict / Series will be left as-is. Extra labels listed don't throw an
+ error.
+
+ Alternatively, change ``Series.name`` with a scalar value.
+
+ See the :ref:`user guide <basics.rename>` for more.
+
+ Parameters
+ ----------
+ index : scalar, hashable sequence, dict-like or function, optional
+ dict-like or functions are transformations to apply to
+ the index.
+ Scalar or hashable sequence-like will alter the ``Series.name``
+ attribute.
+ copy : bool, default True
+ Also copy underlying data
+ inplace : bool, default False
+ Whether to return a new Series. If True then value of copy is
+ ignored.
+ level : int or level name, default None
+ In case of a MultiIndex, only rename labels in the specified
+ level.
+
+ Returns
+ -------
+ renamed : Series (new object)
+
+ See Also
+ --------
+ Series.rename_axis
+
+ Examples
+ --------
+ >>> s = pd.Series([1, 2, 3])
+ >>> s
+ 0 1
+ 1 2
+ 2 3
+ dtype: int64
+ >>> s.rename("my_name") # scalar, changes Series.name
+ 0 1
+ 1 2
+ 2 3
+ Name: my_name, dtype: int64
+ >>> s.rename(lambda x: x ** 2) # function, changes labels
+ 0 1
+ 1 2
+ 4 3
+ dtype: int64
+ >>> s.rename({1: 3, 2: 5}) # mapping, changes labels
+ 0 1
+ 3 2
+ 5 3
+ dtype: int64
+ """
+ kwargs['inplace'] = validate_bool_kwarg(kwargs.get('inplace', False),
+ 'inplace')
+
+ non_mapping = is_scalar(index) or (is_list_like(index) and
+ not is_dict_like(index))
+ if non_mapping:
+ return self._set_name(index, inplace=kwargs.get('inplace'))
+ return super(Series, self).rename(index=index, **kwargs)
+
+ @Substitution(**_shared_doc_kwargs)
+ @Appender(generic.NDFrame.reindex.__doc__)
+ def reindex(self, index=None, **kwargs):
+ return super(Series, self).reindex(index=index, **kwargs)
+
+ def drop(self, labels=None, axis=0, index=None, columns=None,
+ level=None, inplace=False, errors='raise'):
+ """
+ Return Series with specified index labels removed.
+
+ Remove elements of a Series based on specifying the index labels.
+ When using a multi-index, labels on different levels can be removed
+ by specifying the level.
+
+ Parameters
+ ----------
+ labels : single label or list-like
+ Index labels to drop.
+ axis : 0, default 0
+ Redundant for application on Series.
+ index, columns : None
+ Redundant for application on Series, but index can be used instead
+ of labels.
+
+ .. versionadded:: 0.21.0
+ level : int or level name, optional
+ For MultiIndex, level for which the labels will be removed.
+ inplace : bool, default False
+ If True, do operation inplace and return None.
+ errors : {'ignore', 'raise'}, default 'raise'
+ If 'ignore', suppress error and only existing labels are dropped.
+
+ Returns
+ -------
+ dropped : pandas.Series
+
+ Raises
+ ------
+ KeyError
+ If none of the labels are found in the index.
+
+ See Also
+ --------
+ Series.reindex : Return only specified index labels of Series.
+ Series.dropna : Return series without null values.
+ Series.drop_duplicates : Return Series with duplicate values removed.
+ DataFrame.drop : Drop specified labels from rows or columns.
+
+ Examples
+ --------
+ >>> s = pd.Series(data=np.arange(3), index=['A','B','C'])
+ >>> s
+ A 0
+ B 1
+ C 2
+ dtype: int64
+
+ Drop labels B en C
+
+ >>> s.drop(labels=['B','C'])
+ A 0
+ dtype: int64
+
+ Drop 2nd level label in MultiIndex Series
+
+ >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
+ ... ['speed', 'weight', 'length']],
+ ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
+ ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
+ >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
+ ... index=midx)
+ >>> s
+ lama speed 45.0
+ weight 200.0
+ length 1.2
+ cow speed 30.0
+ weight 250.0
+ length 1.5
+ falcon speed 320.0
+ weight 1.0
+ length 0.3
+ dtype: float64
+
+ >>> s.drop(labels='weight', level=1)
+ lama speed 45.0
+ length 1.2
+ cow speed 30.0
+ length 1.5
+ falcon speed 320.0
+ length 0.3
+ dtype: float64
+ """
+ return super(Series, self).drop(labels=labels, axis=axis, index=index,
+ columns=columns, level=level,
+ inplace=inplace, errors=errors)
+
+ @Substitution(**_shared_doc_kwargs)
+ @Appender(generic.NDFrame.fillna.__doc__)
+ def fillna(self, value=None, method=None, axis=None, inplace=False,
+ limit=None, downcast=None, **kwargs):
+ return super(Series, self).fillna(value=value, method=method,
+ axis=axis, inplace=inplace,
+ limit=limit, downcast=downcast,
+ **kwargs)
+
+ @Appender(generic._shared_docs['replace'] % _shared_doc_kwargs)
+ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
+ regex=False, method='pad'):
+ return super(Series, self).replace(to_replace=to_replace, value=value,
+ inplace=inplace, limit=limit,
+ regex=regex, method=method)
+
+ @Appender(generic._shared_docs['shift'] % _shared_doc_kwargs)
+ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
+ return super(Series, self).shift(periods=periods, freq=freq, axis=axis,
+ fill_value=fill_value)
+
+ def reindex_axis(self, labels, axis=0, **kwargs):
+ """
+ Conform Series to new index with optional filling logic.
+
+ .. deprecated:: 0.21.0
+ Use ``Series.reindex`` instead.
+ """
+ # for compatibility with higher dims
+ if axis != 0:
+ raise ValueError("cannot reindex series on non-zero axis!")
+ msg = ("'.reindex_axis' is deprecated and will be removed in a future "
+ "version. Use '.reindex' instead.")
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+
+ return self.reindex(index=labels, **kwargs)
+
+ def memory_usage(self, index=True, deep=False):
+ """
+ Return the memory usage of the Series.
+
+ The memory usage can optionally include the contribution of
+ the index and of elements of `object` dtype.
+
+ Parameters
+ ----------
+ index : bool, default True
+ Specifies whether to include the memory usage of the Series index.
+ deep : bool, default False
+ If True, introspect the data deeply by interrogating
+ `object` dtypes for system-level memory consumption, and include
+ it in the returned value.
+
+ Returns
+ -------
+ int
+ Bytes of memory consumed.
+
+ See Also
+ --------
+ numpy.ndarray.nbytes : Total bytes consumed by the elements of the
+ array.
+ DataFrame.memory_usage : Bytes consumed by a DataFrame.
+
+ Examples
+ --------
+ >>> s = pd.Series(range(3))
+ >>> s.memory_usage()
+ 104
+
+ Not including the index gives the size of the rest of the data, which
+ is necessarily smaller:
+
+ >>> s.memory_usage(index=False)
+ 24
+
+ The memory footprint of `object` values is ignored by default:
+
+ >>> s = pd.Series(["a", "b"])
+ >>> s.values
+ array(['a', 'b'], dtype=object)
+ >>> s.memory_usage()
+ 96
+ >>> s.memory_usage(deep=True)
+ 212
+ """
+ v = super(Series, self).memory_usage(deep=deep)
+ if index:
+ v += self.index.memory_usage(deep=deep)
+ return v
+
+ @Appender(generic.NDFrame._take.__doc__)
+ def _take(self, indices, axis=0, is_copy=False):
+
+ indices = ensure_platform_int(indices)
+ new_index = self.index.take(indices)
+
+ if is_categorical_dtype(self):
+ # https://github.com/pandas-dev/pandas/issues/20664
+ # TODO: remove when the default Categorical.take behavior changes
+ indices = maybe_convert_indices(indices, len(self._get_axis(axis)))
+ kwargs = {'allow_fill': False}
+ else:
+ kwargs = {}
+ new_values = self._values.take(indices, **kwargs)
+
+ result = (self._constructor(new_values, index=new_index,
+ fastpath=True).__finalize__(self))
+
+ # Maybe set copy if we didn't actually change the index.
+ if is_copy:
+ if not result._get_axis(axis).equals(self._get_axis(axis)):
+ result._set_is_copy(self)
+
+ return result
+
+ def isin(self, values):
+ """
+ Check whether `values` are contained in Series.
+
+ Return a boolean Series showing whether each element in the Series
+ matches an element in the passed sequence of `values` exactly.
+
+ Parameters
+ ----------
+ values : set or list-like
+ The sequence of values to test. Passing in a single string will
+ raise a ``TypeError``. Instead, turn a single string into a
+ list of one element.
+
+ .. versionadded:: 0.18.1
+
+ Support for values as a set.
+
+ Returns
+ -------
+ isin : Series (bool dtype)
+
+ Raises
+ ------
+ TypeError
+ * If `values` is a string
+
+ See Also
+ --------
+ DataFrame.isin : Equivalent method on DataFrame.
+
+ Examples
+ --------
+ >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama',
+ ... 'hippo'], name='animal')
+ >>> s.isin(['cow', 'lama'])
+ 0 True
+ 1 True
+ 2 True
+ 3 False
+ 4 True
+ 5 False
+ Name: animal, dtype: bool
+
+ Passing a single string as ``s.isin('lama')`` will raise an error. Use
+ a list of one element instead:
+
+ >>> s.isin(['lama'])
+ 0 True
+ 1 False
+ 2 True
+ 3 False
+ 4 True
+ 5 False
+ Name: animal, dtype: bool
+ """
+ result = algorithms.isin(self, values)
+ return self._constructor(result, index=self.index).__finalize__(self)
+
+ def between(self, left, right, inclusive=True):
+ """
+ Return boolean Series equivalent to left <= series <= right.
+
+ This function returns a boolean vector containing `True` wherever the
+ corresponding Series element is between the boundary values `left` and
+ `right`. NA values are treated as `False`.
+
+ Parameters
+ ----------
+ left : scalar
+ Left boundary.
+ right : scalar
+ Right boundary.
+ inclusive : bool, default True
+ Include boundaries.
+
+ Returns
+ -------
+ Series
+ Each element will be a boolean.
+
+ See Also
+ --------
+ Series.gt : Greater than of series and other.
+ Series.lt : Less than of series and other.
+
+ Notes
+ -----
+ This function is equivalent to ``(left <= ser) & (ser <= right)``
+
+ Examples
+ --------
+ >>> s = pd.Series([2, 0, 4, 8, np.nan])
+
+ Boundary values are included by default:
+
+ >>> s.between(1, 4)
+ 0 True
+ 1 False
+ 2 True
+ 3 False
+ 4 False
+ dtype: bool
+
+ With `inclusive` set to ``False`` boundary values are excluded:
+
+ >>> s.between(1, 4, inclusive=False)
+ 0 True
+ 1 False
+ 2 False
+ 3 False
+ 4 False
+ dtype: bool
+
+ `left` and `right` can be any scalar value:
+
+ >>> s = pd.Series(['Alice', 'Bob', 'Carol', 'Eve'])
+ >>> s.between('Anna', 'Daniel')
+ 0 False
+ 1 True
+ 2 True
+ 3 False
+ dtype: bool
+ """
+ if inclusive:
+ lmask = self >= left
+ rmask = self <= right
+ else:
+ lmask = self > left
+ rmask = self < right
+
+ return lmask & rmask
+
+ @classmethod
+ def from_csv(cls, path, sep=',', parse_dates=True, header=None,
+ index_col=0, encoding=None, infer_datetime_format=False):
+ """
+ Read CSV file.
+
+ .. deprecated:: 0.21.0
+ Use :func:`pandas.read_csv` instead.
+
+ It is preferable to use the more powerful :func:`pandas.read_csv`
+ for most general purposes, but ``from_csv`` makes for an easy
+ roundtrip to and from a file (the exact counterpart of
+ ``to_csv``), especially with a time Series.
+
+ This method only differs from :func:`pandas.read_csv` in some defaults:
+
+ - `index_col` is ``0`` instead of ``None`` (take first column as index
+ by default)
+ - `header` is ``None`` instead of ``0`` (the first row is not used as
+ the column names)
+ - `parse_dates` is ``True`` instead of ``False`` (try parsing the index
+ as datetime by default)
+
+ With :func:`pandas.read_csv`, the option ``squeeze=True`` can be used
+ to return a Series like ``from_csv``.
+
+ Parameters
+ ----------
+ path : string file path or file handle / StringIO
+ sep : string, default ','
+ Field delimiter
+ parse_dates : boolean, default True
+ Parse dates. Different default from read_table
+ header : int, default None
+ Row to use as header (skip prior rows)
+ index_col : int or sequence, default 0
+ Column to use for index. If a sequence is given, a MultiIndex
+ is used. Different default from read_table
+ encoding : string, optional
+ a string representing the encoding to use if the contents are
+ non-ascii, for python versions prior to 3
+ infer_datetime_format : boolean, default False
+ If True and `parse_dates` is True for a column, try to infer the
+ datetime format based on the first datetime string. If the format
+ can be inferred, there often will be a large parsing speed-up.
+
+ Returns
+ -------
+ y : Series
+
+ See Also
+ --------
+ read_csv
+ """
+
+ # We're calling `DataFrame.from_csv` in the implementation,
+ # which will propagate a warning regarding `from_csv` deprecation.
+ from pandas.core.frame import DataFrame
+ df = DataFrame.from_csv(path, header=header, index_col=index_col,
+ sep=sep, parse_dates=parse_dates,
+ encoding=encoding,
+ infer_datetime_format=infer_datetime_format)
+ result = df.iloc[:, 0]
+ if header is None:
+ result.index.name = result.name = None
+
+ return result
+
+ @Appender(generic.NDFrame.to_csv.__doc__)
+ def to_csv(self, *args, **kwargs):
+
+ names = ["path_or_buf", "sep", "na_rep", "float_format", "columns",
+ "header", "index", "index_label", "mode", "encoding",
+ "compression", "quoting", "quotechar", "line_terminator",
+ "chunksize", "tupleize_cols", "date_format", "doublequote",
+ "escapechar", "decimal"]
+
+ old_names = ["path_or_buf", "index", "sep", "na_rep", "float_format",
+ "header", "index_label", "mode", "encoding",
+ "compression", "date_format", "decimal"]
+
+ if "path" in kwargs:
+ warnings.warn("The signature of `Series.to_csv` was aligned "
+ "to that of `DataFrame.to_csv`, and argument "
+ "'path' will be renamed to 'path_or_buf'.",
+ FutureWarning, stacklevel=2)
+ kwargs["path_or_buf"] = kwargs.pop("path")
+
+ if len(args) > 1:
+ # Either "index" (old signature) or "sep" (new signature) is being
+ # passed as second argument (while the first is the same)
+ maybe_sep = args[1]
+
+ if not (is_string_like(maybe_sep) and len(maybe_sep) == 1):
+ # old signature
+ warnings.warn("The signature of `Series.to_csv` was aligned "
+ "to that of `DataFrame.to_csv`. Note that the "
+ "order of arguments changed, and the new one "
+ "has 'sep' in first place, for which \"{}\" is "
+ "not a valid value. The old order will cease to "
+ "be supported in a future version. Please refer "
+ "to the documentation for `DataFrame.to_csv` "
+ "when updating your function "
+ "calls.".format(maybe_sep),
+ FutureWarning, stacklevel=2)
+ names = old_names
+
+ pos_args = dict(zip(names[:len(args)], args))
+
+ for key in pos_args:
+ if key in kwargs:
+ raise ValueError("Argument given by name ('{}') and position "
+ "({})".format(key, names.index(key)))
+ kwargs[key] = pos_args[key]
+
+ if kwargs.get("header", None) is None:
+ warnings.warn("The signature of `Series.to_csv` was aligned "
+ "to that of `DataFrame.to_csv`, and argument "
+ "'header' will change its default value from False "
+ "to True: please pass an explicit value to suppress "
+ "this warning.", FutureWarning,
+ stacklevel=2)
+ kwargs["header"] = False # Backwards compatibility.
+ return self.to_frame().to_csv(**kwargs)
+
+ @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs)
+ def isna(self):
+ return super(Series, self).isna()
+
+ @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs)
+ def isnull(self):
+ return super(Series, self).isnull()
+
+ @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs)
+ def notna(self):
+ return super(Series, self).notna()
+
+ @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs)
+ def notnull(self):
+ return super(Series, self).notnull()
+
+ def dropna(self, axis=0, inplace=False, **kwargs):
+ """
+ Return a new Series with missing values removed.
+
+ See the :ref:`User Guide <missing_data>` for more on which values are
+ considered missing, and how to work with missing data.
+
+ Parameters
+ ----------
+ axis : {0 or 'index'}, default 0
+ There is only one axis to drop values from.
+ inplace : bool, default False
+ If True, do operation inplace and return None.
+ **kwargs
+ Not in use.
+
+ Returns
+ -------
+ Series
+ Series with NA entries dropped from it.
+
+ See Also
+ --------
+ Series.isna: Indicate missing values.
+ Series.notna : Indicate existing (non-missing) values.
+ Series.fillna : Replace missing values.
+ DataFrame.dropna : Drop rows or columns which contain NA values.
+ Index.dropna : Drop missing indices.
+
+ Examples
+ --------
+ >>> ser = pd.Series([1., 2., np.nan])
+ >>> ser
+ 0 1.0
+ 1 2.0
+ 2 NaN
+ dtype: float64
+
+ Drop NA values from a Series.
+
+ >>> ser.dropna()
+ 0 1.0
+ 1 2.0
+ dtype: float64
+
+ Keep the Series with valid entries in the same variable.
+
+ >>> ser.dropna(inplace=True)
+ >>> ser
+ 0 1.0
+ 1 2.0
+ dtype: float64
+
+ Empty strings are not considered NA values. ``None`` is considered an
+ NA value.
+
+ >>> ser = pd.Series([np.NaN, 2, pd.NaT, '', None, 'I stay'])
+ >>> ser
+ 0 NaN
+ 1 2
+ 2 NaT
+ 3
+ 4 None
+ 5 I stay
+ dtype: object
+ >>> ser.dropna()
+ 1 2
+ 3
+ 5 I stay
+ dtype: object
+ """
+ inplace = validate_bool_kwarg(inplace, 'inplace')
+ kwargs.pop('how', None)
+ if kwargs:
+ raise TypeError('dropna() got an unexpected keyword '
+ 'argument "{0}"'.format(list(kwargs.keys())[0]))
+ # Validate the axis parameter
+ self._get_axis_number(axis or 0)
+
+ if self._can_hold_na:
+ result = remove_na_arraylike(self)
+ if inplace:
+ self._update_inplace(result)
+ else:
+ return result
+ else:
+ if inplace:
+ # do nothing
+ pass
+ else:
+ return self.copy()
+
+ def valid(self, inplace=False, **kwargs):
+ """
+ Return Series without null values.
+
+ .. deprecated:: 0.23.0
+ Use :meth:`Series.dropna` instead.
+ """
+ warnings.warn("Method .valid will be removed in a future version. "
+ "Use .dropna instead.", FutureWarning, stacklevel=2)
+ return self.dropna(inplace=inplace, **kwargs)
+
+ # ----------------------------------------------------------------------
+ # Time series-oriented methods
+
+ def to_timestamp(self, freq=None, how='start', copy=True):
+ """
+ Cast to datetimeindex of timestamps, at *beginning* of period.
+
+ Parameters
+ ----------
+ freq : string, default frequency of PeriodIndex
+ Desired frequency
+ how : {'s', 'e', 'start', 'end'}
+ Convention for converting period to timestamp; start of period
+ vs. end
+
+ Returns
+ -------
+ ts : Series with DatetimeIndex
+ """
+ new_values = self._values
+ if copy:
+ new_values = new_values.copy()
+
+ new_index = self.index.to_timestamp(freq=freq, how=how)
+ return self._constructor(new_values,
+ index=new_index).__finalize__(self)
+
+ def to_period(self, freq=None, copy=True):
+ """
+ Convert Series from DatetimeIndex to PeriodIndex with desired
+ frequency (inferred from index if not passed).
+
+ Parameters
+ ----------
+ freq : string, default
+
+ Returns
+ -------
+ ts : Series with PeriodIndex
+ """
+ new_values = self._values
+ if copy:
+ new_values = new_values.copy()
+
+ new_index = self.index.to_period(freq=freq)
+ return self._constructor(new_values,
+ index=new_index).__finalize__(self)
+
+ # ----------------------------------------------------------------------
+ # Accessor Methods
+ # ----------------------------------------------------------------------
+ str = CachedAccessor("str", StringMethods)
+ dt = CachedAccessor("dt", CombinedDatetimelikeProperties)
+ cat = CachedAccessor("cat", CategoricalAccessor)
+ plot = CachedAccessor("plot", gfx.SeriesPlotMethods)
+ sparse = CachedAccessor("sparse", SparseAccessor)
+
+ # ----------------------------------------------------------------------
+ # Add plotting methods to Series
+ hist = gfx.hist_series
+
+
+Series._setup_axes(['index'], info_axis=0, stat_axis=0, aliases={'rows': 0},
+ docs={'index': 'The index (axis labels) of the Series.'})
+Series._add_numeric_operations()
+Series._add_series_only_operations()
+Series._add_series_or_dataframe_operations()
+
+# Add arithmetic!
+ops.add_flex_arithmetic_methods(Series)
+ops.add_special_arithmetic_methods(Series)
diff --git a/contrib/python/pandas/py2/pandas/core/sorting.py b/contrib/python/pandas/py2/pandas/core/sorting.py
new file mode 100644
index 00000000000..0b5b017bec9
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/sorting.py
@@ -0,0 +1,508 @@
+""" miscellaneous sorting / groupby utilities """
+import warnings
+
+import numpy as np
+
+from pandas._libs import algos, hashtable, lib
+from pandas._libs.hashtable import unique_label_indices
+from pandas.compat import PY3, long, string_types
+
+from pandas.core.dtypes.cast import infer_dtype_from_array
+from pandas.core.dtypes.common import (
+ ensure_int64, ensure_platform_int, is_categorical_dtype, is_list_like)
+from pandas.core.dtypes.missing import isna
+
+import pandas.core.algorithms as algorithms
+
+_INT64_MAX = np.iinfo(np.int64).max
+
+
+def get_group_index(labels, shape, sort, xnull):
+ """
+ For the particular label_list, gets the offsets into the hypothetical list
+ representing the totally ordered cartesian product of all possible label
+ combinations, *as long as* this space fits within int64 bounds;
+ otherwise, though group indices identify unique combinations of
+ labels, they cannot be deconstructed.
+ - If `sort`, rank of returned ids preserve lexical ranks of labels.
+ i.e. returned id's can be used to do lexical sort on labels;
+ - If `xnull` nulls (-1 labels) are passed through.
+
+ Parameters
+ ----------
+ labels: sequence of arrays
+ Integers identifying levels at each location
+ shape: sequence of ints same length as labels
+ Number of unique levels at each location
+ sort: boolean
+ If the ranks of returned ids should match lexical ranks of labels
+ xnull: boolean
+ If true nulls are excluded. i.e. -1 values in the labels are
+ passed through
+ Returns
+ -------
+ An array of type int64 where two elements are equal if their corresponding
+ labels are equal at all location.
+ """
+ def _int64_cut_off(shape):
+ acc = long(1)
+ for i, mul in enumerate(shape):
+ acc *= long(mul)
+ if not acc < _INT64_MAX:
+ return i
+ return len(shape)
+
+ def maybe_lift(lab, size):
+ # promote nan values (assigned -1 label in lab array)
+ # so that all output values are non-negative
+ return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
+
+ labels = map(ensure_int64, labels)
+ if not xnull:
+ labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
+
+ labels = list(labels)
+ shape = list(shape)
+
+ # Iteratively process all the labels in chunks sized so less
+ # than _INT64_MAX unique int ids will be required for each chunk
+ while True:
+ # how many levels can be done without overflow:
+ nlev = _int64_cut_off(shape)
+
+ # compute flat ids for the first `nlev` levels
+ stride = np.prod(shape[1:nlev], dtype='i8')
+ out = stride * labels[0].astype('i8', subok=False, copy=False)
+
+ for i in range(1, nlev):
+ if shape[i] == 0:
+ stride = 0
+ else:
+ stride //= shape[i]
+ out += labels[i] * stride
+
+ if xnull: # exclude nulls
+ mask = labels[0] == -1
+ for lab in labels[1:nlev]:
+ mask |= lab == -1
+ out[mask] = -1
+
+ if nlev == len(shape): # all levels done!
+ break
+
+ # compress what has been done so far in order to avoid overflow
+ # to retain lexical ranks, obs_ids should be sorted
+ comp_ids, obs_ids = compress_group_index(out, sort=sort)
+
+ labels = [comp_ids] + labels[nlev:]
+ shape = [len(obs_ids)] + shape[nlev:]
+
+ return out
+
+
+def get_compressed_ids(labels, sizes):
+ """
+
+ Group_index is offsets into cartesian product of all possible labels. This
+ space can be huge, so this function compresses it, by computing offsets
+ (comp_ids) into the list of unique labels (obs_group_ids).
+
+ Parameters
+ ----------
+ labels : list of label arrays
+ sizes : list of size of the levels
+
+ Returns
+ -------
+ tuple of (comp_ids, obs_group_ids)
+
+ """
+ ids = get_group_index(labels, sizes, sort=True, xnull=False)
+ return compress_group_index(ids, sort=True)
+
+
+def is_int64_overflow_possible(shape):
+ the_prod = long(1)
+ for x in shape:
+ the_prod *= long(x)
+
+ return the_prod >= _INT64_MAX
+
+
+def decons_group_index(comp_labels, shape):
+ # reconstruct labels
+ if is_int64_overflow_possible(shape):
+ # at some point group indices are factorized,
+ # and may not be deconstructed here! wrong path!
+ raise ValueError('cannot deconstruct factorized group indices!')
+
+ label_list = []
+ factor = 1
+ y = 0
+ x = comp_labels
+ for i in reversed(range(len(shape))):
+ labels = (x - y) % (factor * shape[i]) // factor
+ np.putmask(labels, comp_labels < 0, -1)
+ label_list.append(labels)
+ y = labels * factor
+ factor *= shape[i]
+ return label_list[::-1]
+
+
+def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull):
+ """
+ reconstruct labels from observed group ids
+
+ Parameters
+ ----------
+ xnull: boolean,
+ if nulls are excluded; i.e. -1 labels are passed through
+ """
+
+ if not xnull:
+ lift = np.fromiter(((a == -1).any() for a in labels), dtype='i8')
+ shape = np.asarray(shape, dtype='i8') + lift
+
+ if not is_int64_overflow_possible(shape):
+ # obs ids are deconstructable! take the fast route!
+ out = decons_group_index(obs_ids, shape)
+ return out if xnull or not lift.any() \
+ else [x - y for x, y in zip(out, lift)]
+
+ i = unique_label_indices(comp_ids)
+ i8copy = lambda a: a.astype('i8', subok=False, copy=True)
+ return [i8copy(lab[i]) for lab in labels]
+
+
+def indexer_from_factorized(labels, shape, compress=True):
+ ids = get_group_index(labels, shape, sort=True, xnull=False)
+
+ if not compress:
+ ngroups = (ids.size and ids.max()) + 1
+ else:
+ ids, obs = compress_group_index(ids, sort=True)
+ ngroups = len(obs)
+
+ return get_group_index_sorter(ids, ngroups)
+
+
+def lexsort_indexer(keys, orders=None, na_position='last'):
+ from pandas.core.arrays import Categorical
+
+ labels = []
+ shape = []
+ if isinstance(orders, bool):
+ orders = [orders] * len(keys)
+ elif orders is None:
+ orders = [True] * len(keys)
+
+ for key, order in zip(keys, orders):
+
+ # we are already a Categorical
+ if is_categorical_dtype(key):
+ c = key
+
+ # create the Categorical
+ else:
+ c = Categorical(key, ordered=True)
+
+ if na_position not in ['last', 'first']:
+ raise ValueError('invalid na_position: {!r}'.format(na_position))
+
+ n = len(c.categories)
+ codes = c.codes.copy()
+
+ mask = (c.codes == -1)
+ if order: # ascending
+ if na_position == 'last':
+ codes = np.where(mask, n, codes)
+ elif na_position == 'first':
+ codes += 1
+ else: # not order means descending
+ if na_position == 'last':
+ codes = np.where(mask, n, n - codes - 1)
+ elif na_position == 'first':
+ codes = np.where(mask, 0, n - codes)
+ if mask.any():
+ n += 1
+
+ shape.append(n)
+ labels.append(codes)
+
+ return indexer_from_factorized(labels, shape)
+
+
+def nargsort(items, kind='quicksort', ascending=True, na_position='last'):
+ """
+ This is intended to be a drop-in replacement for np.argsort which
+ handles NaNs. It adds ascending and na_position parameters.
+ GH #6399, #5231
+ """
+
+ # specially handle Categorical
+ if is_categorical_dtype(items):
+ if na_position not in {'first', 'last'}:
+ raise ValueError('invalid na_position: {!r}'.format(na_position))
+
+ mask = isna(items)
+ cnt_null = mask.sum()
+ sorted_idx = items.argsort(ascending=ascending, kind=kind)
+ if ascending and na_position == 'last':
+ # NaN is coded as -1 and is listed in front after sorting
+ sorted_idx = np.roll(sorted_idx, -cnt_null)
+ elif not ascending and na_position == 'first':
+ # NaN is coded as -1 and is listed in the end after sorting
+ sorted_idx = np.roll(sorted_idx, cnt_null)
+ return sorted_idx
+
+ with warnings.catch_warnings():
+ # https://github.com/pandas-dev/pandas/issues/25439
+ # can be removed once ExtensionArrays are properly handled by nargsort
+ warnings.filterwarnings(
+ "ignore", category=FutureWarning,
+ message="Converting timezone-aware DatetimeArray to")
+ items = np.asanyarray(items)
+ idx = np.arange(len(items))
+ mask = isna(items)
+ non_nans = items[~mask]
+ non_nan_idx = idx[~mask]
+ nan_idx = np.nonzero(mask)[0]
+ if not ascending:
+ non_nans = non_nans[::-1]
+ non_nan_idx = non_nan_idx[::-1]
+ indexer = non_nan_idx[non_nans.argsort(kind=kind)]
+ if not ascending:
+ indexer = indexer[::-1]
+ # Finally, place the NaNs at the end or the beginning according to
+ # na_position
+ if na_position == 'last':
+ indexer = np.concatenate([indexer, nan_idx])
+ elif na_position == 'first':
+ indexer = np.concatenate([nan_idx, indexer])
+ else:
+ raise ValueError('invalid na_position: {!r}'.format(na_position))
+ return indexer
+
+
+class _KeyMapper(object):
+
+ """
+ Ease my suffering. Map compressed group id -> key tuple
+ """
+
+ def __init__(self, comp_ids, ngroups, levels, labels):
+ self.levels = levels
+ self.labels = labels
+ self.comp_ids = comp_ids.astype(np.int64)
+
+ self.k = len(labels)
+ self.tables = [hashtable.Int64HashTable(ngroups)
+ for _ in range(self.k)]
+
+ self._populate_tables()
+
+ def _populate_tables(self):
+ for labs, table in zip(self.labels, self.tables):
+ table.map(self.comp_ids, labs.astype(np.int64))
+
+ def get_key(self, comp_id):
+ return tuple(level[table.get_item(comp_id)]
+ for table, level in zip(self.tables, self.levels))
+
+
+def get_flattened_iterator(comp_ids, ngroups, levels, labels):
+ # provide "flattened" iterator for multi-group setting
+ mapper = _KeyMapper(comp_ids, ngroups, levels, labels)
+ return [mapper.get_key(i) for i in range(ngroups)]
+
+
+def get_indexer_dict(label_list, keys):
+ """ return a diction of {labels} -> {indexers} """
+ shape = list(map(len, keys))
+
+ group_index = get_group_index(label_list, shape, sort=True, xnull=True)
+ ngroups = ((group_index.size and group_index.max()) + 1) \
+ if is_int64_overflow_possible(shape) \
+ else np.prod(shape, dtype='i8')
+
+ sorter = get_group_index_sorter(group_index, ngroups)
+
+ sorted_labels = [lab.take(sorter) for lab in label_list]
+ group_index = group_index.take(sorter)
+
+ return lib.indices_fast(sorter, group_index, keys, sorted_labels)
+
+
+# ----------------------------------------------------------------------
+# sorting levels...cleverly?
+
+def get_group_index_sorter(group_index, ngroups):
+ """
+ algos.groupsort_indexer implements `counting sort` and it is at least
+ O(ngroups), where
+ ngroups = prod(shape)
+ shape = map(len, keys)
+ that is, linear in the number of combinations (cartesian product) of unique
+ values of groupby keys. This can be huge when doing multi-key groupby.
+ np.argsort(kind='mergesort') is O(count x log(count)) where count is the
+ length of the data-frame;
+ Both algorithms are `stable` sort and that is necessary for correctness of
+ groupby operations. e.g. consider:
+ df.groupby(key)[col].transform('first')
+ """
+ count = len(group_index)
+ alpha = 0.0 # taking complexities literally; there may be
+ beta = 1.0 # some room for fine-tuning these parameters
+ do_groupsort = (count > 0 and ((alpha + beta * ngroups) <
+ (count * np.log(count))))
+ if do_groupsort:
+ sorter, _ = algos.groupsort_indexer(ensure_int64(group_index),
+ ngroups)
+ return ensure_platform_int(sorter)
+ else:
+ return group_index.argsort(kind='mergesort')
+
+
+def compress_group_index(group_index, sort=True):
+ """
+ Group_index is offsets into cartesian product of all possible labels. This
+ space can be huge, so this function compresses it, by computing offsets
+ (comp_ids) into the list of unique labels (obs_group_ids).
+ """
+
+ size_hint = min(len(group_index), hashtable._SIZE_HINT_LIMIT)
+ table = hashtable.Int64HashTable(size_hint)
+
+ group_index = ensure_int64(group_index)
+
+ # note, group labels come out ascending (ie, 1,2,3 etc)
+ comp_ids, obs_group_ids = table.get_labels_groupby(group_index)
+
+ if sort and len(obs_group_ids) > 0:
+ obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids)
+
+ return comp_ids, obs_group_ids
+
+
+def _reorder_by_uniques(uniques, labels):
+ # sorter is index where elements ought to go
+ sorter = uniques.argsort()
+
+ # reverse_indexer is where elements came from
+ reverse_indexer = np.empty(len(sorter), dtype=np.int64)
+ reverse_indexer.put(sorter, np.arange(len(sorter)))
+
+ mask = labels < 0
+
+ # move labels to right locations (ie, unsort ascending labels)
+ labels = algorithms.take_nd(reverse_indexer, labels, allow_fill=False)
+ np.putmask(labels, mask, -1)
+
+ # sort observed ids
+ uniques = algorithms.take_nd(uniques, sorter, allow_fill=False)
+
+ return uniques, labels
+
+
+def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False):
+ """
+ Sort ``values`` and reorder corresponding ``labels``.
+ ``values`` should be unique if ``labels`` is not None.
+ Safe for use with mixed types (int, str), orders ints before strs.
+
+ .. versionadded:: 0.19.0
+
+ Parameters
+ ----------
+ values : list-like
+ Sequence; must be unique if ``labels`` is not None.
+ labels : list_like
+ Indices to ``values``. All out of bound indices are treated as
+ "not found" and will be masked with ``na_sentinel``.
+ na_sentinel : int, default -1
+ Value in ``labels`` to mark "not found".
+ Ignored when ``labels`` is None.
+ assume_unique : bool, default False
+ When True, ``values`` are assumed to be unique, which can speed up
+ the calculation. Ignored when ``labels`` is None.
+
+ Returns
+ -------
+ ordered : ndarray
+ Sorted ``values``
+ new_labels : ndarray
+ Reordered ``labels``; returned when ``labels`` is not None.
+
+ Raises
+ ------
+ TypeError
+ * If ``values`` is not list-like or if ``labels`` is neither None
+ nor list-like
+ * If ``values`` cannot be sorted
+ ValueError
+ * If ``labels`` is not None and ``values`` contain duplicates.
+ """
+ if not is_list_like(values):
+ raise TypeError("Only list-like objects are allowed to be passed to"
+ "safe_sort as values")
+
+ if not isinstance(values, np.ndarray):
+
+ # don't convert to string types
+ dtype, _ = infer_dtype_from_array(values)
+ values = np.asarray(values, dtype=dtype)
+
+ def sort_mixed(values):
+ # order ints before strings, safe in py3
+ str_pos = np.array([isinstance(x, string_types) for x in values],
+ dtype=bool)
+ nums = np.sort(values[~str_pos])
+ strs = np.sort(values[str_pos])
+ return np.concatenate([nums, np.asarray(strs, dtype=object)])
+
+ sorter = None
+ if PY3 and lib.infer_dtype(values, skipna=False) == 'mixed-integer':
+ # unorderable in py3 if mixed str/int
+ ordered = sort_mixed(values)
+ else:
+ try:
+ sorter = values.argsort()
+ ordered = values.take(sorter)
+ except TypeError:
+ # try this anyway
+ ordered = sort_mixed(values)
+
+ # labels:
+
+ if labels is None:
+ return ordered
+
+ if not is_list_like(labels):
+ raise TypeError("Only list-like objects or None are allowed to be"
+ "passed to safe_sort as labels")
+ labels = ensure_platform_int(np.asarray(labels))
+
+ from pandas import Index
+ if not assume_unique and not Index(values).is_unique:
+ raise ValueError("values should be unique if labels is not None")
+
+ if sorter is None:
+ # mixed types
+ (hash_klass, _), values = algorithms._get_data_algo(
+ values, algorithms._hashtables)
+ t = hash_klass(len(values))
+ t.map_locations(values)
+ sorter = ensure_platform_int(t.lookup(ordered))
+
+ reverse_indexer = np.empty(len(sorter), dtype=np.int_)
+ reverse_indexer.put(sorter, np.arange(len(sorter)))
+
+ mask = (labels < -len(values)) | (labels >= len(values)) | \
+ (labels == na_sentinel)
+
+ # (Out of bound indices will be masked with `na_sentinel` next, so we may
+ # deal with them here without performance loss using `mode='wrap'`.)
+ new_labels = reverse_indexer.take(labels, mode='wrap')
+ np.putmask(new_labels, mask, na_sentinel)
+
+ return ordered, ensure_platform_int(new_labels)
diff --git a/contrib/python/pandas/py2/pandas/core/sparse/__init__.py b/contrib/python/pandas/py2/pandas/core/sparse/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/sparse/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/core/sparse/api.py b/contrib/python/pandas/py2/pandas/core/sparse/api.py
new file mode 100644
index 00000000000..33e8b921905
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/sparse/api.py
@@ -0,0 +1,5 @@
+# pylint: disable=W0611
+# flake8: noqa
+from pandas.core.arrays.sparse import SparseArray, SparseDtype
+from pandas.core.sparse.frame import SparseDataFrame
+from pandas.core.sparse.series import SparseSeries
diff --git a/contrib/python/pandas/py2/pandas/core/sparse/frame.py b/contrib/python/pandas/py2/pandas/core/sparse/frame.py
new file mode 100644
index 00000000000..586193fe118
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/sparse/frame.py
@@ -0,0 +1,1043 @@
+"""
+Data structures for sparse float data. Life is made simpler by dealing only
+with float64 data
+"""
+from __future__ import division
+
+import warnings
+
+import numpy as np
+
+from pandas._libs.sparse import BlockIndex, get_blocks
+import pandas.compat as compat
+from pandas.compat import lmap
+from pandas.compat.numpy import function as nv
+from pandas.util._decorators import Appender
+
+from pandas.core.dtypes.cast import find_common_type, maybe_upcast
+from pandas.core.dtypes.common import ensure_platform_int, is_scipy_sparse
+from pandas.core.dtypes.missing import isna, notna
+
+import pandas.core.algorithms as algos
+from pandas.core.arrays.sparse import SparseArray, SparseDtype
+import pandas.core.common as com
+from pandas.core.frame import DataFrame
+import pandas.core.generic as generic
+from pandas.core.index import Index, MultiIndex, ensure_index
+import pandas.core.indexes.base as ibase
+from pandas.core.internals import (
+ BlockManager, create_block_manager_from_arrays)
+from pandas.core.internals.construction import extract_index, prep_ndarray
+import pandas.core.ops as ops
+from pandas.core.series import Series
+from pandas.core.sparse.series import SparseSeries
+
+# pylint: disable=E1101,E1103,W0231,E0202
+
+
+_shared_doc_kwargs = dict(klass='SparseDataFrame')
+
+
+class SparseDataFrame(DataFrame):
+ """
+ DataFrame containing sparse floating point data in the form of SparseSeries
+ objects
+
+ Parameters
+ ----------
+ data : same types as can be passed to DataFrame or scipy.sparse.spmatrix
+ .. versionchanged :: 0.23.0
+ If data is a dict, argument order is maintained for Python 3.6
+ and later.
+
+ index : array-like, optional
+ column : array-like, optional
+ default_kind : {'block', 'integer'}, default 'block'
+ Default sparse kind for converting Series to SparseSeries. Will not
+ override SparseSeries passed into constructor
+ default_fill_value : float
+ Default fill_value for converting Series to SparseSeries
+ (default: nan). Will not override SparseSeries passed in.
+ """
+ _subtyp = 'sparse_frame'
+
+ def __init__(self, data=None, index=None, columns=None, default_kind=None,
+ default_fill_value=None, dtype=None, copy=False):
+
+ # pick up the defaults from the Sparse structures
+ if isinstance(data, SparseDataFrame):
+ if index is None:
+ index = data.index
+ if columns is None:
+ columns = data.columns
+ if default_fill_value is None:
+ default_fill_value = data.default_fill_value
+ if default_kind is None:
+ default_kind = data.default_kind
+ elif isinstance(data, (SparseSeries, SparseArray)):
+ if index is None:
+ index = data.index
+ if default_fill_value is None:
+ default_fill_value = data.fill_value
+ if columns is None and hasattr(data, 'name'):
+ columns = [data.name]
+ if columns is None:
+ raise Exception("cannot pass a series w/o a name or columns")
+ data = {columns[0]: data}
+
+ if default_fill_value is None:
+ default_fill_value = np.nan
+ if default_kind is None:
+ default_kind = 'block'
+
+ self._default_kind = default_kind
+ self._default_fill_value = default_fill_value
+
+ if is_scipy_sparse(data):
+ mgr = self._init_spmatrix(data, index, columns, dtype=dtype,
+ fill_value=default_fill_value)
+ elif isinstance(data, dict):
+ mgr = self._init_dict(data, index, columns, dtype=dtype)
+ elif isinstance(data, (np.ndarray, list)):
+ mgr = self._init_matrix(data, index, columns, dtype=dtype)
+ elif isinstance(data, SparseDataFrame):
+ mgr = self._init_mgr(data._data,
+ dict(index=index, columns=columns),
+ dtype=dtype, copy=copy)
+ elif isinstance(data, DataFrame):
+ mgr = self._init_dict(data, data.index, data.columns, dtype=dtype)
+ elif isinstance(data, Series):
+ mgr = self._init_dict(data.to_frame(), data.index,
+ columns=None, dtype=dtype)
+ elif isinstance(data, BlockManager):
+ mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
+ dtype=dtype, copy=copy)
+ elif data is None:
+ data = DataFrame()
+
+ if index is None:
+ index = Index([])
+ else:
+ index = ensure_index(index)
+
+ if columns is None:
+ columns = Index([])
+ else:
+ for c in columns:
+ data[c] = SparseArray(np.nan, index=index,
+ kind=self._default_kind,
+ fill_value=self._default_fill_value)
+ mgr = to_manager(data, columns, index)
+ if dtype is not None:
+ mgr = mgr.astype(dtype)
+ else:
+ msg = ('SparseDataFrame called with unknown type "{data_type}" '
+ 'for data argument')
+ raise TypeError(msg.format(data_type=type(data).__name__))
+
+ generic.NDFrame.__init__(self, mgr)
+
+ @property
+ def _constructor(self):
+ return SparseDataFrame
+
+ _constructor_sliced = SparseSeries
+
+ def _init_dict(self, data, index, columns, dtype=None):
+ # pre-filter out columns if we passed it
+ if columns is not None:
+ columns = ensure_index(columns)
+ data = {k: v for k, v in compat.iteritems(data) if k in columns}
+ else:
+ keys = com.dict_keys_to_ordered_list(data)
+ columns = Index(keys)
+
+ if index is None:
+ index = extract_index(list(data.values()))
+
+ def sp_maker(x):
+ return SparseArray(x, kind=self._default_kind,
+ fill_value=self._default_fill_value,
+ copy=True, dtype=dtype)
+ sdict = {}
+ for k, v in compat.iteritems(data):
+ if isinstance(v, Series):
+ # Force alignment, no copy necessary
+ if not v.index.equals(index):
+ v = v.reindex(index)
+
+ if not isinstance(v, SparseSeries):
+ v = sp_maker(v.values)
+ elif isinstance(v, SparseArray):
+ v = v.copy()
+ else:
+ if isinstance(v, dict):
+ v = [v.get(i, np.nan) for i in index]
+
+ v = sp_maker(v)
+
+ if index is not None and len(v) != len(index):
+ msg = "Length of passed values is {}, index implies {}"
+ raise ValueError(msg.format(len(v), len(index)))
+ sdict[k] = v
+
+ if len(columns.difference(sdict)):
+ # TODO: figure out how to handle this case, all nan's?
+ # add in any other columns we want to have (completeness)
+ nan_arr = np.empty(len(index), dtype='float64')
+ nan_arr.fill(np.nan)
+ nan_arr = SparseArray(nan_arr, kind=self._default_kind,
+ fill_value=self._default_fill_value,
+ copy=False)
+ sdict.update((c, nan_arr) for c in columns if c not in sdict)
+
+ return to_manager(sdict, columns, index)
+
+ def _init_matrix(self, data, index, columns, dtype=None):
+ """ Init self from ndarray or list of lists """
+ data = prep_ndarray(data, copy=False)
+ index, columns = self._prep_index(data, index, columns)
+ data = {idx: data[:, i] for i, idx in enumerate(columns)}
+ return self._init_dict(data, index, columns, dtype)
+
+ def _init_spmatrix(self, data, index, columns, dtype=None,
+ fill_value=None):
+ """ Init self from scipy.sparse matrix """
+ index, columns = self._prep_index(data, index, columns)
+ data = data.tocoo()
+ N = len(index)
+
+ # Construct a dict of SparseSeries
+ sdict = {}
+ values = Series(data.data, index=data.row, copy=False)
+ for col, rowvals in values.groupby(data.col):
+ # get_blocks expects int32 row indices in sorted order
+ rowvals = rowvals.sort_index()
+ rows = rowvals.index.values.astype(np.int32)
+ blocs, blens = get_blocks(rows)
+
+ sdict[columns[col]] = SparseSeries(
+ rowvals.values, index=index,
+ fill_value=fill_value,
+ sparse_index=BlockIndex(N, blocs, blens))
+
+ # Add any columns that were empty and thus not grouped on above
+ sdict.update({column: SparseSeries(index=index,
+ fill_value=fill_value,
+ sparse_index=BlockIndex(N, [], []))
+ for column in columns
+ if column not in sdict})
+
+ return self._init_dict(sdict, index, columns, dtype)
+
+ def _prep_index(self, data, index, columns):
+ N, K = data.shape
+ if index is None:
+ index = ibase.default_index(N)
+ if columns is None:
+ columns = ibase.default_index(K)
+
+ if len(columns) != K:
+ raise ValueError('Column length mismatch: {columns} vs. {K}'
+ .format(columns=len(columns), K=K))
+ if len(index) != N:
+ raise ValueError('Index length mismatch: {index} vs. {N}'
+ .format(index=len(index), N=N))
+ return index, columns
+
+ def to_coo(self):
+ """
+ Return the contents of the frame as a sparse SciPy COO matrix.
+
+ .. versionadded:: 0.20.0
+
+ Returns
+ -------
+ coo_matrix : scipy.sparse.spmatrix
+ If the caller is heterogeneous and contains booleans or objects,
+ the result will be of dtype=object. See Notes.
+
+ Notes
+ -----
+ The dtype will be the lowest-common-denominator type (implicit
+ upcasting); that is to say if the dtypes (even of numeric types)
+ are mixed, the one that accommodates all will be chosen.
+
+ e.g. If the dtypes are float16 and float32, dtype will be upcast to
+ float32. By numpy.find_common_type convention, mixing int64 and
+ and uint64 will result in a float64 dtype.
+ """
+ try:
+ from scipy.sparse import coo_matrix
+ except ImportError:
+ raise ImportError('Scipy is not installed')
+
+ dtype = find_common_type(self.dtypes)
+ if isinstance(dtype, SparseDtype):
+ dtype = dtype.subtype
+
+ cols, rows, datas = [], [], []
+ for col, name in enumerate(self):
+ s = self[name]
+ row = s.sp_index.to_int_index().indices
+ cols.append(np.repeat(col, len(row)))
+ rows.append(row)
+ datas.append(s.sp_values.astype(dtype, copy=False))
+
+ cols = np.concatenate(cols)
+ rows = np.concatenate(rows)
+ datas = np.concatenate(datas)
+ return coo_matrix((datas, (rows, cols)), shape=self.shape)
+
+ def __array_wrap__(self, result):
+ return self._constructor(
+ result, index=self.index, columns=self.columns,
+ default_kind=self._default_kind,
+ default_fill_value=self._default_fill_value).__finalize__(self)
+
+ def __getstate__(self):
+ # pickling
+ return dict(_typ=self._typ, _subtyp=self._subtyp, _data=self._data,
+ _default_fill_value=self._default_fill_value,
+ _default_kind=self._default_kind)
+
+ def _unpickle_sparse_frame_compat(self, state):
+ """ original pickle format """
+ series, cols, idx, fv, kind = state
+
+ if not isinstance(cols, Index): # pragma: no cover
+ from pandas.io.pickle import _unpickle_array
+ columns = _unpickle_array(cols)
+ else:
+ columns = cols
+
+ if not isinstance(idx, Index): # pragma: no cover
+ from pandas.io.pickle import _unpickle_array
+ index = _unpickle_array(idx)
+ else:
+ index = idx
+
+ series_dict = DataFrame()
+ for col, (sp_index, sp_values) in compat.iteritems(series):
+ series_dict[col] = SparseSeries(sp_values, sparse_index=sp_index,
+ fill_value=fv)
+
+ self._data = to_manager(series_dict, columns, index)
+ self._default_fill_value = fv
+ self._default_kind = kind
+
+ def to_dense(self):
+ """
+ Convert to dense DataFrame
+
+ Returns
+ -------
+ df : DataFrame
+ """
+ data = {k: v.to_dense() for k, v in compat.iteritems(self)}
+ return DataFrame(data, index=self.index, columns=self.columns)
+
+ def _apply_columns(self, func):
+ """ get new SparseDataFrame applying func to each columns """
+
+ new_data = {col: func(series)
+ for col, series in compat.iteritems(self)}
+
+ return self._constructor(
+ data=new_data, index=self.index, columns=self.columns,
+ default_fill_value=self.default_fill_value).__finalize__(self)
+
+ def astype(self, dtype):
+ return self._apply_columns(lambda x: x.astype(dtype))
+
+ def copy(self, deep=True):
+ """
+ Make a copy of this SparseDataFrame
+ """
+ result = super(SparseDataFrame, self).copy(deep=deep)
+ result._default_fill_value = self._default_fill_value
+ result._default_kind = self._default_kind
+ return result
+
+ @property
+ def default_fill_value(self):
+ return self._default_fill_value
+
+ @property
+ def default_kind(self):
+ return self._default_kind
+
+ @property
+ def density(self):
+ """
+ Ratio of non-sparse points to total (dense) data points
+ represented in the frame
+ """
+ tot_nonsparse = sum(ser.sp_index.npoints
+ for _, ser in compat.iteritems(self))
+ tot = len(self.index) * len(self.columns)
+ return tot_nonsparse / float(tot)
+
+ def fillna(self, value=None, method=None, axis=0, inplace=False,
+ limit=None, downcast=None):
+ new_self = super(SparseDataFrame,
+ self).fillna(value=value, method=method, axis=axis,
+ inplace=inplace, limit=limit,
+ downcast=downcast)
+ if not inplace:
+ self = new_self
+
+ # set the fill value if we are filling as a scalar with nothing special
+ # going on
+ if (value is not None and value == value and method is None and
+ limit is None):
+ self._default_fill_value = value
+
+ if not inplace:
+ return self
+
+ # ----------------------------------------------------------------------
+ # Support different internal representation of SparseDataFrame
+
+ def _sanitize_column(self, key, value, **kwargs):
+ """
+ Creates a new SparseArray from the input value.
+
+ Parameters
+ ----------
+ key : object
+ value : scalar, Series, or array-like
+ kwargs : dict
+
+ Returns
+ -------
+ sanitized_column : SparseArray
+
+ """
+ def sp_maker(x, index=None):
+ return SparseArray(x, index=index,
+ fill_value=self._default_fill_value,
+ kind=self._default_kind)
+ if isinstance(value, SparseSeries):
+ clean = value.reindex(self.index).as_sparse_array(
+ fill_value=self._default_fill_value, kind=self._default_kind)
+
+ elif isinstance(value, SparseArray):
+ if len(value) != len(self.index):
+ raise AssertionError('Length of values does not match '
+ 'length of index')
+ clean = value
+
+ elif hasattr(value, '__iter__'):
+ if isinstance(value, Series):
+ clean = value.reindex(self.index)
+ if not isinstance(value, SparseSeries):
+ clean = sp_maker(clean)
+ else:
+ if len(value) != len(self.index):
+ raise AssertionError('Length of values does not match '
+ 'length of index')
+ clean = sp_maker(value)
+
+ # Scalar
+ else:
+ clean = sp_maker(value, self.index)
+
+ # always return a SparseArray!
+ return clean
+
+ def get_value(self, index, col, takeable=False):
+ """
+ Quickly retrieve single value at passed column and index
+
+ .. deprecated:: 0.21.0
+
+ Please use .at[] or .iat[] accessors.
+
+ Parameters
+ ----------
+ index : row label
+ col : column label
+ takeable : interpret the index/col as indexers, default False
+
+ Returns
+ -------
+ value : scalar value
+ """
+ warnings.warn("get_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead", FutureWarning,
+ stacklevel=2)
+ return self._get_value(index, col, takeable=takeable)
+
+ def _get_value(self, index, col, takeable=False):
+ if takeable is True:
+ series = self._iget_item_cache(col)
+ else:
+ series = self._get_item_cache(col)
+
+ return series._get_value(index, takeable=takeable)
+ _get_value.__doc__ = get_value.__doc__
+
+ def set_value(self, index, col, value, takeable=False):
+ """
+ Put single value at passed column and index
+
+ .. deprecated:: 0.21.0
+
+ Please use .at[] or .iat[] accessors.
+
+ Parameters
+ ----------
+ index : row label
+ col : column label
+ value : scalar value
+ takeable : interpret the index/col as indexers, default False
+
+ Notes
+ -----
+ This method *always* returns a new object. It is currently not
+ particularly efficient (and potentially very expensive) but is provided
+ for API compatibility with DataFrame
+
+ Returns
+ -------
+ frame : DataFrame
+ """
+ warnings.warn("set_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead", FutureWarning,
+ stacklevel=2)
+ return self._set_value(index, col, value, takeable=takeable)
+
+ def _set_value(self, index, col, value, takeable=False):
+ dense = self.to_dense()._set_value(
+ index, col, value, takeable=takeable)
+ return dense.to_sparse(kind=self._default_kind,
+ fill_value=self._default_fill_value)
+ _set_value.__doc__ = set_value.__doc__
+
+ def _slice(self, slobj, axis=0, kind=None):
+ if axis == 0:
+ new_index = self.index[slobj]
+ new_columns = self.columns
+ else:
+ new_index = self.index
+ new_columns = self.columns[slobj]
+
+ return self.reindex(index=new_index, columns=new_columns)
+
+ def xs(self, key, axis=0, copy=False):
+ """
+ Returns a row (cross-section) from the SparseDataFrame as a Series
+ object.
+
+ Parameters
+ ----------
+ key : some index contained in the index
+
+ Returns
+ -------
+ xs : Series
+ """
+ if axis == 1:
+ data = self[key]
+ return data
+
+ i = self.index.get_loc(key)
+ data = self.take([i]).get_values()[0]
+ return Series(data, index=self.columns)
+
+ # ----------------------------------------------------------------------
+ # Arithmetic-related methods
+
+ def _combine_frame(self, other, func, fill_value=None, level=None):
+ if level is not None:
+ raise NotImplementedError("'level' argument is not supported")
+
+ this, other = self.align(other, join='outer', level=level, copy=False)
+ new_index, new_columns = this.index, this.columns
+
+ if self.empty and other.empty:
+ return self._constructor(index=new_index).__finalize__(self)
+
+ new_data = {}
+ if fill_value is not None:
+ # TODO: be a bit more intelligent here
+ for col in new_columns:
+ if col in this and col in other:
+ dleft = this[col].to_dense()
+ dright = other[col].to_dense()
+ result = dleft._binop(dright, func, fill_value=fill_value)
+ result = result.to_sparse(fill_value=this[col].fill_value)
+ new_data[col] = result
+ else:
+
+ for col in new_columns:
+ if col in this and col in other:
+ new_data[col] = func(this[col], other[col])
+
+ new_fill_value = self._get_op_result_fill_value(other, func)
+
+ return self._constructor(data=new_data, index=new_index,
+ columns=new_columns,
+ default_fill_value=new_fill_value
+ ).__finalize__(self)
+
+ def _combine_match_index(self, other, func, level=None):
+ new_data = {}
+
+ if level is not None:
+ raise NotImplementedError("'level' argument is not supported")
+
+ this, other = self.align(other, join='outer', axis=0, level=level,
+ copy=False)
+
+ for col, series in compat.iteritems(this):
+ new_data[col] = func(series.values, other.values)
+
+ fill_value = self._get_op_result_fill_value(other, func)
+
+ return self._constructor(
+ new_data, index=this.index, columns=self.columns,
+ default_fill_value=fill_value).__finalize__(self)
+
+ def _combine_match_columns(self, other, func, level=None):
+ # patched version of DataFrame._combine_match_columns to account for
+ # NumPy circumventing __rsub__ with float64 types, e.g.: 3.0 - series,
+ # where 3.0 is numpy.float64 and series is a SparseSeries. Still
+ # possible for this to happen, which is bothersome
+
+ if level is not None:
+ raise NotImplementedError("'level' argument is not supported")
+
+ left, right = self.align(other, join='outer', axis=1, level=level,
+ copy=False)
+ assert left.columns.equals(right.index)
+
+ new_data = {}
+
+ for col in left.columns:
+ new_data[col] = func(left[col], float(right[col]))
+
+ return self._constructor(
+ new_data, index=left.index, columns=left.columns,
+ default_fill_value=self.default_fill_value).__finalize__(self)
+
+ def _combine_const(self, other, func):
+ return self._apply_columns(lambda x: func(x, other))
+
+ def _get_op_result_fill_value(self, other, func):
+ own_default = self.default_fill_value
+
+ if isinstance(other, DataFrame):
+ # i.e. called from _combine_frame
+
+ other_default = getattr(other, 'default_fill_value', np.nan)
+
+ # if the fill values are the same use them? or use a valid one
+ if own_default == other_default:
+ # TOOD: won't this evaluate as False if both are np.nan?
+ fill_value = own_default
+ elif np.isnan(own_default) and not np.isnan(other_default):
+ fill_value = other_default
+ elif not np.isnan(own_default) and np.isnan(other_default):
+ fill_value = own_default
+ else:
+ fill_value = None
+
+ elif isinstance(other, SparseSeries):
+ # i.e. called from _combine_match_index
+
+ # fill_value is a function of our operator
+ if isna(other.fill_value) or isna(own_default):
+ fill_value = np.nan
+ else:
+ fill_value = func(np.float64(own_default),
+ np.float64(other.fill_value))
+
+ else:
+ raise NotImplementedError(type(other))
+
+ return fill_value
+
+ def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
+ limit=None, takeable=False):
+ if level is not None:
+ raise TypeError('Reindex by level not supported for sparse')
+
+ if self.index.equals(index):
+ if copy:
+ return self.copy()
+ else:
+ return self
+
+ if len(self.index) == 0:
+ return self._constructor(
+ index=index, columns=self.columns).__finalize__(self)
+
+ indexer = self.index.get_indexer(index, method, limit=limit)
+ indexer = ensure_platform_int(indexer)
+ mask = indexer == -1
+ need_mask = mask.any()
+
+ new_series = {}
+ for col, series in self.iteritems():
+ if mask.all():
+ continue
+
+ values = series.values
+ # .take returns SparseArray
+ new = values.take(indexer)
+ if need_mask:
+ new = new.values
+ # convert integer to float if necessary. need to do a lot
+ # more than that, handle boolean etc also
+ new, fill_value = maybe_upcast(new, fill_value=fill_value)
+ np.putmask(new, mask, fill_value)
+
+ new_series[col] = new
+
+ return self._constructor(
+ new_series, index=index, columns=self.columns,
+ default_fill_value=self._default_fill_value).__finalize__(self)
+
+ def _reindex_columns(self, columns, method, copy, level, fill_value=None,
+ limit=None, takeable=False):
+ if level is not None:
+ raise TypeError('Reindex by level not supported for sparse')
+
+ if notna(fill_value):
+ raise NotImplementedError("'fill_value' argument is not supported")
+
+ if limit:
+ raise NotImplementedError("'limit' argument is not supported")
+
+ if method is not None:
+ raise NotImplementedError("'method' argument is not supported")
+
+ # TODO: fill value handling
+ sdict = {k: v for k, v in compat.iteritems(self) if k in columns}
+ return self._constructor(
+ sdict, index=self.index, columns=columns,
+ default_fill_value=self._default_fill_value).__finalize__(self)
+
+ def _reindex_with_indexers(self, reindexers, method=None, fill_value=None,
+ limit=None, copy=False, allow_dups=False):
+
+ if method is not None or limit is not None:
+ raise NotImplementedError("cannot reindex with a method or limit "
+ "with sparse")
+
+ if fill_value is None:
+ fill_value = np.nan
+
+ reindexers = {self._get_axis_number(a): val
+ for (a, val) in compat.iteritems(reindexers)}
+
+ index, row_indexer = reindexers.get(0, (None, None))
+ columns, col_indexer = reindexers.get(1, (None, None))
+
+ if columns is None:
+ columns = self.columns
+
+ new_arrays = {}
+ for col in columns:
+ if col not in self:
+ continue
+ if row_indexer is not None:
+ new_arrays[col] = algos.take_1d(self[col].get_values(),
+ row_indexer,
+ fill_value=fill_value)
+ else:
+ new_arrays[col] = self[col]
+
+ return self._constructor(new_arrays, index=index,
+ columns=columns).__finalize__(self)
+
+ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
+ sort=False):
+ if on is not None:
+ raise NotImplementedError("'on' keyword parameter is not yet "
+ "implemented")
+ return self._join_index(other, how, lsuffix, rsuffix)
+
+ def _join_index(self, other, how, lsuffix, rsuffix):
+ if isinstance(other, Series):
+ if other.name is None:
+ raise ValueError('Other Series must have a name')
+
+ other = SparseDataFrame(
+ {other.name: other},
+ default_fill_value=self._default_fill_value)
+
+ join_index = self.index.join(other.index, how=how)
+
+ this = self.reindex(join_index)
+ other = other.reindex(join_index)
+
+ this, other = this._maybe_rename_join(other, lsuffix, rsuffix)
+
+ from pandas import concat
+ return concat([this, other], axis=1, verify_integrity=True)
+
+ def _maybe_rename_join(self, other, lsuffix, rsuffix):
+ to_rename = self.columns.intersection(other.columns)
+ if len(to_rename) > 0:
+ if not lsuffix and not rsuffix:
+ raise ValueError('columns overlap but no suffix specified: '
+ '{to_rename}'.format(to_rename=to_rename))
+
+ def lrenamer(x):
+ if x in to_rename:
+ return '{x}{lsuffix}'.format(x=x, lsuffix=lsuffix)
+ return x
+
+ def rrenamer(x):
+ if x in to_rename:
+ return '{x}{rsuffix}'.format(x=x, rsuffix=rsuffix)
+ return x
+
+ this = self.rename(columns=lrenamer)
+ other = other.rename(columns=rrenamer)
+ else:
+ this = self
+
+ return this, other
+
+ def transpose(self, *args, **kwargs):
+ """
+ Returns a DataFrame with the rows/columns switched.
+ """
+ nv.validate_transpose(args, kwargs)
+ return self._constructor(
+ self.values.T, index=self.columns, columns=self.index,
+ default_fill_value=self._default_fill_value,
+ default_kind=self._default_kind).__finalize__(self)
+
+ T = property(transpose)
+
+ @Appender(DataFrame.count.__doc__)
+ def count(self, axis=0, **kwds):
+ if axis is None:
+ axis = self._stat_axis_number
+
+ return self.apply(lambda x: x.count(), axis=axis)
+
+ def cumsum(self, axis=0, *args, **kwargs):
+ """
+ Return SparseDataFrame of cumulative sums over requested axis.
+
+ Parameters
+ ----------
+ axis : {0, 1}
+ 0 for row-wise, 1 for column-wise
+
+ Returns
+ -------
+ y : SparseDataFrame
+ """
+ nv.validate_cumsum(args, kwargs)
+
+ if axis is None:
+ axis = self._stat_axis_number
+
+ return self.apply(lambda x: x.cumsum(), axis=axis)
+
+ @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs)
+ def isna(self):
+ return self._apply_columns(lambda x: x.isna())
+ isnull = isna
+
+ @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs)
+ def notna(self):
+ return self._apply_columns(lambda x: x.notna())
+ notnull = notna
+
+ def apply(self, func, axis=0, broadcast=None, reduce=None,
+ result_type=None):
+ """
+ Analogous to DataFrame.apply, for SparseDataFrame
+
+ Parameters
+ ----------
+ func : function
+ Function to apply to each column
+ axis : {0, 1, 'index', 'columns'}
+ broadcast : bool, default False
+ For aggregation functions, return object of same size with values
+ propagated
+
+ .. deprecated:: 0.23.0
+ This argument will be removed in a future version, replaced
+ by result_type='broadcast'.
+
+ reduce : boolean or None, default None
+ Try to apply reduction procedures. If the DataFrame is empty,
+ apply will use reduce to determine whether the result should be a
+ Series or a DataFrame. If reduce is None (the default), apply's
+ return value will be guessed by calling func an empty Series (note:
+ while guessing, exceptions raised by func will be ignored). If
+ reduce is True a Series will always be returned, and if False a
+ DataFrame will always be returned.
+
+ .. deprecated:: 0.23.0
+ This argument will be removed in a future version, replaced
+ by result_type='reduce'.
+
+ result_type : {'expand', 'reduce', 'broadcast, None}
+ These only act when axis=1 {columns}:
+
+ * 'expand' : list-like results will be turned into columns.
+ * 'reduce' : return a Series if possible rather than expanding
+ list-like results. This is the opposite to 'expand'.
+ * 'broadcast' : results will be broadcast to the original shape
+ of the frame, the original index & columns will be retained.
+
+ The default behaviour (None) depends on the return value of the
+ applied function: list-like results will be returned as a Series
+ of those. However if the apply function returns a Series these
+ are expanded to columns.
+
+ .. versionadded:: 0.23.0
+
+ Returns
+ -------
+ applied : Series or SparseDataFrame
+ """
+ if not len(self.columns):
+ return self
+ axis = self._get_axis_number(axis)
+
+ if isinstance(func, np.ufunc):
+ new_series = {}
+ for k, v in compat.iteritems(self):
+ applied = func(v)
+ applied.fill_value = func(v.fill_value)
+ new_series[k] = applied
+ return self._constructor(
+ new_series, index=self.index, columns=self.columns,
+ default_fill_value=self._default_fill_value,
+ default_kind=self._default_kind).__finalize__(self)
+
+ from pandas.core.apply import frame_apply
+ op = frame_apply(self,
+ func=func,
+ axis=axis,
+ reduce=reduce,
+ broadcast=broadcast,
+ result_type=result_type)
+ return op.get_result()
+
+ def applymap(self, func):
+ """
+ Apply a function to a DataFrame that is intended to operate
+ elementwise, i.e. like doing map(func, series) for each series in the
+ DataFrame
+
+ Parameters
+ ----------
+ func : function
+ Python function, returns a single value from a single value
+
+ Returns
+ -------
+ applied : DataFrame
+ """
+ return self.apply(lambda x: lmap(func, x))
+
+
+def to_manager(sdf, columns, index):
+ """ create and return the block manager from a dataframe of series,
+ columns, index
+ """
+
+ # from BlockManager perspective
+ axes = [ensure_index(columns), ensure_index(index)]
+
+ return create_block_manager_from_arrays(
+ [sdf[c] for c in columns], columns, axes)
+
+
+def stack_sparse_frame(frame):
+ """
+ Only makes sense when fill_value is NaN
+ """
+ lengths = [s.sp_index.npoints for _, s in compat.iteritems(frame)]
+ nobs = sum(lengths)
+
+ # this is pretty fast
+ minor_codes = np.repeat(np.arange(len(frame.columns)), lengths)
+
+ inds_to_concat = []
+ vals_to_concat = []
+ # TODO: Figure out whether this can be reached.
+ # I think this currently can't be reached because you can't build a
+ # SparseDataFrame with a non-np.NaN fill value (fails earlier).
+ for _, series in compat.iteritems(frame):
+ if not np.isnan(series.fill_value):
+ raise TypeError('This routine assumes NaN fill value')
+
+ int_index = series.sp_index.to_int_index()
+ inds_to_concat.append(int_index.indices)
+ vals_to_concat.append(series.sp_values)
+
+ major_codes = np.concatenate(inds_to_concat)
+ stacked_values = np.concatenate(vals_to_concat)
+ index = MultiIndex(levels=[frame.index, frame.columns],
+ codes=[major_codes, minor_codes],
+ verify_integrity=False)
+
+ lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index,
+ columns=['foo'])
+ return lp.sort_index(level=0)
+
+
+def homogenize(series_dict):
+ """
+ Conform a set of SparseSeries (with NaN fill_value) to a common SparseIndex
+ corresponding to the locations where they all have data
+
+ Parameters
+ ----------
+ series_dict : dict or DataFrame
+
+ Notes
+ -----
+ Using the dumbest algorithm I could think of. Should put some more thought
+ into this
+
+ Returns
+ -------
+ homogenized : dict of SparseSeries
+ """
+ index = None
+
+ need_reindex = False
+
+ for _, series in compat.iteritems(series_dict):
+ if not np.isnan(series.fill_value):
+ raise TypeError('this method is only valid with NaN fill values')
+
+ if index is None:
+ index = series.sp_index
+ elif not series.sp_index.equals(index):
+ need_reindex = True
+ index = index.intersect(series.sp_index)
+
+ if need_reindex:
+ output = {}
+ for name, series in compat.iteritems(series_dict):
+ if not series.sp_index.equals(index):
+ series = series.sparse_reindex(index)
+
+ output[name] = series
+ else:
+ output = series_dict
+
+ return output
+
+
+# use unaccelerated ops for sparse objects
+ops.add_flex_arithmetic_methods(SparseDataFrame)
+ops.add_special_arithmetic_methods(SparseDataFrame)
diff --git a/contrib/python/pandas/py2/pandas/core/sparse/scipy_sparse.py b/contrib/python/pandas/py2/pandas/core/sparse/scipy_sparse.py
new file mode 100644
index 00000000000..2d0ce2d5e59
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/sparse/scipy_sparse.py
@@ -0,0 +1,131 @@
+"""
+Interaction with scipy.sparse matrices.
+
+Currently only includes SparseSeries.to_coo helpers.
+"""
+from pandas.compat import OrderedDict, lmap
+
+from pandas.core.index import Index, MultiIndex
+from pandas.core.series import Series
+
+
+def _check_is_partition(parts, whole):
+ whole = set(whole)
+ parts = [set(x) for x in parts]
+ if set.intersection(*parts) != set():
+ raise ValueError(
+ 'Is not a partition because intersection is not null.')
+ if set.union(*parts) != whole:
+ raise ValueError('Is not a partition because union is not the whole.')
+
+
+def _to_ijv(ss, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
+ """ For arbitrary (MultiIndexed) SparseSeries return
+ (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for
+ passing to scipy.sparse.coo constructor. """
+ # index and column levels must be a partition of the index
+ _check_is_partition([row_levels, column_levels], range(ss.index.nlevels))
+
+ # from the SparseSeries: get the labels and data for non-null entries
+ values = ss._data.internal_values()._valid_sp_values
+
+ nonnull_labels = ss.dropna()
+
+ def get_indexers(levels):
+ """ Return sparse coords and dense labels for subset levels """
+
+ # TODO: how to do this better? cleanly slice nonnull_labels given the
+ # coord
+ values_ilabels = [tuple(x[i] for i in levels)
+ for x in nonnull_labels.index]
+ if len(levels) == 1:
+ values_ilabels = [x[0] for x in values_ilabels]
+
+ # # performance issues with groupby ###################################
+ # TODO: these two lines can rejplace the code below but
+ # groupby is too slow (in some cases at least)
+ # labels_to_i = ss.groupby(level=levels, sort=sort_labels).first()
+ # labels_to_i[:] = np.arange(labels_to_i.shape[0])
+
+ def _get_label_to_i_dict(labels, sort_labels=False):
+ """ Return OrderedDict of unique labels to number.
+ Optionally sort by label.
+ """
+ labels = Index(lmap(tuple, labels)).unique().tolist() # squish
+ if sort_labels:
+ labels = sorted(list(labels))
+ d = OrderedDict((k, i) for i, k in enumerate(labels))
+ return (d)
+
+ def _get_index_subset_to_coord_dict(index, subset, sort_labels=False):
+ ilabels = list(zip(*[index._get_level_values(i) for i in subset]))
+ labels_to_i = _get_label_to_i_dict(ilabels,
+ sort_labels=sort_labels)
+ labels_to_i = Series(labels_to_i)
+ if len(subset) > 1:
+ labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index)
+ labels_to_i.index.names = [index.names[i] for i in subset]
+ else:
+ labels_to_i.index = Index(x[0] for x in labels_to_i.index)
+ labels_to_i.index.name = index.names[subset[0]]
+
+ labels_to_i.name = 'value'
+ return (labels_to_i)
+
+ labels_to_i = _get_index_subset_to_coord_dict(ss.index, levels,
+ sort_labels=sort_labels)
+ # #####################################################################
+ # #####################################################################
+
+ i_coord = labels_to_i[values_ilabels].tolist()
+ i_labels = labels_to_i.index.tolist()
+
+ return i_coord, i_labels
+
+ i_coord, i_labels = get_indexers(row_levels)
+ j_coord, j_labels = get_indexers(column_levels)
+
+ return values, i_coord, j_coord, i_labels, j_labels
+
+
+def _sparse_series_to_coo(ss, row_levels=(0, ), column_levels=(1, ),
+ sort_labels=False):
+ """ Convert a SparseSeries to a scipy.sparse.coo_matrix using index
+ levels row_levels, column_levels as the row and column
+ labels respectively. Returns the sparse_matrix, row and column labels.
+ """
+
+ import scipy.sparse
+
+ if ss.index.nlevels < 2:
+ raise ValueError('to_coo requires MultiIndex with nlevels > 2')
+ if not ss.index.is_unique:
+ raise ValueError('Duplicate index entries are not allowed in to_coo '
+ 'transformation.')
+
+ # to keep things simple, only rely on integer indexing (not labels)
+ row_levels = [ss.index._get_level_number(x) for x in row_levels]
+ column_levels = [ss.index._get_level_number(x) for x in column_levels]
+
+ v, i, j, rows, columns = _to_ijv(ss, row_levels=row_levels,
+ column_levels=column_levels,
+ sort_labels=sort_labels)
+ sparse_matrix = scipy.sparse.coo_matrix(
+ (v, (i, j)), shape=(len(rows), len(columns)))
+ return sparse_matrix, rows, columns
+
+
+def _coo_to_sparse_series(A, dense_index=False):
+ """ Convert a scipy.sparse.coo_matrix to a SparseSeries.
+ Use the defaults given in the SparseSeries constructor.
+ """
+ s = Series(A.data, MultiIndex.from_arrays((A.row, A.col)))
+ s = s.sort_index()
+ s = s.to_sparse() # TODO: specify kind?
+ if dense_index:
+ # is there a better constructor method to use here?
+ i = range(A.shape[0])
+ j = range(A.shape[1])
+ ind = MultiIndex.from_product([i, j])
+ s = s.reindex(ind)
+ return s
diff --git a/contrib/python/pandas/py2/pandas/core/sparse/series.py b/contrib/python/pandas/py2/pandas/core/sparse/series.py
new file mode 100644
index 00000000000..db4d3e876de
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/sparse/series.py
@@ -0,0 +1,592 @@
+"""
+Data structures for sparse float data. Life is made simpler by dealing only
+with float64 data
+"""
+
+# pylint: disable=E1101,E1103,W0231
+
+import warnings
+
+import numpy as np
+
+import pandas._libs.index as libindex
+import pandas._libs.sparse as splib
+from pandas._libs.sparse import BlockIndex, IntIndex
+import pandas.compat as compat
+from pandas.compat.numpy import function as nv
+from pandas.util._decorators import Appender, Substitution
+
+from pandas.core.dtypes.common import is_integer, is_scalar
+from pandas.core.dtypes.generic import ABCSeries, ABCSparseSeries
+from pandas.core.dtypes.missing import isna, notna
+
+from pandas.core import generic
+from pandas.core.arrays import SparseArray
+from pandas.core.arrays.sparse import SparseAccessor
+from pandas.core.index import Index
+from pandas.core.internals import SingleBlockManager
+import pandas.core.ops as ops
+from pandas.core.series import Series
+from pandas.core.sparse.scipy_sparse import (
+ _coo_to_sparse_series, _sparse_series_to_coo)
+
+_shared_doc_kwargs = dict(axes='index', klass='SparseSeries',
+ axes_single_arg="{0, 'index'}",
+ optional_labels='', optional_axis='')
+
+
+class SparseSeries(Series):
+ """Data structure for labeled, sparse floating point data
+
+ Parameters
+ ----------
+ data : {array-like, Series, SparseSeries, dict}
+ .. versionchanged :: 0.23.0
+ If data is a dict, argument order is maintained for Python 3.6
+ and later.
+
+ kind : {'block', 'integer'}
+ fill_value : float
+ Code for missing value. Defaults depends on dtype.
+ 0 for int dtype, False for bool dtype, and NaN for other dtypes
+ sparse_index : {BlockIndex, IntIndex}, optional
+ Only if you have one. Mainly used internally
+
+ Notes
+ -----
+ SparseSeries objects are immutable via the typical Python means. If you
+ must change values, convert to dense, make your changes, then convert back
+ to sparse
+ """
+ _subtyp = 'sparse_series'
+
+ def __init__(self, data=None, index=None, sparse_index=None, kind='block',
+ fill_value=None, name=None, dtype=None, copy=False,
+ fastpath=False):
+ # TODO: Most of this should be refactored and shared with Series
+ # 1. BlockManager -> array
+ # 2. Series.index, Series.name, index, name reconciliation
+ # 3. Implicit reindexing
+ # 4. Implicit broadcasting
+ # 5. Dict construction
+ if data is None:
+ data = []
+ elif isinstance(data, SingleBlockManager):
+ index = data.index
+ data = data.blocks[0].values
+ elif isinstance(data, (ABCSeries, ABCSparseSeries)):
+ index = data.index if index is None else index
+ dtype = data.dtype if dtype is None else dtype
+ name = data.name if name is None else name
+
+ if index is not None:
+ data = data.reindex(index)
+
+ elif isinstance(data, compat.Mapping):
+ data, index = Series()._init_dict(data, index=index)
+
+ elif is_scalar(data) and index is not None:
+ data = np.full(len(index), fill_value=data)
+
+ super(SparseSeries, self).__init__(
+ SparseArray(data,
+ sparse_index=sparse_index,
+ kind=kind,
+ dtype=dtype,
+ fill_value=fill_value,
+ copy=copy),
+ index=index, name=name,
+ copy=False, fastpath=fastpath
+ )
+
+ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+ # avoid infinite recursion for other SparseSeries inputs
+ inputs = tuple(
+ x.values if isinstance(x, type(self)) else x
+ for x in inputs
+ )
+ result = self.values.__array_ufunc__(ufunc, method, *inputs, **kwargs)
+ return self._constructor(result, index=self.index,
+ sparse_index=self.sp_index,
+ fill_value=result.fill_value,
+ copy=False).__finalize__(self)
+
+ def __array_wrap__(self, result, context=None):
+ """
+ Gets called prior to a ufunc (and after)
+
+ See SparseArray.__array_wrap__ for detail.
+ """
+ result = self.values.__array_wrap__(result, context=context)
+ return self._constructor(result, index=self.index,
+ sparse_index=self.sp_index,
+ fill_value=result.fill_value,
+ copy=False).__finalize__(self)
+
+ def __array_finalize__(self, obj):
+ """
+ Gets called after any ufunc or other array operations, necessary
+ to pass on the index.
+ """
+ self.name = getattr(obj, 'name', None)
+ self.fill_value = getattr(obj, 'fill_value', None)
+
+ # unary ops
+ # TODO: See if this can be shared
+ def __pos__(self):
+ result = self.values.__pos__()
+ return self._constructor(result, index=self.index,
+ sparse_index=self.sp_index,
+ fill_value=result.fill_value,
+ copy=False).__finalize__(self)
+
+ def __neg__(self):
+ result = self.values.__neg__()
+ return self._constructor(result, index=self.index,
+ sparse_index=self.sp_index,
+ fill_value=result.fill_value,
+ copy=False).__finalize__(self)
+
+ def __invert__(self):
+ result = self.values.__invert__()
+ return self._constructor(result, index=self.index,
+ sparse_index=self.sp_index,
+ fill_value=result.fill_value,
+ copy=False).__finalize__(self)
+
+ @property
+ def block(self):
+ warnings.warn("SparseSeries.block is deprecated.", FutureWarning,
+ stacklevel=2)
+ return self._data._block
+
+ @property
+ def fill_value(self):
+ return self.values.fill_value
+
+ @fill_value.setter
+ def fill_value(self, v):
+ self.values.fill_value = v
+
+ @property
+ def sp_index(self):
+ return self.values.sp_index
+
+ @property
+ def sp_values(self):
+ return self.values.sp_values
+
+ @property
+ def npoints(self):
+ return self.values.npoints
+
+ @classmethod
+ def from_array(cls, arr, index=None, name=None, copy=False,
+ fill_value=None, fastpath=False):
+ """Construct SparseSeries from array.
+
+ .. deprecated:: 0.23.0
+ Use the pd.SparseSeries(..) constructor instead.
+ """
+ warnings.warn("'from_array' is deprecated and will be removed in a "
+ "future version. Please use the pd.SparseSeries(..) "
+ "constructor instead.", FutureWarning, stacklevel=2)
+ return cls(arr, index=index, name=name, copy=copy,
+ fill_value=fill_value, fastpath=fastpath)
+
+ @property
+ def _constructor(self):
+ return SparseSeries
+
+ @property
+ def _constructor_expanddim(self):
+ from pandas.core.sparse.api import SparseDataFrame
+ return SparseDataFrame
+
+ @property
+ def kind(self):
+ if isinstance(self.sp_index, BlockIndex):
+ return 'block'
+ elif isinstance(self.sp_index, IntIndex):
+ return 'integer'
+
+ def as_sparse_array(self, kind=None, fill_value=None, copy=False):
+ """ return my self as a sparse array, do not copy by default """
+
+ if fill_value is None:
+ fill_value = self.fill_value
+ if kind is None:
+ kind = self.kind
+ return SparseArray(self.values, sparse_index=self.sp_index,
+ fill_value=fill_value, kind=kind, copy=copy)
+
+ def __unicode__(self):
+ # currently, unicode is same as repr...fixes infinite loop
+ series_rep = Series.__unicode__(self)
+ rep = '{series}\n{index!r}'.format(series=series_rep,
+ index=self.sp_index)
+ return rep
+
+ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
+ filter_type=None, **kwds):
+ """ perform a reduction operation """
+ return op(self.get_values(), skipna=skipna, **kwds)
+
+ def __getstate__(self):
+ # pickling
+ return dict(_typ=self._typ, _subtyp=self._subtyp, _data=self._data,
+ fill_value=self.fill_value, name=self.name)
+
+ def _unpickle_series_compat(self, state):
+
+ nd_state, own_state = state
+
+ # recreate the ndarray
+ data = np.empty(nd_state[1], dtype=nd_state[2])
+ np.ndarray.__setstate__(data, nd_state)
+
+ index, fill_value, sp_index = own_state[:3]
+ name = None
+ if len(own_state) > 3:
+ name = own_state[3]
+
+ # create a sparse array
+ if not isinstance(data, SparseArray):
+ data = SparseArray(data, sparse_index=sp_index,
+ fill_value=fill_value, copy=False)
+
+ # recreate
+ data = SingleBlockManager(data, index, fastpath=True)
+ generic.NDFrame.__init__(self, data)
+
+ self._set_axis(0, index)
+ self.name = name
+
+ def _set_subtyp(self, is_all_dates):
+ if is_all_dates:
+ object.__setattr__(self, '_subtyp', 'sparse_time_series')
+ else:
+ object.__setattr__(self, '_subtyp', 'sparse_series')
+
+ def _ixs(self, i, axis=0):
+ """
+ Return the i-th value or values in the SparseSeries by location
+
+ Parameters
+ ----------
+ i : int, slice, or sequence of integers
+
+ Returns
+ -------
+ value : scalar (int) or Series (slice, sequence)
+ """
+ label = self.index[i]
+ if isinstance(label, Index):
+ return self.take(i, axis=axis)
+ else:
+ return self._get_val_at(i)
+
+ def _get_val_at(self, loc):
+ """ forward to the array """
+ return self.values._get_val_at(loc)
+
+ def __getitem__(self, key):
+ # TODO: Document difference from Series.__getitem__, deprecate,
+ # and remove!
+ if is_integer(key) and key not in self.index:
+ return self._get_val_at(key)
+ else:
+ return super(SparseSeries, self).__getitem__(key)
+
+ def _get_values(self, indexer):
+ try:
+ return self._constructor(self._data.get_slice(indexer),
+ fastpath=True).__finalize__(self)
+ except Exception:
+ return self[indexer]
+
+ def _set_with_engine(self, key, value):
+ return self._set_value(key, value)
+
+ def abs(self):
+ """
+ Return an object with absolute value taken. Only applicable to objects
+ that are all numeric
+
+ Returns
+ -------
+ abs: same type as caller
+ """
+ return self._constructor(np.abs(self.values),
+ index=self.index).__finalize__(self)
+
+ def get(self, label, default=None):
+ """
+ Returns value occupying requested label, default to specified
+ missing value if not present. Analogous to dict.get
+
+ Parameters
+ ----------
+ label : object
+ Label value looking for
+ default : object, optional
+ Value to return if label not in index
+
+ Returns
+ -------
+ y : scalar
+ """
+ if label in self.index:
+ loc = self.index.get_loc(label)
+ return self._get_val_at(loc)
+ else:
+ return default
+
+ def get_value(self, label, takeable=False):
+ """
+ Retrieve single value at passed index label
+
+ .. deprecated:: 0.21.0
+
+ Please use .at[] or .iat[] accessors.
+
+ Parameters
+ ----------
+ index : label
+ takeable : interpret the index as indexers, default False
+
+ Returns
+ -------
+ value : scalar value
+ """
+ warnings.warn("get_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead", FutureWarning,
+ stacklevel=2)
+
+ return self._get_value(label, takeable=takeable)
+
+ def _get_value(self, label, takeable=False):
+ loc = label if takeable is True else self.index.get_loc(label)
+ return self._get_val_at(loc)
+ _get_value.__doc__ = get_value.__doc__
+
+ def set_value(self, label, value, takeable=False):
+ """
+ Quickly set single value at passed label. If label is not contained, a
+ new object is created with the label placed at the end of the result
+ index
+
+ .. deprecated:: 0.21.0
+
+ Please use .at[] or .iat[] accessors.
+
+ Parameters
+ ----------
+ label : object
+ Partial indexing with MultiIndex not allowed
+ value : object
+ Scalar value
+ takeable : interpret the index as indexers, default False
+
+ Notes
+ -----
+ This method *always* returns a new object. It is not particularly
+ efficient but is provided for API compatibility with Series
+
+ Returns
+ -------
+ series : SparseSeries
+ """
+ warnings.warn("set_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead", FutureWarning,
+ stacklevel=2)
+ return self._set_value(label, value, takeable=takeable)
+
+ def _set_value(self, label, value, takeable=False):
+ values = self.to_dense()
+
+ # if the label doesn't exist, we will create a new object here
+ # and possibly change the index
+ new_values = values._set_value(label, value, takeable=takeable)
+ if new_values is not None:
+ values = new_values
+ new_index = values.index
+ values = SparseArray(values, fill_value=self.fill_value,
+ kind=self.kind)
+ self._data = SingleBlockManager(values, new_index)
+ self._index = new_index
+ _set_value.__doc__ = set_value.__doc__
+
+ def _set_values(self, key, value):
+
+ # this might be inefficient as we have to recreate the sparse array
+ # rather than setting individual elements, but have to convert
+ # the passed slice/boolean that's in dense space into a sparse indexer
+ # not sure how to do that!
+ if isinstance(key, Series):
+ key = key.values
+
+ values = self.values.to_dense()
+ values[key] = libindex.convert_scalar(values, value)
+ values = SparseArray(values, fill_value=self.fill_value,
+ kind=self.kind)
+ self._data = SingleBlockManager(values, self.index)
+
+ def to_dense(self):
+ """
+ Convert SparseSeries to a Series.
+
+ Returns
+ -------
+ s : Series
+ """
+ return Series(self.values.to_dense(), index=self.index,
+ name=self.name)
+
+ @property
+ def density(self):
+ return self.values.density
+
+ def copy(self, deep=True):
+ """
+ Make a copy of the SparseSeries. Only the actual sparse values need to
+ be copied
+ """
+ # TODO: https://github.com/pandas-dev/pandas/issues/22314
+ # We skip the block manager till that is resolved.
+ new_data = self.values.copy(deep=deep)
+ return self._constructor(new_data, sparse_index=self.sp_index,
+ fill_value=self.fill_value,
+ index=self.index.copy(),
+ name=self.name).__finalize__(self)
+
+ @Substitution(**_shared_doc_kwargs)
+ @Appender(generic.NDFrame.reindex.__doc__)
+ def reindex(self, index=None, method=None, copy=True, limit=None,
+ **kwargs):
+ # TODO: remove?
+ return super(SparseSeries, self).reindex(index=index, method=method,
+ copy=copy, limit=limit,
+ **kwargs)
+
+ def sparse_reindex(self, new_index):
+ """
+ Conform sparse values to new SparseIndex
+
+ Parameters
+ ----------
+ new_index : {BlockIndex, IntIndex}
+
+ Returns
+ -------
+ reindexed : SparseSeries
+ """
+ if not isinstance(new_index, splib.SparseIndex):
+ raise TypeError("new index must be a SparseIndex")
+ values = self.values
+ values = values.sp_index.to_int_index().reindex(
+ values.sp_values.astype('float64'), values.fill_value, new_index)
+ values = SparseArray(values,
+ sparse_index=new_index,
+ fill_value=self.values.fill_value)
+ return self._constructor(values, index=self.index).__finalize__(self)
+
+ def cumsum(self, axis=0, *args, **kwargs):
+ """
+ Cumulative sum of non-NA/null values.
+
+ When performing the cumulative summation, any non-NA/null values will
+ be skipped. The resulting SparseSeries will preserve the locations of
+ NaN values, but the fill value will be `np.nan` regardless.
+
+ Parameters
+ ----------
+ axis : {0}
+
+ Returns
+ -------
+ cumsum : SparseSeries
+ """
+ nv.validate_cumsum(args, kwargs)
+ # Validate axis
+ if axis is not None:
+ self._get_axis_number(axis)
+
+ new_array = self.values.cumsum()
+
+ return self._constructor(
+ new_array, index=self.index,
+ sparse_index=new_array.sp_index).__finalize__(self)
+
+ # TODO: SparseSeries.isna is Sparse, while Series.isna is dense
+ @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs)
+ def isna(self):
+ arr = SparseArray(isna(self.values.sp_values),
+ sparse_index=self.values.sp_index,
+ fill_value=isna(self.fill_value))
+ return self._constructor(arr, index=self.index).__finalize__(self)
+
+ isnull = isna
+
+ @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs)
+ def notna(self):
+ arr = SparseArray(notna(self.values.sp_values),
+ sparse_index=self.values.sp_index,
+ fill_value=notna(self.fill_value))
+ return self._constructor(arr, index=self.index).__finalize__(self)
+ notnull = notna
+
+ def dropna(self, axis=0, inplace=False, **kwargs):
+ """
+ Analogous to Series.dropna. If fill_value=NaN, returns a dense Series
+ """
+ # TODO: make more efficient
+ # Validate axis
+ self._get_axis_number(axis or 0)
+ dense_valid = self.to_dense().dropna()
+ if inplace:
+ raise NotImplementedError("Cannot perform inplace dropna"
+ " operations on a SparseSeries")
+ if isna(self.fill_value):
+ return dense_valid
+ else:
+ dense_valid = dense_valid[dense_valid != self.fill_value]
+ return dense_valid.to_sparse(fill_value=self.fill_value)
+
+ def combine_first(self, other):
+ """
+ Combine Series values, choosing the calling Series's values
+ first. Result index will be the union of the two indexes
+
+ Parameters
+ ----------
+ other : Series
+
+ Returns
+ -------
+ y : Series
+ """
+ if isinstance(other, SparseSeries):
+ other = other.to_dense()
+
+ dense_combined = self.to_dense().combine_first(other)
+ return dense_combined.to_sparse(fill_value=self.fill_value)
+
+ @Appender(SparseAccessor.to_coo.__doc__)
+ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
+ A, rows, columns = _sparse_series_to_coo(self, row_levels,
+ column_levels,
+ sort_labels=sort_labels)
+ return A, rows, columns
+
+ @classmethod
+ @Appender(SparseAccessor.from_coo.__doc__)
+ def from_coo(cls, A, dense_index=False):
+ return _coo_to_sparse_series(A, dense_index=dense_index)
+
+
+# overwrite series methods with unaccelerated Sparse-specific versions
+ops.add_flex_arithmetic_methods(SparseSeries)
+ops.add_special_arithmetic_methods(SparseSeries)
diff --git a/contrib/python/pandas/py2/pandas/core/strings.py b/contrib/python/pandas/py2/pandas/core/strings.py
new file mode 100644
index 00000000000..ca79dcd9408
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/strings.py
@@ -0,0 +1,3184 @@
+# -*- coding: utf-8 -*-
+import codecs
+import re
+import textwrap
+import warnings
+
+import numpy as np
+
+import pandas._libs.lib as lib
+import pandas._libs.ops as libops
+import pandas.compat as compat
+from pandas.compat import zip
+from pandas.util._decorators import Appender, deprecate_kwarg
+
+from pandas.core.dtypes.common import (
+ ensure_object, is_bool_dtype, is_categorical_dtype, is_integer,
+ is_list_like, is_object_dtype, is_re, is_scalar, is_string_like)
+from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
+from pandas.core.dtypes.missing import isna
+
+from pandas.core.algorithms import take_1d
+from pandas.core.base import NoNewAttributesMixin
+import pandas.core.common as com
+
+_cpython_optimized_encoders = (
+ "utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii"
+)
+_cpython_optimized_decoders = _cpython_optimized_encoders + (
+ "utf-16", "utf-32"
+)
+
+_shared_docs = dict()
+
+
+def cat_core(list_of_columns, sep):
+ """
+ Auxiliary function for :meth:`str.cat`
+
+ Parameters
+ ----------
+ list_of_columns : list of numpy arrays
+ List of arrays to be concatenated with sep;
+ these arrays may not contain NaNs!
+ sep : string
+ The separator string for concatenating the columns
+
+ Returns
+ -------
+ nd.array
+ The concatenation of list_of_columns with sep
+ """
+ list_with_sep = [sep] * (2 * len(list_of_columns) - 1)
+ list_with_sep[::2] = list_of_columns
+ return np.sum(list_with_sep, axis=0)
+
+
+def _na_map(f, arr, na_result=np.nan, dtype=object):
+ # should really _check_ for NA
+ return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
+
+
+def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
+ if not len(arr):
+ return np.ndarray(0, dtype=dtype)
+
+ if isinstance(arr, ABCSeries):
+ arr = arr.values
+ if not isinstance(arr, np.ndarray):
+ arr = np.asarray(arr, dtype=object)
+ if na_mask:
+ mask = isna(arr)
+ try:
+ convert = not all(mask)
+ result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert)
+ except (TypeError, AttributeError) as e:
+ # Reraise the exception if callable `f` got wrong number of args.
+ # The user may want to be warned by this, instead of getting NaN
+ if compat.PY2:
+ p_err = r'takes (no|(exactly|at (least|most)) ?\d+) arguments?'
+ else:
+ p_err = (r'((takes)|(missing)) (?(2)from \d+ to )?\d+ '
+ r'(?(3)required )positional arguments?')
+
+ if len(e.args) >= 1 and re.search(p_err, e.args[0]):
+ raise e
+
+ def g(x):
+ try:
+ return f(x)
+ except (TypeError, AttributeError):
+ return na_value
+
+ return _map(g, arr, dtype=dtype)
+ if na_value is not np.nan:
+ np.putmask(result, mask, na_value)
+ if result.dtype == object:
+ result = lib.maybe_convert_objects(result)
+ return result
+ else:
+ return lib.map_infer(arr, f)
+
+
+def str_count(arr, pat, flags=0):
+ """
+ Count occurrences of pattern in each string of the Series/Index.
+
+ This function is used to count the number of times a particular regex
+ pattern is repeated in each of the string elements of the
+ :class:`~pandas.Series`.
+
+ Parameters
+ ----------
+ pat : str
+ Valid regular expression.
+ flags : int, default 0, meaning no flags
+ Flags for the `re` module. For a complete list, `see here
+ <https://docs.python.org/3/howto/regex.html#compilation-flags>`_.
+ **kwargs
+ For compatibility with other string methods. Not used.
+
+ Returns
+ -------
+ counts : Series or Index
+ Same type as the calling object containing the integer counts.
+
+ See Also
+ --------
+ re : Standard library module for regular expressions.
+ str.count : Standard library version, without regular expression support.
+
+ Notes
+ -----
+ Some characters need to be escaped when passing in `pat`.
+ eg. ``'$'`` has a special meaning in regex and must be escaped when
+ finding this literal character.
+
+ Examples
+ --------
+ >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])
+ >>> s.str.count('a')
+ 0 0.0
+ 1 0.0
+ 2 2.0
+ 3 2.0
+ 4 NaN
+ 5 0.0
+ 6 1.0
+ dtype: float64
+
+ Escape ``'$'`` to find the literal dollar sign.
+
+ >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
+ >>> s.str.count('\\$')
+ 0 1
+ 1 0
+ 2 1
+ 3 2
+ 4 2
+ 5 0
+ dtype: int64
+
+ This is also available on Index
+
+ >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a')
+ Int64Index([0, 0, 2, 1], dtype='int64')
+ """
+ regex = re.compile(pat, flags=flags)
+ f = lambda x: len(regex.findall(x))
+ return _na_map(f, arr, dtype=int)
+
+
+def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
+ """
+ Test if pattern or regex is contained within a string of a Series or Index.
+
+ Return boolean Series or Index based on whether a given pattern or regex is
+ contained within a string of a Series or Index.
+
+ Parameters
+ ----------
+ pat : str
+ Character sequence or regular expression.
+ case : bool, default True
+ If True, case sensitive.
+ flags : int, default 0 (no flags)
+ Flags to pass through to the re module, e.g. re.IGNORECASE.
+ na : default NaN
+ Fill value for missing values.
+ regex : bool, default True
+ If True, assumes the pat is a regular expression.
+
+ If False, treats the pat as a literal string.
+
+ Returns
+ -------
+ Series or Index of boolean values
+ A Series or Index of boolean values indicating whether the
+ given pattern is contained within the string of each element
+ of the Series or Index.
+
+ See Also
+ --------
+ match : Analogous, but stricter, relying on re.match instead of re.search.
+ Series.str.startswith : Test if the start of each string element matches a
+ pattern.
+ Series.str.endswith : Same as startswith, but tests the end of string.
+
+ Examples
+ --------
+
+ Returning a Series of booleans using only a literal pattern.
+
+ >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
+ >>> s1.str.contains('og', regex=False)
+ 0 False
+ 1 True
+ 2 False
+ 3 False
+ 4 NaN
+ dtype: object
+
+ Returning an Index of booleans using only a literal pattern.
+
+ >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN])
+ >>> ind.str.contains('23', regex=False)
+ Index([False, False, False, True, nan], dtype='object')
+
+ Specifying case sensitivity using `case`.
+
+ >>> s1.str.contains('oG', case=True, regex=True)
+ 0 False
+ 1 False
+ 2 False
+ 3 False
+ 4 NaN
+ dtype: object
+
+ Specifying `na` to be `False` instead of `NaN` replaces NaN values
+ with `False`. If Series or Index does not contain NaN values
+ the resultant dtype will be `bool`, otherwise, an `object` dtype.
+
+ >>> s1.str.contains('og', na=False, regex=True)
+ 0 False
+ 1 True
+ 2 False
+ 3 False
+ 4 False
+ dtype: bool
+
+ Returning 'house' or 'dog' when either expression occurs in a string.
+
+ >>> s1.str.contains('house|dog', regex=True)
+ 0 False
+ 1 True
+ 2 True
+ 3 False
+ 4 NaN
+ dtype: object
+
+ Ignoring case sensitivity using `flags` with regex.
+
+ >>> import re
+ >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)
+ 0 False
+ 1 False
+ 2 True
+ 3 False
+ 4 NaN
+ dtype: object
+
+ Returning any digit using regular expression.
+
+ >>> s1.str.contains('\\d', regex=True)
+ 0 False
+ 1 False
+ 2 False
+ 3 True
+ 4 NaN
+ dtype: object
+
+ Ensure `pat` is a not a literal pattern when `regex` is set to True.
+ Note in the following example one might expect only `s2[1]` and `s2[3]` to
+ return `True`. However, '.0' as a regex matches any character
+ followed by a 0.
+
+ >>> s2 = pd.Series(['40','40.0','41','41.0','35'])
+ >>> s2.str.contains('.0', regex=True)
+ 0 True
+ 1 True
+ 2 False
+ 3 True
+ 4 False
+ dtype: bool
+ """
+ if regex:
+ if not case:
+ flags |= re.IGNORECASE
+
+ regex = re.compile(pat, flags=flags)
+
+ if regex.groups > 0:
+ warnings.warn("This pattern has match groups. To actually get the"
+ " groups, use str.extract.", UserWarning,
+ stacklevel=3)
+
+ f = lambda x: bool(regex.search(x))
+ else:
+ if case:
+ f = lambda x: pat in x
+ else:
+ upper_pat = pat.upper()
+ f = lambda x: upper_pat in x
+ uppered = _na_map(lambda x: x.upper(), arr)
+ return _na_map(f, uppered, na, dtype=bool)
+ return _na_map(f, arr, na, dtype=bool)
+
+
+def str_startswith(arr, pat, na=np.nan):
+ """
+ Test if the start of each string element matches a pattern.
+
+ Equivalent to :meth:`str.startswith`.
+
+ Parameters
+ ----------
+ pat : str
+ Character sequence. Regular expressions are not accepted.
+ na : object, default NaN
+ Object shown if element tested is not a string.
+
+ Returns
+ -------
+ Series or Index of bool
+ A Series of booleans indicating whether the given pattern matches
+ the start of each string element.
+
+ See Also
+ --------
+ str.startswith : Python standard library string method.
+ Series.str.endswith : Same as startswith, but tests the end of string.
+ Series.str.contains : Tests if string element contains a pattern.
+
+ Examples
+ --------
+ >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan])
+ >>> s
+ 0 bat
+ 1 Bear
+ 2 cat
+ 3 NaN
+ dtype: object
+
+ >>> s.str.startswith('b')
+ 0 True
+ 1 False
+ 2 False
+ 3 NaN
+ dtype: object
+
+ Specifying `na` to be `False` instead of `NaN`.
+
+ >>> s.str.startswith('b', na=False)
+ 0 True
+ 1 False
+ 2 False
+ 3 False
+ dtype: bool
+ """
+ f = lambda x: x.startswith(pat)
+ return _na_map(f, arr, na, dtype=bool)
+
+
+def str_endswith(arr, pat, na=np.nan):
+ """
+ Test if the end of each string element matches a pattern.
+
+ Equivalent to :meth:`str.endswith`.
+
+ Parameters
+ ----------
+ pat : str
+ Character sequence. Regular expressions are not accepted.
+ na : object, default NaN
+ Object shown if element tested is not a string.
+
+ Returns
+ -------
+ Series or Index of bool
+ A Series of booleans indicating whether the given pattern matches
+ the end of each string element.
+
+ See Also
+ --------
+ str.endswith : Python standard library string method.
+ Series.str.startswith : Same as endswith, but tests the start of string.
+ Series.str.contains : Tests if string element contains a pattern.
+
+ Examples
+ --------
+ >>> s = pd.Series(['bat', 'bear', 'caT', np.nan])
+ >>> s
+ 0 bat
+ 1 bear
+ 2 caT
+ 3 NaN
+ dtype: object
+
+ >>> s.str.endswith('t')
+ 0 True
+ 1 False
+ 2 False
+ 3 NaN
+ dtype: object
+
+ Specifying `na` to be `False` instead of `NaN`.
+
+ >>> s.str.endswith('t', na=False)
+ 0 True
+ 1 False
+ 2 False
+ 3 False
+ dtype: bool
+ """
+ f = lambda x: x.endswith(pat)
+ return _na_map(f, arr, na, dtype=bool)
+
+
+def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True):
+ r"""
+ Replace occurrences of pattern/regex in the Series/Index with
+ some other string. Equivalent to :meth:`str.replace` or
+ :func:`re.sub`.
+
+ Parameters
+ ----------
+ pat : string or compiled regex
+ String can be a character sequence or regular expression.
+
+ .. versionadded:: 0.20.0
+ `pat` also accepts a compiled regex.
+
+ repl : string or callable
+ Replacement string or a callable. The callable is passed the regex
+ match object and must return a replacement string to be used.
+ See :func:`re.sub`.
+
+ .. versionadded:: 0.20.0
+ `repl` also accepts a callable.
+
+ n : int, default -1 (all)
+ Number of replacements to make from start
+ case : boolean, default None
+ - If True, case sensitive (the default if `pat` is a string)
+ - Set to False for case insensitive
+ - Cannot be set if `pat` is a compiled regex
+ flags : int, default 0 (no flags)
+ - re module flags, e.g. re.IGNORECASE
+ - Cannot be set if `pat` is a compiled regex
+ regex : boolean, default True
+ - If True, assumes the passed-in pattern is a regular expression.
+ - If False, treats the pattern as a literal string
+ - Cannot be set to False if `pat` is a compiled regex or `repl` is
+ a callable.
+
+ .. versionadded:: 0.23.0
+
+ Returns
+ -------
+ Series or Index of object
+ A copy of the object with all matching occurrences of `pat` replaced by
+ `repl`.
+
+ Raises
+ ------
+ ValueError
+ * if `regex` is False and `repl` is a callable or `pat` is a compiled
+ regex
+ * if `pat` is a compiled regex and `case` or `flags` is set
+
+ Notes
+ -----
+ When `pat` is a compiled regex, all flags should be included in the
+ compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled
+ regex will raise an error.
+
+ Examples
+ --------
+ When `pat` is a string and `regex` is True (the default), the given `pat`
+ is compiled as a regex. When `repl` is a string, it replaces matching
+ regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are
+ left as is:
+
+ >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True)
+ 0 bao
+ 1 baz
+ 2 NaN
+ dtype: object
+
+ When `pat` is a string and `regex` is False, every `pat` is replaced with
+ `repl` as with :meth:`str.replace`:
+
+ >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)
+ 0 bao
+ 1 fuz
+ 2 NaN
+ dtype: object
+
+ When `repl` is a callable, it is called on every `pat` using
+ :func:`re.sub`. The callable should expect one positional argument
+ (a regex object) and return a string.
+
+ To get the idea:
+
+ >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr)
+ 0 <_sre.SRE_Match object; span=(0, 1), match='f'>oo
+ 1 <_sre.SRE_Match object; span=(0, 1), match='f'>uz
+ 2 NaN
+ dtype: object
+
+ Reverse every lowercase alphabetic word:
+
+ >>> repl = lambda m: m.group(0)[::-1]
+ >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl)
+ 0 oof 123
+ 1 rab zab
+ 2 NaN
+ dtype: object
+
+ Using regex groups (extract second group and swap case):
+
+ >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
+ >>> repl = lambda m: m.group('two').swapcase()
+ >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl)
+ 0 tWO
+ 1 bAR
+ dtype: object
+
+ Using a compiled regex with flags
+
+ >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
+ >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar')
+ 0 foo
+ 1 bar
+ 2 NaN
+ dtype: object
+ """
+
+ # Check whether repl is valid (GH 13438, GH 15055)
+ if not (is_string_like(repl) or callable(repl)):
+ raise TypeError("repl must be a string or callable")
+
+ is_compiled_re = is_re(pat)
+ if regex:
+ if is_compiled_re:
+ if (case is not None) or (flags != 0):
+ raise ValueError("case and flags cannot be set"
+ " when pat is a compiled regex")
+ else:
+ # not a compiled regex
+ # set default case
+ if case is None:
+ case = True
+
+ # add case flag, if provided
+ if case is False:
+ flags |= re.IGNORECASE
+ if is_compiled_re or len(pat) > 1 or flags or callable(repl):
+ n = n if n >= 0 else 0
+ compiled = re.compile(pat, flags=flags)
+ f = lambda x: compiled.sub(repl=repl, string=x, count=n)
+ else:
+ f = lambda x: x.replace(pat, repl, n)
+ else:
+ if is_compiled_re:
+ raise ValueError("Cannot use a compiled regex as replacement "
+ "pattern with regex=False")
+ if callable(repl):
+ raise ValueError("Cannot use a callable replacement when "
+ "regex=False")
+ f = lambda x: x.replace(pat, repl, n)
+
+ return _na_map(f, arr)
+
+
+def str_repeat(arr, repeats):
+ """
+ Duplicate each string in the Series or Index.
+
+ Parameters
+ ----------
+ repeats : int or sequence of int
+ Same value for all (int) or different value per (sequence).
+
+ Returns
+ -------
+ Series or Index of object
+ Series or Index of repeated string objects specified by
+ input parameter repeats.
+
+ Examples
+ --------
+ >>> s = pd.Series(['a', 'b', 'c'])
+ >>> s
+ 0 a
+ 1 b
+ 2 c
+
+ Single int repeats string in Series
+
+ >>> s.str.repeat(repeats=2)
+ 0 aa
+ 1 bb
+ 2 cc
+
+ Sequence of int repeats corresponding string in Series
+
+ >>> s.str.repeat(repeats=[1, 2, 3])
+ 0 a
+ 1 bb
+ 2 ccc
+ """
+ if is_scalar(repeats):
+ def rep(x):
+ try:
+ return compat.binary_type.__mul__(x, repeats)
+ except TypeError:
+ return compat.text_type.__mul__(x, repeats)
+
+ return _na_map(rep, arr)
+ else:
+
+ def rep(x, r):
+ try:
+ return compat.binary_type.__mul__(x, r)
+ except TypeError:
+ return compat.text_type.__mul__(x, r)
+
+ repeats = np.asarray(repeats, dtype=object)
+ result = libops.vec_binop(com.values_from_object(arr), repeats, rep)
+ return result
+
+
+def str_match(arr, pat, case=True, flags=0, na=np.nan):
+ """
+ Determine if each string matches a regular expression.
+
+ Parameters
+ ----------
+ pat : string
+ Character sequence or regular expression
+ case : boolean, default True
+ If True, case sensitive
+ flags : int, default 0 (no flags)
+ re module flags, e.g. re.IGNORECASE
+ na : default NaN, fill value for missing values
+
+ Returns
+ -------
+ Series/array of boolean values
+
+ See Also
+ --------
+ contains : Analogous, but less strict, relying on re.search instead of
+ re.match.
+ extract : Extract matched groups.
+ """
+ if not case:
+ flags |= re.IGNORECASE
+
+ regex = re.compile(pat, flags=flags)
+
+ dtype = bool
+ f = lambda x: bool(regex.match(x))
+
+ return _na_map(f, arr, na, dtype=dtype)
+
+
+def _get_single_group_name(rx):
+ try:
+ return list(rx.groupindex.keys()).pop()
+ except IndexError:
+ return None
+
+
+def _groups_or_na_fun(regex):
+ """Used in both extract_noexpand and extract_frame"""
+ if regex.groups == 0:
+ raise ValueError("pattern contains no capture groups")
+ empty_row = [np.nan] * regex.groups
+
+ def f(x):
+ if not isinstance(x, compat.string_types):
+ return empty_row
+ m = regex.search(x)
+ if m:
+ return [np.nan if item is None else item for item in m.groups()]
+ else:
+ return empty_row
+ return f
+
+
+def _str_extract_noexpand(arr, pat, flags=0):
+ """
+ Find groups in each string in the Series using passed regular
+ expression. This function is called from
+ str_extract(expand=False), and can return Series, DataFrame, or
+ Index.
+
+ """
+ from pandas import DataFrame, Index
+
+ regex = re.compile(pat, flags=flags)
+ groups_or_na = _groups_or_na_fun(regex)
+
+ if regex.groups == 1:
+ result = np.array([groups_or_na(val)[0] for val in arr], dtype=object)
+ name = _get_single_group_name(regex)
+ else:
+ if isinstance(arr, Index):
+ raise ValueError("only one regex group is supported with Index")
+ name = None
+ names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
+ columns = [names.get(1 + i, i) for i in range(regex.groups)]
+ if arr.empty:
+ result = DataFrame(columns=columns, dtype=object)
+ else:
+ result = DataFrame(
+ [groups_or_na(val) for val in arr],
+ columns=columns,
+ index=arr.index,
+ dtype=object)
+ return result, name
+
+
+def _str_extract_frame(arr, pat, flags=0):
+ """
+ For each subject string in the Series, extract groups from the
+ first match of regular expression pat. This function is called from
+ str_extract(expand=True), and always returns a DataFrame.
+
+ """
+ from pandas import DataFrame
+
+ regex = re.compile(pat, flags=flags)
+ groups_or_na = _groups_or_na_fun(regex)
+ names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
+ columns = [names.get(1 + i, i) for i in range(regex.groups)]
+
+ if len(arr) == 0:
+ return DataFrame(columns=columns, dtype=object)
+ try:
+ result_index = arr.index
+ except AttributeError:
+ result_index = None
+ return DataFrame(
+ [groups_or_na(val) for val in arr],
+ columns=columns,
+ index=result_index,
+ dtype=object)
+
+
+def str_extract(arr, pat, flags=0, expand=True):
+ r"""
+ Extract capture groups in the regex `pat` as columns in a DataFrame.
+
+ For each subject string in the Series, extract groups from the
+ first match of regular expression `pat`.
+
+ Parameters
+ ----------
+ pat : string
+ Regular expression pattern with capturing groups.
+ flags : int, default 0 (no flags)
+ Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that
+ modify regular expression matching for things like case,
+ spaces, etc. For more details, see :mod:`re`.
+ expand : bool, default True
+ If True, return DataFrame with one column per capture group.
+ If False, return a Series/Index if there is one capture group
+ or DataFrame if there are multiple capture groups.
+
+ .. versionadded:: 0.18.0
+
+ Returns
+ -------
+ DataFrame or Series or Index
+ A DataFrame with one row for each subject string, and one
+ column for each group. Any capture group names in regular
+ expression pat will be used for column names; otherwise
+ capture group numbers will be used. The dtype of each result
+ column is always object, even when no match is found. If
+ ``expand=False`` and pat has only one capture group, then
+ return a Series (if subject is a Series) or Index (if subject
+ is an Index).
+
+ See Also
+ --------
+ extractall : Returns all matches (not just the first match).
+
+ Examples
+ --------
+ A pattern with two groups will return a DataFrame with two columns.
+ Non-matches will be NaN.
+
+ >>> s = pd.Series(['a1', 'b2', 'c3'])
+ >>> s.str.extract(r'([ab])(\d)')
+ 0 1
+ 0 a 1
+ 1 b 2
+ 2 NaN NaN
+
+ A pattern may contain optional groups.
+
+ >>> s.str.extract(r'([ab])?(\d)')
+ 0 1
+ 0 a 1
+ 1 b 2
+ 2 NaN 3
+
+ Named groups will become column names in the result.
+
+ >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)')
+ letter digit
+ 0 a 1
+ 1 b 2
+ 2 NaN NaN
+
+ A pattern with one group will return a DataFrame with one column
+ if expand=True.
+
+ >>> s.str.extract(r'[ab](\d)', expand=True)
+ 0
+ 0 1
+ 1 2
+ 2 NaN
+
+ A pattern with one group will return a Series if expand=False.
+
+ >>> s.str.extract(r'[ab](\d)', expand=False)
+ 0 1
+ 1 2
+ 2 NaN
+ dtype: object
+ """
+ if not isinstance(expand, bool):
+ raise ValueError("expand must be True or False")
+ if expand:
+ return _str_extract_frame(arr._orig, pat, flags=flags)
+ else:
+ result, name = _str_extract_noexpand(arr._parent, pat, flags=flags)
+ return arr._wrap_result(result, name=name, expand=expand)
+
+
+def str_extractall(arr, pat, flags=0):
+ r"""
+ For each subject string in the Series, extract groups from all
+ matches of regular expression pat. When each subject string in the
+ Series has exactly one match, extractall(pat).xs(0, level='match')
+ is the same as extract(pat).
+
+ .. versionadded:: 0.18.0
+
+ Parameters
+ ----------
+ pat : str
+ Regular expression pattern with capturing groups.
+ flags : int, default 0 (no flags)
+ A ``re`` module flag, for example ``re.IGNORECASE``. These allow
+ to modify regular expression matching for things like case, spaces,
+ etc. Multiple flags can be combined with the bitwise OR operator,
+ for example ``re.IGNORECASE | re.MULTILINE``.
+
+ Returns
+ -------
+ DataFrame
+ A ``DataFrame`` with one row for each match, and one column for each
+ group. Its rows have a ``MultiIndex`` with first levels that come from
+ the subject ``Series``. The last level is named 'match' and indexes the
+ matches in each item of the ``Series``. Any capture group names in
+ regular expression pat will be used for column names; otherwise capture
+ group numbers will be used.
+
+ See Also
+ --------
+ extract : Returns first match only (not all matches).
+
+ Examples
+ --------
+ A pattern with one group will return a DataFrame with one column.
+ Indices with no matches will not appear in the result.
+
+ >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])
+ >>> s.str.extractall(r"[ab](\d)")
+ 0
+ match
+ A 0 1
+ 1 2
+ B 0 1
+
+ Capture group names are used for column names of the result.
+
+ >>> s.str.extractall(r"[ab](?P<digit>\d)")
+ digit
+ match
+ A 0 1
+ 1 2
+ B 0 1
+
+ A pattern with two groups will return a DataFrame with two columns.
+
+ >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
+ letter digit
+ match
+ A 0 a 1
+ 1 a 2
+ B 0 b 1
+
+ Optional groups that do not match are NaN in the result.
+
+ >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)")
+ letter digit
+ match
+ A 0 a 1
+ 1 a 2
+ B 0 b 1
+ C 0 NaN 1
+ """
+
+ regex = re.compile(pat, flags=flags)
+ # the regex must contain capture groups.
+ if regex.groups == 0:
+ raise ValueError("pattern contains no capture groups")
+
+ if isinstance(arr, ABCIndexClass):
+ arr = arr.to_series().reset_index(drop=True)
+
+ names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
+ columns = [names.get(1 + i, i) for i in range(regex.groups)]
+ match_list = []
+ index_list = []
+ is_mi = arr.index.nlevels > 1
+
+ for subject_key, subject in arr.iteritems():
+ if isinstance(subject, compat.string_types):
+
+ if not is_mi:
+ subject_key = (subject_key, )
+
+ for match_i, match_tuple in enumerate(regex.findall(subject)):
+ if isinstance(match_tuple, compat.string_types):
+ match_tuple = (match_tuple,)
+ na_tuple = [np.NaN if group == "" else group
+ for group in match_tuple]
+ match_list.append(na_tuple)
+ result_key = tuple(subject_key + (match_i, ))
+ index_list.append(result_key)
+
+ from pandas import MultiIndex
+ index = MultiIndex.from_tuples(
+ index_list, names=arr.index.names + ["match"])
+
+ result = arr._constructor_expanddim(match_list, index=index,
+ columns=columns)
+ return result
+
+
+def str_get_dummies(arr, sep='|'):
+ """
+ Split each string in the Series by sep and return a frame of
+ dummy/indicator variables.
+
+ Parameters
+ ----------
+ sep : string, default "|"
+ String to split on.
+
+ Returns
+ -------
+ dummies : DataFrame
+
+ See Also
+ --------
+ get_dummies
+
+ Examples
+ --------
+ >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies()
+ a b c
+ 0 1 1 0
+ 1 1 0 0
+ 2 1 0 1
+
+ >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
+ a b c
+ 0 1 1 0
+ 1 0 0 0
+ 2 1 0 1
+ """
+ arr = arr.fillna('')
+ try:
+ arr = sep + arr + sep
+ except TypeError:
+ arr = sep + arr.astype(str) + sep
+
+ tags = set()
+ for ts in arr.str.split(sep):
+ tags.update(ts)
+ tags = sorted(tags - {""})
+
+ dummies = np.empty((len(arr), len(tags)), dtype=np.int64)
+
+ for i, t in enumerate(tags):
+ pat = sep + t + sep
+ dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x)
+ return dummies, tags
+
+
+def str_join(arr, sep):
+ """
+ Join lists contained as elements in the Series/Index with passed delimiter.
+
+ If the elements of a Series are lists themselves, join the content of these
+ lists using the delimiter passed to the function.
+ This function is an equivalent to :meth:`str.join`.
+
+ Parameters
+ ----------
+ sep : str
+ Delimiter to use between list entries.
+
+ Returns
+ -------
+ Series/Index: object
+ The list entries concatenated by intervening occurrences of the
+ delimiter.
+
+ Raises
+ -------
+ AttributeError
+ If the supplied Series contains neither strings nor lists.
+
+ See Also
+ --------
+ str.join : Standard library version of this method.
+ Series.str.split : Split strings around given separator/delimiter.
+
+ Notes
+ -----
+ If any of the list items is not a string object, the result of the join
+ will be `NaN`.
+
+ Examples
+ --------
+ Example with a list that contains non-string elements.
+
+ >>> s = pd.Series([['lion', 'elephant', 'zebra'],
+ ... [1.1, 2.2, 3.3],
+ ... ['cat', np.nan, 'dog'],
+ ... ['cow', 4.5, 'goat'],
+ ... ['duck', ['swan', 'fish'], 'guppy']])
+ >>> s
+ 0 [lion, elephant, zebra]
+ 1 [1.1, 2.2, 3.3]
+ 2 [cat, nan, dog]
+ 3 [cow, 4.5, goat]
+ 4 [duck, [swan, fish], guppy]
+ dtype: object
+
+ Join all lists using a '-'. The lists containing object(s) of types other
+ than str will produce a NaN.
+
+ >>> s.str.join('-')
+ 0 lion-elephant-zebra
+ 1 NaN
+ 2 NaN
+ 3 NaN
+ 4 NaN
+ dtype: object
+ """
+ return _na_map(sep.join, arr)
+
+
+def str_findall(arr, pat, flags=0):
+ """
+ Find all occurrences of pattern or regular expression in the Series/Index.
+
+ Equivalent to applying :func:`re.findall` to all the elements in the
+ Series/Index.
+
+ Parameters
+ ----------
+ pat : string
+ Pattern or regular expression.
+ flags : int, default 0
+ ``re`` module flags, e.g. `re.IGNORECASE` (default is 0, which means
+ no flags).
+
+ Returns
+ -------
+ Series/Index of lists of strings
+ All non-overlapping matches of pattern or regular expression in each
+ string of this Series/Index.
+
+ See Also
+ --------
+ count : Count occurrences of pattern or regular expression in each string
+ of the Series/Index.
+ extractall : For each string in the Series, extract groups from all matches
+ of regular expression and return a DataFrame with one row for each
+ match and one column for each group.
+ re.findall : The equivalent ``re`` function to all non-overlapping matches
+ of pattern or regular expression in string, as a list of strings.
+
+ Examples
+ --------
+
+ >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit'])
+
+ The search for the pattern 'Monkey' returns one match:
+
+ >>> s.str.findall('Monkey')
+ 0 []
+ 1 [Monkey]
+ 2 []
+ dtype: object
+
+ On the other hand, the search for the pattern 'MONKEY' doesn't return any
+ match:
+
+ >>> s.str.findall('MONKEY')
+ 0 []
+ 1 []
+ 2 []
+ dtype: object
+
+ Flags can be added to the pattern or regular expression. For instance,
+ to find the pattern 'MONKEY' ignoring the case:
+
+ >>> import re
+ >>> s.str.findall('MONKEY', flags=re.IGNORECASE)
+ 0 []
+ 1 [Monkey]
+ 2 []
+ dtype: object
+
+ When the pattern matches more than one string in the Series, all matches
+ are returned:
+
+ >>> s.str.findall('on')
+ 0 [on]
+ 1 [on]
+ 2 []
+ dtype: object
+
+ Regular expressions are supported too. For instance, the search for all the
+ strings ending with the word 'on' is shown next:
+
+ >>> s.str.findall('on$')
+ 0 [on]
+ 1 []
+ 2 []
+ dtype: object
+
+ If the pattern is found more than once in the same string, then a list of
+ multiple strings is returned:
+
+ >>> s.str.findall('b')
+ 0 []
+ 1 []
+ 2 [b, b]
+ dtype: object
+ """
+ regex = re.compile(pat, flags=flags)
+ return _na_map(regex.findall, arr)
+
+
+def str_find(arr, sub, start=0, end=None, side='left'):
+ """
+ Return indexes in each strings in the Series/Index where the
+ substring is fully contained between [start:end]. Return -1 on failure.
+
+ Parameters
+ ----------
+ sub : str
+ Substring being searched
+ start : int
+ Left edge index
+ end : int
+ Right edge index
+ side : {'left', 'right'}, default 'left'
+ Specifies a starting side, equivalent to ``find`` or ``rfind``
+
+ Returns
+ -------
+ found : Series/Index of integer values
+ """
+
+ if not isinstance(sub, compat.string_types):
+ msg = 'expected a string object, not {0}'
+ raise TypeError(msg.format(type(sub).__name__))
+
+ if side == 'left':
+ method = 'find'
+ elif side == 'right':
+ method = 'rfind'
+ else: # pragma: no cover
+ raise ValueError('Invalid side')
+
+ if end is None:
+ f = lambda x: getattr(x, method)(sub, start)
+ else:
+ f = lambda x: getattr(x, method)(sub, start, end)
+
+ return _na_map(f, arr, dtype=int)
+
+
+def str_index(arr, sub, start=0, end=None, side='left'):
+ if not isinstance(sub, compat.string_types):
+ msg = 'expected a string object, not {0}'
+ raise TypeError(msg.format(type(sub).__name__))
+
+ if side == 'left':
+ method = 'index'
+ elif side == 'right':
+ method = 'rindex'
+ else: # pragma: no cover
+ raise ValueError('Invalid side')
+
+ if end is None:
+ f = lambda x: getattr(x, method)(sub, start)
+ else:
+ f = lambda x: getattr(x, method)(sub, start, end)
+
+ return _na_map(f, arr, dtype=int)
+
+
+def str_pad(arr, width, side='left', fillchar=' '):
+ """
+ Pad strings in the Series/Index up to width.
+
+ Parameters
+ ----------
+ width : int
+ Minimum width of resulting string; additional characters will be filled
+ with character defined in `fillchar`.
+ side : {'left', 'right', 'both'}, default 'left'
+ Side from which to fill resulting string.
+ fillchar : str, default ' '
+ Additional character for filling, default is whitespace.
+
+ Returns
+ -------
+ Series or Index of object
+ Returns Series or Index with minimum number of char in object.
+
+ See Also
+ --------
+ Series.str.rjust : Fills the left side of strings with an arbitrary
+ character. Equivalent to ``Series.str.pad(side='left')``.
+ Series.str.ljust : Fills the right side of strings with an arbitrary
+ character. Equivalent to ``Series.str.pad(side='right')``.
+ Series.str.center : Fills boths sides of strings with an arbitrary
+ character. Equivalent to ``Series.str.pad(side='both')``.
+ Series.str.zfill : Pad strings in the Series/Index by prepending '0'
+ character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``.
+
+ Examples
+ --------
+ >>> s = pd.Series(["caribou", "tiger"])
+ >>> s
+ 0 caribou
+ 1 tiger
+ dtype: object
+
+ >>> s.str.pad(width=10)
+ 0 caribou
+ 1 tiger
+ dtype: object
+
+ >>> s.str.pad(width=10, side='right', fillchar='-')
+ 0 caribou---
+ 1 tiger-----
+ dtype: object
+
+ >>> s.str.pad(width=10, side='both', fillchar='-')
+ 0 -caribou--
+ 1 --tiger---
+ dtype: object
+ """
+ if not isinstance(fillchar, compat.string_types):
+ msg = 'fillchar must be a character, not {0}'
+ raise TypeError(msg.format(type(fillchar).__name__))
+
+ if len(fillchar) != 1:
+ raise TypeError('fillchar must be a character, not str')
+
+ if not is_integer(width):
+ msg = 'width must be of integer type, not {0}'
+ raise TypeError(msg.format(type(width).__name__))
+
+ if side == 'left':
+ f = lambda x: x.rjust(width, fillchar)
+ elif side == 'right':
+ f = lambda x: x.ljust(width, fillchar)
+ elif side == 'both':
+ f = lambda x: x.center(width, fillchar)
+ else: # pragma: no cover
+ raise ValueError('Invalid side')
+
+ return _na_map(f, arr)
+
+
+def str_split(arr, pat=None, n=None):
+
+ if pat is None:
+ if n is None or n == 0:
+ n = -1
+ f = lambda x: x.split(pat, n)
+ else:
+ if len(pat) == 1:
+ if n is None or n == 0:
+ n = -1
+ f = lambda x: x.split(pat, n)
+ else:
+ if n is None or n == -1:
+ n = 0
+ regex = re.compile(pat)
+ f = lambda x: regex.split(x, maxsplit=n)
+ res = _na_map(f, arr)
+ return res
+
+
+def str_rsplit(arr, pat=None, n=None):
+
+ if n is None or n == 0:
+ n = -1
+ f = lambda x: x.rsplit(pat, n)
+ res = _na_map(f, arr)
+ return res
+
+
+def str_slice(arr, start=None, stop=None, step=None):
+ """
+ Slice substrings from each element in the Series or Index.
+
+ Parameters
+ ----------
+ start : int, optional
+ Start position for slice operation.
+ stop : int, optional
+ Stop position for slice operation.
+ step : int, optional
+ Step size for slice operation.
+
+ Returns
+ -------
+ Series or Index of object
+ Series or Index from sliced substring from original string object.
+
+ See Also
+ --------
+ Series.str.slice_replace : Replace a slice with a string.
+ Series.str.get : Return element at position.
+ Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i`
+ being the position.
+
+ Examples
+ --------
+ >>> s = pd.Series(["koala", "fox", "chameleon"])
+ >>> s
+ 0 koala
+ 1 fox
+ 2 chameleon
+ dtype: object
+
+ >>> s.str.slice(start=1)
+ 0 oala
+ 1 ox
+ 2 hameleon
+ dtype: object
+
+ >>> s.str.slice(stop=2)
+ 0 ko
+ 1 fo
+ 2 ch
+ dtype: object
+
+ >>> s.str.slice(step=2)
+ 0 kaa
+ 1 fx
+ 2 caeen
+ dtype: object
+
+ >>> s.str.slice(start=0, stop=5, step=3)
+ 0 kl
+ 1 f
+ 2 cm
+ dtype: object
+
+ Equivalent behaviour to:
+
+ >>> s.str[0:5:3]
+ 0 kl
+ 1 f
+ 2 cm
+ dtype: object
+ """
+ obj = slice(start, stop, step)
+ f = lambda x: x[obj]
+ return _na_map(f, arr)
+
+
+def str_slice_replace(arr, start=None, stop=None, repl=None):
+ """
+ Replace a positional slice of a string with another value.
+
+ Parameters
+ ----------
+ start : int, optional
+ Left index position to use for the slice. If not specified (None),
+ the slice is unbounded on the left, i.e. slice from the start
+ of the string.
+ stop : int, optional
+ Right index position to use for the slice. If not specified (None),
+ the slice is unbounded on the right, i.e. slice until the
+ end of the string.
+ repl : str, optional
+ String for replacement. If not specified (None), the sliced region
+ is replaced with an empty string.
+
+ Returns
+ -------
+ replaced : Series or Index
+ Same type as the original object.
+
+ See Also
+ --------
+ Series.str.slice : Just slicing without replacement.
+
+ Examples
+ --------
+ >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde'])
+ >>> s
+ 0 a
+ 1 ab
+ 2 abc
+ 3 abdc
+ 4 abcde
+ dtype: object
+
+ Specify just `start`, meaning replace `start` until the end of the
+ string with `repl`.
+
+ >>> s.str.slice_replace(1, repl='X')
+ 0 aX
+ 1 aX
+ 2 aX
+ 3 aX
+ 4 aX
+ dtype: object
+
+ Specify just `stop`, meaning the start of the string to `stop` is replaced
+ with `repl`, and the rest of the string is included.
+
+ >>> s.str.slice_replace(stop=2, repl='X')
+ 0 X
+ 1 X
+ 2 Xc
+ 3 Xdc
+ 4 Xcde
+ dtype: object
+
+ Specify `start` and `stop`, meaning the slice from `start` to `stop` is
+ replaced with `repl`. Everything before or after `start` and `stop` is
+ included as is.
+
+ >>> s.str.slice_replace(start=1, stop=3, repl='X')
+ 0 aX
+ 1 aX
+ 2 aX
+ 3 aXc
+ 4 aXde
+ dtype: object
+ """
+ if repl is None:
+ repl = ''
+
+ def f(x):
+ if x[start:stop] == '':
+ local_stop = start
+ else:
+ local_stop = stop
+ y = ''
+ if start is not None:
+ y += x[:start]
+ y += repl
+ if stop is not None:
+ y += x[local_stop:]
+ return y
+
+ return _na_map(f, arr)
+
+
+def str_strip(arr, to_strip=None, side='both'):
+ """
+ Strip whitespace (including newlines) from each string in the
+ Series/Index.
+
+ Parameters
+ ----------
+ to_strip : str or unicode
+ side : {'left', 'right', 'both'}, default 'both'
+
+ Returns
+ -------
+ stripped : Series/Index of objects
+ """
+ if side == 'both':
+ f = lambda x: x.strip(to_strip)
+ elif side == 'left':
+ f = lambda x: x.lstrip(to_strip)
+ elif side == 'right':
+ f = lambda x: x.rstrip(to_strip)
+ else: # pragma: no cover
+ raise ValueError('Invalid side')
+ return _na_map(f, arr)
+
+
+def str_wrap(arr, width, **kwargs):
+ r"""
+ Wrap long strings in the Series/Index to be formatted in
+ paragraphs with length less than a given width.
+
+ This method has the same keyword parameters and defaults as
+ :class:`textwrap.TextWrapper`.
+
+ Parameters
+ ----------
+ width : int
+ Maximum line-width
+ expand_tabs : bool, optional
+ If true, tab characters will be expanded to spaces (default: True)
+ replace_whitespace : bool, optional
+ If true, each whitespace character (as defined by string.whitespace)
+ remaining after tab expansion will be replaced by a single space
+ (default: True)
+ drop_whitespace : bool, optional
+ If true, whitespace that, after wrapping, happens to end up at the
+ beginning or end of a line is dropped (default: True)
+ break_long_words : bool, optional
+ If true, then words longer than width will be broken in order to ensure
+ that no lines are longer than width. If it is false, long words will
+ not be broken, and some lines may be longer than width. (default: True)
+ break_on_hyphens : bool, optional
+ If true, wrapping will occur preferably on whitespace and right after
+ hyphens in compound words, as it is customary in English. If false,
+ only whitespaces will be considered as potentially good places for line
+ breaks, but you need to set break_long_words to false if you want truly
+ insecable words. (default: True)
+
+ Returns
+ -------
+ wrapped : Series/Index of objects
+
+ Notes
+ -----
+ Internally, this method uses a :class:`textwrap.TextWrapper` instance with
+ default settings. To achieve behavior matching R's stringr library str_wrap
+ function, use the arguments:
+
+ - expand_tabs = False
+ - replace_whitespace = True
+ - drop_whitespace = True
+ - break_long_words = False
+ - break_on_hyphens = False
+
+ Examples
+ --------
+
+ >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped'])
+ >>> s.str.wrap(12)
+ 0 line to be\nwrapped
+ 1 another line\nto be\nwrapped
+ """
+ kwargs['width'] = width
+
+ tw = textwrap.TextWrapper(**kwargs)
+
+ return _na_map(lambda s: '\n'.join(tw.wrap(s)), arr)
+
+
+def str_translate(arr, table, deletechars=None):
+ """
+ Map all characters in the string through the given mapping table.
+ Equivalent to standard :meth:`str.translate`. Note that the optional
+ argument deletechars is only valid if you are using python 2. For python 3,
+ character deletion should be specified via the table argument.
+
+ Parameters
+ ----------
+ table : dict (python 3), str or None (python 2)
+ In python 3, table is a mapping of Unicode ordinals to Unicode
+ ordinals, strings, or None. Unmapped characters are left untouched.
+ Characters mapped to None are deleted. :meth:`str.maketrans` is a
+ helper function for making translation tables.
+ In python 2, table is either a string of length 256 or None. If the
+ table argument is None, no translation is applied and the operation
+ simply removes the characters in deletechars. :func:`string.maketrans`
+ is a helper function for making translation tables.
+ deletechars : str, optional (python 2)
+ A string of characters to delete. This argument is only valid
+ in python 2.
+
+ Returns
+ -------
+ translated : Series/Index of objects
+ """
+ if deletechars is None:
+ f = lambda x: x.translate(table)
+ else:
+ if compat.PY3:
+ raise ValueError("deletechars is not a valid argument for "
+ "str.translate in python 3. You should simply "
+ "specify character deletions in the table "
+ "argument")
+ f = lambda x: x.translate(table, deletechars)
+ return _na_map(f, arr)
+
+
+def str_get(arr, i):
+ """
+ Extract element from each component at specified position.
+
+ Extract element from lists, tuples, or strings in each element in the
+ Series/Index.
+
+ Parameters
+ ----------
+ i : int
+ Position of element to extract.
+
+ Returns
+ -------
+ items : Series/Index of objects
+
+ Examples
+ --------
+ >>> s = pd.Series(["String",
+ (1, 2, 3),
+ ["a", "b", "c"],
+ 123, -456,
+ {1:"Hello", "2":"World"}])
+ >>> s
+ 0 String
+ 1 (1, 2, 3)
+ 2 [a, b, c]
+ 3 123
+ 4 -456
+ 5 {1: 'Hello', '2': 'World'}
+ dtype: object
+
+ >>> s.str.get(1)
+ 0 t
+ 1 2
+ 2 b
+ 3 NaN
+ 4 NaN
+ 5 Hello
+ dtype: object
+
+ >>> s.str.get(-1)
+ 0 g
+ 1 3
+ 2 c
+ 3 NaN
+ 4 NaN
+ 5 NaN
+ dtype: object
+ """
+ def f(x):
+ if isinstance(x, dict):
+ return x.get(i)
+ elif len(x) > i >= -len(x):
+ return x[i]
+ return np.nan
+ return _na_map(f, arr)
+
+
+def str_decode(arr, encoding, errors="strict"):
+ """
+ Decode character string in the Series/Index using indicated encoding.
+ Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in
+ python3.
+
+ Parameters
+ ----------
+ encoding : str
+ errors : str, optional
+
+ Returns
+ -------
+ decoded : Series/Index of objects
+ """
+ if encoding in _cpython_optimized_decoders:
+ # CPython optimized implementation
+ f = lambda x: x.decode(encoding, errors)
+ else:
+ decoder = codecs.getdecoder(encoding)
+ f = lambda x: decoder(x, errors)[0]
+ return _na_map(f, arr)
+
+
+def str_encode(arr, encoding, errors="strict"):
+ """
+ Encode character string in the Series/Index using indicated encoding.
+ Equivalent to :meth:`str.encode`.
+
+ Parameters
+ ----------
+ encoding : str
+ errors : str, optional
+
+ Returns
+ -------
+ encoded : Series/Index of objects
+ """
+ if encoding in _cpython_optimized_encoders:
+ # CPython optimized implementation
+ f = lambda x: x.encode(encoding, errors)
+ else:
+ encoder = codecs.getencoder(encoding)
+ f = lambda x: encoder(x, errors)[0]
+ return _na_map(f, arr)
+
+
+def _noarg_wrapper(f, docstring=None, **kargs):
+ def wrapper(self):
+ result = _na_map(f, self._parent, **kargs)
+ return self._wrap_result(result)
+
+ wrapper.__name__ = f.__name__
+ if docstring is not None:
+ wrapper.__doc__ = docstring
+ else:
+ raise ValueError('Provide docstring')
+
+ return wrapper
+
+
+def _pat_wrapper(f, flags=False, na=False, **kwargs):
+ def wrapper1(self, pat):
+ result = f(self._parent, pat)
+ return self._wrap_result(result)
+
+ def wrapper2(self, pat, flags=0, **kwargs):
+ result = f(self._parent, pat, flags=flags, **kwargs)
+ return self._wrap_result(result)
+
+ def wrapper3(self, pat, na=np.nan):
+ result = f(self._parent, pat, na=na)
+ return self._wrap_result(result)
+
+ wrapper = wrapper3 if na else wrapper2 if flags else wrapper1
+
+ wrapper.__name__ = f.__name__
+ if f.__doc__:
+ wrapper.__doc__ = f.__doc__
+
+ return wrapper
+
+
+def copy(source):
+ "Copy a docstring from another source function (if present)"
+
+ def do_copy(target):
+ if source.__doc__:
+ target.__doc__ = source.__doc__
+ return target
+
+ return do_copy
+
+
+class StringMethods(NoNewAttributesMixin):
+ """
+ Vectorized string functions for Series and Index. NAs stay NA unless
+ handled otherwise by a particular method. Patterned after Python's string
+ methods, with some inspiration from R's stringr package.
+
+ Examples
+ --------
+ >>> s.str.split('_')
+ >>> s.str.replace('_', '')
+ """
+
+ def __init__(self, data):
+ self._validate(data)
+ self._is_categorical = is_categorical_dtype(data)
+
+ # .values.categories works for both Series/Index
+ self._parent = data.values.categories if self._is_categorical else data
+ # save orig to blow up categoricals to the right type
+ self._orig = data
+ self._freeze()
+
+ @staticmethod
+ def _validate(data):
+ from pandas.core.index import Index
+
+ if (isinstance(data, ABCSeries) and
+ not ((is_categorical_dtype(data.dtype) and
+ is_object_dtype(data.values.categories)) or
+ (is_object_dtype(data.dtype)))):
+ # it's neither a string series not a categorical series with
+ # strings inside the categories.
+ # this really should exclude all series with any non-string values
+ # (instead of test for object dtype), but that isn't practical for
+ # performance reasons until we have a str dtype (GH 9343)
+ raise AttributeError("Can only use .str accessor with string "
+ "values, which use np.object_ dtype in "
+ "pandas")
+ elif isinstance(data, Index):
+ # can't use ABCIndex to exclude non-str
+
+ # see src/inference.pyx which can contain string values
+ allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer')
+ if is_categorical_dtype(data.dtype):
+ inf_type = data.categories.inferred_type
+ else:
+ inf_type = data.inferred_type
+ if inf_type not in allowed_types:
+ message = ("Can only use .str accessor with string values "
+ "(i.e. inferred_type is 'string', 'unicode' or "
+ "'mixed')")
+ raise AttributeError(message)
+ if data.nlevels > 1:
+ message = ("Can only use .str accessor with Index, not "
+ "MultiIndex")
+ raise AttributeError(message)
+
+ def __getitem__(self, key):
+ if isinstance(key, slice):
+ return self.slice(start=key.start, stop=key.stop, step=key.step)
+ else:
+ return self.get(key)
+
+ def __iter__(self):
+ i = 0
+ g = self.get(i)
+ while g.notna().any():
+ yield g
+ i += 1
+ g = self.get(i)
+
+ def _wrap_result(self, result, use_codes=True,
+ name=None, expand=None, fill_value=np.nan):
+
+ from pandas import Index, Series, MultiIndex
+
+ # for category, we do the stuff on the categories, so blow it up
+ # to the full series again
+ # But for some operations, we have to do the stuff on the full values,
+ # so make it possible to skip this step as the method already did this
+ # before the transformation...
+ if use_codes and self._is_categorical:
+ # if self._orig is a CategoricalIndex, there is no .cat-accessor
+ result = take_1d(result, Series(self._orig, copy=False).cat.codes,
+ fill_value=fill_value)
+
+ if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'):
+ return result
+ assert result.ndim < 3
+
+ if expand is None:
+ # infer from ndim if expand is not specified
+ expand = False if result.ndim == 1 else True
+
+ elif expand is True and not isinstance(self._orig, Index):
+ # required when expand=True is explicitly specified
+ # not needed when inferred
+
+ def cons_row(x):
+ if is_list_like(x):
+ return x
+ else:
+ return [x]
+
+ result = [cons_row(x) for x in result]
+ if result:
+ # propagate nan values to match longest sequence (GH 18450)
+ max_len = max(len(x) for x in result)
+ result = [x * max_len if len(x) == 0 or x[0] is np.nan
+ else x for x in result]
+
+ if not isinstance(expand, bool):
+ raise ValueError("expand must be True or False")
+
+ if expand is False:
+ # if expand is False, result should have the same name
+ # as the original otherwise specified
+ if name is None:
+ name = getattr(result, 'name', None)
+ if name is None:
+ # do not use logical or, _orig may be a DataFrame
+ # which has "name" column
+ name = self._orig.name
+
+ # Wait until we are sure result is a Series or Index before
+ # checking attributes (GH 12180)
+ if isinstance(self._orig, Index):
+ # if result is a boolean np.array, return the np.array
+ # instead of wrapping it into a boolean Index (GH 8875)
+ if is_bool_dtype(result):
+ return result
+
+ if expand:
+ result = list(result)
+ out = MultiIndex.from_tuples(result, names=name)
+ if out.nlevels == 1:
+ # We had all tuples of length-one, which are
+ # better represented as a regular Index.
+ out = out.get_level_values(0)
+ return out
+ else:
+ return Index(result, name=name)
+ else:
+ index = self._orig.index
+ if expand:
+ cons = self._orig._constructor_expanddim
+ return cons(result, columns=name, index=index)
+ else:
+ # Must be a Series
+ cons = self._orig._constructor
+ return cons(result, name=name, index=index)
+
+ def _get_series_list(self, others, ignore_index=False):
+ """
+ Auxiliary function for :meth:`str.cat`. Turn potentially mixed input
+ into a list of Series (elements without an index must match the length
+ of the calling Series/Index).
+
+ Parameters
+ ----------
+ others : Series, Index, DataFrame, np.ndarray, list-like or list-like
+ of objects that are Series, Index or np.ndarray (1-dim)
+ ignore_index : boolean, default False
+ Determines whether to forcefully align others with index of caller
+
+ Returns
+ -------
+ tuple : (others transformed into list of Series,
+ boolean whether FutureWarning should be raised)
+ """
+
+ # Once str.cat defaults to alignment, this function can be simplified;
+ # will not need `ignore_index` and the second boolean output anymore
+
+ from pandas import Index, Series, DataFrame
+
+ # self._orig is either Series or Index
+ idx = self._orig if isinstance(self._orig, Index) else self._orig.index
+
+ err_msg = ('others must be Series, Index, DataFrame, np.ndarrary or '
+ 'list-like (either containing only strings or containing '
+ 'only objects of type Series/Index/list-like/np.ndarray)')
+
+ # Generally speaking, all objects without an index inherit the index
+ # `idx` of the calling Series/Index - i.e. must have matching length.
+ # Objects with an index (i.e. Series/Index/DataFrame) keep their own
+ # index, *unless* ignore_index is set to True.
+ if isinstance(others, Series):
+ warn = not others.index.equals(idx)
+ # only reconstruct Series when absolutely necessary
+ los = [Series(others.values, index=idx)
+ if ignore_index and warn else others]
+ return (los, warn)
+ elif isinstance(others, Index):
+ warn = not others.equals(idx)
+ los = [Series(others.values,
+ index=(idx if ignore_index else others))]
+ return (los, warn)
+ elif isinstance(others, DataFrame):
+ warn = not others.index.equals(idx)
+ if ignore_index and warn:
+ # without copy, this could change "others"
+ # that was passed to str.cat
+ others = others.copy()
+ others.index = idx
+ return ([others[x] for x in others], warn)
+ elif isinstance(others, np.ndarray) and others.ndim == 2:
+ others = DataFrame(others, index=idx)
+ return ([others[x] for x in others], False)
+ elif is_list_like(others, allow_sets=False):
+ others = list(others) # ensure iterators do not get read twice etc
+
+ # in case of list-like `others`, all elements must be
+ # either one-dimensional list-likes or scalars
+ if all(is_list_like(x, allow_sets=False) for x in others):
+ los = []
+ join_warn = False
+ depr_warn = False
+ # iterate through list and append list of series for each
+ # element (which we check to be one-dimensional and non-nested)
+ while others:
+ nxt = others.pop(0) # nxt is guaranteed list-like by above
+
+ # GH 21950 - DeprecationWarning
+ # only allowing Series/Index/np.ndarray[1-dim] will greatly
+ # simply this function post-deprecation.
+ if not (isinstance(nxt, (Series, Index)) or
+ (isinstance(nxt, np.ndarray) and nxt.ndim == 1)):
+ depr_warn = True
+
+ if not isinstance(nxt, (DataFrame, Series,
+ Index, np.ndarray)):
+ # safety for non-persistent list-likes (e.g. iterators)
+ # do not map indexed/typed objects; info needed below
+ nxt = list(nxt)
+
+ # known types for which we can avoid deep inspection
+ no_deep = ((isinstance(nxt, np.ndarray) and nxt.ndim == 1)
+ or isinstance(nxt, (Series, Index)))
+ # nested list-likes are forbidden:
+ # -> elements of nxt must not be list-like
+ is_legal = ((no_deep and nxt.dtype == object)
+ or all(not is_list_like(x) for x in nxt))
+
+ # DataFrame is false positive of is_legal
+ # because "x in df" returns column names
+ if not is_legal or isinstance(nxt, DataFrame):
+ raise TypeError(err_msg)
+
+ nxt, wnx = self._get_series_list(nxt,
+ ignore_index=ignore_index)
+ los = los + nxt
+ join_warn = join_warn or wnx
+
+ if depr_warn:
+ warnings.warn('list-likes other than Series, Index, or '
+ 'np.ndarray WITHIN another list-like are '
+ 'deprecated and will be removed in a future '
+ 'version.', FutureWarning, stacklevel=3)
+ return (los, join_warn)
+ elif all(not is_list_like(x) for x in others):
+ return ([Series(others, index=idx)], False)
+ raise TypeError(err_msg)
+
+ def cat(self, others=None, sep=None, na_rep=None, join=None):
+ """
+ Concatenate strings in the Series/Index with given separator.
+
+ If `others` is specified, this function concatenates the Series/Index
+ and elements of `others` element-wise.
+ If `others` is not passed, then all values in the Series/Index are
+ concatenated into a single string with a given `sep`.
+
+ Parameters
+ ----------
+ others : Series, Index, DataFrame, np.ndarrary or list-like
+ Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and
+ other list-likes of strings must have the same length as the
+ calling Series/Index, with the exception of indexed objects (i.e.
+ Series/Index/DataFrame) if `join` is not None.
+
+ If others is a list-like that contains a combination of Series,
+ Index or np.ndarray (1-dim), then all elements will be unpacked and
+ must satisfy the above criteria individually.
+
+ If others is None, the method returns the concatenation of all
+ strings in the calling Series/Index.
+ sep : str, default ''
+ The separator between the different elements/columns. By default
+ the empty string `''` is used.
+ na_rep : str or None, default None
+ Representation that is inserted for all missing values:
+
+ - If `na_rep` is None, and `others` is None, missing values in the
+ Series/Index are omitted from the result.
+ - If `na_rep` is None, and `others` is not None, a row containing a
+ missing value in any of the columns (before concatenation) will
+ have a missing value in the result.
+ join : {'left', 'right', 'outer', 'inner'}, default None
+ Determines the join-style between the calling Series/Index and any
+ Series/Index/DataFrame in `others` (objects without an index need
+ to match the length of the calling Series/Index). If None,
+ alignment is disabled, but this option will be removed in a future
+ version of pandas and replaced with a default of `'left'`. To
+ disable alignment, use `.values` on any Series/Index/DataFrame in
+ `others`.
+
+ .. versionadded:: 0.23.0
+
+ Returns
+ -------
+ concat : str or Series/Index of objects
+ If `others` is None, `str` is returned, otherwise a `Series/Index`
+ (same type as caller) of objects is returned.
+
+ See Also
+ --------
+ split : Split each string in the Series/Index.
+ join : Join lists contained as elements in the Series/Index.
+
+ Examples
+ --------
+ When not passing `others`, all values are concatenated into a single
+ string:
+
+ >>> s = pd.Series(['a', 'b', np.nan, 'd'])
+ >>> s.str.cat(sep=' ')
+ 'a b d'
+
+ By default, NA values in the Series are ignored. Using `na_rep`, they
+ can be given a representation:
+
+ >>> s.str.cat(sep=' ', na_rep='?')
+ 'a b ? d'
+
+ If `others` is specified, corresponding values are concatenated with
+ the separator. Result will be a Series of strings.
+
+ >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',')
+ 0 a,A
+ 1 b,B
+ 2 NaN
+ 3 d,D
+ dtype: object
+
+ Missing values will remain missing in the result, but can again be
+ represented using `na_rep`
+
+ >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-')
+ 0 a,A
+ 1 b,B
+ 2 -,C
+ 3 d,D
+ dtype: object
+
+ If `sep` is not specified, the values are concatenated without
+ separation.
+
+ >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-')
+ 0 aA
+ 1 bB
+ 2 -C
+ 3 dD
+ dtype: object
+
+ Series with different indexes can be aligned before concatenation. The
+ `join`-keyword works as in other methods.
+
+ >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2])
+ >>> s.str.cat(t, join='left', na_rep='-')
+ 0 aa
+ 1 b-
+ 2 -c
+ 3 dd
+ dtype: object
+ >>>
+ >>> s.str.cat(t, join='outer', na_rep='-')
+ 0 aa
+ 1 b-
+ 2 -c
+ 3 dd
+ 4 -e
+ dtype: object
+ >>>
+ >>> s.str.cat(t, join='inner', na_rep='-')
+ 0 aa
+ 2 -c
+ 3 dd
+ dtype: object
+ >>>
+ >>> s.str.cat(t, join='right', na_rep='-')
+ 3 dd
+ 0 aa
+ 4 -e
+ 2 -c
+ dtype: object
+
+ For more examples, see :ref:`here <text.concatenate>`.
+ """
+ from pandas import Index, Series, concat
+
+ if isinstance(others, compat.string_types):
+ raise ValueError("Did you mean to supply a `sep` keyword?")
+ if sep is None:
+ sep = ''
+
+ if isinstance(self._orig, Index):
+ data = Series(self._orig, index=self._orig)
+ else: # Series
+ data = self._orig
+
+ # concatenate Series/Index with itself if no "others"
+ if others is None:
+ data = ensure_object(data)
+ na_mask = isna(data)
+ if na_rep is None and na_mask.any():
+ data = data[~na_mask]
+ elif na_rep is not None and na_mask.any():
+ data = np.where(na_mask, na_rep, data)
+ return sep.join(data)
+
+ try:
+ # turn anything in "others" into lists of Series
+ others, warn = self._get_series_list(others,
+ ignore_index=(join is None))
+ except ValueError: # do not catch TypeError raised by _get_series_list
+ if join is None:
+ raise ValueError('All arrays must be same length, except '
+ 'those having an index if `join` is not None')
+ else:
+ raise ValueError('If `others` contains arrays or lists (or '
+ 'other list-likes without an index), these '
+ 'must all be of the same length as the '
+ 'calling Series/Index.')
+
+ if join is None and warn:
+ warnings.warn("A future version of pandas will perform index "
+ "alignment when `others` is a Series/Index/"
+ "DataFrame (or a list-like containing one). To "
+ "disable alignment (the behavior before v.0.23) and "
+ "silence this warning, use `.values` on any Series/"
+ "Index/DataFrame in `others`. To enable alignment "
+ "and silence this warning, pass `join='left'|"
+ "'outer'|'inner'|'right'`. The future default will "
+ "be `join='left'`.", FutureWarning, stacklevel=2)
+
+ # if join is None, _get_series_list already force-aligned indexes
+ join = 'left' if join is None else join
+
+ # align if required
+ if any(not data.index.equals(x.index) for x in others):
+ # Need to add keys for uniqueness in case of duplicate columns
+ others = concat(others, axis=1,
+ join=(join if join == 'inner' else 'outer'),
+ keys=range(len(others)), sort=False, copy=False)
+ data, others = data.align(others, join=join)
+ others = [others[x] for x in others] # again list of Series
+
+ all_cols = [ensure_object(x) for x in [data] + others]
+ na_masks = np.array([isna(x) for x in all_cols])
+ union_mask = np.logical_or.reduce(na_masks, axis=0)
+
+ if na_rep is None and union_mask.any():
+ # no na_rep means NaNs for all rows where any column has a NaN
+ # only necessary if there are actually any NaNs
+ result = np.empty(len(data), dtype=object)
+ np.putmask(result, union_mask, np.nan)
+
+ not_masked = ~union_mask
+ result[not_masked] = cat_core([x[not_masked] for x in all_cols],
+ sep)
+ elif na_rep is not None and union_mask.any():
+ # fill NaNs with na_rep in case there are actually any NaNs
+ all_cols = [np.where(nm, na_rep, col)
+ for nm, col in zip(na_masks, all_cols)]
+ result = cat_core(all_cols, sep)
+ else:
+ # no NaNs - can just concatenate
+ result = cat_core(all_cols, sep)
+
+ if isinstance(self._orig, Index):
+ # add dtype for case that result is all-NA
+ result = Index(result, dtype=object, name=self._orig.name)
+ else: # Series
+ result = Series(result, dtype=object, index=data.index,
+ name=self._orig.name)
+ return result
+
+ _shared_docs['str_split'] = ("""
+ Split strings around given separator/delimiter.
+
+ Splits the string in the Series/Index from the %(side)s,
+ at the specified delimiter string. Equivalent to :meth:`str.%(method)s`.
+
+ Parameters
+ ----------
+ pat : str, optional
+ String or regular expression to split on.
+ If not specified, split on whitespace.
+ n : int, default -1 (all)
+ Limit number of splits in output.
+ ``None``, 0 and -1 will be interpreted as return all splits.
+ expand : bool, default False
+ Expand the splitted strings into separate columns.
+
+ * If ``True``, return DataFrame/MultiIndex expanding dimensionality.
+ * If ``False``, return Series/Index, containing lists of strings.
+
+ Returns
+ -------
+ Series, Index, DataFrame or MultiIndex
+ Type matches caller unless ``expand=True`` (see Notes).
+
+ See Also
+ --------
+ Series.str.split : Split strings around given separator/delimiter.
+ Series.str.rsplit : Splits string around given separator/delimiter,
+ starting from the right.
+ Series.str.join : Join lists contained as elements in the Series/Index
+ with passed delimiter.
+ str.split : Standard library version for split.
+ str.rsplit : Standard library version for rsplit.
+
+ Notes
+ -----
+ The handling of the `n` keyword depends on the number of found splits:
+
+ - If found splits > `n`, make first `n` splits only
+ - If found splits <= `n`, make all splits
+ - If for a certain row the number of found splits < `n`,
+ append `None` for padding up to `n` if ``expand=True``
+
+ If using ``expand=True``, Series and Index callers return DataFrame and
+ MultiIndex objects, respectively.
+
+ Examples
+ --------
+ >>> s = pd.Series(["this is a regular sentence",
+ "https://docs.python.org/3/tutorial/index.html", np.nan])
+
+ In the default setting, the string is split by whitespace.
+
+ >>> s.str.split()
+ 0 [this, is, a, regular, sentence]
+ 1 [https://docs.python.org/3/tutorial/index.html]
+ 2 NaN
+ dtype: object
+
+ Without the `n` parameter, the outputs of `rsplit` and `split`
+ are identical.
+
+ >>> s.str.rsplit()
+ 0 [this, is, a, regular, sentence]
+ 1 [https://docs.python.org/3/tutorial/index.html]
+ 2 NaN
+ dtype: object
+
+ The `n` parameter can be used to limit the number of splits on the
+ delimiter. The outputs of `split` and `rsplit` are different.
+
+ >>> s.str.split(n=2)
+ 0 [this, is, a regular sentence]
+ 1 [https://docs.python.org/3/tutorial/index.html]
+ 2 NaN
+ dtype: object
+
+ >>> s.str.rsplit(n=2)
+ 0 [this is a, regular, sentence]
+ 1 [https://docs.python.org/3/tutorial/index.html]
+ 2 NaN
+ dtype: object
+
+ The `pat` parameter can be used to split by other characters.
+
+ >>> s.str.split(pat = "/")
+ 0 [this is a regular sentence]
+ 1 [https:, , docs.python.org, 3, tutorial, index...
+ 2 NaN
+ dtype: object
+
+ When using ``expand=True``, the split elements will expand out into
+ separate columns. If NaN is present, it is propagated throughout
+ the columns during the split.
+
+ >>> s.str.split(expand=True)
+ 0 1 2 3
+ 0 this is a regular
+ 1 https://docs.python.org/3/tutorial/index.html None None None
+ 2 NaN NaN NaN NaN \
+
+ 4
+ 0 sentence
+ 1 None
+ 2 NaN
+
+ For slightly more complex use cases like splitting the html document name
+ from a url, a combination of parameter settings can be used.
+
+ >>> s.str.rsplit("/", n=1, expand=True)
+ 0 1
+ 0 this is a regular sentence None
+ 1 https://docs.python.org/3/tutorial index.html
+ 2 NaN NaN
+ """)
+
+ @Appender(_shared_docs['str_split'] % {
+ 'side': 'beginning',
+ 'method': 'split'})
+ def split(self, pat=None, n=-1, expand=False):
+ result = str_split(self._parent, pat, n=n)
+ return self._wrap_result(result, expand=expand)
+
+ @Appender(_shared_docs['str_split'] % {
+ 'side': 'end',
+ 'method': 'rsplit'})
+ def rsplit(self, pat=None, n=-1, expand=False):
+ result = str_rsplit(self._parent, pat, n=n)
+ return self._wrap_result(result, expand=expand)
+
+ _shared_docs['str_partition'] = ("""
+ Split the string at the %(side)s occurrence of `sep`.
+
+ This method splits the string at the %(side)s occurrence of `sep`,
+ and returns 3 elements containing the part before the separator,
+ the separator itself, and the part after the separator.
+ If the separator is not found, return %(return)s.
+
+ Parameters
+ ----------
+ sep : str, default whitespace
+ String to split on.
+ pat : str, default whitespace
+ .. deprecated:: 0.24.0
+ Use ``sep`` instead
+ expand : bool, default True
+ If True, return DataFrame/MultiIndex expanding dimensionality.
+ If False, return Series/Index.
+
+ Returns
+ -------
+ DataFrame/MultiIndex or Series/Index of objects
+
+ See Also
+ --------
+ %(also)s
+ Series.str.split : Split strings around given separators.
+ str.partition : Standard library version.
+
+ Examples
+ --------
+
+ >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers'])
+ >>> s
+ 0 Linda van der Berg
+ 1 George Pitt-Rivers
+ dtype: object
+
+ >>> s.str.partition()
+ 0 1 2
+ 0 Linda van der Berg
+ 1 George Pitt-Rivers
+
+ To partition by the last space instead of the first one:
+
+ >>> s.str.rpartition()
+ 0 1 2
+ 0 Linda van der Berg
+ 1 George Pitt-Rivers
+
+ To partition by something different than a space:
+
+ >>> s.str.partition('-')
+ 0 1 2
+ 0 Linda van der Berg
+ 1 George Pitt - Rivers
+
+ To return a Series containining tuples instead of a DataFrame:
+
+ >>> s.str.partition('-', expand=False)
+ 0 (Linda van der Berg, , )
+ 1 (George Pitt, -, Rivers)
+ dtype: object
+
+ Also available on indices:
+
+ >>> idx = pd.Index(['X 123', 'Y 999'])
+ >>> idx
+ Index(['X 123', 'Y 999'], dtype='object')
+
+ Which will create a MultiIndex:
+
+ >>> idx.str.partition()
+ MultiIndex(levels=[['X', 'Y'], [' '], ['123', '999']],
+ codes=[[0, 1], [0, 0], [0, 1]])
+
+ Or an index with tuples with ``expand=False``:
+
+ >>> idx.str.partition(expand=False)
+ Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object')
+ """)
+
+ @Appender(_shared_docs['str_partition'] % {
+ 'side': 'first',
+ 'return': '3 elements containing the string itself, followed by two '
+ 'empty strings',
+ 'also': 'rpartition : Split the string at the last occurrence of '
+ '`sep`.'
+ })
+ @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep')
+ def partition(self, sep=' ', expand=True):
+ f = lambda x: x.partition(sep)
+ result = _na_map(f, self._parent)
+ return self._wrap_result(result, expand=expand)
+
+ @Appender(_shared_docs['str_partition'] % {
+ 'side': 'last',
+ 'return': '3 elements containing two empty strings, followed by the '
+ 'string itself',
+ 'also': 'partition : Split the string at the first occurrence of '
+ '`sep`.'
+ })
+ @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep')
+ def rpartition(self, sep=' ', expand=True):
+ f = lambda x: x.rpartition(sep)
+ result = _na_map(f, self._parent)
+ return self._wrap_result(result, expand=expand)
+
+ @copy(str_get)
+ def get(self, i):
+ result = str_get(self._parent, i)
+ return self._wrap_result(result)
+
+ @copy(str_join)
+ def join(self, sep):
+ result = str_join(self._parent, sep)
+ return self._wrap_result(result)
+
+ @copy(str_contains)
+ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
+ result = str_contains(self._parent, pat, case=case, flags=flags, na=na,
+ regex=regex)
+ return self._wrap_result(result, fill_value=na)
+
+ @copy(str_match)
+ def match(self, pat, case=True, flags=0, na=np.nan):
+ result = str_match(self._parent, pat, case=case, flags=flags, na=na)
+ return self._wrap_result(result, fill_value=na)
+
+ @copy(str_replace)
+ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
+ result = str_replace(self._parent, pat, repl, n=n, case=case,
+ flags=flags, regex=regex)
+ return self._wrap_result(result)
+
+ @copy(str_repeat)
+ def repeat(self, repeats):
+ result = str_repeat(self._parent, repeats)
+ return self._wrap_result(result)
+
+ @copy(str_pad)
+ def pad(self, width, side='left', fillchar=' '):
+ result = str_pad(self._parent, width, side=side, fillchar=fillchar)
+ return self._wrap_result(result)
+
+ _shared_docs['str_pad'] = ("""
+ Filling %(side)s side of strings in the Series/Index with an
+ additional character. Equivalent to :meth:`str.%(method)s`.
+
+ Parameters
+ ----------
+ width : int
+ Minimum width of resulting string; additional characters will be filled
+ with ``fillchar``
+ fillchar : str
+ Additional character for filling, default is whitespace
+
+ Returns
+ -------
+ filled : Series/Index of objects
+ """)
+
+ @Appender(_shared_docs['str_pad'] % dict(side='left and right',
+ method='center'))
+ def center(self, width, fillchar=' '):
+ return self.pad(width, side='both', fillchar=fillchar)
+
+ @Appender(_shared_docs['str_pad'] % dict(side='right', method='ljust'))
+ def ljust(self, width, fillchar=' '):
+ return self.pad(width, side='right', fillchar=fillchar)
+
+ @Appender(_shared_docs['str_pad'] % dict(side='left', method='rjust'))
+ def rjust(self, width, fillchar=' '):
+ return self.pad(width, side='left', fillchar=fillchar)
+
+ def zfill(self, width):
+ """
+ Pad strings in the Series/Index by prepending '0' characters.
+
+ Strings in the Series/Index are padded with '0' characters on the
+ left of the string to reach a total string length `width`. Strings
+ in the Series/Index with length greater or equal to `width` are
+ unchanged.
+
+ Parameters
+ ----------
+ width : int
+ Minimum length of resulting string; strings with length less
+ than `width` be prepended with '0' characters.
+
+ Returns
+ -------
+ Series/Index of objects
+
+ See Also
+ --------
+ Series.str.rjust : Fills the left side of strings with an arbitrary
+ character.
+ Series.str.ljust : Fills the right side of strings with an arbitrary
+ character.
+ Series.str.pad : Fills the specified sides of strings with an arbitrary
+ character.
+ Series.str.center : Fills boths sides of strings with an arbitrary
+ character.
+
+ Notes
+ -----
+ Differs from :meth:`str.zfill` which has special handling
+ for '+'/'-' in the string.
+
+ Examples
+ --------
+ >>> s = pd.Series(['-1', '1', '1000', 10, np.nan])
+ >>> s
+ 0 -1
+ 1 1
+ 2 1000
+ 3 10
+ 4 NaN
+ dtype: object
+
+ Note that ``10`` and ``NaN`` are not strings, therefore they are
+ converted to ``NaN``. The minus sign in ``'-1'`` is treated as a
+ regular character and the zero is added to the left of it
+ (:meth:`str.zfill` would have moved it to the left). ``1000``
+ remains unchanged as it is longer than `width`.
+
+ >>> s.str.zfill(3)
+ 0 0-1
+ 1 001
+ 2 1000
+ 3 NaN
+ 4 NaN
+ dtype: object
+ """
+ result = str_pad(self._parent, width, side='left', fillchar='0')
+ return self._wrap_result(result)
+
+ @copy(str_slice)
+ def slice(self, start=None, stop=None, step=None):
+ result = str_slice(self._parent, start, stop, step)
+ return self._wrap_result(result)
+
+ @copy(str_slice_replace)
+ def slice_replace(self, start=None, stop=None, repl=None):
+ result = str_slice_replace(self._parent, start, stop, repl)
+ return self._wrap_result(result)
+
+ @copy(str_decode)
+ def decode(self, encoding, errors="strict"):
+ result = str_decode(self._parent, encoding, errors)
+ return self._wrap_result(result)
+
+ @copy(str_encode)
+ def encode(self, encoding, errors="strict"):
+ result = str_encode(self._parent, encoding, errors)
+ return self._wrap_result(result)
+
+ _shared_docs['str_strip'] = (r"""
+ Remove leading and trailing characters.
+
+ Strip whitespaces (including newlines) or a set of specified characters
+ from each string in the Series/Index from %(side)s.
+ Equivalent to :meth:`str.%(method)s`.
+
+ Parameters
+ ----------
+ to_strip : str or None, default None
+ Specifying the set of characters to be removed.
+ All combinations of this set of characters will be stripped.
+ If None then whitespaces are removed.
+
+ Returns
+ -------
+ Series/Index of objects
+
+ See Also
+ --------
+ Series.str.strip : Remove leading and trailing characters in Series/Index.
+ Series.str.lstrip : Remove leading characters in Series/Index.
+ Series.str.rstrip : Remove trailing characters in Series/Index.
+
+ Examples
+ --------
+ >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan])
+ >>> s
+ 0 1. Ant.
+ 1 2. Bee!\n
+ 2 3. Cat?\t
+ 3 NaN
+ dtype: object
+
+ >>> s.str.strip()
+ 0 1. Ant.
+ 1 2. Bee!
+ 2 3. Cat?
+ 3 NaN
+ dtype: object
+
+ >>> s.str.lstrip('123.')
+ 0 Ant.
+ 1 Bee!\n
+ 2 Cat?\t
+ 3 NaN
+ dtype: object
+
+ >>> s.str.rstrip('.!? \n\t')
+ 0 1. Ant
+ 1 2. Bee
+ 2 3. Cat
+ 3 NaN
+ dtype: object
+
+ >>> s.str.strip('123.!? \n\t')
+ 0 Ant
+ 1 Bee
+ 2 Cat
+ 3 NaN
+ dtype: object
+ """)
+
+ @Appender(_shared_docs['str_strip'] % dict(side='left and right sides',
+ method='strip'))
+ def strip(self, to_strip=None):
+ result = str_strip(self._parent, to_strip, side='both')
+ return self._wrap_result(result)
+
+ @Appender(_shared_docs['str_strip'] % dict(side='left side',
+ method='lstrip'))
+ def lstrip(self, to_strip=None):
+ result = str_strip(self._parent, to_strip, side='left')
+ return self._wrap_result(result)
+
+ @Appender(_shared_docs['str_strip'] % dict(side='right side',
+ method='rstrip'))
+ def rstrip(self, to_strip=None):
+ result = str_strip(self._parent, to_strip, side='right')
+ return self._wrap_result(result)
+
+ @copy(str_wrap)
+ def wrap(self, width, **kwargs):
+ result = str_wrap(self._parent, width, **kwargs)
+ return self._wrap_result(result)
+
+ @copy(str_get_dummies)
+ def get_dummies(self, sep='|'):
+ # we need to cast to Series of strings as only that has all
+ # methods available for making the dummies...
+ data = self._orig.astype(str) if self._is_categorical else self._parent
+ result, name = str_get_dummies(data, sep)
+ return self._wrap_result(result, use_codes=(not self._is_categorical),
+ name=name, expand=True)
+
+ @copy(str_translate)
+ def translate(self, table, deletechars=None):
+ result = str_translate(self._parent, table, deletechars)
+ return self._wrap_result(result)
+
+ count = _pat_wrapper(str_count, flags=True)
+ startswith = _pat_wrapper(str_startswith, na=True)
+ endswith = _pat_wrapper(str_endswith, na=True)
+ findall = _pat_wrapper(str_findall, flags=True)
+
+ @copy(str_extract)
+ def extract(self, pat, flags=0, expand=True):
+ return str_extract(self, pat, flags=flags, expand=expand)
+
+ @copy(str_extractall)
+ def extractall(self, pat, flags=0):
+ return str_extractall(self._orig, pat, flags=flags)
+
+ _shared_docs['find'] = ("""
+ Return %(side)s indexes in each strings in the Series/Index
+ where the substring is fully contained between [start:end].
+ Return -1 on failure. Equivalent to standard :meth:`str.%(method)s`.
+
+ Parameters
+ ----------
+ sub : str
+ Substring being searched
+ start : int
+ Left edge index
+ end : int
+ Right edge index
+
+ Returns
+ -------
+ found : Series/Index of integer values
+
+ See Also
+ --------
+ %(also)s
+ """)
+
+ @Appender(_shared_docs['find'] %
+ dict(side='lowest', method='find',
+ also='rfind : Return highest indexes in each strings.'))
+ def find(self, sub, start=0, end=None):
+ result = str_find(self._parent, sub, start=start, end=end, side='left')
+ return self._wrap_result(result)
+
+ @Appender(_shared_docs['find'] %
+ dict(side='highest', method='rfind',
+ also='find : Return lowest indexes in each strings.'))
+ def rfind(self, sub, start=0, end=None):
+ result = str_find(self._parent, sub,
+ start=start, end=end, side='right')
+ return self._wrap_result(result)
+
+ def normalize(self, form):
+ """
+ Return the Unicode normal form for the strings in the Series/Index.
+ For more information on the forms, see the
+ :func:`unicodedata.normalize`.
+
+ Parameters
+ ----------
+ form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
+ Unicode form
+
+ Returns
+ -------
+ normalized : Series/Index of objects
+ """
+ import unicodedata
+ f = lambda x: unicodedata.normalize(form, compat.u_safe(x))
+ result = _na_map(f, self._parent)
+ return self._wrap_result(result)
+
+ _shared_docs['index'] = ("""
+ Return %(side)s indexes in each strings where the substring is
+ fully contained between [start:end]. This is the same as
+ ``str.%(similar)s`` except instead of returning -1, it raises a ValueError
+ when the substring is not found. Equivalent to standard ``str.%(method)s``.
+
+ Parameters
+ ----------
+ sub : str
+ Substring being searched
+ start : int
+ Left edge index
+ end : int
+ Right edge index
+
+ Returns
+ -------
+ found : Series/Index of objects
+
+ See Also
+ --------
+ %(also)s
+ """)
+
+ @Appender(_shared_docs['index'] %
+ dict(side='lowest', similar='find', method='index',
+ also='rindex : Return highest indexes in each strings.'))
+ def index(self, sub, start=0, end=None):
+ result = str_index(self._parent, sub,
+ start=start, end=end, side='left')
+ return self._wrap_result(result)
+
+ @Appender(_shared_docs['index'] %
+ dict(side='highest', similar='rfind', method='rindex',
+ also='index : Return lowest indexes in each strings.'))
+ def rindex(self, sub, start=0, end=None):
+ result = str_index(self._parent, sub,
+ start=start, end=end, side='right')
+ return self._wrap_result(result)
+
+ _shared_docs['len'] = ("""
+ Computes the length of each element in the Series/Index. The element may be
+ a sequence (such as a string, tuple or list) or a collection
+ (such as a dictionary).
+
+ Returns
+ -------
+ Series or Index of int
+ A Series or Index of integer values indicating the length of each
+ element in the Series or Index.
+
+ See Also
+ --------
+ str.len : Python built-in function returning the length of an object.
+ Series.size : Returns the length of the Series.
+
+ Examples
+ --------
+ Returns the length (number of characters) in a string. Returns the
+ number of entries for dictionaries, lists or tuples.
+
+ >>> s = pd.Series(['dog',
+ ... '',
+ ... 5,
+ ... {'foo' : 'bar'},
+ ... [2, 3, 5, 7],
+ ... ('one', 'two', 'three')])
+ >>> s
+ 0 dog
+ 1
+ 2 5
+ 3 {'foo': 'bar'}
+ 4 [2, 3, 5, 7]
+ 5 (one, two, three)
+ dtype: object
+ >>> s.str.len()
+ 0 3.0
+ 1 0.0
+ 2 NaN
+ 3 1.0
+ 4 4.0
+ 5 3.0
+ dtype: float64
+ """)
+ len = _noarg_wrapper(len, docstring=_shared_docs['len'], dtype=int)
+
+ _shared_docs['casemethods'] = ("""
+ Convert strings in the Series/Index to %(type)s.
+
+ Equivalent to :meth:`str.%(method)s`.
+
+ Returns
+ -------
+ Series/Index of objects
+
+ See Also
+ --------
+ Series.str.lower : Converts all characters to lowercase.
+ Series.str.upper : Converts all characters to uppercase.
+ Series.str.title : Converts first character of each word to uppercase and
+ remaining to lowercase.
+ Series.str.capitalize : Converts first character to uppercase and
+ remaining to lowercase.
+ Series.str.swapcase : Converts uppercase to lowercase and lowercase to
+ uppercase.
+
+ Examples
+ --------
+ >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
+ >>> s
+ 0 lower
+ 1 CAPITALS
+ 2 this is a sentence
+ 3 SwApCaSe
+ dtype: object
+
+ >>> s.str.lower()
+ 0 lower
+ 1 capitals
+ 2 this is a sentence
+ 3 swapcase
+ dtype: object
+
+ >>> s.str.upper()
+ 0 LOWER
+ 1 CAPITALS
+ 2 THIS IS A SENTENCE
+ 3 SWAPCASE
+ dtype: object
+
+ >>> s.str.title()
+ 0 Lower
+ 1 Capitals
+ 2 This Is A Sentence
+ 3 Swapcase
+ dtype: object
+
+ >>> s.str.capitalize()
+ 0 Lower
+ 1 Capitals
+ 2 This is a sentence
+ 3 Swapcase
+ dtype: object
+
+ >>> s.str.swapcase()
+ 0 LOWER
+ 1 capitals
+ 2 THIS IS A SENTENCE
+ 3 sWaPcAsE
+ dtype: object
+ """)
+ _shared_docs['lower'] = dict(type='lowercase', method='lower')
+ _shared_docs['upper'] = dict(type='uppercase', method='upper')
+ _shared_docs['title'] = dict(type='titlecase', method='title')
+ _shared_docs['capitalize'] = dict(type='be capitalized',
+ method='capitalize')
+ _shared_docs['swapcase'] = dict(type='be swapcased', method='swapcase')
+ lower = _noarg_wrapper(lambda x: x.lower(),
+ docstring=_shared_docs['casemethods'] %
+ _shared_docs['lower'])
+ upper = _noarg_wrapper(lambda x: x.upper(),
+ docstring=_shared_docs['casemethods'] %
+ _shared_docs['upper'])
+ title = _noarg_wrapper(lambda x: x.title(),
+ docstring=_shared_docs['casemethods'] %
+ _shared_docs['title'])
+ capitalize = _noarg_wrapper(lambda x: x.capitalize(),
+ docstring=_shared_docs['casemethods'] %
+ _shared_docs['capitalize'])
+ swapcase = _noarg_wrapper(lambda x: x.swapcase(),
+ docstring=_shared_docs['casemethods'] %
+ _shared_docs['swapcase'])
+
+ _shared_docs['ismethods'] = ("""
+ Check whether all characters in each string are %(type)s.
+
+ This is equivalent to running the Python string method
+ :meth:`str.%(method)s` for each element of the Series/Index. If a string
+ has zero characters, ``False`` is returned for that check.
+
+ Returns
+ -------
+ Series or Index of bool
+ Series or Index of boolean values with the same length as the original
+ Series/Index.
+
+ See Also
+ --------
+ Series.str.isalpha : Check whether all characters are alphabetic.
+ Series.str.isnumeric : Check whether all characters are numeric.
+ Series.str.isalnum : Check whether all characters are alphanumeric.
+ Series.str.isdigit : Check whether all characters are digits.
+ Series.str.isdecimal : Check whether all characters are decimal.
+ Series.str.isspace : Check whether all characters are whitespace.
+ Series.str.islower : Check whether all characters are lowercase.
+ Series.str.isupper : Check whether all characters are uppercase.
+ Series.str.istitle : Check whether all characters are titlecase.
+
+ Examples
+ --------
+ **Checks for Alphabetic and Numeric Characters**
+
+ >>> s1 = pd.Series(['one', 'one1', '1', ''])
+
+ >>> s1.str.isalpha()
+ 0 True
+ 1 False
+ 2 False
+ 3 False
+ dtype: bool
+
+ >>> s1.str.isnumeric()
+ 0 False
+ 1 False
+ 2 True
+ 3 False
+ dtype: bool
+
+ >>> s1.str.isalnum()
+ 0 True
+ 1 True
+ 2 True
+ 3 False
+ dtype: bool
+
+ Note that checks against characters mixed with any additional punctuation
+ or whitespace will evaluate to false for an alphanumeric check.
+
+ >>> s2 = pd.Series(['A B', '1.5', '3,000'])
+ >>> s2.str.isalnum()
+ 0 False
+ 1 False
+ 2 False
+ dtype: bool
+
+ **More Detailed Checks for Numeric Characters**
+
+ There are several different but overlapping sets of numeric characters that
+ can be checked for.
+
+ >>> s3 = pd.Series(['23', '³', '⅕', ''])
+
+ The ``s3.str.isdecimal`` method checks for characters used to form numbers
+ in base 10.
+
+ >>> s3.str.isdecimal()
+ 0 True
+ 1 False
+ 2 False
+ 3 False
+ dtype: bool
+
+ The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also
+ includes special digits, like superscripted and subscripted digits in
+ unicode.
+
+ >>> s3.str.isdigit()
+ 0 True
+ 1 True
+ 2 False
+ 3 False
+ dtype: bool
+
+ The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also
+ includes other characters that can represent quantities such as unicode
+ fractions.
+
+ >>> s3.str.isnumeric()
+ 0 True
+ 1 True
+ 2 True
+ 3 False
+ dtype: bool
+
+ **Checks for Whitespace**
+
+ >>> s4 = pd.Series([' ', '\\t\\r\\n ', ''])
+ >>> s4.str.isspace()
+ 0 True
+ 1 True
+ 2 False
+ dtype: bool
+
+ **Checks for Character Case**
+
+ >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])
+
+ >>> s5.str.islower()
+ 0 True
+ 1 False
+ 2 False
+ 3 False
+ dtype: bool
+
+ >>> s5.str.isupper()
+ 0 False
+ 1 False
+ 2 True
+ 3 False
+ dtype: bool
+
+ The ``s5.str.istitle`` method checks for whether all words are in title
+ case (whether only the first letter of each word is capitalized). Words are
+ assumed to be as any sequence of non-numeric characters seperated by
+ whitespace characters.
+
+ >>> s5.str.istitle()
+ 0 False
+ 1 True
+ 2 False
+ 3 False
+ dtype: bool
+ """)
+ _shared_docs['isalnum'] = dict(type='alphanumeric', method='isalnum')
+ _shared_docs['isalpha'] = dict(type='alphabetic', method='isalpha')
+ _shared_docs['isdigit'] = dict(type='digits', method='isdigit')
+ _shared_docs['isspace'] = dict(type='whitespace', method='isspace')
+ _shared_docs['islower'] = dict(type='lowercase', method='islower')
+ _shared_docs['isupper'] = dict(type='uppercase', method='isupper')
+ _shared_docs['istitle'] = dict(type='titlecase', method='istitle')
+ _shared_docs['isnumeric'] = dict(type='numeric', method='isnumeric')
+ _shared_docs['isdecimal'] = dict(type='decimal', method='isdecimal')
+ isalnum = _noarg_wrapper(lambda x: x.isalnum(),
+ docstring=_shared_docs['ismethods'] %
+ _shared_docs['isalnum'])
+ isalpha = _noarg_wrapper(lambda x: x.isalpha(),
+ docstring=_shared_docs['ismethods'] %
+ _shared_docs['isalpha'])
+ isdigit = _noarg_wrapper(lambda x: x.isdigit(),
+ docstring=_shared_docs['ismethods'] %
+ _shared_docs['isdigit'])
+ isspace = _noarg_wrapper(lambda x: x.isspace(),
+ docstring=_shared_docs['ismethods'] %
+ _shared_docs['isspace'])
+ islower = _noarg_wrapper(lambda x: x.islower(),
+ docstring=_shared_docs['ismethods'] %
+ _shared_docs['islower'])
+ isupper = _noarg_wrapper(lambda x: x.isupper(),
+ docstring=_shared_docs['ismethods'] %
+ _shared_docs['isupper'])
+ istitle = _noarg_wrapper(lambda x: x.istitle(),
+ docstring=_shared_docs['ismethods'] %
+ _shared_docs['istitle'])
+ isnumeric = _noarg_wrapper(lambda x: compat.u_safe(x).isnumeric(),
+ docstring=_shared_docs['ismethods'] %
+ _shared_docs['isnumeric'])
+ isdecimal = _noarg_wrapper(lambda x: compat.u_safe(x).isdecimal(),
+ docstring=_shared_docs['ismethods'] %
+ _shared_docs['isdecimal'])
+
+ @classmethod
+ def _make_accessor(cls, data):
+ cls._validate(data)
+ return cls(data)
diff --git a/contrib/python/pandas/py2/pandas/core/tools/__init__.py b/contrib/python/pandas/py2/pandas/core/tools/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/tools/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/core/tools/datetimes.py b/contrib/python/pandas/py2/pandas/core/tools/datetimes.py
new file mode 100644
index 00000000000..e6478da400d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/tools/datetimes.py
@@ -0,0 +1,903 @@
+from datetime import datetime, time
+from functools import partial
+
+import numpy as np
+
+from pandas._libs import tslib, tslibs
+from pandas._libs.tslibs import Timestamp, conversion, parsing
+from pandas._libs.tslibs.parsing import ( # noqa
+ DateParseError, _format_is_iso, _guess_datetime_format, parse_time_string)
+from pandas._libs.tslibs.strptime import array_strptime
+from pandas.compat import zip
+
+from pandas.core.dtypes.common import (
+ ensure_object, is_datetime64_dtype, is_datetime64_ns_dtype,
+ is_datetime64tz_dtype, is_float, is_integer, is_integer_dtype,
+ is_list_like, is_numeric_dtype, is_object_dtype, is_scalar)
+from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
+from pandas.core.dtypes.missing import notna
+
+from pandas import compat
+from pandas.core import algorithms
+
+
+def _guess_datetime_format_for_array(arr, **kwargs):
+ # Try to guess the format based on the first non-NaN element
+ non_nan_elements = notna(arr).nonzero()[0]
+ if len(non_nan_elements):
+ return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
+
+
+def _maybe_cache(arg, format, cache, convert_listlike):
+ """
+ Create a cache of unique dates from an array of dates
+
+ Parameters
+ ----------
+ arg : integer, float, string, datetime, list, tuple, 1-d array, Series
+ format : string
+ Strftime format to parse time
+ cache : boolean
+ True attempts to create a cache of converted values
+ convert_listlike : function
+ Conversion function to apply on dates
+
+ Returns
+ -------
+ cache_array : Series
+ Cache of converted, unique dates. Can be empty
+ """
+ from pandas import Series
+ cache_array = Series()
+ if cache:
+ # Perform a quicker unique check
+ from pandas import Index
+ if not Index(arg).is_unique:
+ unique_dates = algorithms.unique(arg)
+ cache_dates = convert_listlike(unique_dates, True, format)
+ cache_array = Series(cache_dates, index=unique_dates)
+ return cache_array
+
+
+def _convert_and_box_cache(arg, cache_array, box, errors, name=None):
+ """
+ Convert array of dates with a cache and box the result
+
+ Parameters
+ ----------
+ arg : integer, float, string, datetime, list, tuple, 1-d array, Series
+ cache_array : Series
+ Cache of converted, unique dates
+ box : boolean
+ True boxes result as an Index-like, False returns an ndarray
+ errors : string
+ 'ignore' plus box=True will convert result to Index
+ name : string, default None
+ Name for a DatetimeIndex
+
+ Returns
+ -------
+ result : datetime of converted dates
+ Returns:
+
+ - Index-like if box=True
+ - ndarray if box=False
+ """
+ from pandas import Series, DatetimeIndex, Index
+ result = Series(arg).map(cache_array)
+ if box:
+ if errors == 'ignore':
+ return Index(result, name=name)
+ else:
+ return DatetimeIndex(result, name=name)
+ return result.values
+
+
+def _return_parsed_timezone_results(result, timezones, box, tz, name):
+ """
+ Return results from array_strptime if a %z or %Z directive was passed.
+
+ Parameters
+ ----------
+ result : ndarray
+ int64 date representations of the dates
+ timezones : ndarray
+ pytz timezone objects
+ box : boolean
+ True boxes result as an Index-like, False returns an ndarray
+ tz : object
+ None or pytz timezone object
+ name : string, default None
+ Name for a DatetimeIndex
+
+ Returns
+ -------
+ tz_result : ndarray of parsed dates with timezone
+ Returns:
+
+ - Index-like if box=True
+ - ndarray of Timestamps if box=False
+
+ """
+ if tz is not None:
+ raise ValueError("Cannot pass a tz argument when "
+ "parsing strings with timezone "
+ "information.")
+ tz_results = np.array([Timestamp(res).tz_localize(zone) for res, zone
+ in zip(result, timezones)])
+ if box:
+ from pandas import Index
+ return Index(tz_results, name=name)
+ return tz_results
+
+
+def _convert_listlike_datetimes(arg, box, format, name=None, tz=None,
+ unit=None, errors=None,
+ infer_datetime_format=None, dayfirst=None,
+ yearfirst=None, exact=None):
+ """
+ Helper function for to_datetime. Performs the conversions of 1D listlike
+ of dates
+
+ Parameters
+ ----------
+ arg : list, tuple, ndarray, Series, Index
+ date to be parced
+ box : boolean
+ True boxes result as an Index-like, False returns an ndarray
+ name : object
+ None or string for the Index name
+ tz : object
+ None or 'utc'
+ unit : string
+ None or string of the frequency of the passed data
+ errors : string
+ error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore'
+ infer_datetime_format : boolean
+ inferring format behavior from to_datetime
+ dayfirst : boolean
+ dayfirst parsing behavior from to_datetime
+ yearfirst : boolean
+ yearfirst parsing behavior from to_datetime
+ exact : boolean
+ exact format matching behavior from to_datetime
+
+ Returns
+ -------
+ ndarray of parsed dates
+ Returns:
+
+ - Index-like if box=True
+ - ndarray of Timestamps if box=False
+ """
+ from pandas import DatetimeIndex
+ from pandas.core.arrays import DatetimeArray
+ from pandas.core.arrays.datetimes import (
+ maybe_convert_dtype, objects_to_datetime64ns)
+
+ if isinstance(arg, (list, tuple)):
+ arg = np.array(arg, dtype='O')
+
+ # these are shortcutable
+ if is_datetime64tz_dtype(arg):
+ if not isinstance(arg, (DatetimeArray, DatetimeIndex)):
+ return DatetimeIndex(arg, tz=tz, name=name)
+ if tz == 'utc':
+ arg = arg.tz_convert(None).tz_localize(tz)
+ return arg
+
+ elif is_datetime64_ns_dtype(arg):
+ if box and not isinstance(arg, (DatetimeArray, DatetimeIndex)):
+ try:
+ return DatetimeIndex(arg, tz=tz, name=name)
+ except ValueError:
+ pass
+
+ return arg
+
+ elif unit is not None:
+ if format is not None:
+ raise ValueError("cannot specify both format and unit")
+ arg = getattr(arg, 'values', arg)
+ result = tslib.array_with_unit_to_datetime(arg, unit,
+ errors=errors)
+ if box:
+ if errors == 'ignore':
+ from pandas import Index
+ result = Index(result, name=name)
+ # GH 23758: We may still need to localize the result with tz
+ try:
+ return result.tz_localize(tz)
+ except AttributeError:
+ return result
+
+ return DatetimeIndex(result, tz=tz, name=name)
+ return result
+ elif getattr(arg, 'ndim', 1) > 1:
+ raise TypeError('arg must be a string, datetime, list, tuple, '
+ '1-d array, or Series')
+
+ # warn if passing timedelta64, raise for PeriodDtype
+ # NB: this must come after unit transformation
+ orig_arg = arg
+ arg, _ = maybe_convert_dtype(arg, copy=False)
+
+ arg = ensure_object(arg)
+ require_iso8601 = False
+
+ if infer_datetime_format and format is None:
+ format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
+
+ if format is not None:
+ # There is a special fast-path for iso8601 formatted
+ # datetime strings, so in those cases don't use the inferred
+ # format because this path makes process slower in this
+ # special case
+ format_is_iso8601 = _format_is_iso(format)
+ if format_is_iso8601:
+ require_iso8601 = not infer_datetime_format
+ format = None
+
+ tz_parsed = None
+ result = None
+
+ if format is not None:
+ try:
+ # shortcut formatting here
+ if format == '%Y%m%d':
+ try:
+ # pass orig_arg as float-dtype may have been converted to
+ # datetime64[ns]
+ orig_arg = ensure_object(orig_arg)
+ result = _attempt_YYYYMMDD(orig_arg, errors=errors)
+ except (ValueError, TypeError, tslibs.OutOfBoundsDatetime):
+ raise ValueError("cannot convert the input to "
+ "'%Y%m%d' date format")
+
+ # fallback
+ if result is None:
+ try:
+ result, timezones = array_strptime(
+ arg, format, exact=exact, errors=errors)
+ if '%Z' in format or '%z' in format:
+ return _return_parsed_timezone_results(
+ result, timezones, box, tz, name)
+ except tslibs.OutOfBoundsDatetime:
+ if errors == 'raise':
+ raise
+ elif errors == 'coerce':
+ result = np.empty(arg.shape, dtype='M8[ns]')
+ iresult = result.view('i8')
+ iresult.fill(tslibs.iNaT)
+ else:
+ result = arg
+ except ValueError:
+ # if format was inferred, try falling back
+ # to array_to_datetime - terminate here
+ # for specified formats
+ if not infer_datetime_format:
+ if errors == 'raise':
+ raise
+ elif errors == 'coerce':
+ result = np.empty(arg.shape, dtype='M8[ns]')
+ iresult = result.view('i8')
+ iresult.fill(tslibs.iNaT)
+ else:
+ result = arg
+ except ValueError as e:
+ # Fallback to try to convert datetime objects if timezone-aware
+ # datetime objects are found without passing `utc=True`
+ try:
+ values, tz = conversion.datetime_to_datetime64(arg)
+ return DatetimeIndex._simple_new(values, name=name, tz=tz)
+ except (ValueError, TypeError):
+ raise e
+
+ if result is None:
+ assert format is None or infer_datetime_format
+ utc = tz == 'utc'
+ result, tz_parsed = objects_to_datetime64ns(
+ arg, dayfirst=dayfirst, yearfirst=yearfirst,
+ utc=utc, errors=errors, require_iso8601=require_iso8601,
+ allow_object=True)
+
+ if tz_parsed is not None:
+ if box:
+ # We can take a shortcut since the datetime64 numpy array
+ # is in UTC
+ return DatetimeIndex._simple_new(result, name=name,
+ tz=tz_parsed)
+ else:
+ # Convert the datetime64 numpy array to an numpy array
+ # of datetime objects
+ result = [Timestamp(ts, tz=tz_parsed).to_pydatetime()
+ for ts in result]
+ return np.array(result, dtype=object)
+
+ if box:
+ # Ensure we return an Index in all cases where box=True
+ if is_datetime64_dtype(result):
+ return DatetimeIndex(result, tz=tz, name=name)
+ elif is_object_dtype(result):
+ # e.g. an Index of datetime objects
+ from pandas import Index
+ return Index(result, name=name)
+ return result
+
+
+def _adjust_to_origin(arg, origin, unit):
+ """
+ Helper function for to_datetime.
+ Adjust input argument to the specified origin
+
+ Parameters
+ ----------
+ arg : list, tuple, ndarray, Series, Index
+ date to be adjusted
+ origin : 'julian' or Timestamp
+ origin offset for the arg
+ unit : string
+ passed unit from to_datetime, must be 'D'
+
+ Returns
+ -------
+ ndarray or scalar of adjusted date(s)
+ """
+ if origin == 'julian':
+ original = arg
+ j0 = Timestamp(0).to_julian_date()
+ if unit != 'D':
+ raise ValueError("unit must be 'D' for origin='julian'")
+ try:
+ arg = arg - j0
+ except TypeError:
+ raise ValueError("incompatible 'arg' type for given "
+ "'origin'='julian'")
+
+ # premptively check this for a nice range
+ j_max = Timestamp.max.to_julian_date() - j0
+ j_min = Timestamp.min.to_julian_date() - j0
+ if np.any(arg > j_max) or np.any(arg < j_min):
+ raise tslibs.OutOfBoundsDatetime(
+ "{original} is Out of Bounds for "
+ "origin='julian'".format(original=original))
+ else:
+ # arg must be numeric
+ if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or
+ is_numeric_dtype(np.asarray(arg))):
+ raise ValueError(
+ "'{arg}' is not compatible with origin='{origin}'; "
+ "it must be numeric with a unit specified ".format(
+ arg=arg,
+ origin=origin))
+
+ # we are going to offset back to unix / epoch time
+ try:
+ offset = Timestamp(origin)
+ except tslibs.OutOfBoundsDatetime:
+ raise tslibs.OutOfBoundsDatetime(
+ "origin {origin} is Out of Bounds".format(origin=origin))
+ except ValueError:
+ raise ValueError("origin {origin} cannot be converted "
+ "to a Timestamp".format(origin=origin))
+
+ if offset.tz is not None:
+ raise ValueError(
+ "origin offset {} must be tz-naive".format(offset))
+ offset -= Timestamp(0)
+
+ # convert the offset to the unit of the arg
+ # this should be lossless in terms of precision
+ offset = offset // tslibs.Timedelta(1, unit=unit)
+
+ # scalars & ndarray-like can handle the addition
+ if is_list_like(arg) and not isinstance(
+ arg, (ABCSeries, ABCIndexClass, np.ndarray)):
+ arg = np.asarray(arg)
+ arg = arg + offset
+ return arg
+
+
+def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
+ utc=None, box=True, format=None, exact=True,
+ unit=None, infer_datetime_format=False, origin='unix',
+ cache=False):
+ """
+ Convert argument to datetime.
+
+ Parameters
+ ----------
+ arg : integer, float, string, datetime, list, tuple, 1-d array, Series
+
+ .. versionadded:: 0.18.1
+
+ or DataFrame/dict-like
+
+ errors : {'ignore', 'raise', 'coerce'}, default 'raise'
+
+ - If 'raise', then invalid parsing will raise an exception
+ - If 'coerce', then invalid parsing will be set as NaT
+ - If 'ignore', then invalid parsing will return the input
+ dayfirst : boolean, default False
+ Specify a date parse order if `arg` is str or its list-likes.
+ If True, parses dates with the day first, eg 10/11/12 is parsed as
+ 2012-11-10.
+ Warning: dayfirst=True is not strict, but will prefer to parse
+ with day first (this is a known bug, based on dateutil behavior).
+ yearfirst : boolean, default False
+ Specify a date parse order if `arg` is str or its list-likes.
+
+ - If True parses dates with the year first, eg 10/11/12 is parsed as
+ 2010-11-12.
+ - If both dayfirst and yearfirst are True, yearfirst is preceded (same
+ as dateutil).
+
+ Warning: yearfirst=True is not strict, but will prefer to parse
+ with year first (this is a known bug, based on dateutil behavior).
+
+ .. versionadded:: 0.16.1
+
+ utc : boolean, default None
+ Return UTC DatetimeIndex if True (converting any tz-aware
+ datetime.datetime objects as well).
+ box : boolean, default True
+
+ - If True returns a DatetimeIndex or Index-like object
+ - If False returns ndarray of values.
+ format : string, default None
+ strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
+ all the way up to nanoseconds.
+ exact : boolean, True by default
+
+ - If True, require an exact format match.
+ - If False, allow the format to match anywhere in the target string.
+
+ unit : string, default 'ns'
+ unit of the arg (D,s,ms,us,ns) denote the unit, which is an
+ integer or float number. This will be based off the origin.
+ Example, with unit='ms' and origin='unix' (the default), this
+ would calculate the number of milliseconds to the unix epoch start.
+ infer_datetime_format : boolean, default False
+ If True and no `format` is given, attempt to infer the format of the
+ datetime strings, and if it can be inferred, switch to a faster
+ method of parsing them. In some cases this can increase the parsing
+ speed by ~5-10x.
+ origin : scalar, default is 'unix'
+ Define the reference date. The numeric values would be parsed as number
+ of units (defined by `unit`) since this reference date.
+
+ - If 'unix' (or POSIX) time; origin is set to 1970-01-01.
+ - If 'julian', unit must be 'D', and origin is set to beginning of
+ Julian Calendar. Julian day number 0 is assigned to the day starting
+ at noon on January 1, 4713 BC.
+ - If Timestamp convertible, origin is set to Timestamp identified by
+ origin.
+
+ .. versionadded:: 0.20.0
+ cache : boolean, default False
+ If True, use a cache of unique, converted dates to apply the datetime
+ conversion. May produce significant speed-up when parsing duplicate
+ date strings, especially ones with timezone offsets.
+
+ .. versionadded:: 0.23.0
+
+ Returns
+ -------
+ ret : datetime if parsing succeeded.
+ Return type depends on input:
+
+ - list-like: DatetimeIndex
+ - Series: Series of datetime64 dtype
+ - scalar: Timestamp
+
+ In case when it is not possible to return designated types (e.g. when
+ any element of input is before Timestamp.min or after Timestamp.max)
+ return will have datetime.datetime type (or corresponding
+ array/Series).
+
+ See Also
+ --------
+ pandas.DataFrame.astype : Cast argument to a specified dtype.
+ pandas.to_timedelta : Convert argument to timedelta.
+
+ Examples
+ --------
+ Assembling a datetime from multiple columns of a DataFrame. The keys can be
+ common abbreviations like ['year', 'month', 'day', 'minute', 'second',
+ 'ms', 'us', 'ns']) or plurals of the same
+
+ >>> df = pd.DataFrame({'year': [2015, 2016],
+ 'month': [2, 3],
+ 'day': [4, 5]})
+ >>> pd.to_datetime(df)
+ 0 2015-02-04
+ 1 2016-03-05
+ dtype: datetime64[ns]
+
+ If a date does not meet the `timestamp limitations
+ <http://pandas.pydata.org/pandas-docs/stable/timeseries.html
+ #timeseries-timestamp-limits>`_, passing errors='ignore'
+ will return the original input instead of raising any exception.
+
+ Passing errors='coerce' will force an out-of-bounds date to NaT,
+ in addition to forcing non-dates (or non-parseable dates) to NaT.
+
+ >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore')
+ datetime.datetime(1300, 1, 1, 0, 0)
+ >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce')
+ NaT
+
+ Passing infer_datetime_format=True can often-times speedup a parsing
+ if its not an ISO8601 format exactly, but in a regular format.
+
+ >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000']*1000)
+
+ >>> s.head()
+ 0 3/11/2000
+ 1 3/12/2000
+ 2 3/13/2000
+ 3 3/11/2000
+ 4 3/12/2000
+ dtype: object
+
+ >>> %timeit pd.to_datetime(s,infer_datetime_format=True)
+ 100 loops, best of 3: 10.4 ms per loop
+
+ >>> %timeit pd.to_datetime(s,infer_datetime_format=False)
+ 1 loop, best of 3: 471 ms per loop
+
+ Using a unix epoch time
+
+ >>> pd.to_datetime(1490195805, unit='s')
+ Timestamp('2017-03-22 15:16:45')
+ >>> pd.to_datetime(1490195805433502912, unit='ns')
+ Timestamp('2017-03-22 15:16:45.433502912')
+
+ .. warning:: For float arg, precision rounding might happen. To prevent
+ unexpected behavior use a fixed-width exact type.
+
+ Using a non-unix epoch origin
+
+ >>> pd.to_datetime([1, 2, 3], unit='D',
+ origin=pd.Timestamp('1960-01-01'))
+ 0 1960-01-02
+ 1 1960-01-03
+ 2 1960-01-04
+ """
+ if arg is None:
+ return None
+
+ if origin != 'unix':
+ arg = _adjust_to_origin(arg, origin, unit)
+
+ tz = 'utc' if utc else None
+ convert_listlike = partial(_convert_listlike_datetimes, tz=tz, unit=unit,
+ dayfirst=dayfirst, yearfirst=yearfirst,
+ errors=errors, exact=exact,
+ infer_datetime_format=infer_datetime_format)
+
+ if isinstance(arg, Timestamp):
+ result = arg
+ if tz is not None:
+ if arg.tz is not None:
+ result = result.tz_convert(tz)
+ else:
+ result = result.tz_localize(tz)
+ elif isinstance(arg, ABCSeries):
+ cache_array = _maybe_cache(arg, format, cache, convert_listlike)
+ if not cache_array.empty:
+ result = arg.map(cache_array)
+ else:
+ from pandas import Series
+ values = convert_listlike(arg._values, True, format)
+ result = Series(values, index=arg.index, name=arg.name)
+ elif isinstance(arg, (ABCDataFrame, compat.MutableMapping)):
+ result = _assemble_from_unit_mappings(arg, errors, box, tz)
+ elif isinstance(arg, ABCIndexClass):
+ cache_array = _maybe_cache(arg, format, cache, convert_listlike)
+ if not cache_array.empty:
+ result = _convert_and_box_cache(arg, cache_array, box, errors,
+ name=arg.name)
+ else:
+ convert_listlike = partial(convert_listlike, name=arg.name)
+ result = convert_listlike(arg, box, format)
+ elif is_list_like(arg):
+ cache_array = _maybe_cache(arg, format, cache, convert_listlike)
+ if not cache_array.empty:
+ result = _convert_and_box_cache(arg, cache_array, box, errors)
+ else:
+ result = convert_listlike(arg, box, format)
+ else:
+ result = convert_listlike(np.array([arg]), box, format)[0]
+
+ return result
+
+
+# mappings for assembling units
+_unit_map = {'year': 'year',
+ 'years': 'year',
+ 'month': 'month',
+ 'months': 'month',
+ 'day': 'day',
+ 'days': 'day',
+ 'hour': 'h',
+ 'hours': 'h',
+ 'minute': 'm',
+ 'minutes': 'm',
+ 'second': 's',
+ 'seconds': 's',
+ 'ms': 'ms',
+ 'millisecond': 'ms',
+ 'milliseconds': 'ms',
+ 'us': 'us',
+ 'microsecond': 'us',
+ 'microseconds': 'us',
+ 'ns': 'ns',
+ 'nanosecond': 'ns',
+ 'nanoseconds': 'ns'
+ }
+
+
+def _assemble_from_unit_mappings(arg, errors, box, tz):
+ """
+ assemble the unit specified fields from the arg (DataFrame)
+ Return a Series for actual parsing
+
+ Parameters
+ ----------
+ arg : DataFrame
+ errors : {'ignore', 'raise', 'coerce'}, default 'raise'
+
+ - If 'raise', then invalid parsing will raise an exception
+ - If 'coerce', then invalid parsing will be set as NaT
+ - If 'ignore', then invalid parsing will return the input
+ box : boolean
+
+ - If True, return a DatetimeIndex
+ - If False, return an array
+ tz : None or 'utc'
+
+ Returns
+ -------
+ Series
+ """
+ from pandas import to_timedelta, to_numeric, DataFrame
+ arg = DataFrame(arg)
+ if not arg.columns.is_unique:
+ raise ValueError("cannot assemble with duplicate keys")
+
+ # replace passed unit with _unit_map
+ def f(value):
+ if value in _unit_map:
+ return _unit_map[value]
+
+ # m is case significant
+ if value.lower() in _unit_map:
+ return _unit_map[value.lower()]
+
+ return value
+
+ unit = {k: f(k) for k in arg.keys()}
+ unit_rev = {v: k for k, v in unit.items()}
+
+ # we require at least Ymd
+ required = ['year', 'month', 'day']
+ req = sorted(list(set(required) - set(unit_rev.keys())))
+ if len(req):
+ raise ValueError("to assemble mappings requires at least that "
+ "[year, month, day] be specified: [{required}] "
+ "is missing".format(required=','.join(req)))
+
+ # keys we don't recognize
+ excess = sorted(list(set(unit_rev.keys()) - set(_unit_map.values())))
+ if len(excess):
+ raise ValueError("extra keys have been passed "
+ "to the datetime assemblage: "
+ "[{excess}]".format(excess=','.join(excess)))
+
+ def coerce(values):
+ # we allow coercion to if errors allows
+ values = to_numeric(values, errors=errors)
+
+ # prevent overflow in case of int8 or int16
+ if is_integer_dtype(values):
+ values = values.astype('int64', copy=False)
+ return values
+
+ values = (coerce(arg[unit_rev['year']]) * 10000 +
+ coerce(arg[unit_rev['month']]) * 100 +
+ coerce(arg[unit_rev['day']]))
+ try:
+ values = to_datetime(values, format='%Y%m%d', errors=errors, utc=tz)
+ except (TypeError, ValueError) as e:
+ raise ValueError("cannot assemble the "
+ "datetimes: {error}".format(error=e))
+
+ for u in ['h', 'm', 's', 'ms', 'us', 'ns']:
+ value = unit_rev.get(u)
+ if value is not None and value in arg:
+ try:
+ values += to_timedelta(coerce(arg[value]),
+ unit=u,
+ errors=errors)
+ except (TypeError, ValueError) as e:
+ raise ValueError("cannot assemble the datetimes [{value}]: "
+ "{error}".format(value=value, error=e))
+ if not box:
+ return values.values
+ return values
+
+
+def _attempt_YYYYMMDD(arg, errors):
+ """
+ try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like,
+ arg is a passed in as an object dtype, but could really be ints/strings
+ with nan-like/or floats (e.g. with nan)
+
+ Parameters
+ ----------
+ arg : passed value
+ errors : 'raise','ignore','coerce'
+ """
+
+ def calc(carg):
+ # calculate the actual result
+ carg = carg.astype(object)
+ parsed = parsing.try_parse_year_month_day(carg / 10000,
+ carg / 100 % 100,
+ carg % 100)
+ return tslib.array_to_datetime(parsed, errors=errors)[0]
+
+ def calc_with_mask(carg, mask):
+ result = np.empty(carg.shape, dtype='M8[ns]')
+ iresult = result.view('i8')
+ iresult[~mask] = tslibs.iNaT
+
+ masked_result = calc(carg[mask].astype(np.float64).astype(np.int64))
+ result[mask] = masked_result.astype('M8[ns]')
+ return result
+
+ # try intlike / strings that are ints
+ try:
+ return calc(arg.astype(np.int64))
+ except ValueError:
+ pass
+
+ # a float with actual np.nan
+ try:
+ carg = arg.astype(np.float64)
+ return calc_with_mask(carg, notna(carg))
+ except ValueError:
+ pass
+
+ # string with NaN-like
+ try:
+ mask = ~algorithms.isin(arg, list(tslib.nat_strings))
+ return calc_with_mask(arg, mask)
+ except ValueError:
+ pass
+
+ return None
+
+
+# Fixed time formats for time parsing
+_time_formats = ["%H:%M", "%H%M", "%I:%M%p", "%I%M%p",
+ "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"]
+
+
+def _guess_time_format_for_array(arr):
+ # Try to guess the format based on the first non-NaN element
+ non_nan_elements = notna(arr).nonzero()[0]
+ if len(non_nan_elements):
+ element = arr[non_nan_elements[0]]
+ for time_format in _time_formats:
+ try:
+ datetime.strptime(element, time_format)
+ return time_format
+ except ValueError:
+ pass
+
+ return None
+
+
+def to_time(arg, format=None, infer_time_format=False, errors='raise'):
+ """
+ Parse time strings to time objects using fixed strptime formats ("%H:%M",
+ "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p",
+ "%I%M%S%p")
+
+ Use infer_time_format if all the strings are in the same format to speed
+ up conversion.
+
+ Parameters
+ ----------
+ arg : string in time format, datetime.time, list, tuple, 1-d array, Series
+ format : str, default None
+ Format used to convert arg into a time object. If None, fixed formats
+ are used.
+ infer_time_format: bool, default False
+ Infer the time format based on the first non-NaN element. If all
+ strings are in the same format, this will speed up conversion.
+ errors : {'ignore', 'raise', 'coerce'}, default 'raise'
+ - If 'raise', then invalid parsing will raise an exception
+ - If 'coerce', then invalid parsing will be set as None
+ - If 'ignore', then invalid parsing will return the input
+
+ Returns
+ -------
+ datetime.time
+ """
+ from pandas.core.series import Series
+
+ def _convert_listlike(arg, format):
+
+ if isinstance(arg, (list, tuple)):
+ arg = np.array(arg, dtype='O')
+
+ elif getattr(arg, 'ndim', 1) > 1:
+ raise TypeError('arg must be a string, datetime, list, tuple, '
+ '1-d array, or Series')
+
+ arg = ensure_object(arg)
+
+ if infer_time_format and format is None:
+ format = _guess_time_format_for_array(arg)
+
+ times = []
+ if format is not None:
+ for element in arg:
+ try:
+ times.append(datetime.strptime(element, format).time())
+ except (ValueError, TypeError):
+ if errors == 'raise':
+ msg = ("Cannot convert {element} to a time with given "
+ "format {format}").format(element=element,
+ format=format)
+ raise ValueError(msg)
+ elif errors == 'ignore':
+ return arg
+ else:
+ times.append(None)
+ else:
+ formats = _time_formats[:]
+ format_found = False
+ for element in arg:
+ time_object = None
+ for time_format in formats:
+ try:
+ time_object = datetime.strptime(element,
+ time_format).time()
+ if not format_found:
+ # Put the found format in front
+ fmt = formats.pop(formats.index(time_format))
+ formats.insert(0, fmt)
+ format_found = True
+ break
+ except (ValueError, TypeError):
+ continue
+
+ if time_object is not None:
+ times.append(time_object)
+ elif errors == 'raise':
+ raise ValueError("Cannot convert arg {arg} to "
+ "a time".format(arg=arg))
+ elif errors == 'ignore':
+ return arg
+ else:
+ times.append(None)
+
+ return times
+
+ if arg is None:
+ return arg
+ elif isinstance(arg, time):
+ return arg
+ elif isinstance(arg, Series):
+ values = _convert_listlike(arg._values, format)
+ return Series(values, index=arg.index, name=arg.name)
+ elif isinstance(arg, ABCIndexClass):
+ return _convert_listlike(arg, format)
+ elif is_list_like(arg):
+ return _convert_listlike(arg, format)
+
+ return _convert_listlike(np.array([arg]), format)[0]
diff --git a/contrib/python/pandas/py2/pandas/core/tools/numeric.py b/contrib/python/pandas/py2/pandas/core/tools/numeric.py
new file mode 100644
index 00000000000..803723dab46
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/tools/numeric.py
@@ -0,0 +1,179 @@
+import numpy as np
+
+from pandas._libs import lib
+
+from pandas.core.dtypes.cast import maybe_downcast_to_dtype
+from pandas.core.dtypes.common import (
+ ensure_object, is_datetime_or_timedelta_dtype, is_decimal, is_number,
+ is_numeric_dtype, is_scalar)
+from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
+
+import pandas as pd
+
+
+def to_numeric(arg, errors='raise', downcast=None):
+ """
+ Convert argument to a numeric type.
+
+ The default return dtype is `float64` or `int64`
+ depending on the data supplied. Use the `downcast` parameter
+ to obtain other dtypes.
+
+ Parameters
+ ----------
+ arg : list, tuple, 1-d array, or Series
+ errors : {'ignore', 'raise', 'coerce'}, default 'raise'
+ - If 'raise', then invalid parsing will raise an exception
+ - If 'coerce', then invalid parsing will be set as NaN
+ - If 'ignore', then invalid parsing will return the input
+ downcast : {'integer', 'signed', 'unsigned', 'float'} , default None
+ If not None, and if the data has been successfully cast to a
+ numerical dtype (or if the data was numeric to begin with),
+ downcast that resulting data to the smallest numerical dtype
+ possible according to the following rules:
+
+ - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
+ - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
+ - 'float': smallest float dtype (min.: np.float32)
+
+ As this behaviour is separate from the core conversion to
+ numeric values, any errors raised during the downcasting
+ will be surfaced regardless of the value of the 'errors' input.
+
+ In addition, downcasting will only occur if the size
+ of the resulting data's dtype is strictly larger than
+ the dtype it is to be cast to, so if none of the dtypes
+ checked satisfy that specification, no downcasting will be
+ performed on the data.
+
+ .. versionadded:: 0.19.0
+
+ Returns
+ -------
+ ret : numeric if parsing succeeded.
+ Return type depends on input. Series if Series, otherwise ndarray
+
+ See Also
+ --------
+ pandas.DataFrame.astype : Cast argument to a specified dtype.
+ pandas.to_datetime : Convert argument to datetime.
+ pandas.to_timedelta : Convert argument to timedelta.
+ numpy.ndarray.astype : Cast a numpy array to a specified type.
+
+ Examples
+ --------
+ Take separate series and convert to numeric, coercing when told to
+
+ >>> s = pd.Series(['1.0', '2', -3])
+ >>> pd.to_numeric(s)
+ 0 1.0
+ 1 2.0
+ 2 -3.0
+ dtype: float64
+ >>> pd.to_numeric(s, downcast='float')
+ 0 1.0
+ 1 2.0
+ 2 -3.0
+ dtype: float32
+ >>> pd.to_numeric(s, downcast='signed')
+ 0 1
+ 1 2
+ 2 -3
+ dtype: int8
+ >>> s = pd.Series(['apple', '1.0', '2', -3])
+ >>> pd.to_numeric(s, errors='ignore')
+ 0 apple
+ 1 1.0
+ 2 2
+ 3 -3
+ dtype: object
+ >>> pd.to_numeric(s, errors='coerce')
+ 0 NaN
+ 1 1.0
+ 2 2.0
+ 3 -3.0
+ dtype: float64
+ """
+ if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'):
+ raise ValueError('invalid downcasting method provided')
+
+ is_series = False
+ is_index = False
+ is_scalars = False
+
+ if isinstance(arg, ABCSeries):
+ is_series = True
+ values = arg.values
+ elif isinstance(arg, ABCIndexClass):
+ is_index = True
+ values = arg.asi8
+ if values is None:
+ values = arg.values
+ elif isinstance(arg, (list, tuple)):
+ values = np.array(arg, dtype='O')
+ elif is_scalar(arg):
+ if is_decimal(arg):
+ return float(arg)
+ if is_number(arg):
+ return arg
+ is_scalars = True
+ values = np.array([arg], dtype='O')
+ elif getattr(arg, 'ndim', 1) > 1:
+ raise TypeError('arg must be a list, tuple, 1-d array, or Series')
+ else:
+ values = arg
+
+ try:
+ if is_numeric_dtype(values):
+ pass
+ elif is_datetime_or_timedelta_dtype(values):
+ values = values.astype(np.int64)
+ else:
+ values = ensure_object(values)
+ coerce_numeric = False if errors in ('ignore', 'raise') else True
+ values = lib.maybe_convert_numeric(values, set(),
+ coerce_numeric=coerce_numeric)
+
+ except Exception:
+ if errors == 'raise':
+ raise
+
+ # attempt downcast only if the data has been successfully converted
+ # to a numerical dtype and if a downcast method has been specified
+ if downcast is not None and is_numeric_dtype(values):
+ typecodes = None
+
+ if downcast in ('integer', 'signed'):
+ typecodes = np.typecodes['Integer']
+ elif downcast == 'unsigned' and np.min(values) >= 0:
+ typecodes = np.typecodes['UnsignedInteger']
+ elif downcast == 'float':
+ typecodes = np.typecodes['Float']
+
+ # pandas support goes only to np.float32,
+ # as float dtypes smaller than that are
+ # extremely rare and not well supported
+ float_32_char = np.dtype(np.float32).char
+ float_32_ind = typecodes.index(float_32_char)
+ typecodes = typecodes[float_32_ind:]
+
+ if typecodes is not None:
+ # from smallest to largest
+ for dtype in typecodes:
+ if np.dtype(dtype).itemsize <= values.dtype.itemsize:
+ values = maybe_downcast_to_dtype(values, dtype)
+
+ # successful conversion
+ if values.dtype == dtype:
+ break
+
+ if is_series:
+ return pd.Series(values, index=arg.index, name=arg.name)
+ elif is_index:
+ # because we want to coerce to numeric if possible,
+ # do not use _shallow_copy_with_infer
+ return pd.Index(values, name=arg.name)
+ elif is_scalars:
+ return values[0]
+ else:
+ return values
diff --git a/contrib/python/pandas/py2/pandas/core/tools/timedeltas.py b/contrib/python/pandas/py2/pandas/core/tools/timedeltas.py
new file mode 100644
index 00000000000..ddd21d0f62d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/tools/timedeltas.py
@@ -0,0 +1,166 @@
+"""
+timedelta support tools
+"""
+
+import numpy as np
+
+from pandas._libs.tslibs.timedeltas import Timedelta, parse_timedelta_unit
+
+from pandas.core.dtypes.common import is_list_like
+from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
+
+import pandas as pd
+from pandas.core.arrays.timedeltas import sequence_to_td64ns
+
+
+def to_timedelta(arg, unit='ns', box=True, errors='raise'):
+ """
+ Convert argument to timedelta.
+
+ Timedeltas are absolute differences in times, expressed in difference
+ units (e.g. days, hours, minutes, seconds). This method converts
+ an argument from a recognized timedelta format / value into
+ a Timedelta type.
+
+ Parameters
+ ----------
+ arg : str, timedelta, list-like or Series
+ The data to be converted to timedelta.
+ unit : str, default 'ns'
+ Denotes the unit of the arg. Possible values:
+ ('Y', 'M', 'W', 'D', 'days', 'day', 'hours', hour', 'hr',
+ 'h', 'm', 'minute', 'min', 'minutes', 'T', 'S', 'seconds',
+ 'sec', 'second', 'ms', 'milliseconds', 'millisecond',
+ 'milli', 'millis', 'L', 'us', 'microseconds', 'microsecond',
+ 'micro', 'micros', 'U', 'ns', 'nanoseconds', 'nano', 'nanos',
+ 'nanosecond', 'N').
+ box : bool, default True
+ - If True returns a Timedelta/TimedeltaIndex of the results.
+ - If False returns a numpy.timedelta64 or numpy.darray of
+ values of dtype timedelta64[ns].
+ errors : {'ignore', 'raise', 'coerce'}, default 'raise'
+ - If 'raise', then invalid parsing will raise an exception.
+ - If 'coerce', then invalid parsing will be set as NaT.
+ - If 'ignore', then invalid parsing will return the input.
+
+ Returns
+ -------
+ timedelta64 or numpy.array of timedelta64
+ Output type returned if parsing succeeded.
+
+ See Also
+ --------
+ DataFrame.astype : Cast argument to a specified dtype.
+ to_datetime : Convert argument to datetime.
+
+ Examples
+ --------
+
+ Parsing a single string to a Timedelta:
+
+ >>> pd.to_timedelta('1 days 06:05:01.00003')
+ Timedelta('1 days 06:05:01.000030')
+ >>> pd.to_timedelta('15.5us')
+ Timedelta('0 days 00:00:00.000015')
+
+ Parsing a list or array of strings:
+
+ >>> pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan'])
+ TimedeltaIndex(['1 days 06:05:01.000030', '0 days 00:00:00.000015', NaT],
+ dtype='timedelta64[ns]', freq=None)
+
+ Converting numbers by specifying the `unit` keyword argument:
+
+ >>> pd.to_timedelta(np.arange(5), unit='s')
+ TimedeltaIndex(['00:00:00', '00:00:01', '00:00:02',
+ '00:00:03', '00:00:04'],
+ dtype='timedelta64[ns]', freq=None)
+ >>> pd.to_timedelta(np.arange(5), unit='d')
+ TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'],
+ dtype='timedelta64[ns]', freq=None)
+
+ Returning an ndarray by using the 'box' keyword argument:
+
+ >>> pd.to_timedelta(np.arange(5), box=False)
+ array([0, 1, 2, 3, 4], dtype='timedelta64[ns]')
+ """
+ unit = parse_timedelta_unit(unit)
+
+ if errors not in ('ignore', 'raise', 'coerce'):
+ raise ValueError("errors must be one of 'ignore', "
+ "'raise', or 'coerce'}")
+
+ if arg is None:
+ return arg
+ elif isinstance(arg, ABCSeries):
+ from pandas import Series
+ values = _convert_listlike(arg._values, unit=unit,
+ box=False, errors=errors)
+ return Series(values, index=arg.index, name=arg.name)
+ elif isinstance(arg, ABCIndexClass):
+ return _convert_listlike(arg, unit=unit, box=box,
+ errors=errors, name=arg.name)
+ elif isinstance(arg, np.ndarray) and arg.ndim == 0:
+ # extract array scalar and process below
+ arg = arg.item()
+ elif is_list_like(arg) and getattr(arg, 'ndim', 1) == 1:
+ return _convert_listlike(arg, unit=unit, box=box, errors=errors)
+ elif getattr(arg, 'ndim', 1) > 1:
+ raise TypeError('arg must be a string, timedelta, list, tuple, '
+ '1-d array, or Series')
+
+ # ...so it must be a scalar value. Return scalar.
+ return _coerce_scalar_to_timedelta_type(arg, unit=unit,
+ box=box, errors=errors)
+
+
+def _coerce_scalar_to_timedelta_type(r, unit='ns', box=True, errors='raise'):
+ """Convert string 'r' to a timedelta object."""
+
+ try:
+ result = Timedelta(r, unit)
+ if not box:
+ # explicitly view as timedelta64 for case when result is pd.NaT
+ result = result.asm8.view('timedelta64[ns]')
+ except ValueError:
+ if errors == 'raise':
+ raise
+ elif errors == 'ignore':
+ return r
+
+ # coerce
+ result = pd.NaT
+
+ return result
+
+
+def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None):
+ """Convert a list of objects to a timedelta index object."""
+
+ if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'):
+ # This is needed only to ensure that in the case where we end up
+ # returning arg (errors == "ignore"), and where the input is a
+ # generator, we return a useful list-like instead of a
+ # used-up generator
+ arg = np.array(list(arg), dtype=object)
+
+ try:
+ value = sequence_to_td64ns(arg, unit=unit,
+ errors=errors, copy=False)[0]
+ except ValueError:
+ if errors == 'ignore':
+ return arg
+ else:
+ # This else-block accounts for the cases when errors='raise'
+ # and errors='coerce'. If errors == 'raise', these errors
+ # should be raised. If errors == 'coerce', we shouldn't
+ # expect any errors to be raised, since all parsing errors
+ # cause coercion to pd.NaT. However, if an error / bug is
+ # introduced that causes an Exception to be raised, we would
+ # like to surface it.
+ raise
+
+ if box:
+ from pandas import TimedeltaIndex
+ value = TimedeltaIndex(value, unit='ns', name=name)
+ return value
diff --git a/contrib/python/pandas/py2/pandas/core/util/__init__.py b/contrib/python/pandas/py2/pandas/core/util/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/util/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/core/util/hashing.py b/contrib/python/pandas/py2/pandas/core/util/hashing.py
new file mode 100644
index 00000000000..29fc1e3671a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/util/hashing.py
@@ -0,0 +1,333 @@
+"""
+data hash pandas / numpy objects
+"""
+import itertools
+
+import numpy as np
+
+from pandas._libs import hashing, tslibs
+
+from pandas.core.dtypes.cast import infer_dtype_from_scalar
+from pandas.core.dtypes.common import (
+ is_categorical_dtype, is_extension_array_dtype, is_list_like)
+from pandas.core.dtypes.generic import (
+ ABCDataFrame, ABCIndexClass, ABCMultiIndex, ABCSeries)
+from pandas.core.dtypes.missing import isna
+
+# 16 byte long hashing key
+_default_hash_key = '0123456789123456'
+
+
+def _combine_hash_arrays(arrays, num_items):
+ """
+ Parameters
+ ----------
+ arrays : generator
+ num_items : int
+
+ Should be the same as CPython's tupleobject.c
+ """
+ try:
+ first = next(arrays)
+ except StopIteration:
+ return np.array([], dtype=np.uint64)
+
+ arrays = itertools.chain([first], arrays)
+
+ mult = np.uint64(1000003)
+ out = np.zeros_like(first) + np.uint64(0x345678)
+ for i, a in enumerate(arrays):
+ inverse_i = num_items - i
+ out ^= a
+ out *= mult
+ mult += np.uint64(82520 + inverse_i + inverse_i)
+ assert i + 1 == num_items, 'Fed in wrong num_items'
+ out += np.uint64(97531)
+ return out
+
+
+def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
+ categorize=True):
+ """
+ Return a data hash of the Index/Series/DataFrame
+
+ .. versionadded:: 0.19.2
+
+ Parameters
+ ----------
+ index : boolean, default True
+ include the index in the hash (if Series/DataFrame)
+ encoding : string, default 'utf8'
+ encoding for data & key when strings
+ hash_key : string key to encode, default to _default_hash_key
+ categorize : bool, default True
+ Whether to first categorize object arrays before hashing. This is more
+ efficient when the array contains duplicate values.
+
+ .. versionadded:: 0.20.0
+
+ Returns
+ -------
+ Series of uint64, same length as the object
+ """
+ from pandas import Series
+ if hash_key is None:
+ hash_key = _default_hash_key
+
+ if isinstance(obj, ABCMultiIndex):
+ return Series(hash_tuples(obj, encoding, hash_key),
+ dtype='uint64', copy=False)
+
+ if isinstance(obj, ABCIndexClass):
+ h = hash_array(obj.values, encoding, hash_key,
+ categorize).astype('uint64', copy=False)
+ h = Series(h, index=obj, dtype='uint64', copy=False)
+ elif isinstance(obj, ABCSeries):
+ h = hash_array(obj.values, encoding, hash_key,
+ categorize).astype('uint64', copy=False)
+ if index:
+ index_iter = (hash_pandas_object(obj.index,
+ index=False,
+ encoding=encoding,
+ hash_key=hash_key,
+ categorize=categorize).values
+ for _ in [None])
+ arrays = itertools.chain([h], index_iter)
+ h = _combine_hash_arrays(arrays, 2)
+
+ h = Series(h, index=obj.index, dtype='uint64', copy=False)
+
+ elif isinstance(obj, ABCDataFrame):
+ hashes = (hash_array(series.values) for _, series in obj.iteritems())
+ num_items = len(obj.columns)
+ if index:
+ index_hash_generator = (hash_pandas_object(obj.index,
+ index=False,
+ encoding=encoding,
+ hash_key=hash_key,
+ categorize=categorize).values # noqa
+ for _ in [None])
+ num_items += 1
+ hashes = itertools.chain(hashes, index_hash_generator)
+ h = _combine_hash_arrays(hashes, num_items)
+
+ h = Series(h, index=obj.index, dtype='uint64', copy=False)
+ else:
+ raise TypeError("Unexpected type for hashing %s" % type(obj))
+ return h
+
+
+def hash_tuples(vals, encoding='utf8', hash_key=None):
+ """
+ Hash an MultiIndex / list-of-tuples efficiently
+
+ .. versionadded:: 0.20.0
+
+ Parameters
+ ----------
+ vals : MultiIndex, list-of-tuples, or single tuple
+ encoding : string, default 'utf8'
+ hash_key : string key to encode, default to _default_hash_key
+
+ Returns
+ -------
+ ndarray of hashed values array
+ """
+ is_tuple = False
+ if isinstance(vals, tuple):
+ vals = [vals]
+ is_tuple = True
+ elif not is_list_like(vals):
+ raise TypeError("must be convertible to a list-of-tuples")
+
+ from pandas import Categorical, MultiIndex
+
+ if not isinstance(vals, ABCMultiIndex):
+ vals = MultiIndex.from_tuples(vals)
+
+ # create a list-of-Categoricals
+ vals = [Categorical(vals.codes[level],
+ vals.levels[level],
+ ordered=False,
+ fastpath=True)
+ for level in range(vals.nlevels)]
+
+ # hash the list-of-ndarrays
+ hashes = (_hash_categorical(cat,
+ encoding=encoding,
+ hash_key=hash_key)
+ for cat in vals)
+ h = _combine_hash_arrays(hashes, len(vals))
+ if is_tuple:
+ h = h[0]
+
+ return h
+
+
+def hash_tuple(val, encoding='utf8', hash_key=None):
+ """
+ Hash a single tuple efficiently
+
+ Parameters
+ ----------
+ val : single tuple
+ encoding : string, default 'utf8'
+ hash_key : string key to encode, default to _default_hash_key
+
+ Returns
+ -------
+ hash
+
+ """
+ hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key)
+ for v in val)
+
+ h = _combine_hash_arrays(hashes, len(val))[0]
+
+ return h
+
+
+def _hash_categorical(c, encoding, hash_key):
+ """
+ Hash a Categorical by hashing its categories, and then mapping the codes
+ to the hashes
+
+ Parameters
+ ----------
+ c : Categorical
+ encoding : string, default 'utf8'
+ hash_key : string key to encode, default to _default_hash_key
+
+ Returns
+ -------
+ ndarray of hashed values array, same size as len(c)
+ """
+ # Convert ExtensionArrays to ndarrays
+ values = np.asarray(c.categories.values)
+ hashed = hash_array(values, encoding, hash_key,
+ categorize=False)
+
+ # we have uint64, as we don't directly support missing values
+ # we don't want to use take_nd which will coerce to float
+ # instead, directly construct the result with a
+ # max(np.uint64) as the missing value indicator
+ #
+ # TODO: GH 15362
+
+ mask = c.isna()
+ if len(hashed):
+ result = hashed.take(c.codes)
+ else:
+ result = np.zeros(len(mask), dtype='uint64')
+
+ if mask.any():
+ result[mask] = np.iinfo(np.uint64).max
+
+ return result
+
+
+def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
+ """
+ Given a 1d array, return an array of deterministic integers.
+
+ .. versionadded:: 0.19.2
+
+ Parameters
+ ----------
+ vals : ndarray, Categorical
+ encoding : string, default 'utf8'
+ encoding for data & key when strings
+ hash_key : string key to encode, default to _default_hash_key
+ categorize : bool, default True
+ Whether to first categorize object arrays before hashing. This is more
+ efficient when the array contains duplicate values.
+
+ .. versionadded:: 0.20.0
+
+ Returns
+ -------
+ 1d uint64 numpy array of hash values, same length as the vals
+ """
+
+ if not hasattr(vals, 'dtype'):
+ raise TypeError("must pass a ndarray-like")
+ dtype = vals.dtype
+
+ if hash_key is None:
+ hash_key = _default_hash_key
+
+ # For categoricals, we hash the categories, then remap the codes to the
+ # hash values. (This check is above the complex check so that we don't ask
+ # numpy if categorical is a subdtype of complex, as it will choke).
+ if is_categorical_dtype(dtype):
+ return _hash_categorical(vals, encoding, hash_key)
+ elif is_extension_array_dtype(dtype):
+ vals, _ = vals._values_for_factorize()
+ dtype = vals.dtype
+
+ # we'll be working with everything as 64-bit values, so handle this
+ # 128-bit value early
+ if np.issubdtype(dtype, np.complex128):
+ return hash_array(vals.real) + 23 * hash_array(vals.imag)
+
+ # First, turn whatever array this is into unsigned 64-bit ints, if we can
+ # manage it.
+ elif isinstance(dtype, np.bool):
+ vals = vals.astype('u8')
+ elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
+ vals = vals.view('i8').astype('u8', copy=False)
+ elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
+ vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
+ else:
+ # With repeated values, its MUCH faster to categorize object dtypes,
+ # then hash and rename categories. We allow skipping the categorization
+ # when the values are known/likely to be unique.
+ if categorize:
+ from pandas import factorize, Categorical, Index
+ codes, categories = factorize(vals, sort=False)
+ cat = Categorical(codes, Index(categories),
+ ordered=False, fastpath=True)
+ return _hash_categorical(cat, encoding, hash_key)
+
+ try:
+ vals = hashing.hash_object_array(vals, hash_key, encoding)
+ except TypeError:
+ # we have mixed types
+ vals = hashing.hash_object_array(vals.astype(str).astype(object),
+ hash_key, encoding)
+
+ # Then, redistribute these 64-bit ints within the space of 64-bit ints
+ vals ^= vals >> 30
+ vals *= np.uint64(0xbf58476d1ce4e5b9)
+ vals ^= vals >> 27
+ vals *= np.uint64(0x94d049bb133111eb)
+ vals ^= vals >> 31
+ return vals
+
+
+def _hash_scalar(val, encoding='utf8', hash_key=None):
+ """
+ Hash scalar value
+
+ Returns
+ -------
+ 1d uint64 numpy array of hash value, of length 1
+ """
+
+ if isna(val):
+ # this is to be consistent with the _hash_categorical implementation
+ return np.array([np.iinfo(np.uint64).max], dtype='u8')
+
+ if getattr(val, 'tzinfo', None) is not None:
+ # for tz-aware datetimes, we need the underlying naive UTC value and
+ # not the tz aware object or pd extension type (as
+ # infer_dtype_from_scalar would do)
+ if not isinstance(val, tslibs.Timestamp):
+ val = tslibs.Timestamp(val)
+ val = val.tz_convert(None)
+
+ dtype, val = infer_dtype_from_scalar(val)
+ vals = np.array([val], dtype=dtype)
+
+ return hash_array(vals, hash_key=hash_key, encoding=encoding,
+ categorize=False)
diff --git a/contrib/python/pandas/py2/pandas/core/window.py b/contrib/python/pandas/py2/pandas/core/window.py
new file mode 100644
index 00000000000..5a9157b43ec
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/core/window.py
@@ -0,0 +1,2649 @@
+"""
+Provide a generic structure to support window functions,
+similar to how we have a Groupby object.
+"""
+from __future__ import division
+
+from collections import defaultdict
+from datetime import timedelta
+from textwrap import dedent
+import warnings
+
+import numpy as np
+
+import pandas._libs.window as libwindow
+import pandas.compat as compat
+from pandas.compat.numpy import function as nv
+from pandas.util._decorators import Appender, Substitution, cache_readonly
+
+from pandas.core.dtypes.common import (
+ ensure_float64, is_bool, is_float_dtype, is_integer, is_integer_dtype,
+ is_list_like, is_scalar, is_timedelta64_dtype, needs_i8_conversion)
+from pandas.core.dtypes.generic import (
+ ABCDataFrame, ABCDateOffset, ABCDatetimeIndex, ABCPeriodIndex, ABCSeries,
+ ABCTimedeltaIndex)
+
+from pandas.core.base import PandasObject, SelectionMixin
+import pandas.core.common as com
+from pandas.core.generic import _shared_docs
+from pandas.core.groupby.base import GroupByMixin
+
+_shared_docs = dict(**_shared_docs)
+_doc_template = """
+ Returns
+ -------
+ Series or DataFrame
+ Return type is determined by the caller.
+
+ See Also
+ --------
+ Series.%(name)s : Series %(name)s.
+ DataFrame.%(name)s : DataFrame %(name)s.
+"""
+
+
+class _Window(PandasObject, SelectionMixin):
+ _attributes = ['window', 'min_periods', 'center', 'win_type',
+ 'axis', 'on', 'closed']
+ exclusions = set()
+
+ def __init__(self, obj, window=None, min_periods=None,
+ center=False, win_type=None, axis=0, on=None, closed=None,
+ **kwargs):
+
+ self.__dict__.update(kwargs)
+ self.blocks = []
+ self.obj = obj
+ self.on = on
+ self.closed = closed
+ self.window = window
+ self.min_periods = min_periods
+ self.center = center
+ self.win_type = win_type
+ self.win_freq = None
+ self.axis = obj._get_axis_number(axis) if axis is not None else None
+ self.validate()
+
+ @property
+ def _constructor(self):
+ return Window
+
+ @property
+ def is_datetimelike(self):
+ return None
+
+ @property
+ def _on(self):
+ return None
+
+ @property
+ def is_freq_type(self):
+ return self.win_type == 'freq'
+
+ def validate(self):
+ if self.center is not None and not is_bool(self.center):
+ raise ValueError("center must be a boolean")
+ if (self.min_periods is not None and
+ not is_integer(self.min_periods)):
+ raise ValueError("min_periods must be an integer")
+ if (self.closed is not None and
+ self.closed not in ['right', 'both', 'left', 'neither']):
+ raise ValueError("closed must be 'right', 'left', 'both' or "
+ "'neither'")
+
+ def _convert_freq(self):
+ """
+ Resample according to the how, return a new object.
+ """
+ obj = self._selected_obj
+ index = None
+ return obj, index
+
+ def _create_blocks(self):
+ """
+ Split data into blocks & return conformed data.
+ """
+
+ obj, index = self._convert_freq()
+ if index is not None:
+ index = self._on
+
+ # filter out the on from the object
+ if self.on is not None:
+ if obj.ndim == 2:
+ obj = obj.reindex(columns=obj.columns.difference([self.on]),
+ copy=False)
+ blocks = obj._to_dict_of_blocks(copy=False).values()
+
+ return blocks, obj, index
+
+ def _gotitem(self, key, ndim, subset=None):
+ """
+ Sub-classes to define. Return a sliced object.
+
+ Parameters
+ ----------
+ key : str / list of selections
+ ndim : 1,2
+ requested ndim of result
+ subset : object, default None
+ subset to act on
+ """
+
+ # create a new object to prevent aliasing
+ if subset is None:
+ subset = self.obj
+ self = self._shallow_copy(subset)
+ self._reset_cache()
+ if subset.ndim == 2:
+ if is_scalar(key) and key in subset or is_list_like(key):
+ self._selection = key
+ return self
+
+ def __getattr__(self, attr):
+ if attr in self._internal_names_set:
+ return object.__getattribute__(self, attr)
+ if attr in self.obj:
+ return self[attr]
+
+ raise AttributeError("%r object has no attribute %r" %
+ (type(self).__name__, attr))
+
+ def _dir_additions(self):
+ return self.obj._dir_additions()
+
+ def _get_window(self, other=None):
+ return self.window
+
+ @property
+ def _window_type(self):
+ return self.__class__.__name__
+
+ def __unicode__(self):
+ """
+ Provide a nice str repr of our rolling object.
+ """
+
+ attrs = ["{k}={v}".format(k=k, v=getattr(self, k))
+ for k in self._attributes
+ if getattr(self, k, None) is not None]
+ return "{klass} [{attrs}]".format(klass=self._window_type,
+ attrs=','.join(attrs))
+
+ def __iter__(self):
+ url = 'https://github.com/pandas-dev/pandas/issues/11704'
+ raise NotImplementedError('See issue #11704 {url}'.format(url=url))
+
+ def _get_index(self, index=None):
+ """
+ Return index as ndarrays.
+
+ Returns
+ -------
+ tuple of (index, index_as_ndarray)
+ """
+
+ if self.is_freq_type:
+ if index is None:
+ index = self._on
+ return index, index.asi8
+ return index, index
+
+ def _prep_values(self, values=None, kill_inf=True):
+
+ if values is None:
+ values = getattr(self._selected_obj, 'values', self._selected_obj)
+
+ # GH #12373 : rolling functions error on float32 data
+ # make sure the data is coerced to float64
+ if is_float_dtype(values.dtype):
+ values = ensure_float64(values)
+ elif is_integer_dtype(values.dtype):
+ values = ensure_float64(values)
+ elif needs_i8_conversion(values.dtype):
+ raise NotImplementedError("ops for {action} for this "
+ "dtype {dtype} are not "
+ "implemented".format(
+ action=self._window_type,
+ dtype=values.dtype))
+ else:
+ try:
+ values = ensure_float64(values)
+ except (ValueError, TypeError):
+ raise TypeError("cannot handle this type -> {0}"
+ "".format(values.dtype))
+
+ if kill_inf:
+ values = values.copy()
+ values[np.isinf(values)] = np.NaN
+
+ return values
+
+ def _wrap_result(self, result, block=None, obj=None):
+ """
+ Wrap a single result.
+ """
+
+ if obj is None:
+ obj = self._selected_obj
+ index = obj.index
+
+ if isinstance(result, np.ndarray):
+
+ # coerce if necessary
+ if block is not None:
+ if is_timedelta64_dtype(block.values.dtype):
+ from pandas import to_timedelta
+ result = to_timedelta(
+ result.ravel(), unit='ns').values.reshape(result.shape)
+
+ if result.ndim == 1:
+ from pandas import Series
+ return Series(result, index, name=obj.name)
+
+ return type(obj)(result, index=index, columns=block.columns)
+ return result
+
+ def _wrap_results(self, results, blocks, obj):
+ """
+ Wrap the results.
+
+ Parameters
+ ----------
+ results : list of ndarrays
+ blocks : list of blocks
+ obj : conformed data (may be resampled)
+ """
+
+ from pandas import Series, concat
+ from pandas.core.index import ensure_index
+
+ final = []
+ for result, block in zip(results, blocks):
+
+ result = self._wrap_result(result, block=block, obj=obj)
+ if result.ndim == 1:
+ return result
+ final.append(result)
+
+ # if we have an 'on' column
+ # we want to put it back into the results
+ # in the same location
+ columns = self._selected_obj.columns
+ if self.on is not None and not self._on.equals(obj.index):
+
+ name = self._on.name
+ final.append(Series(self._on, index=obj.index, name=name))
+
+ if self._selection is not None:
+
+ selection = ensure_index(self._selection)
+
+ # need to reorder to include original location of
+ # the on column (if its not already there)
+ if name not in selection:
+ columns = self.obj.columns
+ indexer = columns.get_indexer(selection.tolist() + [name])
+ columns = columns.take(sorted(indexer))
+
+ if not len(final):
+ return obj.astype('float64')
+ return concat(final, axis=1).reindex(columns=columns, copy=False)
+
+ def _center_window(self, result, window):
+ """
+ Center the result in the window.
+ """
+ if self.axis > result.ndim - 1:
+ raise ValueError("Requested axis is larger then no. of argument "
+ "dimensions")
+
+ offset = _offset(window, True)
+ if offset > 0:
+ if isinstance(result, (ABCSeries, ABCDataFrame)):
+ result = result.slice_shift(-offset, axis=self.axis)
+ else:
+ lead_indexer = [slice(None)] * result.ndim
+ lead_indexer[self.axis] = slice(offset, None)
+ result = np.copy(result[tuple(lead_indexer)])
+ return result
+
+ def aggregate(self, arg, *args, **kwargs):
+ result, how = self._aggregate(arg, *args, **kwargs)
+ if result is None:
+ return self.apply(arg, raw=False, args=args, kwargs=kwargs)
+ return result
+
+ agg = aggregate
+
+ _shared_docs['sum'] = dedent("""
+ Calculate %(name)s sum of given DataFrame or Series.
+
+ Parameters
+ ----------
+ *args, **kwargs
+ For compatibility with other %(name)s methods. Has no effect
+ on the computed value.
+
+ Returns
+ -------
+ Series or DataFrame
+ Same type as the input, with the same index, containing the
+ %(name)s sum.
+
+ See Also
+ --------
+ Series.sum : Reducing sum for Series.
+ DataFrame.sum : Reducing sum for DataFrame.
+
+ Examples
+ --------
+ >>> s = pd.Series([1, 2, 3, 4, 5])
+ >>> s
+ 0 1
+ 1 2
+ 2 3
+ 3 4
+ 4 5
+ dtype: int64
+
+ >>> s.rolling(3).sum()
+ 0 NaN
+ 1 NaN
+ 2 6.0
+ 3 9.0
+ 4 12.0
+ dtype: float64
+
+ >>> s.expanding(3).sum()
+ 0 NaN
+ 1 NaN
+ 2 6.0
+ 3 10.0
+ 4 15.0
+ dtype: float64
+
+ >>> s.rolling(3, center=True).sum()
+ 0 NaN
+ 1 6.0
+ 2 9.0
+ 3 12.0
+ 4 NaN
+ dtype: float64
+
+ For DataFrame, each %(name)s sum is computed column-wise.
+
+ >>> df = pd.DataFrame({"A": s, "B": s ** 2})
+ >>> df
+ A B
+ 0 1 1
+ 1 2 4
+ 2 3 9
+ 3 4 16
+ 4 5 25
+
+ >>> df.rolling(3).sum()
+ A B
+ 0 NaN NaN
+ 1 NaN NaN
+ 2 6.0 14.0
+ 3 9.0 29.0
+ 4 12.0 50.0
+ """)
+
+ _shared_docs['mean'] = dedent("""
+ Calculate the %(name)s mean of the values.
+
+ Parameters
+ ----------
+ *args
+ Under Review.
+ **kwargs
+ Under Review.
+
+ Returns
+ -------
+ Series or DataFrame
+ Returned object type is determined by the caller of the %(name)s
+ calculation.
+
+ See Also
+ --------
+ Series.%(name)s : Calling object with Series data.
+ DataFrame.%(name)s : Calling object with DataFrames.
+ Series.mean : Equivalent method for Series.
+ DataFrame.mean : Equivalent method for DataFrame.
+
+ Examples
+ --------
+ The below examples will show rolling mean calculations with window sizes of
+ two and three, respectively.
+
+ >>> s = pd.Series([1, 2, 3, 4])
+ >>> s.rolling(2).mean()
+ 0 NaN
+ 1 1.5
+ 2 2.5
+ 3 3.5
+ dtype: float64
+
+ >>> s.rolling(3).mean()
+ 0 NaN
+ 1 NaN
+ 2 2.0
+ 3 3.0
+ dtype: float64
+ """)
+
+
+class Window(_Window):
+ """
+ Provides rolling window calculations.
+
+ .. versionadded:: 0.18.0
+
+ Parameters
+ ----------
+ window : int, or offset
+ Size of the moving window. This is the number of observations used for
+ calculating the statistic. Each window will be a fixed size.
+
+ If its an offset then this will be the time period of each window. Each
+ window will be a variable sized based on the observations included in
+ the time-period. This is only valid for datetimelike indexes. This is
+ new in 0.19.0
+ min_periods : int, default None
+ Minimum number of observations in window required to have a value
+ (otherwise result is NA). For a window that is specified by an offset,
+ `min_periods` will default to 1. Otherwise, `min_periods` will default
+ to the size of the window.
+ center : bool, default False
+ Set the labels at the center of the window.
+ win_type : str, default None
+ Provide a window type. If ``None``, all points are evenly weighted.
+ See the notes below for further information.
+ on : str, optional
+ For a DataFrame, column on which to calculate
+ the rolling window, rather than the index
+ axis : int or str, default 0
+ closed : str, default None
+ Make the interval closed on the 'right', 'left', 'both' or
+ 'neither' endpoints.
+ For offset-based windows, it defaults to 'right'.
+ For fixed windows, defaults to 'both'. Remaining cases not implemented
+ for fixed windows.
+
+ .. versionadded:: 0.20.0
+
+ Returns
+ -------
+ a Window or Rolling sub-classed for the particular operation
+
+ See Also
+ --------
+ expanding : Provides expanding transformations.
+ ewm : Provides exponential weighted functions.
+
+ Notes
+ -----
+ By default, the result is set to the right edge of the window. This can be
+ changed to the center of the window by setting ``center=True``.
+
+ To learn more about the offsets & frequency strings, please see `this link
+ <http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases>`__.
+
+ The recognized win_types are:
+
+ * ``boxcar``
+ * ``triang``
+ * ``blackman``
+ * ``hamming``
+ * ``bartlett``
+ * ``parzen``
+ * ``bohman``
+ * ``blackmanharris``
+ * ``nuttall``
+ * ``barthann``
+ * ``kaiser`` (needs beta)
+ * ``gaussian`` (needs std)
+ * ``general_gaussian`` (needs power, width)
+ * ``slepian`` (needs width).
+
+ If ``win_type=None`` all points are evenly weighted. To learn more about
+ different window types see `scipy.signal window functions
+ <https://docs.scipy.org/doc/scipy/reference/signal.html#window-functions>`__.
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]})
+ >>> df
+ B
+ 0 0.0
+ 1 1.0
+ 2 2.0
+ 3 NaN
+ 4 4.0
+
+ Rolling sum with a window length of 2, using the 'triang'
+ window type.
+
+ >>> df.rolling(2, win_type='triang').sum()
+ B
+ 0 NaN
+ 1 1.0
+ 2 2.5
+ 3 NaN
+ 4 NaN
+
+ Rolling sum with a window length of 2, min_periods defaults
+ to the window length.
+
+ >>> df.rolling(2).sum()
+ B
+ 0 NaN
+ 1 1.0
+ 2 3.0
+ 3 NaN
+ 4 NaN
+
+ Same as above, but explicitly set the min_periods
+
+ >>> df.rolling(2, min_periods=1).sum()
+ B
+ 0 0.0
+ 1 1.0
+ 2 3.0
+ 3 2.0
+ 4 4.0
+
+ A ragged (meaning not-a-regular frequency), time-indexed DataFrame
+
+ >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]},
+ ... index = [pd.Timestamp('20130101 09:00:00'),
+ ... pd.Timestamp('20130101 09:00:02'),
+ ... pd.Timestamp('20130101 09:00:03'),
+ ... pd.Timestamp('20130101 09:00:05'),
+ ... pd.Timestamp('20130101 09:00:06')])
+
+ >>> df
+ B
+ 2013-01-01 09:00:00 0.0
+ 2013-01-01 09:00:02 1.0
+ 2013-01-01 09:00:03 2.0
+ 2013-01-01 09:00:05 NaN
+ 2013-01-01 09:00:06 4.0
+
+ Contrasting to an integer rolling window, this will roll a variable
+ length window corresponding to the time period.
+ The default for min_periods is 1.
+
+ >>> df.rolling('2s').sum()
+ B
+ 2013-01-01 09:00:00 0.0
+ 2013-01-01 09:00:02 1.0
+ 2013-01-01 09:00:03 3.0
+ 2013-01-01 09:00:05 NaN
+ 2013-01-01 09:00:06 4.0
+ """
+
+ def validate(self):
+ super(Window, self).validate()
+
+ window = self.window
+ if isinstance(window, (list, tuple, np.ndarray)):
+ pass
+ elif is_integer(window):
+ if window <= 0:
+ raise ValueError("window must be > 0 ")
+ try:
+ import scipy.signal as sig
+ except ImportError:
+ raise ImportError('Please install scipy to generate window '
+ 'weight')
+
+ if not isinstance(self.win_type, compat.string_types):
+ raise ValueError('Invalid win_type {0}'.format(self.win_type))
+ if getattr(sig, self.win_type, None) is None:
+ raise ValueError('Invalid win_type {0}'.format(self.win_type))
+ else:
+ raise ValueError('Invalid window {0}'.format(window))
+
+ def _prep_window(self, **kwargs):
+ """
+ Provide validation for our window type, return the window
+ we have already been validated.
+ """
+
+ window = self._get_window()
+ if isinstance(window, (list, tuple, np.ndarray)):
+ return com.asarray_tuplesafe(window).astype(float)
+ elif is_integer(window):
+ import scipy.signal as sig
+
+ # the below may pop from kwargs
+ def _validate_win_type(win_type, kwargs):
+ arg_map = {'kaiser': ['beta'],
+ 'gaussian': ['std'],
+ 'general_gaussian': ['power', 'width'],
+ 'slepian': ['width']}
+ if win_type in arg_map:
+ return tuple([win_type] + _pop_args(win_type,
+ arg_map[win_type],
+ kwargs))
+ return win_type
+
+ def _pop_args(win_type, arg_names, kwargs):
+ msg = '%s window requires %%s' % win_type
+ all_args = []
+ for n in arg_names:
+ if n not in kwargs:
+ raise ValueError(msg % n)
+ all_args.append(kwargs.pop(n))
+ return all_args
+
+ win_type = _validate_win_type(self.win_type, kwargs)
+ # GH #15662. `False` makes symmetric window, rather than periodic.
+ return sig.get_window(win_type, window, False).astype(float)
+
+ def _apply_window(self, mean=True, **kwargs):
+ """
+ Applies a moving window of type ``window_type`` on the data.
+
+ Parameters
+ ----------
+ mean : bool, default True
+ If True computes weighted mean, else weighted sum
+
+ Returns
+ -------
+ y : same type as input argument
+
+ """
+ window = self._prep_window(**kwargs)
+ center = self.center
+
+ blocks, obj, index = self._create_blocks()
+ results = []
+ for b in blocks:
+ try:
+ values = self._prep_values(b.values)
+ except TypeError:
+ results.append(b.values.copy())
+ continue
+
+ if values.size == 0:
+ results.append(values.copy())
+ continue
+
+ offset = _offset(window, center)
+ additional_nans = np.array([np.NaN] * offset)
+
+ def f(arg, *args, **kwargs):
+ minp = _use_window(self.min_periods, len(window))
+ return libwindow.roll_window(np.concatenate((arg,
+ additional_nans))
+ if center else arg, window, minp,
+ avg=mean)
+
+ result = np.apply_along_axis(f, self.axis, values)
+
+ if center:
+ result = self._center_window(result, window)
+ results.append(result)
+
+ return self._wrap_results(results, blocks, obj)
+
+ _agg_see_also_doc = dedent("""
+ See Also
+ --------
+ pandas.DataFrame.rolling.aggregate
+ pandas.DataFrame.aggregate
+ """)
+
+ _agg_examples_doc = dedent("""
+ Examples
+ --------
+
+ >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'])
+ >>> df
+ A B C
+ 0 -2.385977 -0.102758 0.438822
+ 1 -1.004295 0.905829 -0.954544
+ 2 0.735167 -0.165272 -1.619346
+ 3 -0.702657 -1.340923 -0.706334
+ 4 -0.246845 0.211596 -0.901819
+ 5 2.463718 3.157577 -1.380906
+ 6 -1.142255 2.340594 -0.039875
+ 7 1.396598 -1.647453 1.677227
+ 8 -0.543425 1.761277 -0.220481
+ 9 -0.640505 0.289374 -1.550670
+
+ >>> df.rolling(3, win_type='boxcar').agg('mean')
+ A B C
+ 0 NaN NaN NaN
+ 1 NaN NaN NaN
+ 2 -0.885035 0.212600 -0.711689
+ 3 -0.323928 -0.200122 -1.093408
+ 4 -0.071445 -0.431533 -1.075833
+ 5 0.504739 0.676083 -0.996353
+ 6 0.358206 1.903256 -0.774200
+ 7 0.906020 1.283573 0.085482
+ 8 -0.096361 0.818139 0.472290
+ 9 0.070889 0.134399 -0.031308
+ """)
+
+ @Substitution(see_also=_agg_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded='',
+ klass='Series/DataFrame',
+ axis='')
+ @Appender(_shared_docs['aggregate'])
+ def aggregate(self, arg, *args, **kwargs):
+ result, how = self._aggregate(arg, *args, **kwargs)
+ if result is None:
+
+ # these must apply directly
+ result = arg(self)
+
+ return result
+
+ agg = aggregate
+
+ @Substitution(name='window')
+ @Appender(_shared_docs['sum'])
+ def sum(self, *args, **kwargs):
+ nv.validate_window_func('sum', args, kwargs)
+ return self._apply_window(mean=False, **kwargs)
+
+ @Substitution(name='window')
+ @Appender(_shared_docs['mean'])
+ def mean(self, *args, **kwargs):
+ nv.validate_window_func('mean', args, kwargs)
+ return self._apply_window(mean=True, **kwargs)
+
+
+class _GroupByMixin(GroupByMixin):
+ """
+ Provide the groupby facilities.
+ """
+
+ def __init__(self, obj, *args, **kwargs):
+ parent = kwargs.pop('parent', None) # noqa
+ groupby = kwargs.pop('groupby', None)
+ if groupby is None:
+ groupby, obj = obj, obj.obj
+ self._groupby = groupby
+ self._groupby.mutated = True
+ self._groupby.grouper.mutated = True
+ super(GroupByMixin, self).__init__(obj, *args, **kwargs)
+
+ count = GroupByMixin._dispatch('count')
+ corr = GroupByMixin._dispatch('corr', other=None, pairwise=None)
+ cov = GroupByMixin._dispatch('cov', other=None, pairwise=None)
+
+ def _apply(self, func, name, window=None, center=None,
+ check_minp=None, **kwargs):
+ """
+ Dispatch to apply; we are stripping all of the _apply kwargs and
+ performing the original function call on the grouped object.
+ """
+
+ def f(x, name=name, *args):
+ x = self._shallow_copy(x)
+
+ if isinstance(name, compat.string_types):
+ return getattr(x, name)(*args, **kwargs)
+
+ return x.apply(name, *args, **kwargs)
+
+ return self._groupby.apply(f)
+
+
+class _Rolling(_Window):
+
+ @property
+ def _constructor(self):
+ return Rolling
+
+ def _apply(self, func, name=None, window=None, center=None,
+ check_minp=None, **kwargs):
+ """
+ Rolling statistical measure using supplied function.
+
+ Designed to be used with passed-in Cython array-based functions.
+
+ Parameters
+ ----------
+ func : str/callable to apply
+ name : str, optional
+ name of this function
+ window : int/array, default to _get_window()
+ center : bool, default to self.center
+ check_minp : function, default to _use_window
+
+ Returns
+ -------
+ y : type of input
+ """
+ if center is None:
+ center = self.center
+ if window is None:
+ window = self._get_window()
+
+ if check_minp is None:
+ check_minp = _use_window
+
+ blocks, obj, index = self._create_blocks()
+ index, indexi = self._get_index(index=index)
+ results = []
+ for b in blocks:
+ values = self._prep_values(b.values)
+
+ if values.size == 0:
+ results.append(values.copy())
+ continue
+
+ # if we have a string function name, wrap it
+ if isinstance(func, compat.string_types):
+ cfunc = getattr(libwindow, func, None)
+ if cfunc is None:
+ raise ValueError("we do not support this function "
+ "in libwindow.{func}".format(func=func))
+
+ def func(arg, window, min_periods=None, closed=None):
+ minp = check_minp(min_periods, window)
+ # ensure we are only rolling on floats
+ arg = ensure_float64(arg)
+ return cfunc(arg,
+ window, minp, indexi, closed, **kwargs)
+
+ # calculation function
+ if center:
+ offset = _offset(window, center)
+ additional_nans = np.array([np.NaN] * offset)
+
+ def calc(x):
+ return func(np.concatenate((x, additional_nans)),
+ window, min_periods=self.min_periods,
+ closed=self.closed)
+ else:
+
+ def calc(x):
+ return func(x, window, min_periods=self.min_periods,
+ closed=self.closed)
+
+ with np.errstate(all='ignore'):
+ if values.ndim > 1:
+ result = np.apply_along_axis(calc, self.axis, values)
+ else:
+ result = calc(values)
+
+ if center:
+ result = self._center_window(result, window)
+
+ results.append(result)
+
+ return self._wrap_results(results, blocks, obj)
+
+
+class _Rolling_and_Expanding(_Rolling):
+
+ _shared_docs['count'] = dedent(r"""
+ The %(name)s count of any non-NaN observations inside the window.
+
+ Returns
+ -------
+ Series or DataFrame
+ Returned object type is determined by the caller of the %(name)s
+ calculation.
+
+ See Also
+ --------
+ pandas.Series.%(name)s : Calling object with Series data.
+ pandas.DataFrame.%(name)s : Calling object with DataFrames.
+ pandas.DataFrame.count : Count of the full DataFrame.
+
+ Examples
+ --------
+ >>> s = pd.Series([2, 3, np.nan, 10])
+ >>> s.rolling(2).count()
+ 0 1.0
+ 1 2.0
+ 2 1.0
+ 3 1.0
+ dtype: float64
+ >>> s.rolling(3).count()
+ 0 1.0
+ 1 2.0
+ 2 2.0
+ 3 2.0
+ dtype: float64
+ >>> s.rolling(4).count()
+ 0 1.0
+ 1 2.0
+ 2 2.0
+ 3 3.0
+ dtype: float64
+ """)
+
+ def count(self):
+
+ blocks, obj, index = self._create_blocks()
+ # Validate the index
+ self._get_index(index=index)
+
+ window = self._get_window()
+ window = min(window, len(obj)) if not self.center else window
+
+ results = []
+ for b in blocks:
+ result = b.notna().astype(int)
+ result = self._constructor(result, window=window, min_periods=0,
+ center=self.center,
+ closed=self.closed).sum()
+ results.append(result)
+
+ return self._wrap_results(results, blocks, obj)
+
+ _shared_docs['apply'] = dedent(r"""
+ The %(name)s function's apply function.
+
+ Parameters
+ ----------
+ func : function
+ Must produce a single value from an ndarray input if ``raw=True``
+ or a Series if ``raw=False``.
+ raw : bool, default None
+ * ``False`` : passes each row or column as a Series to the
+ function.
+ * ``True`` or ``None`` : the passed function will receive ndarray
+ objects instead.
+ If you are just applying a NumPy reduction function this will
+ achieve much better performance.
+
+ The `raw` parameter is required and will show a FutureWarning if
+ not passed. In the future `raw` will default to False.
+
+ .. versionadded:: 0.23.0
+ *args, **kwargs
+ Arguments and keyword arguments to be passed into func.
+
+ Returns
+ -------
+ Series or DataFrame
+ Return type is determined by the caller.
+
+ See Also
+ --------
+ Series.%(name)s : Series %(name)s.
+ DataFrame.%(name)s : DataFrame %(name)s.
+ """)
+
+ def apply(self, func, raw=None, args=(), kwargs={}):
+ from pandas import Series
+
+ # TODO: _level is unused?
+ _level = kwargs.pop('_level', None) # noqa
+ window = self._get_window()
+ offset = _offset(window, self.center)
+ index, indexi = self._get_index()
+
+ # TODO: default is for backward compat
+ # change to False in the future
+ if raw is None:
+ warnings.warn(
+ "Currently, 'apply' passes the values as ndarrays to the "
+ "applied function. In the future, this will change to passing "
+ "it as Series objects. You need to specify 'raw=True' to keep "
+ "the current behaviour, and you can pass 'raw=False' to "
+ "silence this warning", FutureWarning, stacklevel=3)
+ raw = True
+
+ def f(arg, window, min_periods, closed):
+ minp = _use_window(min_periods, window)
+ if not raw:
+ arg = Series(arg, index=self.obj.index)
+ return libwindow.roll_generic(
+ arg, window, minp, indexi,
+ closed, offset, func, raw, args, kwargs)
+
+ return self._apply(f, func, args=args, kwargs=kwargs,
+ center=False, raw=raw)
+
+ def sum(self, *args, **kwargs):
+ nv.validate_window_func('sum', args, kwargs)
+ return self._apply('roll_sum', 'sum', **kwargs)
+
+ _shared_docs['max'] = dedent("""
+ Calculate the %(name)s maximum.
+
+ Parameters
+ ----------
+ *args, **kwargs
+ Arguments and keyword arguments to be passed into func.
+ """)
+
+ def max(self, *args, **kwargs):
+ nv.validate_window_func('max', args, kwargs)
+ return self._apply('roll_max', 'max', **kwargs)
+
+ _shared_docs['min'] = dedent("""
+ Calculate the %(name)s minimum.
+
+ Parameters
+ ----------
+ **kwargs
+ Under Review.
+
+ Returns
+ -------
+ Series or DataFrame
+ Returned object type is determined by the caller of the %(name)s
+ calculation.
+
+ See Also
+ --------
+ Series.%(name)s : Calling object with a Series.
+ DataFrame.%(name)s : Calling object with a DataFrame.
+ Series.min : Similar method for Series.
+ DataFrame.min : Similar method for DataFrame.
+
+ Examples
+ --------
+ Performing a rolling minimum with a window size of 3.
+
+ >>> s = pd.Series([4, 3, 5, 2, 6])
+ >>> s.rolling(3).min()
+ 0 NaN
+ 1 NaN
+ 2 3.0
+ 3 2.0
+ 4 2.0
+ dtype: float64
+ """)
+
+ def min(self, *args, **kwargs):
+ nv.validate_window_func('min', args, kwargs)
+ return self._apply('roll_min', 'min', **kwargs)
+
+ def mean(self, *args, **kwargs):
+ nv.validate_window_func('mean', args, kwargs)
+ return self._apply('roll_mean', 'mean', **kwargs)
+
+ _shared_docs['median'] = dedent("""
+ Calculate the %(name)s median.
+
+ Parameters
+ ----------
+ **kwargs
+ For compatibility with other %(name)s methods. Has no effect
+ on the computed median.
+
+ Returns
+ -------
+ Series or DataFrame
+ Returned type is the same as the original object.
+
+ See Also
+ --------
+ Series.%(name)s : Calling object with Series data.
+ DataFrame.%(name)s : Calling object with DataFrames.
+ Series.median : Equivalent method for Series.
+ DataFrame.median : Equivalent method for DataFrame.
+
+ Examples
+ --------
+ Compute the rolling median of a series with a window size of 3.
+
+ >>> s = pd.Series([0, 1, 2, 3, 4])
+ >>> s.rolling(3).median()
+ 0 NaN
+ 1 NaN
+ 2 1.0
+ 3 2.0
+ 4 3.0
+ dtype: float64
+ """)
+
+ def median(self, **kwargs):
+ return self._apply('roll_median_c', 'median', **kwargs)
+
+ _shared_docs['std'] = dedent("""
+ Calculate %(name)s standard deviation.
+
+ Normalized by N-1 by default. This can be changed using the `ddof`
+ argument.
+
+ Parameters
+ ----------
+ ddof : int, default 1
+ Delta Degrees of Freedom. The divisor used in calculations
+ is ``N - ddof``, where ``N`` represents the number of elements.
+ *args, **kwargs
+ For NumPy compatibility. No additional arguments are used.
+
+ Returns
+ -------
+ Series or DataFrame
+ Returns the same object type as the caller of the %(name)s calculation.
+
+ See Also
+ --------
+ Series.%(name)s : Calling object with Series data.
+ DataFrame.%(name)s : Calling object with DataFrames.
+ Series.std : Equivalent method for Series.
+ DataFrame.std : Equivalent method for DataFrame.
+ numpy.std : Equivalent method for Numpy array.
+
+ Notes
+ -----
+ The default `ddof` of 1 used in Series.std is different than the default
+ `ddof` of 0 in numpy.std.
+
+ A minimum of one period is required for the rolling calculation.
+
+ Examples
+ --------
+ >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
+ >>> s.rolling(3).std()
+ 0 NaN
+ 1 NaN
+ 2 0.577350
+ 3 1.000000
+ 4 1.000000
+ 5 1.154701
+ 6 0.000000
+ dtype: float64
+
+ >>> s.expanding(3).std()
+ 0 NaN
+ 1 NaN
+ 2 0.577350
+ 3 0.957427
+ 4 0.894427
+ 5 0.836660
+ 6 0.786796
+ dtype: float64
+ """)
+
+ def std(self, ddof=1, *args, **kwargs):
+ nv.validate_window_func('std', args, kwargs)
+ window = self._get_window()
+ index, indexi = self._get_index()
+
+ def f(arg, *args, **kwargs):
+ minp = _require_min_periods(1)(self.min_periods, window)
+ return _zsqrt(libwindow.roll_var(arg, window, minp, indexi,
+ self.closed, ddof))
+
+ return self._apply(f, 'std', check_minp=_require_min_periods(1),
+ ddof=ddof, **kwargs)
+
+ _shared_docs['var'] = dedent("""
+ Calculate unbiased %(name)s variance.
+
+ Normalized by N-1 by default. This can be changed using the `ddof`
+ argument.
+
+ Parameters
+ ----------
+ ddof : int, default 1
+ Delta Degrees of Freedom. The divisor used in calculations
+ is ``N - ddof``, where ``N`` represents the number of elements.
+ *args, **kwargs
+ For NumPy compatibility. No additional arguments are used.
+
+ Returns
+ -------
+ Series or DataFrame
+ Returns the same object type as the caller of the %(name)s calculation.
+
+ See Also
+ --------
+ Series.%(name)s : Calling object with Series data.
+ DataFrame.%(name)s : Calling object with DataFrames.
+ Series.var : Equivalent method for Series.
+ DataFrame.var : Equivalent method for DataFrame.
+ numpy.var : Equivalent method for Numpy array.
+
+ Notes
+ -----
+ The default `ddof` of 1 used in :meth:`Series.var` is different than the
+ default `ddof` of 0 in :func:`numpy.var`.
+
+ A minimum of 1 period is required for the rolling calculation.
+
+ Examples
+ --------
+ >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
+ >>> s.rolling(3).var()
+ 0 NaN
+ 1 NaN
+ 2 0.333333
+ 3 1.000000
+ 4 1.000000
+ 5 1.333333
+ 6 0.000000
+ dtype: float64
+
+ >>> s.expanding(3).var()
+ 0 NaN
+ 1 NaN
+ 2 0.333333
+ 3 0.916667
+ 4 0.800000
+ 5 0.700000
+ 6 0.619048
+ dtype: float64
+ """)
+
+ def var(self, ddof=1, *args, **kwargs):
+ nv.validate_window_func('var', args, kwargs)
+ return self._apply('roll_var', 'var',
+ check_minp=_require_min_periods(1), ddof=ddof,
+ **kwargs)
+
+ _shared_docs['skew'] = """
+ Unbiased %(name)s skewness.
+
+ Parameters
+ ----------
+ **kwargs
+ Keyword arguments to be passed into func.
+ """
+
+ def skew(self, **kwargs):
+ return self._apply('roll_skew', 'skew',
+ check_minp=_require_min_periods(3), **kwargs)
+
+ _shared_docs['kurt'] = dedent("""
+ Calculate unbiased %(name)s kurtosis.
+
+ This function uses Fisher's definition of kurtosis without bias.
+
+ Parameters
+ ----------
+ **kwargs
+ Under Review.
+
+ Returns
+ -------
+ Series or DataFrame
+ Returned object type is determined by the caller of the %(name)s
+ calculation
+
+ See Also
+ --------
+ Series.%(name)s : Calling object with Series data.
+ DataFrame.%(name)s : Calling object with DataFrames.
+ Series.kurt : Equivalent method for Series.
+ DataFrame.kurt : Equivalent method for DataFrame.
+ scipy.stats.skew : Third moment of a probability density.
+ scipy.stats.kurtosis : Reference SciPy method.
+
+ Notes
+ -----
+ A minimum of 4 periods is required for the %(name)s calculation.
+ """)
+
+ def kurt(self, **kwargs):
+ return self._apply('roll_kurt', 'kurt',
+ check_minp=_require_min_periods(4), **kwargs)
+
+ _shared_docs['quantile'] = dedent("""
+ Calculate the %(name)s quantile.
+
+ Parameters
+ ----------
+ quantile : float
+ Quantile to compute. 0 <= quantile <= 1.
+ interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
+ .. versionadded:: 0.23.0
+
+ This optional parameter specifies the interpolation method to use,
+ when the desired quantile lies between two data points `i` and `j`:
+
+ * linear: `i + (j - i) * fraction`, where `fraction` is the
+ fractional part of the index surrounded by `i` and `j`.
+ * lower: `i`.
+ * higher: `j`.
+ * nearest: `i` or `j` whichever is nearest.
+ * midpoint: (`i` + `j`) / 2.
+ **kwargs:
+ For compatibility with other %(name)s methods. Has no effect on
+ the result.
+
+ Returns
+ -------
+ Series or DataFrame
+ Returned object type is determined by the caller of the %(name)s
+ calculation.
+
+ See Also
+ --------
+ pandas.Series.quantile : Computes value at the given quantile over all data
+ in Series.
+ pandas.DataFrame.quantile : Computes values at the given quantile over
+ requested axis in DataFrame.
+
+ Examples
+ --------
+ >>> s = pd.Series([1, 2, 3, 4])
+ >>> s.rolling(2).quantile(.4, interpolation='lower')
+ 0 NaN
+ 1 1.0
+ 2 2.0
+ 3 3.0
+ dtype: float64
+
+ >>> s.rolling(2).quantile(.4, interpolation='midpoint')
+ 0 NaN
+ 1 1.5
+ 2 2.5
+ 3 3.5
+ dtype: float64
+ """)
+
+ def quantile(self, quantile, interpolation='linear', **kwargs):
+ window = self._get_window()
+ index, indexi = self._get_index()
+
+ def f(arg, *args, **kwargs):
+ minp = _use_window(self.min_periods, window)
+ if quantile == 1.0:
+ return libwindow.roll_max(arg, window, minp, indexi,
+ self.closed)
+ elif quantile == 0.0:
+ return libwindow.roll_min(arg, window, minp, indexi,
+ self.closed)
+ else:
+ return libwindow.roll_quantile(arg, window, minp, indexi,
+ self.closed, quantile,
+ interpolation)
+
+ return self._apply(f, 'quantile', quantile=quantile,
+ **kwargs)
+
+ _shared_docs['cov'] = """
+ Calculate the %(name)s sample covariance.
+
+ Parameters
+ ----------
+ other : Series, DataFrame, or ndarray, optional
+ If not supplied then will default to self and produce pairwise
+ output.
+ pairwise : bool, default None
+ If False then only matching columns between self and other will be
+ used and the output will be a DataFrame.
+ If True then all pairwise combinations will be calculated and the
+ output will be a MultiIndexed DataFrame in the case of DataFrame
+ inputs. In the case of missing elements, only complete pairwise
+ observations will be used.
+ ddof : int, default 1
+ Delta Degrees of Freedom. The divisor used in calculations
+ is ``N - ddof``, where ``N`` represents the number of elements.
+ **kwargs
+ Keyword arguments to be passed into func.
+ """
+
+ def cov(self, other=None, pairwise=None, ddof=1, **kwargs):
+ if other is None:
+ other = self._selected_obj
+ # only default unset
+ pairwise = True if pairwise is None else pairwise
+ other = self._shallow_copy(other)
+
+ # GH 16058: offset window
+ if self.is_freq_type:
+ window = self.win_freq
+ else:
+ window = self._get_window(other)
+
+ def _get_cov(X, Y):
+ # GH #12373 : rolling functions error on float32 data
+ # to avoid potential overflow, cast the data to float64
+ X = X.astype('float64')
+ Y = Y.astype('float64')
+ mean = lambda x: x.rolling(window, self.min_periods,
+ center=self.center).mean(**kwargs)
+ count = (X + Y).rolling(window=window,
+ center=self.center).count(**kwargs)
+ bias_adj = count / (count - ddof)
+ return (mean(X * Y) - mean(X) * mean(Y)) * bias_adj
+
+ return _flex_binary_moment(self._selected_obj, other._selected_obj,
+ _get_cov, pairwise=bool(pairwise))
+
+ _shared_docs['corr'] = dedent("""
+ Calculate %(name)s correlation.
+
+ Parameters
+ ----------
+ other : Series, DataFrame, or ndarray, optional
+ If not supplied then will default to self.
+ pairwise : bool, default None
+ Calculate pairwise combinations of columns within a
+ DataFrame. If `other` is not specified, defaults to `True`,
+ otherwise defaults to `False`.
+ Not relevant for :class:`~pandas.Series`.
+ **kwargs
+ Unused.
+
+ Returns
+ -------
+ Series or DataFrame
+ Returned object type is determined by the caller of the
+ %(name)s calculation.
+
+ See Also
+ --------
+ Series.%(name)s : Calling object with Series data.
+ DataFrame.%(name)s : Calling object with DataFrames.
+ Series.corr : Equivalent method for Series.
+ DataFrame.corr : Equivalent method for DataFrame.
+ %(name)s.cov : Similar method to calculate covariance.
+ numpy.corrcoef : NumPy Pearson's correlation calculation.
+
+ Notes
+ -----
+ This function uses Pearson's definition of correlation
+ (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient).
+
+ When `other` is not specified, the output will be self correlation (e.g.
+ all 1's), except for :class:`~pandas.DataFrame` inputs with `pairwise`
+ set to `True`.
+
+ Function will return ``NaN`` for correlations of equal valued sequences;
+ this is the result of a 0/0 division error.
+
+ When `pairwise` is set to `False`, only matching columns between `self` and
+ `other` will be used.
+
+ When `pairwise` is set to `True`, the output will be a MultiIndex DataFrame
+ with the original index on the first level, and the `other` DataFrame
+ columns on the second level.
+
+ In the case of missing elements, only complete pairwise observations
+ will be used.
+
+ Examples
+ --------
+ The below example shows a rolling calculation with a window size of
+ four matching the equivalent function call using :meth:`numpy.corrcoef`.
+
+ >>> v1 = [3, 3, 3, 5, 8]
+ >>> v2 = [3, 4, 4, 4, 8]
+ >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits
+ >>> # numpy returns a 2X2 array, the correlation coefficient
+ >>> # is the number at entry [0][1]
+ >>> print(fmt.format(np.corrcoef(v1[:-1], v2[:-1])[0][1]))
+ 0.333333
+ >>> print(fmt.format(np.corrcoef(v1[1:], v2[1:])[0][1]))
+ 0.916949
+ >>> s1 = pd.Series(v1)
+ >>> s2 = pd.Series(v2)
+ >>> s1.rolling(4).corr(s2)
+ 0 NaN
+ 1 NaN
+ 2 NaN
+ 3 0.333333
+ 4 0.916949
+ dtype: float64
+
+ The below example shows a similar rolling calculation on a
+ DataFrame using the pairwise option.
+
+ >>> matrix = np.array([[51., 35.], [49., 30.], [47., 32.],\
+ [46., 31.], [50., 36.]])
+ >>> print(np.corrcoef(matrix[:-1,0], matrix[:-1,1]).round(7))
+ [[1. 0.6263001]
+ [0.6263001 1. ]]
+ >>> print(np.corrcoef(matrix[1:,0], matrix[1:,1]).round(7))
+ [[1. 0.5553681]
+ [0.5553681 1. ]]
+ >>> df = pd.DataFrame(matrix, columns=['X','Y'])
+ >>> df
+ X Y
+ 0 51.0 35.0
+ 1 49.0 30.0
+ 2 47.0 32.0
+ 3 46.0 31.0
+ 4 50.0 36.0
+ >>> df.rolling(4).corr(pairwise=True)
+ X Y
+ 0 X NaN NaN
+ Y NaN NaN
+ 1 X NaN NaN
+ Y NaN NaN
+ 2 X NaN NaN
+ Y NaN NaN
+ 3 X 1.000000 0.626300
+ Y 0.626300 1.000000
+ 4 X 1.000000 0.555368
+ Y 0.555368 1.000000
+ """)
+
+ def corr(self, other=None, pairwise=None, **kwargs):
+ if other is None:
+ other = self._selected_obj
+ # only default unset
+ pairwise = True if pairwise is None else pairwise
+ other = self._shallow_copy(other)
+ window = self._get_window(other)
+
+ def _get_corr(a, b):
+ a = a.rolling(window=window, min_periods=self.min_periods,
+ center=self.center)
+ b = b.rolling(window=window, min_periods=self.min_periods,
+ center=self.center)
+
+ return a.cov(b, **kwargs) / (a.std(**kwargs) * b.std(**kwargs))
+
+ return _flex_binary_moment(self._selected_obj, other._selected_obj,
+ _get_corr, pairwise=bool(pairwise))
+
+
+class Rolling(_Rolling_and_Expanding):
+
+ @cache_readonly
+ def is_datetimelike(self):
+ return isinstance(self._on,
+ (ABCDatetimeIndex,
+ ABCTimedeltaIndex,
+ ABCPeriodIndex))
+
+ @cache_readonly
+ def _on(self):
+
+ if self.on is None:
+ return self.obj.index
+ elif (isinstance(self.obj, ABCDataFrame) and
+ self.on in self.obj.columns):
+ from pandas import Index
+ return Index(self.obj[self.on])
+ else:
+ raise ValueError("invalid on specified as {0}, "
+ "must be a column (if DataFrame) "
+ "or None".format(self.on))
+
+ def validate(self):
+ super(Rolling, self).validate()
+
+ # we allow rolling on a datetimelike index
+ if ((self.obj.empty or self.is_datetimelike) and
+ isinstance(self.window, (compat.string_types, ABCDateOffset,
+ timedelta))):
+
+ self._validate_monotonic()
+ freq = self._validate_freq()
+
+ # we don't allow center
+ if self.center:
+ raise NotImplementedError("center is not implemented "
+ "for datetimelike and offset "
+ "based windows")
+
+ # this will raise ValueError on non-fixed freqs
+ self.win_freq = self.window
+ self.window = freq.nanos
+ self.win_type = 'freq'
+
+ # min_periods must be an integer
+ if self.min_periods is None:
+ self.min_periods = 1
+
+ elif not is_integer(self.window):
+ raise ValueError("window must be an integer")
+ elif self.window < 0:
+ raise ValueError("window must be non-negative")
+
+ if not self.is_datetimelike and self.closed is not None:
+ raise ValueError("closed only implemented for datetimelike "
+ "and offset based windows")
+
+ def _validate_monotonic(self):
+ """
+ Validate on is_monotonic.
+ """
+ if not self._on.is_monotonic:
+ formatted = self.on or 'index'
+ raise ValueError("{0} must be "
+ "monotonic".format(formatted))
+
+ def _validate_freq(self):
+ """
+ Validate & return window frequency.
+ """
+ from pandas.tseries.frequencies import to_offset
+ try:
+ return to_offset(self.window)
+ except (TypeError, ValueError):
+ raise ValueError("passed window {0} is not "
+ "compatible with a datetimelike "
+ "index".format(self.window))
+
+ _agg_see_also_doc = dedent("""
+ See Also
+ --------
+ pandas.Series.rolling
+ pandas.DataFrame.rolling
+ """)
+
+ _agg_examples_doc = dedent("""
+ Examples
+ --------
+
+ >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'])
+ >>> df
+ A B C
+ 0 -2.385977 -0.102758 0.438822
+ 1 -1.004295 0.905829 -0.954544
+ 2 0.735167 -0.165272 -1.619346
+ 3 -0.702657 -1.340923 -0.706334
+ 4 -0.246845 0.211596 -0.901819
+ 5 2.463718 3.157577 -1.380906
+ 6 -1.142255 2.340594 -0.039875
+ 7 1.396598 -1.647453 1.677227
+ 8 -0.543425 1.761277 -0.220481
+ 9 -0.640505 0.289374 -1.550670
+
+ >>> df.rolling(3).sum()
+ A B C
+ 0 NaN NaN NaN
+ 1 NaN NaN NaN
+ 2 -2.655105 0.637799 -2.135068
+ 3 -0.971785 -0.600366 -3.280224
+ 4 -0.214334 -1.294599 -3.227500
+ 5 1.514216 2.028250 -2.989060
+ 6 1.074618 5.709767 -2.322600
+ 7 2.718061 3.850718 0.256446
+ 8 -0.289082 2.454418 1.416871
+ 9 0.212668 0.403198 -0.093924
+
+ >>> df.rolling(3).agg({'A':'sum', 'B':'min'})
+ A B
+ 0 NaN NaN
+ 1 NaN NaN
+ 2 -2.655105 -0.165272
+ 3 -0.971785 -1.340923
+ 4 -0.214334 -1.340923
+ 5 1.514216 -1.340923
+ 6 1.074618 0.211596
+ 7 2.718061 -1.647453
+ 8 -0.289082 -1.647453
+ 9 0.212668 -1.647453
+ """)
+
+ @Substitution(see_also=_agg_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded='',
+ klass='Series/Dataframe',
+ axis='')
+ @Appender(_shared_docs['aggregate'])
+ def aggregate(self, arg, *args, **kwargs):
+ return super(Rolling, self).aggregate(arg, *args, **kwargs)
+
+ agg = aggregate
+
+ @Substitution(name='rolling')
+ @Appender(_shared_docs['count'])
+ def count(self):
+
+ # different impl for freq counting
+ if self.is_freq_type:
+ return self._apply('roll_count', 'count')
+
+ return super(Rolling, self).count()
+
+ @Substitution(name='rolling')
+ @Appender(_shared_docs['apply'])
+ def apply(self, func, raw=None, args=(), kwargs={}):
+ return super(Rolling, self).apply(
+ func, raw=raw, args=args, kwargs=kwargs)
+
+ @Substitution(name='rolling')
+ @Appender(_shared_docs['sum'])
+ def sum(self, *args, **kwargs):
+ nv.validate_rolling_func('sum', args, kwargs)
+ return super(Rolling, self).sum(*args, **kwargs)
+
+ @Substitution(name='rolling')
+ @Appender(_doc_template)
+ @Appender(_shared_docs['max'])
+ def max(self, *args, **kwargs):
+ nv.validate_rolling_func('max', args, kwargs)
+ return super(Rolling, self).max(*args, **kwargs)
+
+ @Substitution(name='rolling')
+ @Appender(_shared_docs['min'])
+ def min(self, *args, **kwargs):
+ nv.validate_rolling_func('min', args, kwargs)
+ return super(Rolling, self).min(*args, **kwargs)
+
+ @Substitution(name='rolling')
+ @Appender(_shared_docs['mean'])
+ def mean(self, *args, **kwargs):
+ nv.validate_rolling_func('mean', args, kwargs)
+ return super(Rolling, self).mean(*args, **kwargs)
+
+ @Substitution(name='rolling')
+ @Appender(_shared_docs['median'])
+ def median(self, **kwargs):
+ return super(Rolling, self).median(**kwargs)
+
+ @Substitution(name='rolling')
+ @Appender(_shared_docs['std'])
+ def std(self, ddof=1, *args, **kwargs):
+ nv.validate_rolling_func('std', args, kwargs)
+ return super(Rolling, self).std(ddof=ddof, **kwargs)
+
+ @Substitution(name='rolling')
+ @Appender(_shared_docs['var'])
+ def var(self, ddof=1, *args, **kwargs):
+ nv.validate_rolling_func('var', args, kwargs)
+ return super(Rolling, self).var(ddof=ddof, **kwargs)
+
+ @Substitution(name='rolling')
+ @Appender(_doc_template)
+ @Appender(_shared_docs['skew'])
+ def skew(self, **kwargs):
+ return super(Rolling, self).skew(**kwargs)
+
+ _agg_doc = dedent("""
+ Examples
+ --------
+
+ The example below will show a rolling calculation with a window size of
+ four matching the equivalent function call using `scipy.stats`.
+
+ >>> arr = [1, 2, 3, 4, 999]
+ >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits
+ >>> import scipy.stats
+ >>> print(fmt.format(scipy.stats.kurtosis(arr[:-1], bias=False)))
+ -1.200000
+ >>> print(fmt.format(scipy.stats.kurtosis(arr[1:], bias=False)))
+ 3.999946
+ >>> s = pd.Series(arr)
+ >>> s.rolling(4).kurt()
+ 0 NaN
+ 1 NaN
+ 2 NaN
+ 3 -1.200000
+ 4 3.999946
+ dtype: float64
+ """)
+
+ @Appender(_agg_doc)
+ @Substitution(name='rolling')
+ @Appender(_shared_docs['kurt'])
+ def kurt(self, **kwargs):
+ return super(Rolling, self).kurt(**kwargs)
+
+ @Substitution(name='rolling')
+ @Appender(_shared_docs['quantile'])
+ def quantile(self, quantile, interpolation='linear', **kwargs):
+ return super(Rolling, self).quantile(quantile=quantile,
+ interpolation=interpolation,
+ **kwargs)
+
+ @Substitution(name='rolling')
+ @Appender(_doc_template)
+ @Appender(_shared_docs['cov'])
+ def cov(self, other=None, pairwise=None, ddof=1, **kwargs):
+ return super(Rolling, self).cov(other=other, pairwise=pairwise,
+ ddof=ddof, **kwargs)
+
+ @Substitution(name='rolling')
+ @Appender(_shared_docs['corr'])
+ def corr(self, other=None, pairwise=None, **kwargs):
+ return super(Rolling, self).corr(other=other, pairwise=pairwise,
+ **kwargs)
+
+
+class RollingGroupby(_GroupByMixin, Rolling):
+ """
+ Provides a rolling groupby implementation.
+
+ .. versionadded:: 0.18.1
+
+ """
+ @property
+ def _constructor(self):
+ return Rolling
+
+ def _gotitem(self, key, ndim, subset=None):
+
+ # we are setting the index on the actual object
+ # here so our index is carried thru to the selected obj
+ # when we do the splitting for the groupby
+ if self.on is not None:
+ self._groupby.obj = self._groupby.obj.set_index(self._on)
+ self.on = None
+ return super(RollingGroupby, self)._gotitem(key, ndim, subset=subset)
+
+ def _validate_monotonic(self):
+ """
+ Validate that on is monotonic;
+ we don't care for groupby.rolling
+ because we have already validated at a higher
+ level.
+ """
+ pass
+
+
+class Expanding(_Rolling_and_Expanding):
+ """
+ Provides expanding transformations.
+
+ .. versionadded:: 0.18.0
+
+ Parameters
+ ----------
+ min_periods : int, default 1
+ Minimum number of observations in window required to have a value
+ (otherwise result is NA).
+ center : bool, default False
+ Set the labels at the center of the window.
+ axis : int or str, default 0
+
+ Returns
+ -------
+ a Window sub-classed for the particular operation
+
+ See Also
+ --------
+ rolling : Provides rolling window calculations.
+ ewm : Provides exponential weighted functions.
+
+ Notes
+ -----
+ By default, the result is set to the right edge of the window. This can be
+ changed to the center of the window by setting ``center=True``.
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]})
+ B
+ 0 0.0
+ 1 1.0
+ 2 2.0
+ 3 NaN
+ 4 4.0
+
+ >>> df.expanding(2).sum()
+ B
+ 0 NaN
+ 1 1.0
+ 2 3.0
+ 3 3.0
+ 4 7.0
+ """
+
+ _attributes = ['min_periods', 'center', 'axis']
+
+ def __init__(self, obj, min_periods=1, center=False, axis=0,
+ **kwargs):
+ super(Expanding, self).__init__(obj=obj, min_periods=min_periods,
+ center=center, axis=axis)
+
+ @property
+ def _constructor(self):
+ return Expanding
+
+ def _get_window(self, other=None):
+ """
+ Get the window length over which to perform some operation.
+
+ Parameters
+ ----------
+ other : object, default None
+ The other object that is involved in the operation.
+ Such an object is involved for operations like covariance.
+
+ Returns
+ -------
+ window : int
+ The window length.
+ """
+ axis = self.obj._get_axis(self.axis)
+ length = len(axis) + (other is not None) * len(axis)
+
+ other = self.min_periods or -1
+ return max(length, other)
+
+ _agg_see_also_doc = dedent("""
+ See Also
+ --------
+ pandas.DataFrame.expanding.aggregate
+ pandas.DataFrame.rolling.aggregate
+ pandas.DataFrame.aggregate
+ """)
+
+ _agg_examples_doc = dedent("""
+ Examples
+ --------
+
+ >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'])
+ >>> df
+ A B C
+ 0 -2.385977 -0.102758 0.438822
+ 1 -1.004295 0.905829 -0.954544
+ 2 0.735167 -0.165272 -1.619346
+ 3 -0.702657 -1.340923 -0.706334
+ 4 -0.246845 0.211596 -0.901819
+ 5 2.463718 3.157577 -1.380906
+ 6 -1.142255 2.340594 -0.039875
+ 7 1.396598 -1.647453 1.677227
+ 8 -0.543425 1.761277 -0.220481
+ 9 -0.640505 0.289374 -1.550670
+
+ >>> df.ewm(alpha=0.5).mean()
+ A B C
+ 0 -2.385977 -0.102758 0.438822
+ 1 -1.464856 0.569633 -0.490089
+ 2 -0.207700 0.149687 -1.135379
+ 3 -0.471677 -0.645305 -0.906555
+ 4 -0.355635 -0.203033 -0.904111
+ 5 1.076417 1.503943 -1.146293
+ 6 -0.041654 1.925562 -0.588728
+ 7 0.680292 0.132049 0.548693
+ 8 0.067236 0.948257 0.163353
+ 9 -0.286980 0.618493 -0.694496
+ """)
+
+ @Substitution(see_also=_agg_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded='',
+ klass='Series/Dataframe',
+ axis='')
+ @Appender(_shared_docs['aggregate'])
+ def aggregate(self, arg, *args, **kwargs):
+ return super(Expanding, self).aggregate(arg, *args, **kwargs)
+
+ agg = aggregate
+
+ @Substitution(name='expanding')
+ @Appender(_shared_docs['count'])
+ def count(self, **kwargs):
+ return super(Expanding, self).count(**kwargs)
+
+ @Substitution(name='expanding')
+ @Appender(_shared_docs['apply'])
+ def apply(self, func, raw=None, args=(), kwargs={}):
+ return super(Expanding, self).apply(
+ func, raw=raw, args=args, kwargs=kwargs)
+
+ @Substitution(name='expanding')
+ @Appender(_shared_docs['sum'])
+ def sum(self, *args, **kwargs):
+ nv.validate_expanding_func('sum', args, kwargs)
+ return super(Expanding, self).sum(*args, **kwargs)
+
+ @Substitution(name='expanding')
+ @Appender(_doc_template)
+ @Appender(_shared_docs['max'])
+ def max(self, *args, **kwargs):
+ nv.validate_expanding_func('max', args, kwargs)
+ return super(Expanding, self).max(*args, **kwargs)
+
+ @Substitution(name='expanding')
+ @Appender(_shared_docs['min'])
+ def min(self, *args, **kwargs):
+ nv.validate_expanding_func('min', args, kwargs)
+ return super(Expanding, self).min(*args, **kwargs)
+
+ @Substitution(name='expanding')
+ @Appender(_shared_docs['mean'])
+ def mean(self, *args, **kwargs):
+ nv.validate_expanding_func('mean', args, kwargs)
+ return super(Expanding, self).mean(*args, **kwargs)
+
+ @Substitution(name='expanding')
+ @Appender(_shared_docs['median'])
+ def median(self, **kwargs):
+ return super(Expanding, self).median(**kwargs)
+
+ @Substitution(name='expanding')
+ @Appender(_shared_docs['std'])
+ def std(self, ddof=1, *args, **kwargs):
+ nv.validate_expanding_func('std', args, kwargs)
+ return super(Expanding, self).std(ddof=ddof, **kwargs)
+
+ @Substitution(name='expanding')
+ @Appender(_shared_docs['var'])
+ def var(self, ddof=1, *args, **kwargs):
+ nv.validate_expanding_func('var', args, kwargs)
+ return super(Expanding, self).var(ddof=ddof, **kwargs)
+
+ @Substitution(name='expanding')
+ @Appender(_doc_template)
+ @Appender(_shared_docs['skew'])
+ def skew(self, **kwargs):
+ return super(Expanding, self).skew(**kwargs)
+
+ _agg_doc = dedent("""
+ Examples
+ --------
+
+ The example below will show an expanding calculation with a window size of
+ four matching the equivalent function call using `scipy.stats`.
+
+ >>> arr = [1, 2, 3, 4, 999]
+ >>> import scipy.stats
+ >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits
+ >>> print(fmt.format(scipy.stats.kurtosis(arr[:-1], bias=False)))
+ -1.200000
+ >>> print(fmt.format(scipy.stats.kurtosis(arr, bias=False)))
+ 4.999874
+ >>> s = pd.Series(arr)
+ >>> s.expanding(4).kurt()
+ 0 NaN
+ 1 NaN
+ 2 NaN
+ 3 -1.200000
+ 4 4.999874
+ dtype: float64
+ """)
+
+ @Appender(_agg_doc)
+ @Substitution(name='expanding')
+ @Appender(_shared_docs['kurt'])
+ def kurt(self, **kwargs):
+ return super(Expanding, self).kurt(**kwargs)
+
+ @Substitution(name='expanding')
+ @Appender(_shared_docs['quantile'])
+ def quantile(self, quantile, interpolation='linear', **kwargs):
+ return super(Expanding, self).quantile(quantile=quantile,
+ interpolation=interpolation,
+ **kwargs)
+
+ @Substitution(name='expanding')
+ @Appender(_doc_template)
+ @Appender(_shared_docs['cov'])
+ def cov(self, other=None, pairwise=None, ddof=1, **kwargs):
+ return super(Expanding, self).cov(other=other, pairwise=pairwise,
+ ddof=ddof, **kwargs)
+
+ @Substitution(name='expanding')
+ @Appender(_shared_docs['corr'])
+ def corr(self, other=None, pairwise=None, **kwargs):
+ return super(Expanding, self).corr(other=other, pairwise=pairwise,
+ **kwargs)
+
+
+class ExpandingGroupby(_GroupByMixin, Expanding):
+ """
+ Provides a expanding groupby implementation.
+
+ .. versionadded:: 0.18.1
+
+ """
+ @property
+ def _constructor(self):
+ return Expanding
+
+
+_bias_template = """
+ Parameters
+ ----------
+ bias : bool, default False
+ Use a standard estimation bias correction.
+ *args, **kwargs
+ Arguments and keyword arguments to be passed into func.
+"""
+
+_pairwise_template = """
+ Parameters
+ ----------
+ other : Series, DataFrame, or ndarray, optional
+ If not supplied then will default to self and produce pairwise
+ output.
+ pairwise : bool, default None
+ If False then only matching columns between self and other will be
+ used and the output will be a DataFrame.
+ If True then all pairwise combinations will be calculated and the
+ output will be a MultiIndex DataFrame in the case of DataFrame
+ inputs. In the case of missing elements, only complete pairwise
+ observations will be used.
+ bias : bool, default False
+ Use a standard estimation bias correction.
+ **kwargs
+ Keyword arguments to be passed into func.
+"""
+
+
+class EWM(_Rolling):
+ r"""
+ Provides exponential weighted functions.
+
+ .. versionadded:: 0.18.0
+
+ Parameters
+ ----------
+ com : float, optional
+ Specify decay in terms of center of mass,
+ :math:`\alpha = 1 / (1 + com),\text{ for } com \geq 0`
+ span : float, optional
+ Specify decay in terms of span,
+ :math:`\alpha = 2 / (span + 1),\text{ for } span \geq 1`
+ halflife : float, optional
+ Specify decay in terms of half-life,
+ :math:`\alpha = 1 - exp(log(0.5) / halflife),\text{ for } halflife > 0`
+ alpha : float, optional
+ Specify smoothing factor :math:`\alpha` directly,
+ :math:`0 < \alpha \leq 1`
+
+ .. versionadded:: 0.18.0
+
+ min_periods : int, default 0
+ Minimum number of observations in window required to have a value
+ (otherwise result is NA).
+ adjust : bool, default True
+ Divide by decaying adjustment factor in beginning periods to account
+ for imbalance in relative weightings (viewing EWMA as a moving average)
+ ignore_na : bool, default False
+ Ignore missing values when calculating weights;
+ specify True to reproduce pre-0.15.0 behavior
+
+ Returns
+ -------
+ a Window sub-classed for the particular operation
+
+ See Also
+ --------
+ rolling : Provides rolling window calculations.
+ expanding : Provides expanding transformations.
+
+ Notes
+ -----
+ Exactly one of center of mass, span, half-life, and alpha must be provided.
+ Allowed values and relationship between the parameters are specified in the
+ parameter descriptions above; see the link at the end of this section for
+ a detailed explanation.
+
+ When adjust is True (default), weighted averages are calculated using
+ weights (1-alpha)**(n-1), (1-alpha)**(n-2), ..., 1-alpha, 1.
+
+ When adjust is False, weighted averages are calculated recursively as:
+ weighted_average[0] = arg[0];
+ weighted_average[i] = (1-alpha)*weighted_average[i-1] + alpha*arg[i].
+
+ When ignore_na is False (default), weights are based on absolute positions.
+ For example, the weights of x and y used in calculating the final weighted
+ average of [x, None, y] are (1-alpha)**2 and 1 (if adjust is True), and
+ (1-alpha)**2 and alpha (if adjust is False).
+
+ When ignore_na is True (reproducing pre-0.15.0 behavior), weights are based
+ on relative positions. For example, the weights of x and y used in
+ calculating the final weighted average of [x, None, y] are 1-alpha and 1
+ (if adjust is True), and 1-alpha and alpha (if adjust is False).
+
+ More details can be found at
+ http://pandas.pydata.org/pandas-docs/stable/computation.html#exponentially-weighted-windows
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]})
+ B
+ 0 0.0
+ 1 1.0
+ 2 2.0
+ 3 NaN
+ 4 4.0
+
+ >>> df.ewm(com=0.5).mean()
+ B
+ 0 0.000000
+ 1 0.750000
+ 2 1.615385
+ 3 1.615385
+ 4 3.670213
+ """
+ _attributes = ['com', 'min_periods', 'adjust', 'ignore_na', 'axis']
+
+ def __init__(self, obj, com=None, span=None, halflife=None, alpha=None,
+ min_periods=0, adjust=True, ignore_na=False,
+ axis=0):
+ self.obj = obj
+ self.com = _get_center_of_mass(com, span, halflife, alpha)
+ self.min_periods = min_periods
+ self.adjust = adjust
+ self.ignore_na = ignore_na
+ self.axis = axis
+ self.on = None
+
+ @property
+ def _constructor(self):
+ return EWM
+
+ _agg_see_also_doc = dedent("""
+ See Also
+ --------
+ pandas.DataFrame.rolling.aggregate
+ """)
+
+ _agg_examples_doc = dedent("""
+ Examples
+ --------
+
+ >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'])
+ >>> df
+ A B C
+ 0 -2.385977 -0.102758 0.438822
+ 1 -1.004295 0.905829 -0.954544
+ 2 0.735167 -0.165272 -1.619346
+ 3 -0.702657 -1.340923 -0.706334
+ 4 -0.246845 0.211596 -0.901819
+ 5 2.463718 3.157577 -1.380906
+ 6 -1.142255 2.340594 -0.039875
+ 7 1.396598 -1.647453 1.677227
+ 8 -0.543425 1.761277 -0.220481
+ 9 -0.640505 0.289374 -1.550670
+
+ >>> df.ewm(alpha=0.5).mean()
+ A B C
+ 0 -2.385977 -0.102758 0.438822
+ 1 -1.464856 0.569633 -0.490089
+ 2 -0.207700 0.149687 -1.135379
+ 3 -0.471677 -0.645305 -0.906555
+ 4 -0.355635 -0.203033 -0.904111
+ 5 1.076417 1.503943 -1.146293
+ 6 -0.041654 1.925562 -0.588728
+ 7 0.680292 0.132049 0.548693
+ 8 0.067236 0.948257 0.163353
+ 9 -0.286980 0.618493 -0.694496
+ """)
+
+ @Substitution(see_also=_agg_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded='',
+ klass='Series/Dataframe',
+ axis='')
+ @Appender(_shared_docs['aggregate'])
+ def aggregate(self, arg, *args, **kwargs):
+ return super(EWM, self).aggregate(arg, *args, **kwargs)
+
+ agg = aggregate
+
+ def _apply(self, func, **kwargs):
+ """
+ Rolling statistical measure using supplied function. Designed to be
+ used with passed-in Cython array-based functions.
+
+ Parameters
+ ----------
+ func : str/callable to apply
+
+ Returns
+ -------
+ y : same type as input argument
+ """
+ blocks, obj, index = self._create_blocks()
+ results = []
+ for b in blocks:
+ try:
+ values = self._prep_values(b.values)
+ except TypeError:
+ results.append(b.values.copy())
+ continue
+
+ if values.size == 0:
+ results.append(values.copy())
+ continue
+
+ # if we have a string function name, wrap it
+ if isinstance(func, compat.string_types):
+ cfunc = getattr(libwindow, func, None)
+ if cfunc is None:
+ raise ValueError("we do not support this function "
+ "in libwindow.{func}".format(func=func))
+
+ def func(arg):
+ return cfunc(arg, self.com, int(self.adjust),
+ int(self.ignore_na), int(self.min_periods))
+
+ results.append(np.apply_along_axis(func, self.axis, values))
+
+ return self._wrap_results(results, blocks, obj)
+
+ @Substitution(name='ewm')
+ @Appender(_doc_template)
+ def mean(self, *args, **kwargs):
+ """
+ Exponential weighted moving average.
+
+ Parameters
+ ----------
+ *args, **kwargs
+ Arguments and keyword arguments to be passed into func.
+ """
+ nv.validate_window_func('mean', args, kwargs)
+ return self._apply('ewma', **kwargs)
+
+ @Substitution(name='ewm')
+ @Appender(_doc_template)
+ @Appender(_bias_template)
+ def std(self, bias=False, *args, **kwargs):
+ """
+ Exponential weighted moving stddev.
+ """
+ nv.validate_window_func('std', args, kwargs)
+ return _zsqrt(self.var(bias=bias, **kwargs))
+
+ vol = std
+
+ @Substitution(name='ewm')
+ @Appender(_doc_template)
+ @Appender(_bias_template)
+ def var(self, bias=False, *args, **kwargs):
+ """
+ Exponential weighted moving variance.
+ """
+ nv.validate_window_func('var', args, kwargs)
+
+ def f(arg):
+ return libwindow.ewmcov(arg, arg, self.com, int(self.adjust),
+ int(self.ignore_na), int(self.min_periods),
+ int(bias))
+
+ return self._apply(f, **kwargs)
+
+ @Substitution(name='ewm')
+ @Appender(_doc_template)
+ @Appender(_pairwise_template)
+ def cov(self, other=None, pairwise=None, bias=False, **kwargs):
+ """
+ Exponential weighted sample covariance.
+ """
+ if other is None:
+ other = self._selected_obj
+ # only default unset
+ pairwise = True if pairwise is None else pairwise
+ other = self._shallow_copy(other)
+
+ def _get_cov(X, Y):
+ X = self._shallow_copy(X)
+ Y = self._shallow_copy(Y)
+ cov = libwindow.ewmcov(X._prep_values(), Y._prep_values(),
+ self.com, int(self.adjust),
+ int(self.ignore_na), int(self.min_periods),
+ int(bias))
+ return X._wrap_result(cov)
+
+ return _flex_binary_moment(self._selected_obj, other._selected_obj,
+ _get_cov, pairwise=bool(pairwise))
+
+ @Substitution(name='ewm')
+ @Appender(_doc_template)
+ @Appender(_pairwise_template)
+ def corr(self, other=None, pairwise=None, **kwargs):
+ """
+ Exponential weighted sample correlation.
+ """
+ if other is None:
+ other = self._selected_obj
+ # only default unset
+ pairwise = True if pairwise is None else pairwise
+ other = self._shallow_copy(other)
+
+ def _get_corr(X, Y):
+ X = self._shallow_copy(X)
+ Y = self._shallow_copy(Y)
+
+ def _cov(x, y):
+ return libwindow.ewmcov(x, y, self.com, int(self.adjust),
+ int(self.ignore_na),
+ int(self.min_periods),
+ 1)
+
+ x_values = X._prep_values()
+ y_values = Y._prep_values()
+ with np.errstate(all='ignore'):
+ cov = _cov(x_values, y_values)
+ x_var = _cov(x_values, x_values)
+ y_var = _cov(y_values, y_values)
+ corr = cov / _zsqrt(x_var * y_var)
+ return X._wrap_result(corr)
+
+ return _flex_binary_moment(self._selected_obj, other._selected_obj,
+ _get_corr, pairwise=bool(pairwise))
+
+# Helper Funcs
+
+
+def _flex_binary_moment(arg1, arg2, f, pairwise=False):
+
+ if not (isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) and
+ isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame))):
+ raise TypeError("arguments to moment function must be of type "
+ "np.ndarray/Series/DataFrame")
+
+ if (isinstance(arg1, (np.ndarray, ABCSeries)) and
+ isinstance(arg2, (np.ndarray, ABCSeries))):
+ X, Y = _prep_binary(arg1, arg2)
+ return f(X, Y)
+
+ elif isinstance(arg1, ABCDataFrame):
+ from pandas import DataFrame
+
+ def dataframe_from_int_dict(data, frame_template):
+ result = DataFrame(data, index=frame_template.index)
+ if len(result.columns) > 0:
+ result.columns = frame_template.columns[result.columns]
+ return result
+
+ results = {}
+ if isinstance(arg2, ABCDataFrame):
+ if pairwise is False:
+ if arg1 is arg2:
+ # special case in order to handle duplicate column names
+ for i, col in enumerate(arg1.columns):
+ results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i])
+ return dataframe_from_int_dict(results, arg1)
+ else:
+ if not arg1.columns.is_unique:
+ raise ValueError("'arg1' columns are not unique")
+ if not arg2.columns.is_unique:
+ raise ValueError("'arg2' columns are not unique")
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter("ignore", RuntimeWarning)
+ X, Y = arg1.align(arg2, join='outer')
+ X = X + 0 * Y
+ Y = Y + 0 * X
+
+ with warnings.catch_warnings(record=True):
+ warnings.simplefilter("ignore", RuntimeWarning)
+ res_columns = arg1.columns.union(arg2.columns)
+ for col in res_columns:
+ if col in X and col in Y:
+ results[col] = f(X[col], Y[col])
+ return DataFrame(results, index=X.index,
+ columns=res_columns)
+ elif pairwise is True:
+ results = defaultdict(dict)
+ for i, k1 in enumerate(arg1.columns):
+ for j, k2 in enumerate(arg2.columns):
+ if j < i and arg2 is arg1:
+ # Symmetric case
+ results[i][j] = results[j][i]
+ else:
+ results[i][j] = f(*_prep_binary(arg1.iloc[:, i],
+ arg2.iloc[:, j]))
+
+ from pandas import MultiIndex, concat
+
+ result_index = arg1.index.union(arg2.index)
+ if len(result_index):
+
+ # construct result frame
+ result = concat(
+ [concat([results[i][j]
+ for j, c in enumerate(arg2.columns)],
+ ignore_index=True)
+ for i, c in enumerate(arg1.columns)],
+ ignore_index=True,
+ axis=1)
+ result.columns = arg1.columns
+
+ # set the index and reorder
+ if arg2.columns.nlevels > 1:
+ result.index = MultiIndex.from_product(
+ arg2.columns.levels + [result_index])
+ result = result.reorder_levels([2, 0, 1]).sort_index()
+ else:
+ result.index = MultiIndex.from_product(
+ [range(len(arg2.columns)),
+ range(len(result_index))])
+ result = result.swaplevel(1, 0).sort_index()
+ result.index = MultiIndex.from_product(
+ [result_index] + [arg2.columns])
+ else:
+
+ # empty result
+ result = DataFrame(
+ index=MultiIndex(levels=[arg1.index, arg2.columns],
+ codes=[[], []]),
+ columns=arg2.columns,
+ dtype='float64')
+
+ # reset our index names to arg1 names
+ # reset our column names to arg2 names
+ # careful not to mutate the original names
+ result.columns = result.columns.set_names(
+ arg1.columns.names)
+ result.index = result.index.set_names(
+ result_index.names + arg2.columns.names)
+
+ return result
+
+ else:
+ raise ValueError("'pairwise' is not True/False")
+ else:
+ results = {i: f(*_prep_binary(arg1.iloc[:, i], arg2))
+ for i, col in enumerate(arg1.columns)}
+ return dataframe_from_int_dict(results, arg1)
+
+ else:
+ return _flex_binary_moment(arg2, arg1, f)
+
+
+def _get_center_of_mass(comass, span, halflife, alpha):
+ valid_count = com.count_not_none(comass, span, halflife, alpha)
+ if valid_count > 1:
+ raise ValueError("comass, span, halflife, and alpha "
+ "are mutually exclusive")
+
+ # Convert to center of mass; domain checks ensure 0 < alpha <= 1
+ if comass is not None:
+ if comass < 0:
+ raise ValueError("comass must satisfy: comass >= 0")
+ elif span is not None:
+ if span < 1:
+ raise ValueError("span must satisfy: span >= 1")
+ comass = (span - 1) / 2.
+ elif halflife is not None:
+ if halflife <= 0:
+ raise ValueError("halflife must satisfy: halflife > 0")
+ decay = 1 - np.exp(np.log(0.5) / halflife)
+ comass = 1 / decay - 1
+ elif alpha is not None:
+ if alpha <= 0 or alpha > 1:
+ raise ValueError("alpha must satisfy: 0 < alpha <= 1")
+ comass = (1.0 - alpha) / alpha
+ else:
+ raise ValueError("Must pass one of comass, span, halflife, or alpha")
+
+ return float(comass)
+
+
+def _offset(window, center):
+ if not is_integer(window):
+ window = len(window)
+ offset = (window - 1) / 2. if center else 0
+ try:
+ return int(offset)
+ except TypeError:
+ return offset.astype(int)
+
+
+def _require_min_periods(p):
+ def _check_func(minp, window):
+ if minp is None:
+ return window
+ else:
+ return max(p, minp)
+
+ return _check_func
+
+
+def _use_window(minp, window):
+ if minp is None:
+ return window
+ else:
+ return minp
+
+
+def _zsqrt(x):
+ with np.errstate(all='ignore'):
+ result = np.sqrt(x)
+ mask = x < 0
+
+ if isinstance(x, ABCDataFrame):
+ if mask.values.any():
+ result[mask] = 0
+ else:
+ if mask.any():
+ result[mask] = 0
+
+ return result
+
+
+def _prep_binary(arg1, arg2):
+ if not isinstance(arg2, type(arg1)):
+ raise Exception('Input arrays must be of the same type!')
+
+ # mask out values, this also makes a common index...
+ X = arg1 + 0 * arg2
+ Y = arg2 + 0 * arg1
+
+ return X, Y
+
+
+# Top-level exports
+
+
+def rolling(obj, win_type=None, **kwds):
+ if not isinstance(obj, (ABCSeries, ABCDataFrame)):
+ raise TypeError('invalid type: %s' % type(obj))
+
+ if win_type is not None:
+ return Window(obj, win_type=win_type, **kwds)
+
+ return Rolling(obj, **kwds)
+
+
+rolling.__doc__ = Window.__doc__
+
+
+def expanding(obj, **kwds):
+ if not isinstance(obj, (ABCSeries, ABCDataFrame)):
+ raise TypeError('invalid type: %s' % type(obj))
+
+ return Expanding(obj, **kwds)
+
+
+expanding.__doc__ = Expanding.__doc__
+
+
+def ewm(obj, **kwds):
+ if not isinstance(obj, (ABCSeries, ABCDataFrame)):
+ raise TypeError('invalid type: %s' % type(obj))
+
+ return EWM(obj, **kwds)
+
+
+ewm.__doc__ = EWM.__doc__
diff --git a/contrib/python/pandas/py2/pandas/errors/__init__.py b/contrib/python/pandas/py2/pandas/errors/__init__.py
new file mode 100644
index 00000000000..eb6a4674a74
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/errors/__init__.py
@@ -0,0 +1,183 @@
+# flake8: noqa
+
+"""
+Expose public exceptions & warnings
+"""
+
+from pandas._libs.tslibs import OutOfBoundsDatetime
+
+
+class PerformanceWarning(Warning):
+ """
+ Warning raised when there is a possible
+ performance impact.
+ """
+
+class UnsupportedFunctionCall(ValueError):
+ """
+ Exception raised when attempting to call a numpy function
+ on a pandas object, but that function is not supported by
+ the object e.g. ``np.cumsum(groupby_object)``.
+ """
+
+class UnsortedIndexError(KeyError):
+ """
+ Error raised when attempting to get a slice of a MultiIndex,
+ and the index has not been lexsorted. Subclass of `KeyError`.
+
+ .. versionadded:: 0.20.0
+ """
+
+
+class ParserError(ValueError):
+ """
+ Exception that is raised by an error encountered in `pd.read_csv`.
+ """
+
+
+class DtypeWarning(Warning):
+ """
+ Warning raised when reading different dtypes in a column from a file.
+
+ Raised for a dtype incompatibility. This can happen whenever `read_csv`
+ or `read_table` encounter non-uniform dtypes in a column(s) of a given
+ CSV file.
+
+ See Also
+ --------
+ pandas.read_csv : Read CSV (comma-separated) file into a DataFrame.
+ pandas.read_table : Read general delimited file into a DataFrame.
+
+ Notes
+ -----
+ This warning is issued when dealing with larger files because the dtype
+ checking happens per chunk read.
+
+ Despite the warning, the CSV file is read with mixed types in a single
+ column which will be an object type. See the examples below to better
+ understand this issue.
+
+ Examples
+ --------
+ This example creates and reads a large CSV file with a column that contains
+ `int` and `str`.
+
+ >>> df = pd.DataFrame({'a': (['1'] * 100000 + ['X'] * 100000 +
+ ... ['1'] * 100000),
+ ... 'b': ['b'] * 300000})
+ >>> df.to_csv('test.csv', index=False)
+ >>> df2 = pd.read_csv('test.csv')
+ ... # DtypeWarning: Columns (0) have mixed types
+
+ Important to notice that ``df2`` will contain both `str` and `int` for the
+ same input, '1'.
+
+ >>> df2.iloc[262140, 0]
+ '1'
+ >>> type(df2.iloc[262140, 0])
+ <class 'str'>
+ >>> df2.iloc[262150, 0]
+ 1
+ >>> type(df2.iloc[262150, 0])
+ <class 'int'>
+
+ One way to solve this issue is using the `dtype` parameter in the
+ `read_csv` and `read_table` functions to explicit the conversion:
+
+ >>> df2 = pd.read_csv('test.csv', sep=',', dtype={'a': str})
+
+ No warning was issued.
+
+ >>> import os
+ >>> os.remove('test.csv')
+ """
+
+
+class EmptyDataError(ValueError):
+ """
+ Exception that is thrown in `pd.read_csv` (by both the C and
+ Python engines) when empty data or header is encountered.
+ """
+
+
+class ParserWarning(Warning):
+ """
+ Warning raised when reading a file that doesn't use the default 'c' parser.
+
+ Raised by `pd.read_csv` and `pd.read_table` when it is necessary to change
+ parsers, generally from the default 'c' parser to 'python'.
+
+ It happens due to a lack of support or functionality for parsing a
+ particular attribute of a CSV file with the requested engine.
+
+ Currently, 'c' unsupported options include the following parameters:
+
+ 1. `sep` other than a single character (e.g. regex separators)
+ 2. `skipfooter` higher than 0
+ 3. `sep=None` with `delim_whitespace=False`
+
+ The warning can be avoided by adding `engine='python'` as a parameter in
+ `pd.read_csv` and `pd.read_table` methods.
+
+ See Also
+ --------
+ pd.read_csv : Read CSV (comma-separated) file into DataFrame.
+ pd.read_table : Read general delimited file into DataFrame.
+
+ Examples
+ --------
+ Using a `sep` in `pd.read_csv` other than a single character:
+
+ >>> import io
+ >>> csv = u'''a;b;c
+ ... 1;1,8
+ ... 1;2,1'''
+ >>> df = pd.read_csv(io.StringIO(csv), sep='[;,]') # doctest: +SKIP
+ ... # ParserWarning: Falling back to the 'python' engine...
+
+ Adding `engine='python'` to `pd.read_csv` removes the Warning:
+
+ >>> df = pd.read_csv(io.StringIO(csv), sep='[;,]', engine='python')
+ """
+
+
+class MergeError(ValueError):
+ """
+ Error raised when problems arise during merging due to problems
+ with input data. Subclass of `ValueError`.
+ """
+
+
+class NullFrequencyError(ValueError):
+ """
+ Error raised when a null `freq` attribute is used in an operation
+ that needs a non-null frequency, particularly `DatetimeIndex.shift`,
+ `TimedeltaIndex.shift`, `PeriodIndex.shift`.
+ """
+
+
+class AccessorRegistrationWarning(Warning):
+ """Warning for attribute conflicts in accessor registration."""
+
+
+class AbstractMethodError(NotImplementedError):
+ """Raise this error instead of NotImplementedError for abstract methods
+ while keeping compatibility with Python 2 and Python 3.
+ """
+
+ def __init__(self, class_instance, methodtype='method'):
+ types = {'method', 'classmethod', 'staticmethod', 'property'}
+ if methodtype not in types:
+ msg = 'methodtype must be one of {}, got {} instead.'.format(
+ methodtype, types)
+ raise ValueError(msg)
+ self.methodtype = methodtype
+ self.class_instance = class_instance
+
+ def __str__(self):
+ if self.methodtype == 'classmethod':
+ name = self.class_instance.__name__
+ else:
+ name = self.class_instance.__class__.__name__
+ msg = "This {methodtype} must be defined in the concrete class {name}"
+ return (msg.format(methodtype=self.methodtype, name=name))
diff --git a/contrib/python/pandas/py2/pandas/io/__init__.py b/contrib/python/pandas/py2/pandas/io/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/io/api.py b/contrib/python/pandas/py2/pandas/io/api.py
new file mode 100644
index 00000000000..8c8d7cf73b3
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/api.py
@@ -0,0 +1,20 @@
+"""
+Data IO api
+"""
+
+# flake8: noqa
+
+from pandas.io.clipboards import read_clipboard
+from pandas.io.excel import ExcelFile, ExcelWriter, read_excel
+from pandas.io.feather_format import read_feather
+from pandas.io.gbq import read_gbq
+from pandas.io.html import read_html
+from pandas.io.json import read_json
+from pandas.io.packers import read_msgpack, to_msgpack
+from pandas.io.parquet import read_parquet
+from pandas.io.parsers import read_csv, read_fwf, read_table
+from pandas.io.pickle import read_pickle, to_pickle
+from pandas.io.pytables import HDFStore, read_hdf
+from pandas.io.sas import read_sas
+from pandas.io.sql import read_sql, read_sql_query, read_sql_table
+from pandas.io.stata import read_stata
diff --git a/contrib/python/pandas/py2/pandas/io/clipboard/__init__.py b/contrib/python/pandas/py2/pandas/io/clipboard/__init__.py
new file mode 100644
index 00000000000..b76a843e3e7
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/clipboard/__init__.py
@@ -0,0 +1,125 @@
+"""
+Pyperclip
+
+A cross-platform clipboard module for Python. (only handles plain text for now)
+By Al Sweigart [email protected]
+BSD License
+
+Usage:
+ import pyperclip
+ pyperclip.copy('The text to be copied to the clipboard.')
+ spam = pyperclip.paste()
+
+ if not pyperclip.copy:
+ print("Copy functionality unavailable!")
+
+On Windows, no additional modules are needed.
+On Mac, the module uses pbcopy and pbpaste, which should come with the os.
+On Linux, install xclip or xsel via package manager. For example, in Debian:
+sudo apt-get install xclip
+
+Otherwise on Linux, you will need the gtk, qtpy or PyQt modules installed.
+qtpy also requires a python-qt-bindings module: PyQt4, PyQt5, PySide, PySide2
+
+gtk and PyQt4 modules are not available for Python 3,
+and this module does not work with PyGObject yet.
+"""
+__version__ = '1.5.27'
+
+import platform
+import os
+import subprocess
+from .clipboards import (init_osx_clipboard,
+ init_gtk_clipboard, init_qt_clipboard,
+ init_xclip_clipboard, init_xsel_clipboard,
+ init_klipper_clipboard, init_no_clipboard)
+from .windows import init_windows_clipboard
+
+# `import qtpy` sys.exit()s if DISPLAY is not in the environment.
+# Thus, we need to detect the presence of $DISPLAY manually
+# and not load qtpy if it is absent.
+HAS_DISPLAY = os.getenv("DISPLAY", False)
+CHECK_CMD = "where" if platform.system() == "Windows" else "which"
+
+
+def _executable_exists(name):
+ return subprocess.call([CHECK_CMD, name],
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0
+
+
+def determine_clipboard():
+ # Determine the OS/platform and set
+ # the copy() and paste() functions accordingly.
+ if 'cygwin' in platform.system().lower():
+ # FIXME: pyperclip currently does not support Cygwin,
+ # see https://github.com/asweigart/pyperclip/issues/55
+ pass
+ elif os.name == 'nt' or platform.system() == 'Windows':
+ return init_windows_clipboard()
+ if os.name == 'mac' or platform.system() == 'Darwin':
+ return init_osx_clipboard()
+ if HAS_DISPLAY:
+ # Determine which command/module is installed, if any.
+ try:
+ # Check if gtk is installed
+ import gtk # noqa
+ except ImportError:
+ pass
+ else:
+ return init_gtk_clipboard()
+
+ try:
+ # qtpy is a small abstraction layer that lets you write
+ # applications using a single api call to either PyQt or PySide
+ # https://pypi.org/project/QtPy
+ import qtpy # noqa
+ except ImportError:
+ # If qtpy isn't installed, fall back on importing PyQt5, or PyQt5
+ try:
+ import PyQt5 # noqa
+ except ImportError:
+ try:
+ import PyQt4 # noqa
+ except ImportError:
+ pass # fail fast for all non-ImportError exceptions.
+ else:
+ return init_qt_clipboard()
+ else:
+ return init_qt_clipboard()
+ pass
+ else:
+ return init_qt_clipboard()
+
+ if _executable_exists("xclip"):
+ return init_xclip_clipboard()
+ if _executable_exists("xsel"):
+ return init_xsel_clipboard()
+ if _executable_exists("klipper") and _executable_exists("qdbus"):
+ return init_klipper_clipboard()
+
+ return init_no_clipboard()
+
+
+def set_clipboard(clipboard):
+ global copy, paste
+
+ clipboard_types = {'osx': init_osx_clipboard,
+ 'gtk': init_gtk_clipboard,
+ 'qt': init_qt_clipboard,
+ 'xclip': init_xclip_clipboard,
+ 'xsel': init_xsel_clipboard,
+ 'klipper': init_klipper_clipboard,
+ 'windows': init_windows_clipboard,
+ 'no': init_no_clipboard}
+
+ copy, paste = clipboard_types[clipboard]()
+
+
+copy, paste = determine_clipboard()
+
+__all__ = ["copy", "paste"]
+
+
+# pandas aliases
+clipboard_get = paste
+clipboard_set = copy
diff --git a/contrib/python/pandas/py2/pandas/io/clipboard/clipboards.py b/contrib/python/pandas/py2/pandas/io/clipboard/clipboards.py
new file mode 100644
index 00000000000..d6d0ba0a560
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/clipboard/clipboards.py
@@ -0,0 +1,145 @@
+import subprocess
+
+from pandas.compat import PY2, text_type
+
+from .exceptions import PyperclipException
+
+EXCEPT_MSG = """
+ Pyperclip could not find a copy/paste mechanism for your system.
+ For more information, please visit https://pyperclip.readthedocs.org """
+
+
+def init_osx_clipboard():
+ def copy_osx(text):
+ p = subprocess.Popen(['pbcopy', 'w'],
+ stdin=subprocess.PIPE, close_fds=True)
+ p.communicate(input=text.encode('utf-8'))
+
+ def paste_osx():
+ p = subprocess.Popen(['pbpaste', 'r'],
+ stdout=subprocess.PIPE, close_fds=True)
+ stdout, stderr = p.communicate()
+ return stdout.decode('utf-8')
+
+ return copy_osx, paste_osx
+
+
+def init_gtk_clipboard():
+ import gtk
+
+ def copy_gtk(text):
+ global cb
+ cb = gtk.Clipboard()
+ cb.set_text(text)
+ cb.store()
+
+ def paste_gtk():
+ clipboardContents = gtk.Clipboard().wait_for_text()
+ # for python 2, returns None if the clipboard is blank.
+ if clipboardContents is None:
+ return ''
+ else:
+ return clipboardContents
+
+ return copy_gtk, paste_gtk
+
+
+def init_qt_clipboard():
+ # $DISPLAY should exist
+
+ # Try to import from qtpy, but if that fails try PyQt5 then PyQt4
+ try:
+ from qtpy.QtWidgets import QApplication
+ except ImportError:
+ try:
+ from PyQt5.QtWidgets import QApplication
+ except ImportError:
+ from PyQt4.QtGui import QApplication
+
+ app = QApplication.instance()
+ if app is None:
+ app = QApplication([])
+
+ def copy_qt(text):
+ cb = app.clipboard()
+ cb.setText(text)
+
+ def paste_qt():
+ cb = app.clipboard()
+ return text_type(cb.text())
+
+ return copy_qt, paste_qt
+
+
+def init_xclip_clipboard():
+ def copy_xclip(text):
+ p = subprocess.Popen(['xclip', '-selection', 'c'],
+ stdin=subprocess.PIPE, close_fds=True)
+ p.communicate(input=text.encode('utf-8'))
+
+ def paste_xclip():
+ p = subprocess.Popen(['xclip', '-selection', 'c', '-o'],
+ stdout=subprocess.PIPE, close_fds=True)
+ stdout, stderr = p.communicate()
+ return stdout.decode('utf-8')
+
+ return copy_xclip, paste_xclip
+
+
+def init_xsel_clipboard():
+ def copy_xsel(text):
+ p = subprocess.Popen(['xsel', '-b', '-i'],
+ stdin=subprocess.PIPE, close_fds=True)
+ p.communicate(input=text.encode('utf-8'))
+
+ def paste_xsel():
+ p = subprocess.Popen(['xsel', '-b', '-o'],
+ stdout=subprocess.PIPE, close_fds=True)
+ stdout, stderr = p.communicate()
+ return stdout.decode('utf-8')
+
+ return copy_xsel, paste_xsel
+
+
+def init_klipper_clipboard():
+ def copy_klipper(text):
+ p = subprocess.Popen(
+ ['qdbus', 'org.kde.klipper', '/klipper', 'setClipboardContents',
+ text.encode('utf-8')],
+ stdin=subprocess.PIPE, close_fds=True)
+ p.communicate(input=None)
+
+ def paste_klipper():
+ p = subprocess.Popen(
+ ['qdbus', 'org.kde.klipper', '/klipper', 'getClipboardContents'],
+ stdout=subprocess.PIPE, close_fds=True)
+ stdout, stderr = p.communicate()
+
+ # Workaround for https://bugs.kde.org/show_bug.cgi?id=342874
+ # TODO: https://github.com/asweigart/pyperclip/issues/43
+ clipboardContents = stdout.decode('utf-8')
+ # even if blank, Klipper will append a newline at the end
+ assert len(clipboardContents) > 0
+ # make sure that newline is there
+ assert clipboardContents.endswith('\n')
+ if clipboardContents.endswith('\n'):
+ clipboardContents = clipboardContents[:-1]
+ return clipboardContents
+
+ return copy_klipper, paste_klipper
+
+
+def init_no_clipboard():
+ class ClipboardUnavailable(object):
+
+ def __call__(self, *args, **kwargs):
+ raise PyperclipException(EXCEPT_MSG)
+
+ if PY2:
+ def __nonzero__(self):
+ return False
+ else:
+ def __bool__(self):
+ return False
+
+ return ClipboardUnavailable(), ClipboardUnavailable()
diff --git a/contrib/python/pandas/py2/pandas/io/clipboard/exceptions.py b/contrib/python/pandas/py2/pandas/io/clipboard/exceptions.py
new file mode 100644
index 00000000000..d948ad41432
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/clipboard/exceptions.py
@@ -0,0 +1,12 @@
+import ctypes
+
+
+class PyperclipException(RuntimeError):
+ pass
+
+
+class PyperclipWindowsException(PyperclipException):
+
+ def __init__(self, message):
+ message += " ({err})".format(err=ctypes.WinError())
+ super(PyperclipWindowsException, self).__init__(message)
diff --git a/contrib/python/pandas/py2/pandas/io/clipboard/windows.py b/contrib/python/pandas/py2/pandas/io/clipboard/windows.py
new file mode 100644
index 00000000000..3d979a61b5f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/clipboard/windows.py
@@ -0,0 +1,154 @@
+"""
+This module implements clipboard handling on Windows using ctypes.
+"""
+import contextlib
+import ctypes
+from ctypes import c_size_t, c_wchar, c_wchar_p, get_errno, sizeof
+import time
+
+from .exceptions import PyperclipWindowsException
+
+
+class CheckedCall(object):
+
+ def __init__(self, f):
+ super(CheckedCall, self).__setattr__("f", f)
+
+ def __call__(self, *args):
+ ret = self.f(*args)
+ if not ret and get_errno():
+ raise PyperclipWindowsException("Error calling " + self.f.__name__)
+ return ret
+
+ def __setattr__(self, key, value):
+ setattr(self.f, key, value)
+
+
+def init_windows_clipboard():
+ from ctypes.wintypes import (HGLOBAL, LPVOID, DWORD, LPCSTR, INT, HWND,
+ HINSTANCE, HMENU, BOOL, UINT, HANDLE)
+
+ windll = ctypes.windll
+
+ safeCreateWindowExA = CheckedCall(windll.user32.CreateWindowExA)
+ safeCreateWindowExA.argtypes = [DWORD, LPCSTR, LPCSTR, DWORD, INT, INT,
+ INT, INT, HWND, HMENU, HINSTANCE, LPVOID]
+ safeCreateWindowExA.restype = HWND
+
+ safeDestroyWindow = CheckedCall(windll.user32.DestroyWindow)
+ safeDestroyWindow.argtypes = [HWND]
+ safeDestroyWindow.restype = BOOL
+
+ OpenClipboard = windll.user32.OpenClipboard
+ OpenClipboard.argtypes = [HWND]
+ OpenClipboard.restype = BOOL
+
+ safeCloseClipboard = CheckedCall(windll.user32.CloseClipboard)
+ safeCloseClipboard.argtypes = []
+ safeCloseClipboard.restype = BOOL
+
+ safeEmptyClipboard = CheckedCall(windll.user32.EmptyClipboard)
+ safeEmptyClipboard.argtypes = []
+ safeEmptyClipboard.restype = BOOL
+
+ safeGetClipboardData = CheckedCall(windll.user32.GetClipboardData)
+ safeGetClipboardData.argtypes = [UINT]
+ safeGetClipboardData.restype = HANDLE
+
+ safeSetClipboardData = CheckedCall(windll.user32.SetClipboardData)
+ safeSetClipboardData.argtypes = [UINT, HANDLE]
+ safeSetClipboardData.restype = HANDLE
+
+ safeGlobalAlloc = CheckedCall(windll.kernel32.GlobalAlloc)
+ safeGlobalAlloc.argtypes = [UINT, c_size_t]
+ safeGlobalAlloc.restype = HGLOBAL
+
+ safeGlobalLock = CheckedCall(windll.kernel32.GlobalLock)
+ safeGlobalLock.argtypes = [HGLOBAL]
+ safeGlobalLock.restype = LPVOID
+
+ safeGlobalUnlock = CheckedCall(windll.kernel32.GlobalUnlock)
+ safeGlobalUnlock.argtypes = [HGLOBAL]
+ safeGlobalUnlock.restype = BOOL
+
+ GMEM_MOVEABLE = 0x0002
+ CF_UNICODETEXT = 13
+
+ @contextlib.contextmanager
+ def window():
+ """
+ Context that provides a valid Windows hwnd.
+ """
+ # we really just need the hwnd, so setting "STATIC"
+ # as predefined lpClass is just fine.
+ hwnd = safeCreateWindowExA(0, b"STATIC", None, 0, 0, 0, 0, 0,
+ None, None, None, None)
+ try:
+ yield hwnd
+ finally:
+ safeDestroyWindow(hwnd)
+
+ @contextlib.contextmanager
+ def clipboard(hwnd):
+ """
+ Context manager that opens the clipboard and prevents
+ other applications from modifying the clipboard content.
+ """
+ # We may not get the clipboard handle immediately because
+ # some other application is accessing it (?)
+ # We try for at least 500ms to get the clipboard.
+ t = time.time() + 0.5
+ success = False
+ while time.time() < t:
+ success = OpenClipboard(hwnd)
+ if success:
+ break
+ time.sleep(0.01)
+ if not success:
+ raise PyperclipWindowsException("Error calling OpenClipboard")
+
+ try:
+ yield
+ finally:
+ safeCloseClipboard()
+
+ def copy_windows(text):
+ # This function is heavily based on
+ # http://msdn.com/ms649016#_win32_Copying_Information_to_the_Clipboard
+ with window() as hwnd:
+ # http://msdn.com/ms649048
+ # If an application calls OpenClipboard with hwnd set to NULL,
+ # EmptyClipboard sets the clipboard owner to NULL;
+ # this causes SetClipboardData to fail.
+ # => We need a valid hwnd to copy something.
+ with clipboard(hwnd):
+ safeEmptyClipboard()
+
+ if text:
+ # http://msdn.com/ms649051
+ # If the hMem parameter identifies a memory object,
+ # the object must have been allocated using the
+ # function with the GMEM_MOVEABLE flag.
+ count = len(text) + 1
+ handle = safeGlobalAlloc(GMEM_MOVEABLE,
+ count * sizeof(c_wchar))
+ locked_handle = safeGlobalLock(handle)
+
+ ctypes.memmove(c_wchar_p(locked_handle),
+ c_wchar_p(text), count * sizeof(c_wchar))
+
+ safeGlobalUnlock(handle)
+ safeSetClipboardData(CF_UNICODETEXT, handle)
+
+ def paste_windows():
+ with clipboard(None):
+ handle = safeGetClipboardData(CF_UNICODETEXT)
+ if not handle:
+ # GetClipboardData may return NULL with errno == NO_ERROR
+ # if the clipboard is empty.
+ # (Also, it may return a handle to an empty buffer,
+ # but technically that's not empty)
+ return ""
+ return c_wchar_p(handle).value
+
+ return copy_windows, paste_windows
diff --git a/contrib/python/pandas/py2/pandas/io/clipboards.py b/contrib/python/pandas/py2/pandas/io/clipboards.py
new file mode 100644
index 00000000000..23a2b04214e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/clipboards.py
@@ -0,0 +1,145 @@
+""" io on the clipboard """
+import warnings
+
+import pandas.compat as compat
+from pandas.compat import PY2, PY3, StringIO
+
+from pandas.core.dtypes.generic import ABCDataFrame
+
+from pandas import get_option, option_context
+
+
+def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover
+ r"""
+ Read text from clipboard and pass to read_csv. See read_csv for the
+ full argument list
+
+ Parameters
+ ----------
+ sep : str, default '\s+'
+ A string or regex delimiter. The default of '\s+' denotes
+ one or more whitespace characters.
+
+ Returns
+ -------
+ parsed : DataFrame
+ """
+ encoding = kwargs.pop('encoding', 'utf-8')
+
+ # only utf-8 is valid for passed value because that's what clipboard
+ # supports
+ if encoding is not None and encoding.lower().replace('-', '') != 'utf8':
+ raise NotImplementedError(
+ 'reading from clipboard only supports utf-8 encoding')
+
+ from pandas.io.clipboard import clipboard_get
+ from pandas.io.parsers import read_csv
+ text = clipboard_get()
+
+ # try to decode (if needed on PY3)
+ # Strange. linux py33 doesn't complain, win py33 does
+ if PY3:
+ try:
+ text = compat.bytes_to_str(
+ text, encoding=(kwargs.get('encoding') or
+ get_option('display.encoding'))
+ )
+ except AttributeError:
+ pass
+
+ # Excel copies into clipboard with \t separation
+ # inspect no more then the 10 first lines, if they
+ # all contain an equal number (>0) of tabs, infer
+ # that this came from excel and set 'sep' accordingly
+ lines = text[:10000].split('\n')[:-1][:10]
+
+ # Need to remove leading white space, since read_csv
+ # accepts:
+ # a b
+ # 0 1 2
+ # 1 3 4
+
+ counts = {x.lstrip().count('\t') for x in lines}
+ if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0:
+ sep = '\t'
+
+ # Edge case where sep is specified to be None, return to default
+ if sep is None and kwargs.get('delim_whitespace') is None:
+ sep = r'\s+'
+
+ # Regex separator currently only works with python engine.
+ # Default to python if separator is multi-character (regex)
+ if len(sep) > 1 and kwargs.get('engine') is None:
+ kwargs['engine'] = 'python'
+ elif len(sep) > 1 and kwargs.get('engine') == 'c':
+ warnings.warn('read_clipboard with regex separator does not work'
+ ' properly with c engine')
+
+ # In PY2, the c table reader first encodes text with UTF-8 but Python
+ # table reader uses the format of the passed string. For consistency,
+ # encode strings for python engine so that output from python and c
+ # engines produce consistent results
+ if kwargs.get('engine') == 'python' and PY2:
+ text = text.encode('utf-8')
+
+ return read_csv(StringIO(text), sep=sep, **kwargs)
+
+
+def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover
+ """
+ Attempt to write text representation of object to the system clipboard
+ The clipboard can be then pasted into Excel for example.
+
+ Parameters
+ ----------
+ obj : the object to write to the clipboard
+ excel : boolean, defaults to True
+ if True, use the provided separator, writing in a csv
+ format for allowing easy pasting into excel.
+ if False, write a string representation of the object
+ to the clipboard
+ sep : optional, defaults to tab
+ other keywords are passed to to_csv
+
+ Notes
+ -----
+ Requirements for your platform
+ - Linux: xclip, or xsel (with gtk or PyQt4 modules)
+ - Windows:
+ - OS X:
+ """
+ encoding = kwargs.pop('encoding', 'utf-8')
+
+ # testing if an invalid encoding is passed to clipboard
+ if encoding is not None and encoding.lower().replace('-', '') != 'utf8':
+ raise ValueError('clipboard only supports utf-8 encoding')
+
+ from pandas.io.clipboard import clipboard_set
+ if excel is None:
+ excel = True
+
+ if excel:
+ try:
+ if sep is None:
+ sep = '\t'
+ buf = StringIO()
+ # clipboard_set (pyperclip) expects unicode
+ obj.to_csv(buf, sep=sep, encoding='utf-8', **kwargs)
+ text = buf.getvalue()
+ if PY2:
+ text = text.decode('utf-8')
+ clipboard_set(text)
+ return
+ except TypeError:
+ warnings.warn('to_clipboard in excel mode requires a single '
+ 'character separator.')
+ elif sep is not None:
+ warnings.warn('to_clipboard with excel=False ignores the sep argument')
+
+ if isinstance(obj, ABCDataFrame):
+ # str(df) has various unhelpful defaults, like truncation
+ with option_context('display.max_colwidth', 999999):
+ objstr = obj.to_string(**kwargs)
+ else:
+ objstr = str(obj)
+ clipboard_set(objstr)
diff --git a/contrib/python/pandas/py2/pandas/io/common.py b/contrib/python/pandas/py2/pandas/io/common.py
new file mode 100644
index 00000000000..c1cacf39c5b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/common.py
@@ -0,0 +1,617 @@
+"""Common IO api utilities"""
+
+import codecs
+from contextlib import closing, contextmanager
+import csv
+import mmap
+import os
+import zipfile
+
+import pandas.compat as compat
+from pandas.compat import BytesIO, StringIO, string_types, text_type
+from pandas.errors import ( # noqa
+ AbstractMethodError, DtypeWarning, EmptyDataError, ParserError,
+ ParserWarning)
+
+from pandas.core.dtypes.common import is_file_like, is_number
+
+from pandas.io.formats.printing import pprint_thing
+
+# gh-12665: Alias for now and remove later.
+CParserError = ParserError
+
+# common NA values
+# no longer excluding inf representations
+# '1.#INF','-1.#INF', '1.#INF000000',
+_NA_VALUES = {'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A',
+ 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan',
+ '-nan', ''}
+
+
+if compat.PY3:
+ from urllib.request import urlopen, pathname2url
+ _urlopen = urlopen
+ from urllib.parse import urlparse as parse_url
+ from urllib.parse import (uses_relative, uses_netloc, uses_params,
+ urlencode, urljoin)
+ from urllib.error import URLError
+ from http.client import HTTPException # noqa
+else:
+ from urllib2 import urlopen as _urlopen
+ from urllib import urlencode, pathname2url # noqa
+ from urlparse import urlparse as parse_url
+ from urlparse import uses_relative, uses_netloc, uses_params, urljoin
+ from urllib2 import URLError # noqa
+ from httplib import HTTPException # noqa
+ from contextlib import contextmanager, closing # noqa
+ from functools import wraps # noqa
+
+ # @wraps(_urlopen)
+ @contextmanager
+ def urlopen(*args, **kwargs):
+ with closing(_urlopen(*args, **kwargs)) as f:
+ yield f
+
+
+_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
+_VALID_URLS.discard('')
+
+
+class BaseIterator(object):
+ """Subclass this and provide a "__next__()" method to obtain an iterator.
+ Useful only when the object being iterated is non-reusable (e.g. OK for a
+ parser, not for an in-memory table, yes for its iterator)."""
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ raise AbstractMethodError(self)
+
+
+if not compat.PY3:
+ BaseIterator.next = lambda self: self.__next__()
+
+
+def _is_url(url):
+ """Check to see if a URL has a valid protocol.
+
+ Parameters
+ ----------
+ url : str or unicode
+
+ Returns
+ -------
+ isurl : bool
+ If `url` has a valid protocol return True otherwise False.
+ """
+ try:
+ return parse_url(url).scheme in _VALID_URLS
+ except Exception:
+ return False
+
+
+def _expand_user(filepath_or_buffer):
+ """Return the argument with an initial component of ~ or ~user
+ replaced by that user's home directory.
+
+ Parameters
+ ----------
+ filepath_or_buffer : object to be converted if possible
+
+ Returns
+ -------
+ expanded_filepath_or_buffer : an expanded filepath or the
+ input if not expandable
+ """
+ if isinstance(filepath_or_buffer, string_types):
+ return os.path.expanduser(filepath_or_buffer)
+ return filepath_or_buffer
+
+
+def _validate_header_arg(header):
+ if isinstance(header, bool):
+ raise TypeError("Passing a bool to header is invalid. "
+ "Use header=None for no header or "
+ "header=int or list-like of ints to specify "
+ "the row(s) making up the column names")
+
+
+def _stringify_path(filepath_or_buffer):
+ """Attempt to convert a path-like object to a string.
+
+ Parameters
+ ----------
+ filepath_or_buffer : object to be converted
+
+ Returns
+ -------
+ str_filepath_or_buffer : maybe a string version of the object
+
+ Notes
+ -----
+ Objects supporting the fspath protocol (python 3.6+) are coerced
+ according to its __fspath__ method.
+
+ For backwards compatibility with older pythons, pathlib.Path and
+ py.path objects are specially coerced.
+
+ Any other object is passed through unchanged, which includes bytes,
+ strings, buffers, or anything else that's not even path-like.
+ """
+ try:
+ import pathlib
+ _PATHLIB_INSTALLED = True
+ except ImportError:
+ _PATHLIB_INSTALLED = False
+
+ try:
+ from py.path import local as LocalPath
+ _PY_PATH_INSTALLED = True
+ except ImportError:
+ _PY_PATH_INSTALLED = False
+
+ if hasattr(filepath_or_buffer, '__fspath__'):
+ return filepath_or_buffer.__fspath__()
+ if _PATHLIB_INSTALLED and isinstance(filepath_or_buffer, pathlib.Path):
+ return text_type(filepath_or_buffer)
+ if _PY_PATH_INSTALLED and isinstance(filepath_or_buffer, LocalPath):
+ return filepath_or_buffer.strpath
+ return _expand_user(filepath_or_buffer)
+
+
+def is_s3_url(url):
+ """Check for an s3, s3n, or s3a url"""
+ try:
+ return parse_url(url).scheme in ['s3', 's3n', 's3a']
+ except Exception:
+ return False
+
+
+def is_gcs_url(url):
+ """Check for a gcs url"""
+ try:
+ return parse_url(url).scheme in ['gcs', 'gs']
+ except Exception:
+ return False
+
+
+def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
+ compression=None, mode=None):
+ """
+ If the filepath_or_buffer is a url, translate and return the buffer.
+ Otherwise passthrough.
+
+ Parameters
+ ----------
+ filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
+ or buffer
+ encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
+ mode : str, optional
+
+ Returns
+ -------
+ tuple of ({a filepath_ or buffer or S3File instance},
+ encoding, str,
+ compression, str,
+ should_close, bool)
+ """
+ filepath_or_buffer = _stringify_path(filepath_or_buffer)
+
+ if _is_url(filepath_or_buffer):
+ req = _urlopen(filepath_or_buffer)
+ content_encoding = req.headers.get('Content-Encoding', None)
+ if content_encoding == 'gzip':
+ # Override compression based on Content-Encoding header
+ compression = 'gzip'
+ reader = BytesIO(req.read())
+ req.close()
+ return reader, encoding, compression, True
+
+ if is_s3_url(filepath_or_buffer):
+ from pandas.io import s3
+ return s3.get_filepath_or_buffer(filepath_or_buffer,
+ encoding=encoding,
+ compression=compression,
+ mode=mode)
+
+ if is_gcs_url(filepath_or_buffer):
+ from pandas.io import gcs
+ return gcs.get_filepath_or_buffer(filepath_or_buffer,
+ encoding=encoding,
+ compression=compression,
+ mode=mode)
+
+ if isinstance(filepath_or_buffer, (compat.string_types,
+ compat.binary_type,
+ mmap.mmap)):
+ return _expand_user(filepath_or_buffer), None, compression, False
+
+ if not is_file_like(filepath_or_buffer):
+ msg = "Invalid file path or buffer object type: {_type}"
+ raise ValueError(msg.format(_type=type(filepath_or_buffer)))
+
+ return filepath_or_buffer, None, compression, False
+
+
+def file_path_to_url(path):
+ """
+ converts an absolute native path to a FILE URL.
+
+ Parameters
+ ----------
+ path : a path in native format
+
+ Returns
+ -------
+ a valid FILE URL
+ """
+ return urljoin('file:', pathname2url(path))
+
+
+_compression_to_extension = {
+ 'gzip': '.gz',
+ 'bz2': '.bz2',
+ 'zip': '.zip',
+ 'xz': '.xz',
+}
+
+
+def _infer_compression(filepath_or_buffer, compression):
+ """
+ Get the compression method for filepath_or_buffer. If compression='infer',
+ the inferred compression method is returned. Otherwise, the input
+ compression method is returned unchanged, unless it's invalid, in which
+ case an error is raised.
+
+ Parameters
+ ----------
+ filepath_or_buffer :
+ a path (str) or buffer
+ compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
+ If 'infer' and `filepath_or_buffer` is path-like, then detect
+ compression from the following extensions: '.gz', '.bz2', '.zip',
+ or '.xz' (otherwise no compression).
+
+ Returns
+ -------
+ string or None :
+ compression method
+
+ Raises
+ ------
+ ValueError on invalid compression specified
+ """
+
+ # No compression has been explicitly specified
+ if compression is None:
+ return None
+
+ # Infer compression
+ if compression == 'infer':
+ # Convert all path types (e.g. pathlib.Path) to strings
+ filepath_or_buffer = _stringify_path(filepath_or_buffer)
+ if not isinstance(filepath_or_buffer, compat.string_types):
+ # Cannot infer compression of a buffer, assume no compression
+ return None
+
+ # Infer compression from the filename/URL extension
+ for compression, extension in _compression_to_extension.items():
+ if filepath_or_buffer.endswith(extension):
+ return compression
+ return None
+
+ # Compression has been specified. Check that it's valid
+ if compression in _compression_to_extension:
+ return compression
+
+ msg = 'Unrecognized compression type: {}'.format(compression)
+ valid = ['infer', None] + sorted(_compression_to_extension)
+ msg += '\nValid compression types are {}'.format(valid)
+ raise ValueError(msg)
+
+
+def _get_handle(path_or_buf, mode, encoding=None, compression=None,
+ memory_map=False, is_text=True):
+ """
+ Get file handle for given path/buffer and mode.
+
+ Parameters
+ ----------
+ path_or_buf :
+ a path (str) or buffer
+ mode : str
+ mode to open path_or_buf with
+ encoding : str or None
+ compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
+ If 'infer' and `filepath_or_buffer` is path-like, then detect
+ compression from the following extensions: '.gz', '.bz2', '.zip',
+ or '.xz' (otherwise no compression).
+ memory_map : boolean, default False
+ See parsers._parser_params for more information.
+ is_text : boolean, default True
+ whether file/buffer is in text format (csv, json, etc.), or in binary
+ mode (pickle, etc.)
+
+ Returns
+ -------
+ f : file-like
+ A file-like object
+ handles : list of file-like objects
+ A list of file-like object that were opened in this function.
+ """
+ try:
+ from s3fs import S3File
+ need_text_wrapping = (BytesIO, S3File)
+ except ImportError:
+ need_text_wrapping = (BytesIO,)
+
+ handles = list()
+ f = path_or_buf
+
+ # Convert pathlib.Path/py.path.local or string
+ path_or_buf = _stringify_path(path_or_buf)
+ is_path = isinstance(path_or_buf, compat.string_types)
+
+ if is_path:
+ compression = _infer_compression(path_or_buf, compression)
+
+ if compression:
+
+ if compat.PY2 and not is_path and encoding:
+ msg = 'compression with encoding is not yet supported in Python 2'
+ raise ValueError(msg)
+
+ # GZ Compression
+ if compression == 'gzip':
+ import gzip
+ if is_path:
+ f = gzip.open(path_or_buf, mode)
+ else:
+ f = gzip.GzipFile(fileobj=path_or_buf)
+
+ # BZ Compression
+ elif compression == 'bz2':
+ import bz2
+ if is_path:
+ f = bz2.BZ2File(path_or_buf, mode)
+ elif compat.PY2:
+ # Python 2's bz2 module can't take file objects, so have to
+ # run through decompress manually
+ f = StringIO(bz2.decompress(path_or_buf.read()))
+ path_or_buf.close()
+ else:
+ f = bz2.BZ2File(path_or_buf)
+
+ # ZIP Compression
+ elif compression == 'zip':
+ zf = BytesZipFile(path_or_buf, mode)
+ # Ensure the container is closed as well.
+ handles.append(zf)
+ if zf.mode == 'w':
+ f = zf
+ elif zf.mode == 'r':
+ zip_names = zf.namelist()
+ if len(zip_names) == 1:
+ f = zf.open(zip_names.pop())
+ elif len(zip_names) == 0:
+ raise ValueError('Zero files found in ZIP file {}'
+ .format(path_or_buf))
+ else:
+ raise ValueError('Multiple files found in ZIP file.'
+ ' Only one file per ZIP: {}'
+ .format(zip_names))
+
+ # XZ Compression
+ elif compression == 'xz':
+ lzma = compat.import_lzma()
+ f = lzma.LZMAFile(path_or_buf, mode)
+
+ # Unrecognized Compression
+ else:
+ msg = 'Unrecognized compression type: {}'.format(compression)
+ raise ValueError(msg)
+
+ handles.append(f)
+
+ elif is_path:
+ if compat.PY2:
+ # Python 2
+ mode = "wb" if mode == "w" else mode
+ f = open(path_or_buf, mode)
+ elif encoding:
+ # Python 3 and encoding
+ f = open(path_or_buf, mode, encoding=encoding, newline="")
+ elif is_text:
+ # Python 3 and no explicit encoding
+ f = open(path_or_buf, mode, errors='replace', newline="")
+ else:
+ # Python 3 and binary mode
+ f = open(path_or_buf, mode)
+ handles.append(f)
+
+ # in Python 3, convert BytesIO or fileobjects passed with an encoding
+ if (compat.PY3 and is_text and
+ (compression or isinstance(f, need_text_wrapping))):
+ from io import TextIOWrapper
+ f = TextIOWrapper(f, encoding=encoding, newline='')
+ handles.append(f)
+
+ if memory_map and hasattr(f, 'fileno'):
+ try:
+ g = MMapWrapper(f)
+ f.close()
+ f = g
+ except Exception:
+ # we catch any errors that may have occurred
+ # because that is consistent with the lower-level
+ # functionality of the C engine (pd.read_csv), so
+ # leave the file handler as is then
+ pass
+
+ return f, handles
+
+
+class BytesZipFile(zipfile.ZipFile, BytesIO):
+ """
+ Wrapper for standard library class ZipFile and allow the returned file-like
+ handle to accept byte strings via `write` method.
+
+ BytesIO provides attributes of file-like object and ZipFile.writestr writes
+ bytes strings into a member of the archive.
+ """
+ # GH 17778
+ def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs):
+ if mode in ['wb', 'rb']:
+ mode = mode.replace('b', '')
+ super(BytesZipFile, self).__init__(file, mode, compression, **kwargs)
+
+ def write(self, data):
+ super(BytesZipFile, self).writestr(self.filename, data)
+
+ @property
+ def closed(self):
+ return self.fp is None
+
+
+class MMapWrapper(BaseIterator):
+ """
+ Wrapper for the Python's mmap class so that it can be properly read in
+ by Python's csv.reader class.
+
+ Parameters
+ ----------
+ f : file object
+ File object to be mapped onto memory. Must support the 'fileno'
+ method or have an equivalent attribute
+
+ """
+
+ def __init__(self, f):
+ self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
+
+ def __getattr__(self, name):
+ return getattr(self.mmap, name)
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ newline = self.mmap.readline()
+
+ # readline returns bytes, not str, in Python 3,
+ # but Python's CSV reader expects str, so convert
+ # the output to str before continuing
+ if compat.PY3:
+ newline = compat.bytes_to_str(newline)
+
+ # mmap doesn't raise if reading past the allocated
+ # data but instead returns an empty string, so raise
+ # if that is returned
+ if newline == '':
+ raise StopIteration
+ return newline
+
+
+if not compat.PY3:
+ MMapWrapper.next = lambda self: self.__next__()
+
+
+class UTF8Recoder(BaseIterator):
+
+ """
+ Iterator that reads an encoded stream and reencodes the input to UTF-8
+ """
+
+ def __init__(self, f, encoding):
+ self.reader = codecs.getreader(encoding)(f)
+
+ def read(self, bytes=-1):
+ return self.reader.read(bytes).encode("utf-8")
+
+ def readline(self):
+ return self.reader.readline().encode("utf-8")
+
+ def next(self):
+ return next(self.reader).encode("utf-8")
+
+
+if compat.PY3: # pragma: no cover
+ def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
+ # ignore encoding
+ return csv.reader(f, dialect=dialect, **kwds)
+
+ def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
+ return csv.writer(f, dialect=dialect, **kwds)
+else:
+ class UnicodeReader(BaseIterator):
+
+ """
+ A CSV reader which will iterate over lines in the CSV file "f",
+ which is encoded in the given encoding.
+
+ On Python 3, this is replaced (below) by csv.reader, which handles
+ unicode.
+ """
+
+ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
+ f = UTF8Recoder(f, encoding)
+ self.reader = csv.reader(f, dialect=dialect, **kwds)
+
+ def __next__(self):
+ row = next(self.reader)
+ return [compat.text_type(s, "utf-8") for s in row]
+
+ class UnicodeWriter(object):
+
+ """
+ A CSV writer which will write rows to CSV file "f",
+ which is encoded in the given encoding.
+ """
+
+ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
+ # Redirect output to a queue
+ self.queue = StringIO()
+ self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
+ self.stream = f
+ self.encoder = codecs.getincrementalencoder(encoding)()
+ self.quoting = kwds.get("quoting", None)
+
+ def writerow(self, row):
+ def _check_as_is(x):
+ return (self.quoting == csv.QUOTE_NONNUMERIC and
+ is_number(x)) or isinstance(x, str)
+
+ row = [x if _check_as_is(x)
+ else pprint_thing(x).encode("utf-8") for x in row]
+
+ self.writer.writerow([s for s in row])
+ # Fetch UTF-8 output from the queue ...
+ data = self.queue.getvalue()
+ data = data.decode("utf-8")
+ # ... and re-encode it into the target encoding
+ data = self.encoder.encode(data)
+ # write to the target stream
+ self.stream.write(data)
+ # empty queue
+ self.queue.truncate(0)
+
+ def writerows(self, rows):
+ def _check_as_is(x):
+ return (self.quoting == csv.QUOTE_NONNUMERIC and
+ is_number(x)) or isinstance(x, str)
+
+ for i, row in enumerate(rows):
+ rows[i] = [x if _check_as_is(x)
+ else pprint_thing(x).encode("utf-8") for x in row]
+
+ self.writer.writerows([[s for s in row] for row in rows])
+ # Fetch UTF-8 output from the queue ...
+ data = self.queue.getvalue()
+ data = data.decode("utf-8")
+ # ... and re-encode it into the target encoding
+ data = self.encoder.encode(data)
+ # write to the target stream
+ self.stream.write(data)
+ # empty queue
+ self.queue.truncate(0)
diff --git a/contrib/python/pandas/py2/pandas/io/date_converters.py b/contrib/python/pandas/py2/pandas/io/date_converters.py
new file mode 100644
index 00000000000..1a22ee7240d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/date_converters.py
@@ -0,0 +1,64 @@
+"""This module is designed for community supported date conversion functions"""
+import numpy as np
+
+from pandas._libs.tslibs import parsing
+from pandas.compat import map, range
+
+
+def parse_date_time(date_col, time_col):
+ date_col = _maybe_cast(date_col)
+ time_col = _maybe_cast(time_col)
+ return parsing.try_parse_date_and_time(date_col, time_col)
+
+
+def parse_date_fields(year_col, month_col, day_col):
+ year_col = _maybe_cast(year_col)
+ month_col = _maybe_cast(month_col)
+ day_col = _maybe_cast(day_col)
+ return parsing.try_parse_year_month_day(year_col, month_col, day_col)
+
+
+def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col,
+ second_col):
+ year_col = _maybe_cast(year_col)
+ month_col = _maybe_cast(month_col)
+ day_col = _maybe_cast(day_col)
+ hour_col = _maybe_cast(hour_col)
+ minute_col = _maybe_cast(minute_col)
+ second_col = _maybe_cast(second_col)
+ return parsing.try_parse_datetime_components(year_col, month_col, day_col,
+ hour_col, minute_col,
+ second_col)
+
+
+def generic_parser(parse_func, *cols):
+ N = _check_columns(cols)
+ results = np.empty(N, dtype=object)
+
+ for i in range(N):
+ args = [c[i] for c in cols]
+ results[i] = parse_func(*args)
+
+ return results
+
+
+def _maybe_cast(arr):
+ if not arr.dtype.type == np.object_:
+ arr = np.array(arr, dtype=object)
+ return arr
+
+
+def _check_columns(cols):
+ if not len(cols):
+ raise AssertionError("There must be at least 1 column")
+
+ head, tail = cols[0], cols[1:]
+
+ N = len(head)
+
+ for i, n in enumerate(map(len, tail)):
+ if n != N:
+ raise AssertionError('All columns must have the same length: {0}; '
+ 'column {1} has length {2}'.format(N, i, n))
+
+ return N
diff --git a/contrib/python/pandas/py2/pandas/io/excel.py b/contrib/python/pandas/py2/pandas/io/excel.py
new file mode 100644
index 00000000000..3a7c39ec653
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/excel.py
@@ -0,0 +1,1996 @@
+"""
+Module parse to/from Excel
+"""
+
+# ---------------------------------------------------------------------
+# ExcelFile class
+import abc
+from datetime import date, datetime, time, timedelta
+from distutils.version import LooseVersion
+from io import UnsupportedOperation
+import os
+from textwrap import fill
+import warnings
+
+import numpy as np
+
+import pandas._libs.json as json
+import pandas.compat as compat
+from pandas.compat import (
+ OrderedDict, add_metaclass, lrange, map, range, string_types, u, zip)
+from pandas.errors import EmptyDataError
+from pandas.util._decorators import Appender, deprecate_kwarg
+
+from pandas.core.dtypes.common import (
+ is_bool, is_float, is_integer, is_list_like)
+
+from pandas.core import config
+from pandas.core.frame import DataFrame
+
+from pandas.io.common import (
+ _NA_VALUES, _is_url, _stringify_path, _urlopen, _validate_header_arg,
+ get_filepath_or_buffer)
+from pandas.io.formats.printing import pprint_thing
+from pandas.io.parsers import TextParser
+
+__all__ = ["read_excel", "ExcelWriter", "ExcelFile"]
+
+_writer_extensions = ["xlsx", "xls", "xlsm"]
+_writers = {}
+
+_read_excel_doc = """
+Read an Excel file into a pandas DataFrame.
+
+Support both `xls` and `xlsx` file extensions from a local filesystem or URL.
+Support an option to read a single sheet or a list of sheets.
+
+Parameters
+----------
+io : str, file descriptor, pathlib.Path, ExcelFile or xlrd.Book
+ The string could be a URL. Valid URL schemes include http, ftp, s3,
+ gcs, and file. For file URLs, a host is expected. For instance, a local
+ file could be /path/to/workbook.xlsx.
+sheet_name : str, int, list, or None, default 0
+ Strings are used for sheet names. Integers are used in zero-indexed
+ sheet positions. Lists of strings/integers are used to request
+ multiple sheets. Specify None to get all sheets.
+
+ Available cases:
+
+ * Defaults to ``0``: 1st sheet as a `DataFrame`
+ * ``1``: 2nd sheet as a `DataFrame`
+ * ``"Sheet1"``: Load sheet with name "Sheet1"
+ * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5"
+ as a dict of `DataFrame`
+ * None: All sheets.
+
+header : int, list of int, default 0
+ Row (0-indexed) to use for the column labels of the parsed
+ DataFrame. If a list of integers is passed those row positions will
+ be combined into a ``MultiIndex``. Use None if there is no header.
+names : array-like, default None
+ List of column names to use. If file contains no header row,
+ then you should explicitly pass header=None.
+index_col : int, list of int, default None
+ Column (0-indexed) to use as the row labels of the DataFrame.
+ Pass None if there is no such column. If a list is passed,
+ those columns will be combined into a ``MultiIndex``. If a
+ subset of data is selected with ``usecols``, index_col
+ is based on the subset.
+parse_cols : int or list, default None
+ Alias of `usecols`.
+
+ .. deprecated:: 0.21.0
+ Use `usecols` instead.
+
+usecols : int, str, list-like, or callable default None
+ Return a subset of the columns.
+ * If None, then parse all columns.
+ * If int, then indicates last column to be parsed.
+
+ .. deprecated:: 0.24.0
+ Pass in a list of int instead from 0 to `usecols` inclusive.
+
+ * If str, then indicates comma separated list of Excel column letters
+ and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of
+ both sides.
+ * If list of int, then indicates list of column numbers to be parsed.
+ * If list of string, then indicates list of column names to be parsed.
+
+ .. versionadded:: 0.24.0
+
+ * If callable, then evaluate each column name against it and parse the
+ column if the callable returns ``True``.
+
+ .. versionadded:: 0.24.0
+
+squeeze : bool, default False
+ If the parsed data only contains one column then return a Series.
+dtype : Type name or dict of column -> type, default None
+ Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
+ Use `object` to preserve data as stored in Excel and not interpret dtype.
+ If converters are specified, they will be applied INSTEAD
+ of dtype conversion.
+
+ .. versionadded:: 0.20.0
+
+engine : str, default None
+ If io is not a buffer or path, this must be set to identify io.
+ Acceptable values are None or xlrd.
+converters : dict, default None
+ Dict of functions for converting values in certain columns. Keys can
+ either be integers or column labels, values are functions that take one
+ input argument, the Excel cell content, and return the transformed
+ content.
+true_values : list, default None
+ Values to consider as True.
+
+ .. versionadded:: 0.19.0
+
+false_values : list, default None
+ Values to consider as False.
+
+ .. versionadded:: 0.19.0
+
+skiprows : list-like
+ Rows to skip at the beginning (0-indexed).
+nrows : int, default None
+ Number of rows to parse.
+
+ .. versionadded:: 0.23.0
+
+na_values : scalar, str, list-like, or dict, default None
+ Additional strings to recognize as NA/NaN. If dict passed, specific
+ per-column NA values. By default the following values are interpreted
+ as NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ") + """'.
+keep_default_na : bool, default True
+ If na_values are specified and keep_default_na is False the default NaN
+ values are overridden, otherwise they're appended to.
+verbose : bool, default False
+ Indicate number of NA values placed in non-numeric columns.
+parse_dates : bool, list-like, or dict, default False
+ The behavior is as follows:
+
+ * bool. If True -> try parsing the index.
+ * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
+ each as a separate date column.
+ * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
+ a single date column.
+ * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
+ result 'foo'
+
+ If a column or index contains an unparseable date, the entire column or
+ index will be returned unaltered as an object data type. For non-standard
+ datetime parsing, use ``pd.to_datetime`` after ``pd.read_csv``
+
+ Note: A fast-path exists for iso8601-formatted dates.
+date_parser : function, optional
+ Function to use for converting a sequence of string columns to an array of
+ datetime instances. The default uses ``dateutil.parser.parser`` to do the
+ conversion. Pandas will try to call `date_parser` in three different ways,
+ advancing to the next if an exception occurs: 1) Pass one or more arrays
+ (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
+ string values from the columns defined by `parse_dates` into a single array
+ and pass that; and 3) call `date_parser` once for each row using one or
+ more strings (corresponding to the columns defined by `parse_dates`) as
+ arguments.
+thousands : str, default None
+ Thousands separator for parsing string columns to numeric. Note that
+ this parameter is only necessary for columns stored as TEXT in Excel,
+ any numeric columns will automatically be parsed, regardless of display
+ format.
+comment : str, default None
+ Comments out remainder of line. Pass a character or characters to this
+ argument to indicate comments in the input file. Any data between the
+ comment string and the end of the current line is ignored.
+skip_footer : int, default 0
+ Alias of `skipfooter`.
+
+ .. deprecated:: 0.23.0
+ Use `skipfooter` instead.
+skipfooter : int, default 0
+ Rows at the end to skip (0-indexed).
+convert_float : bool, default True
+ Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
+ data will be read in as floats: Excel stores all numbers as floats
+ internally.
+mangle_dupe_cols : bool, default True
+ Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
+ 'X'...'X'. Passing in False will cause data to be overwritten if there
+ are duplicate names in the columns.
+**kwds : optional
+ Optional keyword arguments can be passed to ``TextFileReader``.
+
+Returns
+-------
+DataFrame or dict of DataFrames
+ DataFrame from the passed in Excel file. See notes in sheet_name
+ argument for more information on when a dict of DataFrames is returned.
+
+See Also
+--------
+to_excel : Write DataFrame to an Excel file.
+to_csv : Write DataFrame to a comma-separated values (csv) file.
+read_csv : Read a comma-separated values (csv) file into DataFrame.
+read_fwf : Read a table of fixed-width formatted lines into DataFrame.
+
+Examples
+--------
+The file can be read using the file name as string or an open file object:
+
+>>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP
+ Name Value
+0 string1 1
+1 string2 2
+2 #Comment 3
+
+>>> pd.read_excel(open('tmp.xlsx', 'rb'),
+... sheet_name='Sheet3') # doctest: +SKIP
+ Unnamed: 0 Name Value
+0 0 string1 1
+1 1 string2 2
+2 2 #Comment 3
+
+Index and header can be specified via the `index_col` and `header` arguments
+
+>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP
+ 0 1 2
+0 NaN Name Value
+1 0.0 string1 1
+2 1.0 string2 2
+3 2.0 #Comment 3
+
+Column types are inferred but can be explicitly specified
+
+>>> pd.read_excel('tmp.xlsx', index_col=0,
+... dtype={'Name': str, 'Value': float}) # doctest: +SKIP
+ Name Value
+0 string1 1.0
+1 string2 2.0
+2 #Comment 3.0
+
+True, False, and NA values, and thousands separators have defaults,
+but can be explicitly specified, too. Supply the values you would like
+as strings or lists of strings!
+
+>>> pd.read_excel('tmp.xlsx', index_col=0,
+... na_values=['string1', 'string2']) # doctest: +SKIP
+ Name Value
+0 NaN 1
+1 NaN 2
+2 #Comment 3
+
+Comment lines in the excel input file can be skipped using the `comment` kwarg
+
+>>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP
+ Name Value
+0 string1 1.0
+1 string2 2.0
+2 None NaN
+"""
+
+
+def register_writer(klass):
+ """Adds engine to the excel writer registry. You must use this method to
+ integrate with ``to_excel``. Also adds config options for any new
+ ``supported_extensions`` defined on the writer."""
+ if not compat.callable(klass):
+ raise ValueError("Can only register callables as engines")
+ engine_name = klass.engine
+ _writers[engine_name] = klass
+ for ext in klass.supported_extensions:
+ if ext.startswith('.'):
+ ext = ext[1:]
+ if ext not in _writer_extensions:
+ config.register_option("io.excel.{ext}.writer".format(ext=ext),
+ engine_name, validator=str)
+ _writer_extensions.append(ext)
+
+
+def _get_default_writer(ext):
+ _default_writers = {'xlsx': 'openpyxl', 'xlsm': 'openpyxl', 'xls': 'xlwt'}
+ try:
+ import xlsxwriter # noqa
+ _default_writers['xlsx'] = 'xlsxwriter'
+ except ImportError:
+ pass
+ return _default_writers[ext]
+
+
+def get_writer(engine_name):
+ try:
+ return _writers[engine_name]
+ except KeyError:
+ raise ValueError("No Excel writer '{engine}'"
+ .format(engine=engine_name))
+
+
+@Appender(_read_excel_doc)
+@deprecate_kwarg("parse_cols", "usecols")
+@deprecate_kwarg("skip_footer", "skipfooter")
+def read_excel(io,
+ sheet_name=0,
+ header=0,
+ names=None,
+ index_col=None,
+ parse_cols=None,
+ usecols=None,
+ squeeze=False,
+ dtype=None,
+ engine=None,
+ converters=None,
+ true_values=None,
+ false_values=None,
+ skiprows=None,
+ nrows=None,
+ na_values=None,
+ keep_default_na=True,
+ verbose=False,
+ parse_dates=False,
+ date_parser=None,
+ thousands=None,
+ comment=None,
+ skip_footer=0,
+ skipfooter=0,
+ convert_float=True,
+ mangle_dupe_cols=True,
+ **kwds):
+
+ # Can't use _deprecate_kwarg since sheetname=None has a special meaning
+ if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds:
+ warnings.warn("The `sheetname` keyword is deprecated, use "
+ "`sheet_name` instead", FutureWarning, stacklevel=2)
+ sheet_name = kwds.pop("sheetname")
+
+ if 'sheet' in kwds:
+ raise TypeError("read_excel() got an unexpected keyword argument "
+ "`sheet`")
+
+ if not isinstance(io, ExcelFile):
+ io = ExcelFile(io, engine=engine)
+
+ return io.parse(
+ sheet_name=sheet_name,
+ header=header,
+ names=names,
+ index_col=index_col,
+ usecols=usecols,
+ squeeze=squeeze,
+ dtype=dtype,
+ converters=converters,
+ true_values=true_values,
+ false_values=false_values,
+ skiprows=skiprows,
+ nrows=nrows,
+ na_values=na_values,
+ keep_default_na=keep_default_na,
+ verbose=verbose,
+ parse_dates=parse_dates,
+ date_parser=date_parser,
+ thousands=thousands,
+ comment=comment,
+ skipfooter=skipfooter,
+ convert_float=convert_float,
+ mangle_dupe_cols=mangle_dupe_cols,
+ **kwds)
+
+
+class _XlrdReader(object):
+
+ def __init__(self, filepath_or_buffer):
+ """Reader using xlrd engine.
+
+ Parameters
+ ----------
+ filepath_or_buffer : string, path object or Workbook
+ Object to be parsed.
+ """
+ err_msg = "Install xlrd >= 1.0.0 for Excel support"
+
+ try:
+ import xlrd
+ except ImportError:
+ raise ImportError(err_msg)
+ else:
+ if xlrd.__VERSION__ < LooseVersion("1.0.0"):
+ raise ImportError(err_msg +
+ ". Current version " + xlrd.__VERSION__)
+
+ # If filepath_or_buffer is a url, want to keep the data as bytes so
+ # can't pass to get_filepath_or_buffer()
+ if _is_url(filepath_or_buffer):
+ filepath_or_buffer = _urlopen(filepath_or_buffer)
+ elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)):
+ filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
+ filepath_or_buffer)
+
+ if isinstance(filepath_or_buffer, xlrd.Book):
+ self.book = filepath_or_buffer
+ elif not isinstance(filepath_or_buffer, xlrd.Book) and hasattr(
+ filepath_or_buffer, "read"):
+ # N.B. xlrd.Book has a read attribute too
+ if hasattr(filepath_or_buffer, 'seek'):
+ try:
+ # GH 19779
+ filepath_or_buffer.seek(0)
+ except UnsupportedOperation:
+ # HTTPResponse does not support seek()
+ # GH 20434
+ pass
+
+ data = filepath_or_buffer.read()
+ self.book = xlrd.open_workbook(file_contents=data)
+ elif isinstance(filepath_or_buffer, compat.string_types):
+ self.book = xlrd.open_workbook(filepath_or_buffer)
+ else:
+ raise ValueError('Must explicitly set engine if not passing in'
+ ' buffer or path for io.')
+
+ @property
+ def sheet_names(self):
+ return self.book.sheet_names()
+
+ def parse(self,
+ sheet_name=0,
+ header=0,
+ names=None,
+ index_col=None,
+ usecols=None,
+ squeeze=False,
+ dtype=None,
+ true_values=None,
+ false_values=None,
+ skiprows=None,
+ nrows=None,
+ na_values=None,
+ verbose=False,
+ parse_dates=False,
+ date_parser=None,
+ thousands=None,
+ comment=None,
+ skipfooter=0,
+ convert_float=True,
+ mangle_dupe_cols=True,
+ **kwds):
+
+ _validate_header_arg(header)
+
+ from xlrd import (xldate, XL_CELL_DATE,
+ XL_CELL_ERROR, XL_CELL_BOOLEAN,
+ XL_CELL_NUMBER)
+
+ epoch1904 = self.book.datemode
+
+ def _parse_cell(cell_contents, cell_typ):
+ """converts the contents of the cell into a pandas
+ appropriate object"""
+
+ if cell_typ == XL_CELL_DATE:
+
+ # Use the newer xlrd datetime handling.
+ try:
+ cell_contents = xldate.xldate_as_datetime(
+ cell_contents, epoch1904)
+ except OverflowError:
+ return cell_contents
+
+ # Excel doesn't distinguish between dates and time,
+ # so we treat dates on the epoch as times only.
+ # Also, Excel supports 1900 and 1904 epochs.
+ year = (cell_contents.timetuple())[0:3]
+ if ((not epoch1904 and year == (1899, 12, 31)) or
+ (epoch1904 and year == (1904, 1, 1))):
+ cell_contents = time(cell_contents.hour,
+ cell_contents.minute,
+ cell_contents.second,
+ cell_contents.microsecond)
+
+ elif cell_typ == XL_CELL_ERROR:
+ cell_contents = np.nan
+ elif cell_typ == XL_CELL_BOOLEAN:
+ cell_contents = bool(cell_contents)
+ elif convert_float and cell_typ == XL_CELL_NUMBER:
+ # GH5394 - Excel 'numbers' are always floats
+ # it's a minimal perf hit and less surprising
+ val = int(cell_contents)
+ if val == cell_contents:
+ cell_contents = val
+ return cell_contents
+
+ ret_dict = False
+
+ # Keep sheetname to maintain backwards compatibility.
+ if isinstance(sheet_name, list):
+ sheets = sheet_name
+ ret_dict = True
+ elif sheet_name is None:
+ sheets = self.book.sheet_names()
+ ret_dict = True
+ else:
+ sheets = [sheet_name]
+
+ # handle same-type duplicates.
+ sheets = list(OrderedDict.fromkeys(sheets).keys())
+
+ output = OrderedDict()
+
+ for asheetname in sheets:
+ if verbose:
+ print("Reading sheet {sheet}".format(sheet=asheetname))
+
+ if isinstance(asheetname, compat.string_types):
+ sheet = self.book.sheet_by_name(asheetname)
+ else: # assume an integer if not a string
+ sheet = self.book.sheet_by_index(asheetname)
+
+ data = []
+ usecols = _maybe_convert_usecols(usecols)
+
+ for i in range(sheet.nrows):
+ row = [_parse_cell(value, typ)
+ for value, typ in zip(sheet.row_values(i),
+ sheet.row_types(i))]
+ data.append(row)
+
+ if sheet.nrows == 0:
+ output[asheetname] = DataFrame()
+ continue
+
+ if is_list_like(header) and len(header) == 1:
+ header = header[0]
+
+ # forward fill and pull out names for MultiIndex column
+ header_names = None
+ if header is not None and is_list_like(header):
+ header_names = []
+ control_row = [True] * len(data[0])
+
+ for row in header:
+ if is_integer(skiprows):
+ row += skiprows
+
+ data[row], control_row = _fill_mi_header(data[row],
+ control_row)
+
+ if index_col is not None:
+ header_name, _ = _pop_header_name(data[row], index_col)
+ header_names.append(header_name)
+
+ if is_list_like(index_col):
+ # Forward fill values for MultiIndex index.
+ if not is_list_like(header):
+ offset = 1 + header
+ else:
+ offset = 1 + max(header)
+
+ # Check if we have an empty dataset
+ # before trying to collect data.
+ if offset < len(data):
+ for col in index_col:
+ last = data[offset][col]
+
+ for row in range(offset + 1, len(data)):
+ if data[row][col] == '' or data[row][col] is None:
+ data[row][col] = last
+ else:
+ last = data[row][col]
+
+ has_index_names = is_list_like(header) and len(header) > 1
+
+ # GH 12292 : error when read one empty column from excel file
+ try:
+ parser = TextParser(data,
+ names=names,
+ header=header,
+ index_col=index_col,
+ has_index_names=has_index_names,
+ squeeze=squeeze,
+ dtype=dtype,
+ true_values=true_values,
+ false_values=false_values,
+ skiprows=skiprows,
+ nrows=nrows,
+ na_values=na_values,
+ parse_dates=parse_dates,
+ date_parser=date_parser,
+ thousands=thousands,
+ comment=comment,
+ skipfooter=skipfooter,
+ usecols=usecols,
+ mangle_dupe_cols=mangle_dupe_cols,
+ **kwds)
+
+ output[asheetname] = parser.read(nrows=nrows)
+
+ if not squeeze or isinstance(output[asheetname], DataFrame):
+ if header_names:
+ output[asheetname].columns = output[
+ asheetname].columns.set_names(header_names)
+ elif compat.PY2:
+ output[asheetname].columns = _maybe_convert_to_string(
+ output[asheetname].columns)
+
+ except EmptyDataError:
+ # No Data, return an empty DataFrame
+ output[asheetname] = DataFrame()
+
+ if ret_dict:
+ return output
+ else:
+ return output[asheetname]
+
+
+class ExcelFile(object):
+ """
+ Class for parsing tabular excel sheets into DataFrame objects.
+ Uses xlrd. See read_excel for more documentation
+
+ Parameters
+ ----------
+ io : string, path object (pathlib.Path or py._path.local.LocalPath),
+ file-like object or xlrd workbook
+ If a string or path object, expected to be a path to xls or xlsx file.
+ engine : string, default None
+ If io is not a buffer or path, this must be set to identify io.
+ Acceptable values are None or ``xlrd``.
+ """
+
+ _engines = {
+ 'xlrd': _XlrdReader,
+ }
+
+ def __init__(self, io, engine=None):
+ if engine is None:
+ engine = 'xlrd'
+ if engine not in self._engines:
+ raise ValueError("Unknown engine: {engine}".format(engine=engine))
+
+ # could be a str, ExcelFile, Book, etc.
+ self.io = io
+ # Always a string
+ self._io = _stringify_path(io)
+
+ self._reader = self._engines[engine](self._io)
+
+ def __fspath__(self):
+ return self._io
+
+ def parse(self,
+ sheet_name=0,
+ header=0,
+ names=None,
+ index_col=None,
+ usecols=None,
+ squeeze=False,
+ converters=None,
+ true_values=None,
+ false_values=None,
+ skiprows=None,
+ nrows=None,
+ na_values=None,
+ parse_dates=False,
+ date_parser=None,
+ thousands=None,
+ comment=None,
+ skipfooter=0,
+ convert_float=True,
+ mangle_dupe_cols=True,
+ **kwds):
+ """
+ Parse specified sheet(s) into a DataFrame
+
+ Equivalent to read_excel(ExcelFile, ...) See the read_excel
+ docstring for more info on accepted parameters
+ """
+
+ # Can't use _deprecate_kwarg since sheetname=None has a special meaning
+ if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds:
+ warnings.warn("The `sheetname` keyword is deprecated, use "
+ "`sheet_name` instead", FutureWarning, stacklevel=2)
+ sheet_name = kwds.pop("sheetname")
+ elif 'sheetname' in kwds:
+ raise TypeError("Cannot specify both `sheet_name` "
+ "and `sheetname`. Use just `sheet_name`")
+
+ if 'chunksize' in kwds:
+ raise NotImplementedError("chunksize keyword of read_excel "
+ "is not implemented")
+
+ return self._reader.parse(sheet_name=sheet_name,
+ header=header,
+ names=names,
+ index_col=index_col,
+ usecols=usecols,
+ squeeze=squeeze,
+ converters=converters,
+ true_values=true_values,
+ false_values=false_values,
+ skiprows=skiprows,
+ nrows=nrows,
+ na_values=na_values,
+ parse_dates=parse_dates,
+ date_parser=date_parser,
+ thousands=thousands,
+ comment=comment,
+ skipfooter=skipfooter,
+ convert_float=convert_float,
+ mangle_dupe_cols=mangle_dupe_cols,
+ **kwds)
+
+ @property
+ def book(self):
+ return self._reader.book
+
+ @property
+ def sheet_names(self):
+ return self._reader.sheet_names
+
+ def close(self):
+ """close io if necessary"""
+ if hasattr(self.io, 'close'):
+ self.io.close()
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+
+
+def _excel2num(x):
+ """
+ Convert Excel column name like 'AB' to 0-based column index.
+
+ Parameters
+ ----------
+ x : str
+ The Excel column name to convert to a 0-based column index.
+
+ Returns
+ -------
+ num : int
+ The column index corresponding to the name.
+
+ Raises
+ ------
+ ValueError
+ Part of the Excel column name was invalid.
+ """
+ index = 0
+
+ for c in x.upper().strip():
+ cp = ord(c)
+
+ if cp < ord("A") or cp > ord("Z"):
+ raise ValueError("Invalid column name: {x}".format(x=x))
+
+ index = index * 26 + cp - ord("A") + 1
+
+ return index - 1
+
+
+def _range2cols(areas):
+ """
+ Convert comma separated list of column names and ranges to indices.
+
+ Parameters
+ ----------
+ areas : str
+ A string containing a sequence of column ranges (or areas).
+
+ Returns
+ -------
+ cols : list
+ A list of 0-based column indices.
+
+ Examples
+ --------
+ >>> _range2cols('A:E')
+ [0, 1, 2, 3, 4]
+ >>> _range2cols('A,C,Z:AB')
+ [0, 2, 25, 26, 27]
+ """
+ cols = []
+
+ for rng in areas.split(","):
+ if ":" in rng:
+ rng = rng.split(":")
+ cols.extend(lrange(_excel2num(rng[0]), _excel2num(rng[1]) + 1))
+ else:
+ cols.append(_excel2num(rng))
+
+ return cols
+
+
+def _maybe_convert_usecols(usecols):
+ """
+ Convert `usecols` into a compatible format for parsing in `parsers.py`.
+
+ Parameters
+ ----------
+ usecols : object
+ The use-columns object to potentially convert.
+
+ Returns
+ -------
+ converted : object
+ The compatible format of `usecols`.
+ """
+ if usecols is None:
+ return usecols
+
+ if is_integer(usecols):
+ warnings.warn(("Passing in an integer for `usecols` has been "
+ "deprecated. Please pass in a list of int from "
+ "0 to `usecols` inclusive instead."),
+ FutureWarning, stacklevel=2)
+ return lrange(usecols + 1)
+
+ if isinstance(usecols, compat.string_types):
+ return _range2cols(usecols)
+
+ return usecols
+
+
+def _validate_freeze_panes(freeze_panes):
+ if freeze_panes is not None:
+ if (
+ len(freeze_panes) == 2 and
+ all(isinstance(item, int) for item in freeze_panes)
+ ):
+ return True
+
+ raise ValueError("freeze_panes must be of form (row, column)"
+ " where row and column are integers")
+
+ # freeze_panes wasn't specified, return False so it won't be applied
+ # to output sheet
+ return False
+
+
+def _trim_excel_header(row):
+ # trim header row so auto-index inference works
+ # xlrd uses '' , openpyxl None
+ while len(row) > 0 and (row[0] == '' or row[0] is None):
+ row = row[1:]
+ return row
+
+
+def _maybe_convert_to_string(row):
+ """
+ Convert elements in a row to string from Unicode.
+
+ This is purely a Python 2.x patch and is performed ONLY when all
+ elements of the row are string-like.
+
+ Parameters
+ ----------
+ row : array-like
+ The row of data to convert.
+
+ Returns
+ -------
+ converted : array-like
+ """
+ if compat.PY2:
+ converted = []
+
+ for i in range(len(row)):
+ if isinstance(row[i], compat.string_types):
+ try:
+ converted.append(str(row[i]))
+ except UnicodeEncodeError:
+ break
+ else:
+ break
+ else:
+ row = converted
+
+ return row
+
+
+def _fill_mi_header(row, control_row):
+ """Forward fills blank entries in row, but only inside the same parent index
+
+ Used for creating headers in Multiindex.
+ Parameters
+ ----------
+ row : list
+ List of items in a single row.
+ control_row : list of bool
+ Helps to determine if particular column is in same parent index as the
+ previous value. Used to stop propagation of empty cells between
+ different indexes.
+
+ Returns
+ ----------
+ Returns changed row and control_row
+ """
+ last = row[0]
+ for i in range(1, len(row)):
+ if not control_row[i]:
+ last = row[i]
+
+ if row[i] == '' or row[i] is None:
+ row[i] = last
+ else:
+ control_row[i] = False
+ last = row[i]
+
+ return _maybe_convert_to_string(row), control_row
+
+# fill blank if index_col not None
+
+
+def _pop_header_name(row, index_col):
+ """
+ Pop the header name for MultiIndex parsing.
+
+ Parameters
+ ----------
+ row : list
+ The data row to parse for the header name.
+ index_col : int, list
+ The index columns for our data. Assumed to be non-null.
+
+ Returns
+ -------
+ header_name : str
+ The extracted header name.
+ trimmed_row : list
+ The original data row with the header name removed.
+ """
+ # Pop out header name and fill w/blank.
+ i = index_col if not is_list_like(index_col) else max(index_col)
+
+ header_name = row[i]
+ header_name = None if header_name == "" else header_name
+
+ return header_name, row[:i] + [''] + row[i + 1:]
+
+
+@add_metaclass(abc.ABCMeta)
+class ExcelWriter(object):
+ """
+ Class for writing DataFrame objects into excel sheets, default is to use
+ xlwt for xls, openpyxl for xlsx. See DataFrame.to_excel for typical usage.
+
+ Parameters
+ ----------
+ path : string
+ Path to xls or xlsx file.
+ engine : string (optional)
+ Engine to use for writing. If None, defaults to
+ ``io.excel.<extension>.writer``. NOTE: can only be passed as a keyword
+ argument.
+ date_format : string, default None
+ Format string for dates written into Excel files (e.g. 'YYYY-MM-DD')
+ datetime_format : string, default None
+ Format string for datetime objects written into Excel files
+ (e.g. 'YYYY-MM-DD HH:MM:SS')
+ mode : {'w' or 'a'}, default 'w'
+ File mode to use (write or append).
+
+ .. versionadded:: 0.24.0
+
+ Attributes
+ ----------
+ None
+
+ Methods
+ -------
+ None
+
+ Notes
+ -----
+ None of the methods and properties are considered public.
+
+ For compatibility with CSV writers, ExcelWriter serializes lists
+ and dicts to strings before writing.
+
+ Examples
+ --------
+ Default usage:
+
+ >>> with ExcelWriter('path_to_file.xlsx') as writer:
+ ... df.to_excel(writer)
+
+ To write to separate sheets in a single file:
+
+ >>> with ExcelWriter('path_to_file.xlsx') as writer:
+ ... df1.to_excel(writer, sheet_name='Sheet1')
+ ... df2.to_excel(writer, sheet_name='Sheet2')
+
+ You can set the date format or datetime format:
+
+ >>> with ExcelWriter('path_to_file.xlsx',
+ date_format='YYYY-MM-DD',
+ datetime_format='YYYY-MM-DD HH:MM:SS') as writer:
+ ... df.to_excel(writer)
+
+ You can also append to an existing Excel file:
+
+ >>> with ExcelWriter('path_to_file.xlsx', mode='a') as writer:
+ ... df.to_excel(writer, sheet_name='Sheet3')
+ """
+ # Defining an ExcelWriter implementation (see abstract methods for more...)
+
+ # - Mandatory
+ # - ``write_cells(self, cells, sheet_name=None, startrow=0, startcol=0)``
+ # --> called to write additional DataFrames to disk
+ # - ``supported_extensions`` (tuple of supported extensions), used to
+ # check that engine supports the given extension.
+ # - ``engine`` - string that gives the engine name. Necessary to
+ # instantiate class directly and bypass ``ExcelWriterMeta`` engine
+ # lookup.
+ # - ``save(self)`` --> called to save file to disk
+ # - Mostly mandatory (i.e. should at least exist)
+ # - book, cur_sheet, path
+
+ # - Optional:
+ # - ``__init__(self, path, engine=None, **kwargs)`` --> always called
+ # with path as first argument.
+
+ # You also need to register the class with ``register_writer()``.
+ # Technically, ExcelWriter implementations don't need to subclass
+ # ExcelWriter.
+ def __new__(cls, path, engine=None, **kwargs):
+ # only switch class if generic(ExcelWriter)
+
+ if issubclass(cls, ExcelWriter):
+ if engine is None or (isinstance(engine, string_types) and
+ engine == 'auto'):
+ if isinstance(path, string_types):
+ ext = os.path.splitext(path)[-1][1:]
+ else:
+ ext = 'xlsx'
+
+ try:
+ engine = config.get_option('io.excel.{ext}.writer'
+ .format(ext=ext))
+ if engine == 'auto':
+ engine = _get_default_writer(ext)
+ except KeyError:
+ error = ValueError("No engine for filetype: '{ext}'"
+ .format(ext=ext))
+ raise error
+ cls = get_writer(engine)
+
+ return object.__new__(cls)
+
+ # declare external properties you can count on
+ book = None
+ curr_sheet = None
+ path = None
+
+ @abc.abstractproperty
+ def supported_extensions(self):
+ "extensions that writer engine supports"
+ pass
+
+ @abc.abstractproperty
+ def engine(self):
+ "name of engine"
+ pass
+
+ @abc.abstractmethod
+ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0,
+ freeze_panes=None):
+ """
+ Write given formatted cells into Excel an excel sheet
+
+ Parameters
+ ----------
+ cells : generator
+ cell of formatted data to save to Excel sheet
+ sheet_name : string, default None
+ Name of Excel sheet, if None, then use self.cur_sheet
+ startrow : upper left cell row to dump data frame
+ startcol : upper left cell column to dump data frame
+ freeze_panes: integer tuple of length 2
+ contains the bottom-most row and right-most column to freeze
+ """
+ pass
+
+ @abc.abstractmethod
+ def save(self):
+ """
+ Save workbook to disk.
+ """
+ pass
+
+ def __init__(self, path, engine=None,
+ date_format=None, datetime_format=None, mode='w',
+ **engine_kwargs):
+ # validate that this engine can handle the extension
+ if isinstance(path, string_types):
+ ext = os.path.splitext(path)[-1]
+ else:
+ ext = 'xls' if engine == 'xlwt' else 'xlsx'
+
+ self.check_extension(ext)
+
+ self.path = path
+ self.sheets = {}
+ self.cur_sheet = None
+
+ if date_format is None:
+ self.date_format = 'YYYY-MM-DD'
+ else:
+ self.date_format = date_format
+ if datetime_format is None:
+ self.datetime_format = 'YYYY-MM-DD HH:MM:SS'
+ else:
+ self.datetime_format = datetime_format
+
+ self.mode = mode
+
+ def __fspath__(self):
+ return _stringify_path(self.path)
+
+ def _get_sheet_name(self, sheet_name):
+ if sheet_name is None:
+ sheet_name = self.cur_sheet
+ if sheet_name is None: # pragma: no cover
+ raise ValueError('Must pass explicit sheet_name or set '
+ 'cur_sheet property')
+ return sheet_name
+
+ def _value_with_fmt(self, val):
+ """Convert numpy types to Python types for the Excel writers.
+
+ Parameters
+ ----------
+ val : object
+ Value to be written into cells
+
+ Returns
+ -------
+ Tuple with the first element being the converted value and the second
+ being an optional format
+ """
+ fmt = None
+
+ if is_integer(val):
+ val = int(val)
+ elif is_float(val):
+ val = float(val)
+ elif is_bool(val):
+ val = bool(val)
+ elif isinstance(val, datetime):
+ fmt = self.datetime_format
+ elif isinstance(val, date):
+ fmt = self.date_format
+ elif isinstance(val, timedelta):
+ val = val.total_seconds() / float(86400)
+ fmt = '0'
+ else:
+ val = compat.to_str(val)
+
+ return val, fmt
+
+ @classmethod
+ def check_extension(cls, ext):
+ """checks that path's extension against the Writer's supported
+ extensions. If it isn't supported, raises UnsupportedFiletypeError."""
+ if ext.startswith('.'):
+ ext = ext[1:]
+ if not any(ext in extension for extension in cls.supported_extensions):
+ msg = (u("Invalid extension for engine '{engine}': '{ext}'")
+ .format(engine=pprint_thing(cls.engine),
+ ext=pprint_thing(ext)))
+ raise ValueError(msg)
+ else:
+ return True
+
+ # Allow use as a contextmanager
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+
+ def close(self):
+ """synonym for save, to make it more file-like"""
+ return self.save()
+
+
+class _OpenpyxlWriter(ExcelWriter):
+ engine = 'openpyxl'
+ supported_extensions = ('.xlsx', '.xlsm')
+
+ def __init__(self, path, engine=None, mode='w', **engine_kwargs):
+ # Use the openpyxl module as the Excel writer.
+ from openpyxl.workbook import Workbook
+
+ super(_OpenpyxlWriter, self).__init__(path, mode=mode, **engine_kwargs)
+
+ if self.mode == 'a': # Load from existing workbook
+ from openpyxl import load_workbook
+ book = load_workbook(self.path)
+ self.book = book
+ else:
+ # Create workbook object with default optimized_write=True.
+ self.book = Workbook()
+
+ if self.book.worksheets:
+ try:
+ self.book.remove(self.book.worksheets[0])
+ except AttributeError:
+
+ # compat - for openpyxl <= 2.4
+ self.book.remove_sheet(self.book.worksheets[0])
+
+ def save(self):
+ """
+ Save workbook to disk.
+ """
+ return self.book.save(self.path)
+
+ @classmethod
+ def _convert_to_style(cls, style_dict):
+ """
+ converts a style_dict to an openpyxl style object
+ Parameters
+ ----------
+ style_dict : style dictionary to convert
+ """
+
+ from openpyxl.style import Style
+ xls_style = Style()
+ for key, value in style_dict.items():
+ for nk, nv in value.items():
+ if key == "borders":
+ (xls_style.borders.__getattribute__(nk)
+ .__setattr__('border_style', nv))
+ else:
+ xls_style.__getattribute__(key).__setattr__(nk, nv)
+
+ return xls_style
+
+ @classmethod
+ def _convert_to_style_kwargs(cls, style_dict):
+ """
+ Convert a style_dict to a set of kwargs suitable for initializing
+ or updating-on-copy an openpyxl v2 style object
+ Parameters
+ ----------
+ style_dict : dict
+ A dict with zero or more of the following keys (or their synonyms).
+ 'font'
+ 'fill'
+ 'border' ('borders')
+ 'alignment'
+ 'number_format'
+ 'protection'
+ Returns
+ -------
+ style_kwargs : dict
+ A dict with the same, normalized keys as ``style_dict`` but each
+ value has been replaced with a native openpyxl style object of the
+ appropriate class.
+ """
+
+ _style_key_map = {
+ 'borders': 'border',
+ }
+
+ style_kwargs = {}
+ for k, v in style_dict.items():
+ if k in _style_key_map:
+ k = _style_key_map[k]
+ _conv_to_x = getattr(cls, '_convert_to_{k}'.format(k=k),
+ lambda x: None)
+ new_v = _conv_to_x(v)
+ if new_v:
+ style_kwargs[k] = new_v
+
+ return style_kwargs
+
+ @classmethod
+ def _convert_to_color(cls, color_spec):
+ """
+ Convert ``color_spec`` to an openpyxl v2 Color object
+ Parameters
+ ----------
+ color_spec : str, dict
+ A 32-bit ARGB hex string, or a dict with zero or more of the
+ following keys.
+ 'rgb'
+ 'indexed'
+ 'auto'
+ 'theme'
+ 'tint'
+ 'index'
+ 'type'
+ Returns
+ -------
+ color : openpyxl.styles.Color
+ """
+
+ from openpyxl.styles import Color
+
+ if isinstance(color_spec, str):
+ return Color(color_spec)
+ else:
+ return Color(**color_spec)
+
+ @classmethod
+ def _convert_to_font(cls, font_dict):
+ """
+ Convert ``font_dict`` to an openpyxl v2 Font object
+ Parameters
+ ----------
+ font_dict : dict
+ A dict with zero or more of the following keys (or their synonyms).
+ 'name'
+ 'size' ('sz')
+ 'bold' ('b')
+ 'italic' ('i')
+ 'underline' ('u')
+ 'strikethrough' ('strike')
+ 'color'
+ 'vertAlign' ('vertalign')
+ 'charset'
+ 'scheme'
+ 'family'
+ 'outline'
+ 'shadow'
+ 'condense'
+ Returns
+ -------
+ font : openpyxl.styles.Font
+ """
+
+ from openpyxl.styles import Font
+
+ _font_key_map = {
+ 'sz': 'size',
+ 'b': 'bold',
+ 'i': 'italic',
+ 'u': 'underline',
+ 'strike': 'strikethrough',
+ 'vertalign': 'vertAlign',
+ }
+
+ font_kwargs = {}
+ for k, v in font_dict.items():
+ if k in _font_key_map:
+ k = _font_key_map[k]
+ if k == 'color':
+ v = cls._convert_to_color(v)
+ font_kwargs[k] = v
+
+ return Font(**font_kwargs)
+
+ @classmethod
+ def _convert_to_stop(cls, stop_seq):
+ """
+ Convert ``stop_seq`` to a list of openpyxl v2 Color objects,
+ suitable for initializing the ``GradientFill`` ``stop`` parameter.
+ Parameters
+ ----------
+ stop_seq : iterable
+ An iterable that yields objects suitable for consumption by
+ ``_convert_to_color``.
+ Returns
+ -------
+ stop : list of openpyxl.styles.Color
+ """
+
+ return map(cls._convert_to_color, stop_seq)
+
+ @classmethod
+ def _convert_to_fill(cls, fill_dict):
+ """
+ Convert ``fill_dict`` to an openpyxl v2 Fill object
+ Parameters
+ ----------
+ fill_dict : dict
+ A dict with one or more of the following keys (or their synonyms),
+ 'fill_type' ('patternType', 'patterntype')
+ 'start_color' ('fgColor', 'fgcolor')
+ 'end_color' ('bgColor', 'bgcolor')
+ or one or more of the following keys (or their synonyms).
+ 'type' ('fill_type')
+ 'degree'
+ 'left'
+ 'right'
+ 'top'
+ 'bottom'
+ 'stop'
+ Returns
+ -------
+ fill : openpyxl.styles.Fill
+ """
+
+ from openpyxl.styles import PatternFill, GradientFill
+
+ _pattern_fill_key_map = {
+ 'patternType': 'fill_type',
+ 'patterntype': 'fill_type',
+ 'fgColor': 'start_color',
+ 'fgcolor': 'start_color',
+ 'bgColor': 'end_color',
+ 'bgcolor': 'end_color',
+ }
+
+ _gradient_fill_key_map = {
+ 'fill_type': 'type',
+ }
+
+ pfill_kwargs = {}
+ gfill_kwargs = {}
+ for k, v in fill_dict.items():
+ pk = gk = None
+ if k in _pattern_fill_key_map:
+ pk = _pattern_fill_key_map[k]
+ if k in _gradient_fill_key_map:
+ gk = _gradient_fill_key_map[k]
+ if pk in ['start_color', 'end_color']:
+ v = cls._convert_to_color(v)
+ if gk == 'stop':
+ v = cls._convert_to_stop(v)
+ if pk:
+ pfill_kwargs[pk] = v
+ elif gk:
+ gfill_kwargs[gk] = v
+ else:
+ pfill_kwargs[k] = v
+ gfill_kwargs[k] = v
+
+ try:
+ return PatternFill(**pfill_kwargs)
+ except TypeError:
+ return GradientFill(**gfill_kwargs)
+
+ @classmethod
+ def _convert_to_side(cls, side_spec):
+ """
+ Convert ``side_spec`` to an openpyxl v2 Side object
+ Parameters
+ ----------
+ side_spec : str, dict
+ A string specifying the border style, or a dict with zero or more
+ of the following keys (or their synonyms).
+ 'style' ('border_style')
+ 'color'
+ Returns
+ -------
+ side : openpyxl.styles.Side
+ """
+
+ from openpyxl.styles import Side
+
+ _side_key_map = {
+ 'border_style': 'style',
+ }
+
+ if isinstance(side_spec, str):
+ return Side(style=side_spec)
+
+ side_kwargs = {}
+ for k, v in side_spec.items():
+ if k in _side_key_map:
+ k = _side_key_map[k]
+ if k == 'color':
+ v = cls._convert_to_color(v)
+ side_kwargs[k] = v
+
+ return Side(**side_kwargs)
+
+ @classmethod
+ def _convert_to_border(cls, border_dict):
+ """
+ Convert ``border_dict`` to an openpyxl v2 Border object
+ Parameters
+ ----------
+ border_dict : dict
+ A dict with zero or more of the following keys (or their synonyms).
+ 'left'
+ 'right'
+ 'top'
+ 'bottom'
+ 'diagonal'
+ 'diagonal_direction'
+ 'vertical'
+ 'horizontal'
+ 'diagonalUp' ('diagonalup')
+ 'diagonalDown' ('diagonaldown')
+ 'outline'
+ Returns
+ -------
+ border : openpyxl.styles.Border
+ """
+
+ from openpyxl.styles import Border
+
+ _border_key_map = {
+ 'diagonalup': 'diagonalUp',
+ 'diagonaldown': 'diagonalDown',
+ }
+
+ border_kwargs = {}
+ for k, v in border_dict.items():
+ if k in _border_key_map:
+ k = _border_key_map[k]
+ if k == 'color':
+ v = cls._convert_to_color(v)
+ if k in ['left', 'right', 'top', 'bottom', 'diagonal']:
+ v = cls._convert_to_side(v)
+ border_kwargs[k] = v
+
+ return Border(**border_kwargs)
+
+ @classmethod
+ def _convert_to_alignment(cls, alignment_dict):
+ """
+ Convert ``alignment_dict`` to an openpyxl v2 Alignment object
+ Parameters
+ ----------
+ alignment_dict : dict
+ A dict with zero or more of the following keys (or their synonyms).
+ 'horizontal'
+ 'vertical'
+ 'text_rotation'
+ 'wrap_text'
+ 'shrink_to_fit'
+ 'indent'
+ Returns
+ -------
+ alignment : openpyxl.styles.Alignment
+ """
+
+ from openpyxl.styles import Alignment
+
+ return Alignment(**alignment_dict)
+
+ @classmethod
+ def _convert_to_number_format(cls, number_format_dict):
+ """
+ Convert ``number_format_dict`` to an openpyxl v2.1.0 number format
+ initializer.
+ Parameters
+ ----------
+ number_format_dict : dict
+ A dict with zero or more of the following keys.
+ 'format_code' : str
+ Returns
+ -------
+ number_format : str
+ """
+ return number_format_dict['format_code']
+
+ @classmethod
+ def _convert_to_protection(cls, protection_dict):
+ """
+ Convert ``protection_dict`` to an openpyxl v2 Protection object.
+ Parameters
+ ----------
+ protection_dict : dict
+ A dict with zero or more of the following keys.
+ 'locked'
+ 'hidden'
+ Returns
+ -------
+ """
+
+ from openpyxl.styles import Protection
+
+ return Protection(**protection_dict)
+
+ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0,
+ freeze_panes=None):
+ # Write the frame cells using openpyxl.
+ sheet_name = self._get_sheet_name(sheet_name)
+
+ _style_cache = {}
+
+ if sheet_name in self.sheets:
+ wks = self.sheets[sheet_name]
+ else:
+ wks = self.book.create_sheet()
+ wks.title = sheet_name
+ self.sheets[sheet_name] = wks
+
+ if _validate_freeze_panes(freeze_panes):
+ wks.freeze_panes = wks.cell(row=freeze_panes[0] + 1,
+ column=freeze_panes[1] + 1)
+
+ for cell in cells:
+ xcell = wks.cell(
+ row=startrow + cell.row + 1,
+ column=startcol + cell.col + 1
+ )
+ xcell.value, fmt = self._value_with_fmt(cell.val)
+ if fmt:
+ xcell.number_format = fmt
+
+ style_kwargs = {}
+ if cell.style:
+ key = str(cell.style)
+ style_kwargs = _style_cache.get(key)
+ if style_kwargs is None:
+ style_kwargs = self._convert_to_style_kwargs(cell.style)
+ _style_cache[key] = style_kwargs
+
+ if style_kwargs:
+ for k, v in style_kwargs.items():
+ setattr(xcell, k, v)
+
+ if cell.mergestart is not None and cell.mergeend is not None:
+
+ wks.merge_cells(
+ start_row=startrow + cell.row + 1,
+ start_column=startcol + cell.col + 1,
+ end_column=startcol + cell.mergeend + 1,
+ end_row=startrow + cell.mergestart + 1
+ )
+
+ # When cells are merged only the top-left cell is preserved
+ # The behaviour of the other cells in a merged range is
+ # undefined
+ if style_kwargs:
+ first_row = startrow + cell.row + 1
+ last_row = startrow + cell.mergestart + 1
+ first_col = startcol + cell.col + 1
+ last_col = startcol + cell.mergeend + 1
+
+ for row in range(first_row, last_row + 1):
+ for col in range(first_col, last_col + 1):
+ if row == first_row and col == first_col:
+ # Ignore first cell. It is already handled.
+ continue
+ xcell = wks.cell(column=col, row=row)
+ for k, v in style_kwargs.items():
+ setattr(xcell, k, v)
+
+
+register_writer(_OpenpyxlWriter)
+
+
+class _XlwtWriter(ExcelWriter):
+ engine = 'xlwt'
+ supported_extensions = ('.xls',)
+
+ def __init__(self, path, engine=None, encoding=None, mode='w',
+ **engine_kwargs):
+ # Use the xlwt module as the Excel writer.
+ import xlwt
+ engine_kwargs['engine'] = engine
+
+ if mode == 'a':
+ raise ValueError('Append mode is not supported with xlwt!')
+
+ super(_XlwtWriter, self).__init__(path, mode=mode, **engine_kwargs)
+
+ if encoding is None:
+ encoding = 'ascii'
+ self.book = xlwt.Workbook(encoding=encoding)
+ self.fm_datetime = xlwt.easyxf(num_format_str=self.datetime_format)
+ self.fm_date = xlwt.easyxf(num_format_str=self.date_format)
+
+ def save(self):
+ """
+ Save workbook to disk.
+ """
+ return self.book.save(self.path)
+
+ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0,
+ freeze_panes=None):
+ # Write the frame cells using xlwt.
+
+ sheet_name = self._get_sheet_name(sheet_name)
+
+ if sheet_name in self.sheets:
+ wks = self.sheets[sheet_name]
+ else:
+ wks = self.book.add_sheet(sheet_name)
+ self.sheets[sheet_name] = wks
+
+ if _validate_freeze_panes(freeze_panes):
+ wks.set_panes_frozen(True)
+ wks.set_horz_split_pos(freeze_panes[0])
+ wks.set_vert_split_pos(freeze_panes[1])
+
+ style_dict = {}
+
+ for cell in cells:
+ val, fmt = self._value_with_fmt(cell.val)
+
+ stylekey = json.dumps(cell.style)
+ if fmt:
+ stylekey += fmt
+
+ if stylekey in style_dict:
+ style = style_dict[stylekey]
+ else:
+ style = self._convert_to_style(cell.style, fmt)
+ style_dict[stylekey] = style
+
+ if cell.mergestart is not None and cell.mergeend is not None:
+ wks.write_merge(startrow + cell.row,
+ startrow + cell.mergestart,
+ startcol + cell.col,
+ startcol + cell.mergeend,
+ val, style)
+ else:
+ wks.write(startrow + cell.row,
+ startcol + cell.col,
+ val, style)
+
+ @classmethod
+ def _style_to_xlwt(cls, item, firstlevel=True, field_sep=',',
+ line_sep=';'):
+ """helper which recursively generate an xlwt easy style string
+ for example:
+
+ hstyle = {"font": {"bold": True},
+ "border": {"top": "thin",
+ "right": "thin",
+ "bottom": "thin",
+ "left": "thin"},
+ "align": {"horiz": "center"}}
+ will be converted to
+ font: bold on; \
+ border: top thin, right thin, bottom thin, left thin; \
+ align: horiz center;
+ """
+ if hasattr(item, 'items'):
+ if firstlevel:
+ it = ["{key}: {val}"
+ .format(key=key, val=cls._style_to_xlwt(value, False))
+ for key, value in item.items()]
+ out = "{sep} ".format(sep=(line_sep).join(it))
+ return out
+ else:
+ it = ["{key} {val}"
+ .format(key=key, val=cls._style_to_xlwt(value, False))
+ for key, value in item.items()]
+ out = "{sep} ".format(sep=(field_sep).join(it))
+ return out
+ else:
+ item = "{item}".format(item=item)
+ item = item.replace("True", "on")
+ item = item.replace("False", "off")
+ return item
+
+ @classmethod
+ def _convert_to_style(cls, style_dict, num_format_str=None):
+ """
+ converts a style_dict to an xlwt style object
+ Parameters
+ ----------
+ style_dict : style dictionary to convert
+ num_format_str : optional number format string
+ """
+ import xlwt
+
+ if style_dict:
+ xlwt_stylestr = cls._style_to_xlwt(style_dict)
+ style = xlwt.easyxf(xlwt_stylestr, field_sep=',', line_sep=';')
+ else:
+ style = xlwt.XFStyle()
+ if num_format_str is not None:
+ style.num_format_str = num_format_str
+
+ return style
+
+
+register_writer(_XlwtWriter)
+
+
+class _XlsxStyler(object):
+ # Map from openpyxl-oriented styles to flatter xlsxwriter representation
+ # Ordering necessary for both determinism and because some are keyed by
+ # prefixes of others.
+ STYLE_MAPPING = {
+ 'font': [
+ (('name',), 'font_name'),
+ (('sz',), 'font_size'),
+ (('size',), 'font_size'),
+ (('color', 'rgb',), 'font_color'),
+ (('color',), 'font_color'),
+ (('b',), 'bold'),
+ (('bold',), 'bold'),
+ (('i',), 'italic'),
+ (('italic',), 'italic'),
+ (('u',), 'underline'),
+ (('underline',), 'underline'),
+ (('strike',), 'font_strikeout'),
+ (('vertAlign',), 'font_script'),
+ (('vertalign',), 'font_script'),
+ ],
+ 'number_format': [
+ (('format_code',), 'num_format'),
+ ((), 'num_format',),
+ ],
+ 'protection': [
+ (('locked',), 'locked'),
+ (('hidden',), 'hidden'),
+ ],
+ 'alignment': [
+ (('horizontal',), 'align'),
+ (('vertical',), 'valign'),
+ (('text_rotation',), 'rotation'),
+ (('wrap_text',), 'text_wrap'),
+ (('indent',), 'indent'),
+ (('shrink_to_fit',), 'shrink'),
+ ],
+ 'fill': [
+ (('patternType',), 'pattern'),
+ (('patterntype',), 'pattern'),
+ (('fill_type',), 'pattern'),
+ (('start_color', 'rgb',), 'fg_color'),
+ (('fgColor', 'rgb',), 'fg_color'),
+ (('fgcolor', 'rgb',), 'fg_color'),
+ (('start_color',), 'fg_color'),
+ (('fgColor',), 'fg_color'),
+ (('fgcolor',), 'fg_color'),
+ (('end_color', 'rgb',), 'bg_color'),
+ (('bgColor', 'rgb',), 'bg_color'),
+ (('bgcolor', 'rgb',), 'bg_color'),
+ (('end_color',), 'bg_color'),
+ (('bgColor',), 'bg_color'),
+ (('bgcolor',), 'bg_color'),
+ ],
+ 'border': [
+ (('color', 'rgb',), 'border_color'),
+ (('color',), 'border_color'),
+ (('style',), 'border'),
+ (('top', 'color', 'rgb',), 'top_color'),
+ (('top', 'color',), 'top_color'),
+ (('top', 'style',), 'top'),
+ (('top',), 'top'),
+ (('right', 'color', 'rgb',), 'right_color'),
+ (('right', 'color',), 'right_color'),
+ (('right', 'style',), 'right'),
+ (('right',), 'right'),
+ (('bottom', 'color', 'rgb',), 'bottom_color'),
+ (('bottom', 'color',), 'bottom_color'),
+ (('bottom', 'style',), 'bottom'),
+ (('bottom',), 'bottom'),
+ (('left', 'color', 'rgb',), 'left_color'),
+ (('left', 'color',), 'left_color'),
+ (('left', 'style',), 'left'),
+ (('left',), 'left'),
+ ],
+ }
+
+ @classmethod
+ def convert(cls, style_dict, num_format_str=None):
+ """
+ converts a style_dict to an xlsxwriter format dict
+
+ Parameters
+ ----------
+ style_dict : style dictionary to convert
+ num_format_str : optional number format string
+ """
+
+ # Create a XlsxWriter format object.
+ props = {}
+
+ if num_format_str is not None:
+ props['num_format'] = num_format_str
+
+ if style_dict is None:
+ return props
+
+ if 'borders' in style_dict:
+ style_dict = style_dict.copy()
+ style_dict['border'] = style_dict.pop('borders')
+
+ for style_group_key, style_group in style_dict.items():
+ for src, dst in cls.STYLE_MAPPING.get(style_group_key, []):
+ # src is a sequence of keys into a nested dict
+ # dst is a flat key
+ if dst in props:
+ continue
+ v = style_group
+ for k in src:
+ try:
+ v = v[k]
+ except (KeyError, TypeError):
+ break
+ else:
+ props[dst] = v
+
+ if isinstance(props.get('pattern'), string_types):
+ # TODO: support other fill patterns
+ props['pattern'] = 0 if props['pattern'] == 'none' else 1
+
+ for k in ['border', 'top', 'right', 'bottom', 'left']:
+ if isinstance(props.get(k), string_types):
+ try:
+ props[k] = ['none', 'thin', 'medium', 'dashed', 'dotted',
+ 'thick', 'double', 'hair', 'mediumDashed',
+ 'dashDot', 'mediumDashDot', 'dashDotDot',
+ 'mediumDashDotDot',
+ 'slantDashDot'].index(props[k])
+ except ValueError:
+ props[k] = 2
+
+ if isinstance(props.get('font_script'), string_types):
+ props['font_script'] = ['baseline', 'superscript',
+ 'subscript'].index(props['font_script'])
+
+ if isinstance(props.get('underline'), string_types):
+ props['underline'] = {'none': 0, 'single': 1, 'double': 2,
+ 'singleAccounting': 33,
+ 'doubleAccounting': 34}[props['underline']]
+
+ return props
+
+
+class _XlsxWriter(ExcelWriter):
+ engine = 'xlsxwriter'
+ supported_extensions = ('.xlsx',)
+
+ def __init__(self, path, engine=None,
+ date_format=None, datetime_format=None, mode='w',
+ **engine_kwargs):
+ # Use the xlsxwriter module as the Excel writer.
+ import xlsxwriter
+
+ if mode == 'a':
+ raise ValueError('Append mode is not supported with xlsxwriter!')
+
+ super(_XlsxWriter, self).__init__(path, engine=engine,
+ date_format=date_format,
+ datetime_format=datetime_format,
+ mode=mode,
+ **engine_kwargs)
+
+ self.book = xlsxwriter.Workbook(path, **engine_kwargs)
+
+ def save(self):
+ """
+ Save workbook to disk.
+ """
+
+ return self.book.close()
+
+ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0,
+ freeze_panes=None):
+ # Write the frame cells using xlsxwriter.
+ sheet_name = self._get_sheet_name(sheet_name)
+
+ if sheet_name in self.sheets:
+ wks = self.sheets[sheet_name]
+ else:
+ wks = self.book.add_worksheet(sheet_name)
+ self.sheets[sheet_name] = wks
+
+ style_dict = {'null': None}
+
+ if _validate_freeze_panes(freeze_panes):
+ wks.freeze_panes(*(freeze_panes))
+
+ for cell in cells:
+ val, fmt = self._value_with_fmt(cell.val)
+
+ stylekey = json.dumps(cell.style)
+ if fmt:
+ stylekey += fmt
+
+ if stylekey in style_dict:
+ style = style_dict[stylekey]
+ else:
+ style = self.book.add_format(
+ _XlsxStyler.convert(cell.style, fmt))
+ style_dict[stylekey] = style
+
+ if cell.mergestart is not None and cell.mergeend is not None:
+ wks.merge_range(startrow + cell.row,
+ startcol + cell.col,
+ startrow + cell.mergestart,
+ startcol + cell.mergeend,
+ cell.val, style)
+ else:
+ wks.write(startrow + cell.row,
+ startcol + cell.col,
+ val, style)
+
+
+register_writer(_XlsxWriter)
diff --git a/contrib/python/pandas/py2/pandas/io/feather_format.py b/contrib/python/pandas/py2/pandas/io/feather_format.py
new file mode 100644
index 00000000000..d76e6b75d37
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/feather_format.py
@@ -0,0 +1,127 @@
+""" feather-format compat """
+
+from distutils.version import LooseVersion
+
+from pandas.compat import range
+from pandas.util._decorators import deprecate_kwarg
+
+from pandas import DataFrame, Int64Index, RangeIndex
+
+from pandas.io.common import _stringify_path
+
+
+def _try_import():
+ # since pandas is a dependency of pyarrow
+ # we need to import on first use
+ try:
+ import pyarrow
+ from pyarrow import feather
+ except ImportError:
+ # give a nice error message
+ raise ImportError("pyarrow is not installed\n\n"
+ "you can install via conda\n"
+ "conda install pyarrow -c conda-forge\n"
+ "or via pip\n"
+ "pip install -U pyarrow\n")
+
+ if LooseVersion(pyarrow.__version__) < LooseVersion('0.9.0'):
+ raise ImportError("pyarrow >= 0.9.0 required for feather support\n\n"
+ "you can install via conda\n"
+ "conda install pyarrow -c conda-forge"
+ "or via pip\n"
+ "pip install -U pyarrow\n")
+
+ return feather, pyarrow
+
+
+def to_feather(df, path):
+ """
+ Write a DataFrame to the feather-format
+
+ Parameters
+ ----------
+ df : DataFrame
+ path : string file path, or file-like object
+
+ """
+ path = _stringify_path(path)
+ if not isinstance(df, DataFrame):
+ raise ValueError("feather only support IO with DataFrames")
+
+ feather = _try_import()[0]
+ valid_types = {'string', 'unicode'}
+
+ # validate index
+ # --------------
+
+ # validate that we have only a default index
+ # raise on anything else as we don't serialize the index
+
+ if not isinstance(df.index, Int64Index):
+ raise ValueError("feather does not support serializing {} "
+ "for the index; you can .reset_index()"
+ "to make the index into column(s)".format(
+ type(df.index)))
+
+ if not df.index.equals(RangeIndex.from_range(range(len(df)))):
+ raise ValueError("feather does not support serializing a "
+ "non-default index for the index; you "
+ "can .reset_index() to make the index "
+ "into column(s)")
+
+ if df.index.name is not None:
+ raise ValueError("feather does not serialize index meta-data on a "
+ "default index")
+
+ # validate columns
+ # ----------------
+
+ # must have value column names (strings only)
+ if df.columns.inferred_type not in valid_types:
+ raise ValueError("feather must have string column names")
+
+ feather.write_feather(df, path)
+
+
+@deprecate_kwarg(old_arg_name='nthreads', new_arg_name='use_threads')
+def read_feather(path, columns=None, use_threads=True):
+ """
+ Load a feather-format object from the file path
+
+ .. versionadded 0.20.0
+
+ Parameters
+ ----------
+ path : string file path, or file-like object
+ columns : sequence, default None
+ If not provided, all columns are read
+
+ .. versionadded 0.24.0
+ nthreads : int, default 1
+ Number of CPU threads to use when reading to pandas.DataFrame
+
+ .. versionadded 0.21.0
+ .. deprecated 0.24.0
+ use_threads : bool, default True
+ Whether to parallelize reading using multiple threads
+
+ .. versionadded 0.24.0
+
+ Returns
+ -------
+ type of object stored in file
+
+ """
+
+ feather, pyarrow = _try_import()
+ path = _stringify_path(path)
+
+ if LooseVersion(pyarrow.__version__) < LooseVersion('0.11.0'):
+ int_use_threads = int(use_threads)
+ if int_use_threads < 1:
+ int_use_threads = 1
+ return feather.read_feather(path, columns=columns,
+ nthreads=int_use_threads)
+
+ return feather.read_feather(path, columns=columns,
+ use_threads=bool(use_threads))
diff --git a/contrib/python/pandas/py2/pandas/io/formats/__init__.py b/contrib/python/pandas/py2/pandas/io/formats/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/formats/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/io/formats/console.py b/contrib/python/pandas/py2/pandas/io/formats/console.py
new file mode 100644
index 00000000000..d5ef9f61bc1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/formats/console.py
@@ -0,0 +1,159 @@
+"""
+Internal module for console introspection
+"""
+
+import locale
+import sys
+
+from pandas.io.formats.terminal import get_terminal_size
+
+# -----------------------------------------------------------------------------
+# Global formatting options
+_initial_defencoding = None
+
+
+def detect_console_encoding():
+ """
+ Try to find the most capable encoding supported by the console.
+ slightly modified from the way IPython handles the same issue.
+ """
+ global _initial_defencoding
+
+ encoding = None
+ try:
+ encoding = sys.stdout.encoding or sys.stdin.encoding
+ except (AttributeError, IOError):
+ pass
+
+ # try again for something better
+ if not encoding or 'ascii' in encoding.lower():
+ try:
+ encoding = locale.getpreferredencoding()
+ except Exception:
+ pass
+
+ # when all else fails. this will usually be "ascii"
+ if not encoding or 'ascii' in encoding.lower():
+ encoding = sys.getdefaultencoding()
+
+ # GH3360, save the reported defencoding at import time
+ # MPL backends may change it. Make available for debugging.
+ if not _initial_defencoding:
+ _initial_defencoding = sys.getdefaultencoding()
+
+ return encoding
+
+
+def get_console_size():
+ """Return console size as tuple = (width, height).
+
+ Returns (None,None) in non-interactive session.
+ """
+ from pandas import get_option
+
+ display_width = get_option('display.width')
+ # deprecated.
+ display_height = get_option('display.max_rows')
+
+ # Consider
+ # interactive shell terminal, can detect term size
+ # interactive non-shell terminal (ipnb/ipqtconsole), cannot detect term
+ # size non-interactive script, should disregard term size
+
+ # in addition
+ # width,height have default values, but setting to 'None' signals
+ # should use Auto-Detection, But only in interactive shell-terminal.
+ # Simple. yeah.
+
+ if in_interactive_session():
+ if in_ipython_frontend():
+ # sane defaults for interactive non-shell terminal
+ # match default for width,height in config_init
+ from pandas.core.config import get_default_val
+ terminal_width = get_default_val('display.width')
+ terminal_height = get_default_val('display.max_rows')
+ else:
+ # pure terminal
+ terminal_width, terminal_height = get_terminal_size()
+ else:
+ terminal_width, terminal_height = None, None
+
+ # Note if the User sets width/Height to None (auto-detection)
+ # and we're in a script (non-inter), this will return (None,None)
+ # caller needs to deal.
+ return (display_width or terminal_width, display_height or terminal_height)
+
+
+# ----------------------------------------------------------------------
+# Detect our environment
+
+def in_interactive_session():
+ """ check if we're running in an interactive shell
+
+ returns True if running under python/ipython interactive shell
+ """
+ from pandas import get_option
+
+ def check_main():
+ try:
+ import __main__ as main
+ except ModuleNotFoundError:
+ return get_option('mode.sim_interactive')
+ return (not hasattr(main, '__file__') or
+ get_option('mode.sim_interactive'))
+
+ try:
+ return __IPYTHON__ or check_main() # noqa
+ except NameError:
+ return check_main()
+
+
+def in_qtconsole():
+ """
+ check if we're inside an IPython qtconsole
+
+ .. deprecated:: 0.14.1
+ This is no longer needed, or working, in IPython 3 and above.
+ """
+ try:
+ ip = get_ipython() # noqa
+ front_end = (
+ ip.config.get('KernelApp', {}).get('parent_appname', "") or
+ ip.config.get('IPKernelApp', {}).get('parent_appname', ""))
+ if 'qtconsole' in front_end.lower():
+ return True
+ except NameError:
+ return False
+ return False
+
+
+def in_ipnb():
+ """
+ check if we're inside an IPython Notebook
+
+ .. deprecated:: 0.14.1
+ This is no longer needed, or working, in IPython 3 and above.
+ """
+ try:
+ ip = get_ipython() # noqa
+ front_end = (
+ ip.config.get('KernelApp', {}).get('parent_appname', "") or
+ ip.config.get('IPKernelApp', {}).get('parent_appname', ""))
+ if 'notebook' in front_end.lower():
+ return True
+ except NameError:
+ return False
+ return False
+
+
+def in_ipython_frontend():
+ """
+ check if we're inside an an IPython zmq frontend
+ """
+ try:
+ ip = get_ipython() # noqa
+ return 'zmq' in str(type(ip)).lower()
+ except NameError:
+ pass
+
+ return False
diff --git a/contrib/python/pandas/py2/pandas/io/formats/css.py b/contrib/python/pandas/py2/pandas/io/formats/css.py
new file mode 100644
index 00000000000..429c98b579c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/formats/css.py
@@ -0,0 +1,250 @@
+"""Utilities for interpreting CSS from Stylers for formatting non-HTML outputs
+"""
+
+import re
+import warnings
+
+
+class CSSWarning(UserWarning):
+ """This CSS syntax cannot currently be parsed"""
+ pass
+
+
+class CSSResolver(object):
+ """A callable for parsing and resolving CSS to atomic properties
+
+ """
+
+ INITIAL_STYLE = {
+ }
+
+ def __call__(self, declarations_str, inherited=None):
+ """ the given declarations to atomic properties
+
+ Parameters
+ ----------
+ declarations_str : str
+ A list of CSS declarations
+ inherited : dict, optional
+ Atomic properties indicating the inherited style context in which
+ declarations_str is to be resolved. ``inherited`` should already
+ be resolved, i.e. valid output of this method.
+
+ Returns
+ -------
+ props : dict
+ Atomic CSS 2.2 properties
+
+ Examples
+ --------
+ >>> resolve = CSSResolver()
+ >>> inherited = {'font-family': 'serif', 'font-weight': 'bold'}
+ >>> out = resolve('''
+ ... border-color: BLUE RED;
+ ... font-size: 1em;
+ ... font-size: 2em;
+ ... font-weight: normal;
+ ... font-weight: inherit;
+ ... ''', inherited)
+ >>> sorted(out.items()) # doctest: +NORMALIZE_WHITESPACE
+ [('border-bottom-color', 'blue'),
+ ('border-left-color', 'red'),
+ ('border-right-color', 'red'),
+ ('border-top-color', 'blue'),
+ ('font-family', 'serif'),
+ ('font-size', '24pt'),
+ ('font-weight', 'bold')]
+ """
+
+ props = dict(self.atomize(self.parse(declarations_str)))
+ if inherited is None:
+ inherited = {}
+
+ # 1. resolve inherited, initial
+ for prop, val in inherited.items():
+ if prop not in props:
+ props[prop] = val
+
+ for prop, val in list(props.items()):
+ if val == 'inherit':
+ val = inherited.get(prop, 'initial')
+ if val == 'initial':
+ val = self.INITIAL_STYLE.get(prop)
+
+ if val is None:
+ # we do not define a complete initial stylesheet
+ del props[prop]
+ else:
+ props[prop] = val
+
+ # 2. resolve relative font size
+ if props.get('font-size'):
+ if 'font-size' in inherited:
+ em_pt = inherited['font-size']
+ assert em_pt[-2:] == 'pt'
+ em_pt = float(em_pt[:-2])
+ else:
+ em_pt = None
+ props['font-size'] = self.size_to_pt(
+ props['font-size'], em_pt, conversions=self.FONT_SIZE_RATIOS)
+
+ font_size = float(props['font-size'][:-2])
+ else:
+ font_size = None
+
+ # 3. TODO: resolve other font-relative units
+ for side in self.SIDES:
+ prop = 'border-{side}-width'.format(side=side)
+ if prop in props:
+ props[prop] = self.size_to_pt(
+ props[prop], em_pt=font_size,
+ conversions=self.BORDER_WIDTH_RATIOS)
+ for prop in ['margin-{side}'.format(side=side),
+ 'padding-{side}'.format(side=side)]:
+ if prop in props:
+ # TODO: support %
+ props[prop] = self.size_to_pt(
+ props[prop], em_pt=font_size,
+ conversions=self.MARGIN_RATIOS)
+
+ return props
+
+ UNIT_RATIOS = {
+ 'rem': ('pt', 12),
+ 'ex': ('em', .5),
+ # 'ch':
+ 'px': ('pt', .75),
+ 'pc': ('pt', 12),
+ 'in': ('pt', 72),
+ 'cm': ('in', 1 / 2.54),
+ 'mm': ('in', 1 / 25.4),
+ 'q': ('mm', .25),
+ '!!default': ('em', 0),
+ }
+
+ FONT_SIZE_RATIOS = UNIT_RATIOS.copy()
+ FONT_SIZE_RATIOS.update({
+ '%': ('em', .01),
+ 'xx-small': ('rem', .5),
+ 'x-small': ('rem', .625),
+ 'small': ('rem', .8),
+ 'medium': ('rem', 1),
+ 'large': ('rem', 1.125),
+ 'x-large': ('rem', 1.5),
+ 'xx-large': ('rem', 2),
+ 'smaller': ('em', 1 / 1.2),
+ 'larger': ('em', 1.2),
+ '!!default': ('em', 1),
+ })
+
+ MARGIN_RATIOS = UNIT_RATIOS.copy()
+ MARGIN_RATIOS.update({
+ 'none': ('pt', 0),
+ })
+
+ BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy()
+ BORDER_WIDTH_RATIOS.update({
+ 'none': ('pt', 0),
+ 'thick': ('px', 4),
+ 'medium': ('px', 2),
+ 'thin': ('px', 1),
+ # Default: medium only if solid
+ })
+
+ def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS):
+ def _error():
+ warnings.warn('Unhandled size: {val!r}'.format(val=in_val),
+ CSSWarning)
+ return self.size_to_pt('1!!default', conversions=conversions)
+
+ try:
+ val, unit = re.match(r'^(\S*?)([a-zA-Z%!].*)', in_val).groups()
+ except AttributeError:
+ return _error()
+ if val == '':
+ # hack for 'large' etc.
+ val = 1
+ else:
+ try:
+ val = float(val)
+ except ValueError:
+ return _error()
+
+ while unit != 'pt':
+ if unit == 'em':
+ if em_pt is None:
+ unit = 'rem'
+ else:
+ val *= em_pt
+ unit = 'pt'
+ continue
+
+ try:
+ unit, mul = conversions[unit]
+ except KeyError:
+ return _error()
+ val *= mul
+
+ val = round(val, 5)
+ if int(val) == val:
+ size_fmt = '{fmt:d}pt'.format(fmt=int(val))
+ else:
+ size_fmt = '{fmt:f}pt'.format(fmt=val)
+ return size_fmt
+
+ def atomize(self, declarations):
+ for prop, value in declarations:
+ attr = 'expand_' + prop.replace('-', '_')
+ try:
+ expand = getattr(self, attr)
+ except AttributeError:
+ yield prop, value
+ else:
+ for prop, value in expand(prop, value):
+ yield prop, value
+
+ SIDE_SHORTHANDS = {
+ 1: [0, 0, 0, 0],
+ 2: [0, 1, 0, 1],
+ 3: [0, 1, 2, 1],
+ 4: [0, 1, 2, 3],
+ }
+ SIDES = ('top', 'right', 'bottom', 'left')
+
+ def _side_expander(prop_fmt):
+ def expand(self, prop, value):
+ tokens = value.split()
+ try:
+ mapping = self.SIDE_SHORTHANDS[len(tokens)]
+ except KeyError:
+ warnings.warn('Could not expand "{prop}: {val}"'
+ .format(prop=prop, val=value), CSSWarning)
+ return
+ for key, idx in zip(self.SIDES, mapping):
+ yield prop_fmt.format(key), tokens[idx]
+
+ return expand
+
+ expand_border_color = _side_expander('border-{:s}-color')
+ expand_border_style = _side_expander('border-{:s}-style')
+ expand_border_width = _side_expander('border-{:s}-width')
+ expand_margin = _side_expander('margin-{:s}')
+ expand_padding = _side_expander('padding-{:s}')
+
+ def parse(self, declarations_str):
+ """Generates (prop, value) pairs from declarations
+
+ In a future version may generate parsed tokens from tinycss/tinycss2
+ """
+ for decl in declarations_str.split(';'):
+ if not decl.strip():
+ continue
+ prop, sep, val = decl.partition(':')
+ prop = prop.strip().lower()
+ # TODO: don't lowercase case sensitive parts of values (strings)
+ val = val.strip().lower()
+ if sep:
+ yield prop, val
+ else:
+ warnings.warn('Ill-formatted attribute: expected a colon '
+ 'in {decl!r}'.format(decl=decl), CSSWarning)
diff --git a/contrib/python/pandas/py2/pandas/io/formats/csvs.py b/contrib/python/pandas/py2/pandas/io/formats/csvs.py
new file mode 100644
index 00000000000..46c843af043
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/formats/csvs.py
@@ -0,0 +1,315 @@
+# -*- coding: utf-8 -*-
+"""
+Module for formatting output data into CSV files.
+"""
+
+from __future__ import print_function
+
+import csv as csvlib
+import os
+import warnings
+from zipfile import ZipFile
+
+import numpy as np
+
+from pandas._libs import writers as libwriters
+from pandas.compat import StringIO, range, zip
+
+from pandas.core.dtypes.generic import (
+ ABCDatetimeIndex, ABCIndexClass, ABCMultiIndex, ABCPeriodIndex)
+from pandas.core.dtypes.missing import notna
+
+from pandas import compat
+
+from pandas.io.common import (
+ UnicodeWriter, _get_handle, _infer_compression, get_filepath_or_buffer)
+
+
+class CSVFormatter(object):
+
+ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
+ float_format=None, cols=None, header=True, index=True,
+ index_label=None, mode='w', nanRep=None, encoding=None,
+ compression='infer', quoting=None, line_terminator='\n',
+ chunksize=None, tupleize_cols=False, quotechar='"',
+ date_format=None, doublequote=True, escapechar=None,
+ decimal='.'):
+
+ self.obj = obj
+
+ if path_or_buf is None:
+ path_or_buf = StringIO()
+
+ self.path_or_buf, _, _, _ = get_filepath_or_buffer(
+ path_or_buf, encoding=encoding, compression=compression, mode=mode
+ )
+ self.sep = sep
+ self.na_rep = na_rep
+ self.float_format = float_format
+ self.decimal = decimal
+
+ self.header = header
+ self.index = index
+ self.index_label = index_label
+ self.mode = mode
+ if encoding is None:
+ encoding = 'ascii' if compat.PY2 else 'utf-8'
+ self.encoding = encoding
+ self.compression = _infer_compression(self.path_or_buf, compression)
+
+ if quoting is None:
+ quoting = csvlib.QUOTE_MINIMAL
+ self.quoting = quoting
+
+ if quoting == csvlib.QUOTE_NONE:
+ # prevents crash in _csv
+ quotechar = None
+ self.quotechar = quotechar
+
+ self.doublequote = doublequote
+ self.escapechar = escapechar
+
+ self.line_terminator = line_terminator or os.linesep
+
+ self.date_format = date_format
+
+ self.tupleize_cols = tupleize_cols
+ self.has_mi_columns = (isinstance(obj.columns, ABCMultiIndex) and
+ not self.tupleize_cols)
+
+ # validate mi options
+ if self.has_mi_columns:
+ if cols is not None:
+ raise TypeError("cannot specify cols with a MultiIndex on the "
+ "columns")
+
+ if cols is not None:
+ if isinstance(cols, ABCIndexClass):
+ cols = cols.to_native_types(na_rep=na_rep,
+ float_format=float_format,
+ date_format=date_format,
+ quoting=self.quoting)
+ else:
+ cols = list(cols)
+ self.obj = self.obj.loc[:, cols]
+
+ # update columns to include possible multiplicity of dupes
+ # and make sure sure cols is just a list of labels
+ cols = self.obj.columns
+ if isinstance(cols, ABCIndexClass):
+ cols = cols.to_native_types(na_rep=na_rep,
+ float_format=float_format,
+ date_format=date_format,
+ quoting=self.quoting)
+ else:
+ cols = list(cols)
+
+ # save it
+ self.cols = cols
+
+ # preallocate data 2d list
+ self.blocks = self.obj._data.blocks
+ ncols = sum(b.shape[0] for b in self.blocks)
+ self.data = [None] * ncols
+
+ if chunksize is None:
+ chunksize = (100000 // (len(self.cols) or 1)) or 1
+ self.chunksize = int(chunksize)
+
+ self.data_index = obj.index
+ if (isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and
+ date_format is not None):
+ from pandas import Index
+ self.data_index = Index([x.strftime(date_format) if notna(x) else
+ '' for x in self.data_index])
+
+ self.nlevels = getattr(self.data_index, 'nlevels', 1)
+ if not index:
+ self.nlevels = 0
+
+ def save(self):
+ """
+ Create the writer & save
+ """
+ # GH21227 internal compression is not used when file-like passed.
+ if self.compression and hasattr(self.path_or_buf, 'write'):
+ msg = ("compression has no effect when passing file-like "
+ "object as input.")
+ warnings.warn(msg, RuntimeWarning, stacklevel=2)
+
+ # when zip compression is called.
+ is_zip = isinstance(self.path_or_buf, ZipFile) or (
+ not hasattr(self.path_or_buf, 'write')
+ and self.compression == 'zip')
+
+ if is_zip:
+ # zipfile doesn't support writing string to archive. uses string
+ # buffer to receive csv writing and dump into zip compression
+ # file handle. GH21241, GH21118
+ f = StringIO()
+ close = False
+ elif hasattr(self.path_or_buf, 'write'):
+ f = self.path_or_buf
+ close = False
+ else:
+ f, handles = _get_handle(self.path_or_buf, self.mode,
+ encoding=self.encoding,
+ compression=self.compression)
+ close = True
+
+ try:
+ writer_kwargs = dict(lineterminator=self.line_terminator,
+ delimiter=self.sep, quoting=self.quoting,
+ doublequote=self.doublequote,
+ escapechar=self.escapechar,
+ quotechar=self.quotechar)
+ if self.encoding == 'ascii':
+ self.writer = csvlib.writer(f, **writer_kwargs)
+ else:
+ writer_kwargs['encoding'] = self.encoding
+ self.writer = UnicodeWriter(f, **writer_kwargs)
+
+ self._save()
+
+ finally:
+ if is_zip:
+ # GH17778 handles zip compression separately.
+ buf = f.getvalue()
+ if hasattr(self.path_or_buf, 'write'):
+ self.path_or_buf.write(buf)
+ else:
+ f, handles = _get_handle(self.path_or_buf, self.mode,
+ encoding=self.encoding,
+ compression=self.compression)
+ f.write(buf)
+ close = True
+ if close:
+ f.close()
+ for _fh in handles:
+ _fh.close()
+
+ def _save_header(self):
+
+ writer = self.writer
+ obj = self.obj
+ index_label = self.index_label
+ cols = self.cols
+ has_mi_columns = self.has_mi_columns
+ header = self.header
+ encoded_labels = []
+
+ has_aliases = isinstance(header, (tuple, list, np.ndarray,
+ ABCIndexClass))
+ if not (has_aliases or self.header):
+ return
+ if has_aliases:
+ if len(header) != len(cols):
+ raise ValueError(('Writing {ncols} cols but got {nalias} '
+ 'aliases'.format(ncols=len(cols),
+ nalias=len(header))))
+ else:
+ write_cols = header
+ else:
+ write_cols = cols
+
+ if self.index:
+ # should write something for index label
+ if index_label is not False:
+ if index_label is None:
+ if isinstance(obj.index, ABCMultiIndex):
+ index_label = []
+ for i, name in enumerate(obj.index.names):
+ if name is None:
+ name = ''
+ index_label.append(name)
+ else:
+ index_label = obj.index.name
+ if index_label is None:
+ index_label = ['']
+ else:
+ index_label = [index_label]
+ elif not isinstance(index_label,
+ (list, tuple, np.ndarray, ABCIndexClass)):
+ # given a string for a DF with Index
+ index_label = [index_label]
+
+ encoded_labels = list(index_label)
+ else:
+ encoded_labels = []
+
+ if not has_mi_columns or has_aliases:
+ encoded_labels += list(write_cols)
+ writer.writerow(encoded_labels)
+ else:
+ # write out the mi
+ columns = obj.columns
+
+ # write out the names for each level, then ALL of the values for
+ # each level
+ for i in range(columns.nlevels):
+
+ # we need at least 1 index column to write our col names
+ col_line = []
+ if self.index:
+
+ # name is the first column
+ col_line.append(columns.names[i])
+
+ if isinstance(index_label, list) and len(index_label) > 1:
+ col_line.extend([''] * (len(index_label) - 1))
+
+ col_line.extend(columns._get_level_values(i))
+
+ writer.writerow(col_line)
+
+ # Write out the index line if it's not empty.
+ # Otherwise, we will print out an extraneous
+ # blank line between the mi and the data rows.
+ if encoded_labels and set(encoded_labels) != {''}:
+ encoded_labels.extend([''] * len(columns))
+ writer.writerow(encoded_labels)
+
+ def _save(self):
+
+ self._save_header()
+
+ nrows = len(self.data_index)
+
+ # write in chunksize bites
+ chunksize = self.chunksize
+ chunks = int(nrows / chunksize) + 1
+
+ for i in range(chunks):
+ start_i = i * chunksize
+ end_i = min((i + 1) * chunksize, nrows)
+ if start_i >= end_i:
+ break
+
+ self._save_chunk(start_i, end_i)
+
+ def _save_chunk(self, start_i, end_i):
+
+ data_index = self.data_index
+
+ # create the data for a chunk
+ slicer = slice(start_i, end_i)
+ for i in range(len(self.blocks)):
+ b = self.blocks[i]
+ d = b.to_native_types(slicer=slicer, na_rep=self.na_rep,
+ float_format=self.float_format,
+ decimal=self.decimal,
+ date_format=self.date_format,
+ quoting=self.quoting)
+
+ for col_loc, col in zip(b.mgr_locs, d):
+ # self.data is a preallocated list
+ self.data[col_loc] = col
+
+ ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
+ float_format=self.float_format,
+ decimal=self.decimal,
+ date_format=self.date_format,
+ quoting=self.quoting)
+
+ libwriters.write_csv_rows(self.data, ix, self.nlevels,
+ self.cols, self.writer)
diff --git a/contrib/python/pandas/py2/pandas/io/formats/excel.py b/contrib/python/pandas/py2/pandas/io/formats/excel.py
new file mode 100644
index 00000000000..d74722996a6
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/formats/excel.py
@@ -0,0 +1,664 @@
+"""Utilities for conversion to writer-agnostic Excel representation
+"""
+
+import itertools
+import re
+import warnings
+
+import numpy as np
+
+from pandas.compat import reduce
+
+from pandas.core.dtypes import missing
+from pandas.core.dtypes.common import is_float, is_scalar
+from pandas.core.dtypes.generic import ABCMultiIndex, ABCPeriodIndex
+
+from pandas import Index
+import pandas.core.common as com
+
+from pandas.io.formats.css import CSSResolver, CSSWarning
+from pandas.io.formats.format import get_level_lengths
+from pandas.io.formats.printing import pprint_thing
+
+
+class ExcelCell(object):
+ __fields__ = ('row', 'col', 'val', 'style', 'mergestart', 'mergeend')
+ __slots__ = __fields__
+
+ def __init__(self, row, col, val, style=None, mergestart=None,
+ mergeend=None):
+ self.row = row
+ self.col = col
+ self.val = val
+ self.style = style
+ self.mergestart = mergestart
+ self.mergeend = mergeend
+
+
+class CSSToExcelConverter(object):
+ """A callable for converting CSS declarations to ExcelWriter styles
+
+ Supports parts of CSS 2.2, with minimal CSS 3.0 support (e.g. text-shadow),
+ focusing on font styling, backgrounds, borders and alignment.
+
+ Operates by first computing CSS styles in a fairly generic
+ way (see :meth:`compute_css`) then determining Excel style
+ properties from CSS properties (see :meth:`build_xlstyle`).
+
+ Parameters
+ ----------
+ inherited : str, optional
+ CSS declarations understood to be the containing scope for the
+ CSS processed by :meth:`__call__`.
+ """
+ # NB: Most of the methods here could be classmethods, as only __init__
+ # and __call__ make use of instance attributes. We leave them as
+ # instancemethods so that users can easily experiment with extensions
+ # without monkey-patching.
+
+ def __init__(self, inherited=None):
+ if inherited is not None:
+ inherited = self.compute_css(inherited,
+ self.compute_css.INITIAL_STYLE)
+
+ self.inherited = inherited
+
+ compute_css = CSSResolver()
+
+ def __call__(self, declarations_str):
+ """Convert CSS declarations to ExcelWriter style
+
+ Parameters
+ ----------
+ declarations_str : str
+ List of CSS declarations.
+ e.g. "font-weight: bold; background: blue"
+
+ Returns
+ -------
+ xlstyle : dict
+ A style as interpreted by ExcelWriter when found in
+ ExcelCell.style.
+ """
+ # TODO: memoize?
+ properties = self.compute_css(declarations_str, self.inherited)
+ return self.build_xlstyle(properties)
+
+ def build_xlstyle(self, props):
+ out = {
+ 'alignment': self.build_alignment(props),
+ 'border': self.build_border(props),
+ 'fill': self.build_fill(props),
+ 'font': self.build_font(props),
+ 'number_format': self.build_number_format(props),
+ }
+ # TODO: handle cell width and height: needs support in pandas.io.excel
+
+ def remove_none(d):
+ """Remove key where value is None, through nested dicts"""
+ for k, v in list(d.items()):
+ if v is None:
+ del d[k]
+ elif isinstance(v, dict):
+ remove_none(v)
+ if not v:
+ del d[k]
+
+ remove_none(out)
+ return out
+
+ VERTICAL_MAP = {
+ 'top': 'top',
+ 'text-top': 'top',
+ 'middle': 'center',
+ 'baseline': 'bottom',
+ 'bottom': 'bottom',
+ 'text-bottom': 'bottom',
+ # OpenXML also has 'justify', 'distributed'
+ }
+
+ def build_alignment(self, props):
+ # TODO: text-indent, padding-left -> alignment.indent
+ return {'horizontal': props.get('text-align'),
+ 'vertical': self.VERTICAL_MAP.get(props.get('vertical-align')),
+ 'wrap_text': (None if props.get('white-space') is None else
+ props['white-space'] not in
+ ('nowrap', 'pre', 'pre-line'))
+ }
+
+ def build_border(self, props):
+ return {side: {
+ 'style': self._border_style(props.get('border-{side}-style'
+ .format(side=side)),
+ props.get('border-{side}-width'
+ .format(side=side))),
+ 'color': self.color_to_excel(
+ props.get('border-{side}-color'.format(side=side))),
+ } for side in ['top', 'right', 'bottom', 'left']}
+
+ def _border_style(self, style, width):
+ # convert styles and widths to openxml, one of:
+ # 'dashDot'
+ # 'dashDotDot'
+ # 'dashed'
+ # 'dotted'
+ # 'double'
+ # 'hair'
+ # 'medium'
+ # 'mediumDashDot'
+ # 'mediumDashDotDot'
+ # 'mediumDashed'
+ # 'slantDashDot'
+ # 'thick'
+ # 'thin'
+ if width is None and style is None:
+ return None
+ if style == 'none' or style == 'hidden':
+ return None
+
+ if width is None:
+ width = '2pt'
+ width = float(width[:-2])
+ if width < 1e-5:
+ return None
+ elif width < 1.3:
+ width_name = 'thin'
+ elif width < 2.8:
+ width_name = 'medium'
+ else:
+ width_name = 'thick'
+
+ if style in (None, 'groove', 'ridge', 'inset', 'outset'):
+ # not handled
+ style = 'solid'
+
+ if style == 'double':
+ return 'double'
+ if style == 'solid':
+ return width_name
+ if style == 'dotted':
+ if width_name in ('hair', 'thin'):
+ return 'dotted'
+ return 'mediumDashDotDot'
+ if style == 'dashed':
+ if width_name in ('hair', 'thin'):
+ return 'dashed'
+ return 'mediumDashed'
+
+ def build_fill(self, props):
+ # TODO: perhaps allow for special properties
+ # -excel-pattern-bgcolor and -excel-pattern-type
+ fill_color = props.get('background-color')
+ if fill_color not in (None, 'transparent', 'none'):
+ return {
+ 'fgColor': self.color_to_excel(fill_color),
+ 'patternType': 'solid',
+ }
+
+ BOLD_MAP = {'bold': True, 'bolder': True, '600': True, '700': True,
+ '800': True, '900': True,
+ 'normal': False, 'lighter': False, '100': False, '200': False,
+ '300': False, '400': False, '500': False}
+ ITALIC_MAP = {'normal': False, 'italic': True, 'oblique': True}
+
+ def build_font(self, props):
+ size = props.get('font-size')
+ if size is not None:
+ assert size.endswith('pt')
+ size = float(size[:-2])
+
+ font_names_tmp = re.findall(r'''(?x)
+ (
+ "(?:[^"]|\\")+"
+ |
+ '(?:[^']|\\')+'
+ |
+ [^'",]+
+ )(?=,|\s*$)
+ ''', props.get('font-family', ''))
+ font_names = []
+ for name in font_names_tmp:
+ if name[:1] == '"':
+ name = name[1:-1].replace('\\"', '"')
+ elif name[:1] == '\'':
+ name = name[1:-1].replace('\\\'', '\'')
+ else:
+ name = name.strip()
+ if name:
+ font_names.append(name)
+
+ family = None
+ for name in font_names:
+ if name == 'serif':
+ family = 1 # roman
+ break
+ elif name == 'sans-serif':
+ family = 2 # swiss
+ break
+ elif name == 'cursive':
+ family = 4 # script
+ break
+ elif name == 'fantasy':
+ family = 5 # decorative
+ break
+
+ decoration = props.get('text-decoration')
+ if decoration is not None:
+ decoration = decoration.split()
+ else:
+ decoration = ()
+
+ return {
+ 'name': font_names[0] if font_names else None,
+ 'family': family,
+ 'size': size,
+ 'bold': self.BOLD_MAP.get(props.get('font-weight')),
+ 'italic': self.ITALIC_MAP.get(props.get('font-style')),
+ 'underline': ('single' if
+ 'underline' in decoration
+ else None),
+ 'strike': ('line-through' in decoration) or None,
+ 'color': self.color_to_excel(props.get('color')),
+ # shadow if nonzero digit before shadow color
+ 'shadow': (bool(re.search('^[^#(]*[1-9]',
+ props['text-shadow']))
+ if 'text-shadow' in props else None),
+ # 'vertAlign':,
+ # 'charset': ,
+ # 'scheme': ,
+ # 'outline': ,
+ # 'condense': ,
+ }
+
+ NAMED_COLORS = {
+ 'maroon': '800000',
+ 'brown': 'A52A2A',
+ 'red': 'FF0000',
+ 'pink': 'FFC0CB',
+ 'orange': 'FFA500',
+ 'yellow': 'FFFF00',
+ 'olive': '808000',
+ 'green': '008000',
+ 'purple': '800080',
+ 'fuchsia': 'FF00FF',
+ 'lime': '00FF00',
+ 'teal': '008080',
+ 'aqua': '00FFFF',
+ 'blue': '0000FF',
+ 'navy': '000080',
+ 'black': '000000',
+ 'gray': '808080',
+ 'grey': '808080',
+ 'silver': 'C0C0C0',
+ 'white': 'FFFFFF',
+ }
+
+ def color_to_excel(self, val):
+ if val is None:
+ return None
+ if val.startswith('#') and len(val) == 7:
+ return val[1:].upper()
+ if val.startswith('#') and len(val) == 4:
+ return (val[1] * 2 + val[2] * 2 + val[3] * 2).upper()
+ try:
+ return self.NAMED_COLORS[val]
+ except KeyError:
+ warnings.warn('Unhandled color format: {val!r}'.format(val=val),
+ CSSWarning)
+
+ def build_number_format(self, props):
+ return {'format_code': props.get('number-format')}
+
+
+class ExcelFormatter(object):
+ """
+ Class for formatting a DataFrame to a list of ExcelCells,
+
+ Parameters
+ ----------
+ df : DataFrame or Styler
+ na_rep: na representation
+ float_format : string, default None
+ Format string for floating point numbers
+ cols : sequence, optional
+ Columns to write
+ header : boolean or list of string, default True
+ Write out column names. If a list of string is given it is
+ assumed to be aliases for the column names
+ index : boolean, default True
+ output row names (index)
+ index_label : string or sequence, default None
+ Column label for index column(s) if desired. If None is given, and
+ `header` and `index` are True, then the index names are used. A
+ sequence should be given if the DataFrame uses MultiIndex.
+ merge_cells : boolean, default False
+ Format MultiIndex and Hierarchical Rows as merged cells.
+ inf_rep : string, default `'inf'`
+ representation for np.inf values (which aren't representable in Excel)
+ A `'-'` sign will be added in front of -inf.
+ style_converter : callable, optional
+ This translates Styler styles (CSS) into ExcelWriter styles.
+ Defaults to ``CSSToExcelConverter()``.
+ It should have signature css_declarations string -> excel style.
+ This is only called for body cells.
+ """
+
+ def __init__(self, df, na_rep='', float_format=None, cols=None,
+ header=True, index=True, index_label=None, merge_cells=False,
+ inf_rep='inf', style_converter=None):
+ self.rowcounter = 0
+ self.na_rep = na_rep
+ if hasattr(df, 'render'):
+ self.styler = df
+ df = df.data
+ if style_converter is None:
+ style_converter = CSSToExcelConverter()
+ self.style_converter = style_converter
+ else:
+ self.styler = None
+ self.df = df
+ if cols is not None:
+
+ # all missing, raise
+ if not len(Index(cols) & df.columns):
+ raise KeyError(
+ "passes columns are not ALL present dataframe")
+
+ # deprecatedin gh-17295
+ # 1 missing is ok (for now)
+ if len(Index(cols) & df.columns) != len(cols):
+ warnings.warn(
+ "Not all names specified in 'columns' are found; "
+ "this will raise a KeyError in the future",
+ FutureWarning)
+
+ self.df = df.reindex(columns=cols)
+ self.columns = self.df.columns
+ self.float_format = float_format
+ self.index = index
+ self.index_label = index_label
+ self.header = header
+ self.merge_cells = merge_cells
+ self.inf_rep = inf_rep
+
+ @property
+ def header_style(self):
+ return {"font": {"bold": True},
+ "borders": {"top": "thin",
+ "right": "thin",
+ "bottom": "thin",
+ "left": "thin"},
+ "alignment": {"horizontal": "center",
+ "vertical": "top"}}
+
+ def _format_value(self, val):
+ if is_scalar(val) and missing.isna(val):
+ val = self.na_rep
+ elif is_float(val):
+ if missing.isposinf_scalar(val):
+ val = self.inf_rep
+ elif missing.isneginf_scalar(val):
+ val = '-{inf}'.format(inf=self.inf_rep)
+ elif self.float_format is not None:
+ val = float(self.float_format % val)
+ return val
+
+ def _format_header_mi(self):
+ if self.columns.nlevels > 1:
+ if not self.index:
+ raise NotImplementedError("Writing to Excel with MultiIndex"
+ " columns and no index "
+ "('index'=False) is not yet "
+ "implemented.")
+
+ has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
+ if not (has_aliases or self.header):
+ return
+
+ columns = self.columns
+ level_strs = columns.format(sparsify=self.merge_cells, adjoin=False,
+ names=False)
+ level_lengths = get_level_lengths(level_strs)
+ coloffset = 0
+ lnum = 0
+
+ if self.index and isinstance(self.df.index, ABCMultiIndex):
+ coloffset = len(self.df.index[0]) - 1
+
+ if self.merge_cells:
+ # Format multi-index as a merged cells.
+ for lnum in range(len(level_lengths)):
+ name = columns.names[lnum]
+ yield ExcelCell(lnum, coloffset, name, self.header_style)
+
+ for lnum, (spans, levels, level_codes) in enumerate(zip(
+ level_lengths, columns.levels, columns.codes)):
+ values = levels.take(level_codes)
+ for i in spans:
+ if spans[i] > 1:
+ yield ExcelCell(lnum, coloffset + i + 1, values[i],
+ self.header_style, lnum,
+ coloffset + i + spans[i])
+ else:
+ yield ExcelCell(lnum, coloffset + i + 1, values[i],
+ self.header_style)
+ else:
+ # Format in legacy format with dots to indicate levels.
+ for i, values in enumerate(zip(*level_strs)):
+ v = ".".join(map(pprint_thing, values))
+ yield ExcelCell(lnum, coloffset + i + 1, v, self.header_style)
+
+ self.rowcounter = lnum
+
+ def _format_header_regular(self):
+ has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
+ if has_aliases or self.header:
+ coloffset = 0
+
+ if self.index:
+ coloffset = 1
+ if isinstance(self.df.index, ABCMultiIndex):
+ coloffset = len(self.df.index[0])
+
+ colnames = self.columns
+ if has_aliases:
+ if len(self.header) != len(self.columns):
+ raise ValueError('Writing {cols} cols but got {alias} '
+ 'aliases'.format(cols=len(self.columns),
+ alias=len(self.header)))
+ else:
+ colnames = self.header
+
+ for colindex, colname in enumerate(colnames):
+ yield ExcelCell(self.rowcounter, colindex + coloffset, colname,
+ self.header_style)
+
+ def _format_header(self):
+ if isinstance(self.columns, ABCMultiIndex):
+ gen = self._format_header_mi()
+ else:
+ gen = self._format_header_regular()
+
+ gen2 = ()
+ if self.df.index.names:
+ row = [x if x is not None else ''
+ for x in self.df.index.names] + [''] * len(self.columns)
+ if reduce(lambda x, y: x and y, map(lambda x: x != '', row)):
+ gen2 = (ExcelCell(self.rowcounter, colindex, val,
+ self.header_style)
+ for colindex, val in enumerate(row))
+ self.rowcounter += 1
+ return itertools.chain(gen, gen2)
+
+ def _format_body(self):
+
+ if isinstance(self.df.index, ABCMultiIndex):
+ return self._format_hierarchical_rows()
+ else:
+ return self._format_regular_rows()
+
+ def _format_regular_rows(self):
+ has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
+ if has_aliases or self.header:
+ self.rowcounter += 1
+
+ # output index and index_label?
+ if self.index:
+ # check aliases
+ # if list only take first as this is not a MultiIndex
+ if (self.index_label and
+ isinstance(self.index_label, (list, tuple, np.ndarray,
+ Index))):
+ index_label = self.index_label[0]
+ # if string good to go
+ elif self.index_label and isinstance(self.index_label, str):
+ index_label = self.index_label
+ else:
+ index_label = self.df.index.names[0]
+
+ if isinstance(self.columns, ABCMultiIndex):
+ self.rowcounter += 1
+
+ if index_label and self.header is not False:
+ yield ExcelCell(self.rowcounter - 1, 0, index_label,
+ self.header_style)
+
+ # write index_values
+ index_values = self.df.index
+ if isinstance(self.df.index, ABCPeriodIndex):
+ index_values = self.df.index.to_timestamp()
+
+ for idx, idxval in enumerate(index_values):
+ yield ExcelCell(self.rowcounter + idx, 0, idxval,
+ self.header_style)
+
+ coloffset = 1
+ else:
+ coloffset = 0
+
+ for cell in self._generate_body(coloffset):
+ yield cell
+
+ def _format_hierarchical_rows(self):
+ has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
+ if has_aliases or self.header:
+ self.rowcounter += 1
+
+ gcolidx = 0
+
+ if self.index:
+ index_labels = self.df.index.names
+ # check for aliases
+ if (self.index_label and
+ isinstance(self.index_label, (list, tuple, np.ndarray,
+ Index))):
+ index_labels = self.index_label
+
+ # MultiIndex columns require an extra row
+ # with index names (blank if None) for
+ # unambigous round-trip, unless not merging,
+ # in which case the names all go on one row Issue #11328
+ if isinstance(self.columns, ABCMultiIndex) and self.merge_cells:
+ self.rowcounter += 1
+
+ # if index labels are not empty go ahead and dump
+ if com._any_not_none(*index_labels) and self.header is not False:
+
+ for cidx, name in enumerate(index_labels):
+ yield ExcelCell(self.rowcounter - 1, cidx, name,
+ self.header_style)
+
+ if self.merge_cells:
+ # Format hierarchical rows as merged cells.
+ level_strs = self.df.index.format(sparsify=True, adjoin=False,
+ names=False)
+ level_lengths = get_level_lengths(level_strs)
+
+ for spans, levels, level_codes in zip(level_lengths,
+ self.df.index.levels,
+ self.df.index.codes):
+
+ values = levels.take(level_codes,
+ allow_fill=levels._can_hold_na,
+ fill_value=True)
+
+ for i in spans:
+ if spans[i] > 1:
+ yield ExcelCell(self.rowcounter + i, gcolidx,
+ values[i], self.header_style,
+ self.rowcounter + i + spans[i] - 1,
+ gcolidx)
+ else:
+ yield ExcelCell(self.rowcounter + i, gcolidx,
+ values[i], self.header_style)
+ gcolidx += 1
+
+ else:
+ # Format hierarchical rows with non-merged values.
+ for indexcolvals in zip(*self.df.index):
+ for idx, indexcolval in enumerate(indexcolvals):
+ yield ExcelCell(self.rowcounter + idx, gcolidx,
+ indexcolval, self.header_style)
+ gcolidx += 1
+
+ for cell in self._generate_body(gcolidx):
+ yield cell
+
+ def _generate_body(self, coloffset):
+ if self.styler is None:
+ styles = None
+ else:
+ styles = self.styler._compute().ctx
+ if not styles:
+ styles = None
+ xlstyle = None
+
+ # Write the body of the frame data series by series.
+ for colidx in range(len(self.columns)):
+ series = self.df.iloc[:, colidx]
+ for i, val in enumerate(series):
+ if styles is not None:
+ xlstyle = self.style_converter(';'.join(styles[i, colidx]))
+ yield ExcelCell(self.rowcounter + i, colidx + coloffset, val,
+ xlstyle)
+
+ def get_formatted_cells(self):
+ for cell in itertools.chain(self._format_header(),
+ self._format_body()):
+ cell.val = self._format_value(cell.val)
+ yield cell
+
+ def write(self, writer, sheet_name='Sheet1', startrow=0,
+ startcol=0, freeze_panes=None, engine=None):
+ """
+ writer : string or ExcelWriter object
+ File path or existing ExcelWriter
+ sheet_name : string, default 'Sheet1'
+ Name of sheet which will contain DataFrame
+ startrow :
+ upper left cell row to dump data frame
+ startcol :
+ upper left cell column to dump data frame
+ freeze_panes : tuple of integer (length 2), default None
+ Specifies the one-based bottommost row and rightmost column that
+ is to be frozen
+ engine : string, default None
+ write engine to use if writer is a path - you can also set this
+ via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``,
+ and ``io.excel.xlsm.writer``.
+ """
+ from pandas.io.excel import ExcelWriter
+ from pandas.io.common import _stringify_path
+
+ if isinstance(writer, ExcelWriter):
+ need_save = False
+ else:
+ writer = ExcelWriter(_stringify_path(writer), engine=engine)
+ need_save = True
+
+ formatted_cells = self.get_formatted_cells()
+ writer.write_cells(formatted_cells, sheet_name,
+ startrow=startrow, startcol=startcol,
+ freeze_panes=freeze_panes)
+ if need_save:
+ writer.save()
diff --git a/contrib/python/pandas/py2/pandas/io/formats/format.py b/contrib/python/pandas/py2/pandas/io/formats/format.py
new file mode 100644
index 00000000000..f68ef2cc390
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/formats/format.py
@@ -0,0 +1,1626 @@
+# -*- coding: utf-8 -*-
+"""
+Internal module for formatting output data in csv, html,
+and latex files. This module also applies to display formatting.
+"""
+
+from __future__ import print_function
+
+from functools import partial
+
+import numpy as np
+
+from pandas._libs import lib
+from pandas._libs.tslib import format_array_from_datetime
+from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT
+from pandas.compat import StringIO, lzip, map, u, zip
+
+from pandas.core.dtypes.common import (
+ is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
+ is_extension_array_dtype, is_float, is_float_dtype, is_integer,
+ is_integer_dtype, is_list_like, is_numeric_dtype, is_scalar,
+ is_timedelta64_dtype)
+from pandas.core.dtypes.generic import (
+ ABCIndexClass, ABCMultiIndex, ABCSeries, ABCSparseArray)
+from pandas.core.dtypes.missing import isna, notna
+
+from pandas import compat
+from pandas.core.base import PandasObject
+import pandas.core.common as com
+from pandas.core.config import get_option, set_option
+from pandas.core.index import Index, ensure_index
+from pandas.core.indexes.datetimes import DatetimeIndex
+
+from pandas.io.common import _expand_user, _stringify_path
+from pandas.io.formats.printing import adjoin, justify, pprint_thing
+from pandas.io.formats.terminal import get_terminal_size
+
+# pylint: disable=W0141
+
+
+common_docstring = """
+ Parameters
+ ----------
+ buf : StringIO-like, optional
+ Buffer to write to.
+ columns : sequence, optional, default None
+ The subset of columns to write. Writes all columns by default.
+ col_space : int, optional
+ The minimum width of each column.
+ header : bool, optional
+ %(header)s.
+ index : bool, optional, default True
+ Whether to print index (row) labels.
+ na_rep : str, optional, default 'NaN'
+ String representation of NAN to use.
+ formatters : list or dict of one-param. functions, optional
+ Formatter functions to apply to columns' elements by position or
+ name.
+ The result of each function must be a unicode string.
+ List must be of length equal to the number of columns.
+ float_format : one-parameter function, optional, default None
+ Formatter function to apply to columns' elements if they are
+ floats. The result of this function must be a unicode string.
+ sparsify : bool, optional, default True
+ Set to False for a DataFrame with a hierarchical index to print
+ every multiindex key at each row.
+ index_names : bool, optional, default True
+ Prints the names of the indexes.
+ justify : str, default None
+ How to justify the column labels. If None uses the option from
+ the print configuration (controlled by set_option), 'right' out
+ of the box. Valid values are
+
+ * left
+ * right
+ * center
+ * justify
+ * justify-all
+ * start
+ * end
+ * inherit
+ * match-parent
+ * initial
+ * unset.
+ max_rows : int, optional
+ Maximum number of rows to display in the console.
+ max_cols : int, optional
+ Maximum number of columns to display in the console.
+ show_dimensions : bool, default False
+ Display DataFrame dimensions (number of rows by number of columns).
+ decimal : str, default '.'
+ Character recognized as decimal separator, e.g. ',' in Europe.
+
+ .. versionadded:: 0.18.0
+ """
+
+_VALID_JUSTIFY_PARAMETERS = ("left", "right", "center", "justify",
+ "justify-all", "start", "end", "inherit",
+ "match-parent", "initial", "unset")
+
+return_docstring = """
+ Returns
+ -------
+ str (or unicode, depending on data and options)
+ String representation of the dataframe.
+ """
+
+
+class CategoricalFormatter(object):
+
+ def __init__(self, categorical, buf=None, length=True, na_rep='NaN',
+ footer=True):
+ self.categorical = categorical
+ self.buf = buf if buf is not None else StringIO(u(""))
+ self.na_rep = na_rep
+ self.length = length
+ self.footer = footer
+
+ def _get_footer(self):
+ footer = ''
+
+ if self.length:
+ if footer:
+ footer += ', '
+ footer += "Length: {length}".format(length=len(self.categorical))
+
+ level_info = self.categorical._repr_categories_info()
+
+ # Levels are added in a newline
+ if footer:
+ footer += '\n'
+ footer += level_info
+
+ return compat.text_type(footer)
+
+ def _get_formatted_values(self):
+ return format_array(self.categorical.get_values(), None,
+ float_format=None, na_rep=self.na_rep)
+
+ def to_string(self):
+ categorical = self.categorical
+
+ if len(categorical) == 0:
+ if self.footer:
+ return self._get_footer()
+ else:
+ return u('')
+
+ fmt_values = self._get_formatted_values()
+
+ result = [u('{i}').format(i=i) for i in fmt_values]
+ result = [i.strip() for i in result]
+ result = u(', ').join(result)
+ result = [u('[') + result + u(']')]
+ if self.footer:
+ footer = self._get_footer()
+ if footer:
+ result.append(footer)
+
+ return compat.text_type(u('\n').join(result))
+
+
+class SeriesFormatter(object):
+
+ def __init__(self, series, buf=None, length=True, header=True, index=True,
+ na_rep='NaN', name=False, float_format=None, dtype=True,
+ max_rows=None):
+ self.series = series
+ self.buf = buf if buf is not None else StringIO()
+ self.name = name
+ self.na_rep = na_rep
+ self.header = header
+ self.length = length
+ self.index = index
+ self.max_rows = max_rows
+
+ if float_format is None:
+ float_format = get_option("display.float_format")
+ self.float_format = float_format
+ self.dtype = dtype
+ self.adj = _get_adjustment()
+
+ self._chk_truncate()
+
+ def _chk_truncate(self):
+ from pandas.core.reshape.concat import concat
+ max_rows = self.max_rows
+ truncate_v = max_rows and (len(self.series) > max_rows)
+ series = self.series
+ if truncate_v:
+ if max_rows == 1:
+ row_num = max_rows
+ series = series.iloc[:max_rows]
+ else:
+ row_num = max_rows // 2
+ series = concat((series.iloc[:row_num],
+ series.iloc[-row_num:]))
+ self.tr_row_num = row_num
+ self.tr_series = series
+ self.truncate_v = truncate_v
+
+ def _get_footer(self):
+ name = self.series.name
+ footer = u('')
+
+ if getattr(self.series.index, 'freq', None) is not None:
+ footer += 'Freq: {freq}'.format(freq=self.series.index.freqstr)
+
+ if self.name is not False and name is not None:
+ if footer:
+ footer += ', '
+
+ series_name = pprint_thing(name,
+ escape_chars=('\t', '\r', '\n'))
+ footer += ((u"Name: {sname}".format(sname=series_name))
+ if name is not None else "")
+
+ if (self.length is True or
+ (self.length == 'truncate' and self.truncate_v)):
+ if footer:
+ footer += ', '
+ footer += 'Length: {length}'.format(length=len(self.series))
+
+ if self.dtype is not False and self.dtype is not None:
+ name = getattr(self.tr_series.dtype, 'name', None)
+ if name:
+ if footer:
+ footer += ', '
+ footer += u'dtype: {typ}'.format(typ=pprint_thing(name))
+
+ # level infos are added to the end and in a new line, like it is done
+ # for Categoricals
+ if is_categorical_dtype(self.tr_series.dtype):
+ level_info = self.tr_series._values._repr_categories_info()
+ if footer:
+ footer += "\n"
+ footer += level_info
+
+ return compat.text_type(footer)
+
+ def _get_formatted_index(self):
+ index = self.tr_series.index
+ is_multi = isinstance(index, ABCMultiIndex)
+
+ if is_multi:
+ have_header = any(name for name in index.names)
+ fmt_index = index.format(names=True)
+ else:
+ have_header = index.name is not None
+ fmt_index = index.format(name=True)
+ return fmt_index, have_header
+
+ def _get_formatted_values(self):
+ values_to_format = self.tr_series._formatting_values()
+ return format_array(values_to_format, None,
+ float_format=self.float_format, na_rep=self.na_rep)
+
+ def to_string(self):
+ series = self.tr_series
+ footer = self._get_footer()
+
+ if len(series) == 0:
+ return 'Series([], ' + footer + ')'
+
+ fmt_index, have_header = self._get_formatted_index()
+ fmt_values = self._get_formatted_values()
+
+ if self.truncate_v:
+ n_header_rows = 0
+ row_num = self.tr_row_num
+ width = self.adj.len(fmt_values[row_num - 1])
+ if width > 3:
+ dot_str = '...'
+ else:
+ dot_str = '..'
+ # Series uses mode=center because it has single value columns
+ # DataFrame uses mode=left
+ dot_str = self.adj.justify([dot_str], width, mode='center')[0]
+ fmt_values.insert(row_num + n_header_rows, dot_str)
+ fmt_index.insert(row_num + 1, '')
+
+ if self.index:
+ result = self.adj.adjoin(3, *[fmt_index[1:], fmt_values])
+ else:
+ result = self.adj.adjoin(3, fmt_values)
+
+ if self.header and have_header:
+ result = fmt_index[0] + '\n' + result
+
+ if footer:
+ result += '\n' + footer
+
+ return compat.text_type(u('').join(result))
+
+
+class TextAdjustment(object):
+
+ def __init__(self):
+ self.encoding = get_option("display.encoding")
+
+ def len(self, text):
+ return compat.strlen(text, encoding=self.encoding)
+
+ def justify(self, texts, max_len, mode='right'):
+ return justify(texts, max_len, mode=mode)
+
+ def adjoin(self, space, *lists, **kwargs):
+ return adjoin(space, *lists, strlen=self.len,
+ justfunc=self.justify, **kwargs)
+
+
+class EastAsianTextAdjustment(TextAdjustment):
+
+ def __init__(self):
+ super(EastAsianTextAdjustment, self).__init__()
+ if get_option("display.unicode.ambiguous_as_wide"):
+ self.ambiguous_width = 2
+ else:
+ self.ambiguous_width = 1
+
+ def len(self, text):
+ return compat.east_asian_len(text, encoding=self.encoding,
+ ambiguous_width=self.ambiguous_width)
+
+ def justify(self, texts, max_len, mode='right'):
+ # re-calculate padding space per str considering East Asian Width
+ def _get_pad(t):
+ return max_len - self.len(t) + len(t)
+
+ if mode == 'left':
+ return [x.ljust(_get_pad(x)) for x in texts]
+ elif mode == 'center':
+ return [x.center(_get_pad(x)) for x in texts]
+ else:
+ return [x.rjust(_get_pad(x)) for x in texts]
+
+
+def _get_adjustment():
+ use_east_asian_width = get_option("display.unicode.east_asian_width")
+ if use_east_asian_width:
+ return EastAsianTextAdjustment()
+ else:
+ return TextAdjustment()
+
+
+class TableFormatter(object):
+
+ is_truncated = False
+ show_dimensions = None
+
+ @property
+ def should_show_dimensions(self):
+ return (self.show_dimensions is True or
+ (self.show_dimensions == 'truncate' and self.is_truncated))
+
+ def _get_formatter(self, i):
+ if isinstance(self.formatters, (list, tuple)):
+ if is_integer(i):
+ return self.formatters[i]
+ else:
+ return None
+ else:
+ if is_integer(i) and i not in self.columns:
+ i = self.columns[i]
+ return self.formatters.get(i, None)
+
+
+class DataFrameFormatter(TableFormatter):
+ """
+ Render a DataFrame
+
+ self.to_string() : console-friendly tabular output
+ self.to_html() : html table
+ self.to_latex() : LaTeX tabular environment table
+
+ """
+
+ __doc__ = __doc__ if __doc__ else ''
+ __doc__ += common_docstring + return_docstring
+
+ def __init__(self, frame, buf=None, columns=None, col_space=None,
+ header=True, index=True, na_rep='NaN', formatters=None,
+ justify=None, float_format=None, sparsify=None,
+ index_names=True, line_width=None, max_rows=None,
+ max_cols=None, show_dimensions=False, decimal='.',
+ table_id=None, render_links=False, **kwds):
+ self.frame = frame
+ if buf is not None:
+ self.buf = _expand_user(_stringify_path(buf))
+ else:
+ self.buf = StringIO()
+ self.show_index_names = index_names
+
+ if sparsify is None:
+ sparsify = get_option("display.multi_sparse")
+
+ self.sparsify = sparsify
+
+ self.float_format = float_format
+ self.formatters = formatters if formatters is not None else {}
+ self.na_rep = na_rep
+ self.decimal = decimal
+ self.col_space = col_space
+ self.header = header
+ self.index = index
+ self.line_width = line_width
+ self.max_rows = max_rows
+ self.max_cols = max_cols
+ self.max_rows_displayed = min(max_rows or len(self.frame),
+ len(self.frame))
+ self.show_dimensions = show_dimensions
+ self.table_id = table_id
+ self.render_links = render_links
+
+ if justify is None:
+ self.justify = get_option("display.colheader_justify")
+ else:
+ self.justify = justify
+
+ self.kwds = kwds
+
+ if columns is not None:
+ self.columns = ensure_index(columns)
+ self.frame = self.frame[self.columns]
+ else:
+ self.columns = frame.columns
+
+ self._chk_truncate()
+ self.adj = _get_adjustment()
+
+ def _chk_truncate(self):
+ """
+ Checks whether the frame should be truncated. If so, slices
+ the frame up.
+ """
+ from pandas.core.reshape.concat import concat
+
+ # Cut the data to the information actually printed
+ max_cols = self.max_cols
+ max_rows = self.max_rows
+
+ if max_cols == 0 or max_rows == 0: # assume we are in the terminal
+ # (why else = 0)
+ (w, h) = get_terminal_size()
+ self.w = w
+ self.h = h
+ if self.max_rows == 0:
+ dot_row = 1
+ prompt_row = 1
+ if self.show_dimensions:
+ show_dimension_rows = 3
+ n_add_rows = (self.header + dot_row + show_dimension_rows +
+ prompt_row)
+ # rows available to fill with actual data
+ max_rows_adj = self.h - n_add_rows
+ self.max_rows_adj = max_rows_adj
+
+ # Format only rows and columns that could potentially fit the
+ # screen
+ if max_cols == 0 and len(self.frame.columns) > w:
+ max_cols = w
+ if max_rows == 0 and len(self.frame) > h:
+ max_rows = h
+
+ if not hasattr(self, 'max_rows_adj'):
+ self.max_rows_adj = max_rows
+ if not hasattr(self, 'max_cols_adj'):
+ self.max_cols_adj = max_cols
+
+ max_cols_adj = self.max_cols_adj
+ max_rows_adj = self.max_rows_adj
+
+ truncate_h = max_cols_adj and (len(self.columns) > max_cols_adj)
+ truncate_v = max_rows_adj and (len(self.frame) > max_rows_adj)
+
+ frame = self.frame
+ if truncate_h:
+ if max_cols_adj == 0:
+ col_num = len(frame.columns)
+ elif max_cols_adj == 1:
+ frame = frame.iloc[:, :max_cols]
+ col_num = max_cols
+ else:
+ col_num = (max_cols_adj // 2)
+ frame = concat((frame.iloc[:, :col_num],
+ frame.iloc[:, -col_num:]), axis=1)
+ self.tr_col_num = col_num
+ if truncate_v:
+ if max_rows_adj == 1:
+ row_num = max_rows
+ frame = frame.iloc[:max_rows, :]
+ else:
+ row_num = max_rows_adj // 2
+ frame = concat((frame.iloc[:row_num, :],
+ frame.iloc[-row_num:, :]))
+ self.tr_row_num = row_num
+
+ self.tr_frame = frame
+ self.truncate_h = truncate_h
+ self.truncate_v = truncate_v
+ self.is_truncated = self.truncate_h or self.truncate_v
+
+ def _to_str_columns(self):
+ """
+ Render a DataFrame to a list of columns (as lists of strings).
+ """
+ frame = self.tr_frame
+ # may include levels names also
+
+ str_index = self._get_formatted_index(frame)
+
+ if not is_list_like(self.header) and not self.header:
+ stringified = []
+ for i, c in enumerate(frame):
+ fmt_values = self._format_col(i)
+ fmt_values = _make_fixed_width(fmt_values, self.justify,
+ minimum=(self.col_space or 0),
+ adj=self.adj)
+ stringified.append(fmt_values)
+ else:
+ if is_list_like(self.header):
+ if len(self.header) != len(self.columns):
+ raise ValueError(('Writing {ncols} cols but got {nalias} '
+ 'aliases'
+ .format(ncols=len(self.columns),
+ nalias=len(self.header))))
+ str_columns = [[label] for label in self.header]
+ else:
+ str_columns = self._get_formatted_column_labels(frame)
+
+ stringified = []
+ for i, c in enumerate(frame):
+ cheader = str_columns[i]
+ header_colwidth = max(self.col_space or 0,
+ *(self.adj.len(x) for x in cheader))
+ fmt_values = self._format_col(i)
+ fmt_values = _make_fixed_width(fmt_values, self.justify,
+ minimum=header_colwidth,
+ adj=self.adj)
+
+ max_len = max(max(self.adj.len(x) for x in fmt_values),
+ header_colwidth)
+ cheader = self.adj.justify(cheader, max_len, mode=self.justify)
+ stringified.append(cheader + fmt_values)
+
+ strcols = stringified
+ if self.index:
+ strcols.insert(0, str_index)
+
+ # Add ... to signal truncated
+ truncate_h = self.truncate_h
+ truncate_v = self.truncate_v
+
+ if truncate_h:
+ col_num = self.tr_col_num
+ strcols.insert(self.tr_col_num + 1, [' ...'] * (len(str_index)))
+ if truncate_v:
+ n_header_rows = len(str_index) - len(frame)
+ row_num = self.tr_row_num
+ for ix, col in enumerate(strcols):
+ # infer from above row
+ cwidth = self.adj.len(strcols[ix][row_num])
+ is_dot_col = False
+ if truncate_h:
+ is_dot_col = ix == col_num + 1
+ if cwidth > 3 or is_dot_col:
+ my_str = '...'
+ else:
+ my_str = '..'
+
+ if ix == 0:
+ dot_mode = 'left'
+ elif is_dot_col:
+ cwidth = 4
+ dot_mode = 'right'
+ else:
+ dot_mode = 'right'
+ dot_str = self.adj.justify([my_str], cwidth, mode=dot_mode)[0]
+ strcols[ix].insert(row_num + n_header_rows, dot_str)
+ return strcols
+
+ def to_string(self):
+ """
+ Render a DataFrame to a console-friendly tabular output.
+ """
+ from pandas import Series
+
+ frame = self.frame
+
+ if len(frame.columns) == 0 or len(frame.index) == 0:
+ info_line = (u('Empty {name}\nColumns: {col}\nIndex: {idx}')
+ .format(name=type(self.frame).__name__,
+ col=pprint_thing(frame.columns),
+ idx=pprint_thing(frame.index)))
+ text = info_line
+ else:
+
+ strcols = self._to_str_columns()
+ if self.line_width is None: # no need to wrap around just print
+ # the whole frame
+ text = self.adj.adjoin(1, *strcols)
+ elif (not isinstance(self.max_cols, int) or
+ self.max_cols > 0): # need to wrap around
+ text = self._join_multiline(*strcols)
+ else: # max_cols == 0. Try to fit frame to terminal
+ text = self.adj.adjoin(1, *strcols).split('\n')
+ max_len = Series(text).str.len().max()
+ # plus truncate dot col
+ dif = max_len - self.w
+ # '+ 1' to avoid too wide repr (GH PR #17023)
+ adj_dif = dif + 1
+ col_lens = Series([Series(ele).apply(len).max()
+ for ele in strcols])
+ n_cols = len(col_lens)
+ counter = 0
+ while adj_dif > 0 and n_cols > 1:
+ counter += 1
+ mid = int(round(n_cols / 2.))
+ mid_ix = col_lens.index[mid]
+ col_len = col_lens[mid_ix]
+ # adjoin adds one
+ adj_dif -= (col_len + 1)
+ col_lens = col_lens.drop(mid_ix)
+ n_cols = len(col_lens)
+ # subtract index column
+ max_cols_adj = n_cols - self.index
+ # GH-21180. Ensure that we print at least two.
+ max_cols_adj = max(max_cols_adj, 2)
+ self.max_cols_adj = max_cols_adj
+
+ # Call again _chk_truncate to cut frame appropriately
+ # and then generate string representation
+ self._chk_truncate()
+ strcols = self._to_str_columns()
+ text = self.adj.adjoin(1, *strcols)
+ self.buf.writelines(text)
+
+ if self.should_show_dimensions:
+ self.buf.write("\n\n[{nrows} rows x {ncols} columns]"
+ .format(nrows=len(frame), ncols=len(frame.columns)))
+
+ def _join_multiline(self, *strcols):
+ lwidth = self.line_width
+ adjoin_width = 1
+ strcols = list(strcols)
+ if self.index:
+ idx = strcols.pop(0)
+ lwidth -= np.array([self.adj.len(x)
+ for x in idx]).max() + adjoin_width
+
+ col_widths = [np.array([self.adj.len(x) for x in col]).max() if
+ len(col) > 0 else 0 for col in strcols]
+ col_bins = _binify(col_widths, lwidth)
+ nbins = len(col_bins)
+
+ if self.truncate_v:
+ nrows = self.max_rows_adj + 1
+ else:
+ nrows = len(self.frame)
+
+ str_lst = []
+ st = 0
+ for i, ed in enumerate(col_bins):
+ row = strcols[st:ed]
+ if self.index:
+ row.insert(0, idx)
+ if nbins > 1:
+ if ed <= len(strcols) and i < nbins - 1:
+ row.append([' \\'] + [' '] * (nrows - 1))
+ else:
+ row.append([' '] * nrows)
+ str_lst.append(self.adj.adjoin(adjoin_width, *row))
+ st = ed
+ return '\n\n'.join(str_lst)
+
+ def to_latex(self, column_format=None, longtable=False, encoding=None,
+ multicolumn=False, multicolumn_format=None, multirow=False):
+ """
+ Render a DataFrame to a LaTeX tabular/longtable environment output.
+ """
+
+ from pandas.io.formats.latex import LatexFormatter
+ latex_renderer = LatexFormatter(self, column_format=column_format,
+ longtable=longtable,
+ multicolumn=multicolumn,
+ multicolumn_format=multicolumn_format,
+ multirow=multirow)
+
+ if encoding is None:
+ encoding = 'ascii' if compat.PY2 else 'utf-8'
+
+ if hasattr(self.buf, 'write'):
+ latex_renderer.write_result(self.buf)
+ elif isinstance(self.buf, compat.string_types):
+ import codecs
+ with codecs.open(self.buf, 'w', encoding=encoding) as f:
+ latex_renderer.write_result(f)
+ else:
+ raise TypeError('buf is not a file name and it has no write '
+ 'method')
+
+ def _format_col(self, i):
+ frame = self.tr_frame
+ formatter = self._get_formatter(i)
+ values_to_format = frame.iloc[:, i]._formatting_values()
+ return format_array(values_to_format, formatter,
+ float_format=self.float_format, na_rep=self.na_rep,
+ space=self.col_space, decimal=self.decimal)
+
+ def to_html(self, classes=None, notebook=False, border=None):
+ """
+ Render a DataFrame to a html table.
+
+ Parameters
+ ----------
+ classes : str or list-like
+ classes to include in the `class` attribute of the opening
+ ``<table>`` tag, in addition to the default "dataframe".
+ notebook : {True, False}, optional, default False
+ Whether the generated HTML is for IPython Notebook.
+ border : int
+ A ``border=border`` attribute is included in the opening
+ ``<table>`` tag. Default ``pd.options.html.border``.
+
+ .. versionadded:: 0.19.0
+ """
+ from pandas.io.formats.html import HTMLFormatter, NotebookFormatter
+ Klass = NotebookFormatter if notebook else HTMLFormatter
+ html = Klass(self, classes=classes, border=border).render()
+ if hasattr(self.buf, 'write'):
+ buffer_put_lines(self.buf, html)
+ elif isinstance(self.buf, compat.string_types):
+ with open(self.buf, 'w') as f:
+ buffer_put_lines(f, html)
+ else:
+ raise TypeError('buf is not a file name and it has no write '
+ ' method')
+
+ def _get_formatted_column_labels(self, frame):
+ from pandas.core.index import _sparsify
+
+ columns = frame.columns
+
+ if isinstance(columns, ABCMultiIndex):
+ fmt_columns = columns.format(sparsify=False, adjoin=False)
+ fmt_columns = lzip(*fmt_columns)
+ dtypes = self.frame.dtypes._values
+
+ # if we have a Float level, they don't use leading space at all
+ restrict_formatting = any(l.is_floating for l in columns.levels)
+ need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
+
+ def space_format(x, y):
+ if (y not in self.formatters and
+ need_leadsp[x] and not restrict_formatting):
+ return ' ' + y
+ return y
+
+ str_columns = list(zip(*[[space_format(x, y) for y in x]
+ for x in fmt_columns]))
+ if self.sparsify and len(str_columns):
+ str_columns = _sparsify(str_columns)
+
+ str_columns = [list(x) for x in zip(*str_columns)]
+ else:
+ fmt_columns = columns.format()
+ dtypes = self.frame.dtypes
+ need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
+ str_columns = [[' ' + x if not self._get_formatter(i) and
+ need_leadsp[x] else x]
+ for i, (col, x) in enumerate(zip(columns,
+ fmt_columns))]
+
+ if self.show_row_idx_names:
+ for x in str_columns:
+ x.append('')
+
+ # self.str_columns = str_columns
+ return str_columns
+
+ @property
+ def has_index_names(self):
+ return _has_names(self.frame.index)
+
+ @property
+ def has_column_names(self):
+ return _has_names(self.frame.columns)
+
+ @property
+ def show_row_idx_names(self):
+ return all((self.has_index_names,
+ self.index,
+ self.show_index_names))
+
+ @property
+ def show_col_idx_names(self):
+ return all((self.has_column_names,
+ self.show_index_names,
+ self.header))
+
+ def _get_formatted_index(self, frame):
+ # Note: this is only used by to_string() and to_latex(), not by
+ # to_html().
+ index = frame.index
+ columns = frame.columns
+ fmt = self._get_formatter('__index__')
+
+ if isinstance(index, ABCMultiIndex):
+ fmt_index = index.format(
+ sparsify=self.sparsify, adjoin=False,
+ names=self.show_row_idx_names, formatter=fmt)
+ else:
+ fmt_index = [index.format(
+ name=self.show_row_idx_names, formatter=fmt)]
+
+ fmt_index = [tuple(_make_fixed_width(list(x), justify='left',
+ minimum=(self.col_space or 0),
+ adj=self.adj)) for x in fmt_index]
+
+ adjoined = self.adj.adjoin(1, *fmt_index).split('\n')
+
+ # empty space for columns
+ if self.show_col_idx_names:
+ col_header = ['{x}'.format(x=x)
+ for x in self._get_column_name_list()]
+ else:
+ col_header = [''] * columns.nlevels
+
+ if self.header:
+ return col_header + adjoined
+ else:
+ return adjoined
+
+ def _get_column_name_list(self):
+ names = []
+ columns = self.frame.columns
+ if isinstance(columns, ABCMultiIndex):
+ names.extend('' if name is None else name
+ for name in columns.names)
+ else:
+ names.append('' if columns.name is None else columns.name)
+ return names
+
+# ----------------------------------------------------------------------
+# Array formatters
+
+
+def format_array(values, formatter, float_format=None, na_rep='NaN',
+ digits=None, space=None, justify='right', decimal='.',
+ leading_space=None):
+ """
+ Format an array for printing.
+
+ Parameters
+ ----------
+ values
+ formatter
+ float_format
+ na_rep
+ digits
+ space
+ justify
+ decimal
+ leading_space : bool, optional
+ Whether the array should be formatted with a leading space.
+ When an array as a column of a Series or DataFrame, we do want
+ the leading space to pad between columns.
+
+ When formatting an Index subclass
+ (e.g. IntervalIndex._format_native_types), we don't want the
+ leading space since it should be left-aligned.
+
+ Returns
+ -------
+ List[str]
+ """
+
+ if is_datetime64_dtype(values.dtype):
+ fmt_klass = Datetime64Formatter
+ elif is_datetime64tz_dtype(values):
+ fmt_klass = Datetime64TZFormatter
+ elif is_timedelta64_dtype(values.dtype):
+ fmt_klass = Timedelta64Formatter
+ elif is_extension_array_dtype(values.dtype):
+ fmt_klass = ExtensionArrayFormatter
+ elif is_float_dtype(values.dtype):
+ fmt_klass = FloatArrayFormatter
+ elif is_integer_dtype(values.dtype):
+ fmt_klass = IntArrayFormatter
+ else:
+ fmt_klass = GenericArrayFormatter
+
+ if space is None:
+ space = get_option("display.column_space")
+
+ if float_format is None:
+ float_format = get_option("display.float_format")
+
+ if digits is None:
+ digits = get_option("display.precision")
+
+ fmt_obj = fmt_klass(values, digits=digits, na_rep=na_rep,
+ float_format=float_format, formatter=formatter,
+ space=space, justify=justify, decimal=decimal,
+ leading_space=leading_space)
+
+ return fmt_obj.get_result()
+
+
+class GenericArrayFormatter(object):
+
+ def __init__(self, values, digits=7, formatter=None, na_rep='NaN',
+ space=12, float_format=None, justify='right', decimal='.',
+ quoting=None, fixed_width=True, leading_space=None):
+ self.values = values
+ self.digits = digits
+ self.na_rep = na_rep
+ self.space = space
+ self.formatter = formatter
+ self.float_format = float_format
+ self.justify = justify
+ self.decimal = decimal
+ self.quoting = quoting
+ self.fixed_width = fixed_width
+ self.leading_space = leading_space
+
+ def get_result(self):
+ fmt_values = self._format_strings()
+ return _make_fixed_width(fmt_values, self.justify)
+
+ def _format_strings(self):
+ if self.float_format is None:
+ float_format = get_option("display.float_format")
+ if float_format is None:
+ fmt_str = ('{{x: .{prec:d}g}}'
+ .format(prec=get_option("display.precision")))
+ float_format = lambda x: fmt_str.format(x=x)
+ else:
+ float_format = self.float_format
+
+ formatter = (
+ self.formatter if self.formatter is not None else
+ (lambda x: pprint_thing(x, escape_chars=('\t', '\r', '\n'))))
+
+ def _format(x):
+ if self.na_rep is not None and is_scalar(x) and isna(x):
+ if x is None:
+ return 'None'
+ elif x is NaT:
+ return 'NaT'
+ return self.na_rep
+ elif isinstance(x, PandasObject):
+ return u'{x}'.format(x=x)
+ else:
+ # object dtype
+ return u'{x}'.format(x=formatter(x))
+
+ vals = self.values
+ if isinstance(vals, Index):
+ vals = vals._values
+ elif isinstance(vals, ABCSparseArray):
+ vals = vals.values
+
+ is_float_type = lib.map_infer(vals, is_float) & notna(vals)
+ leading_space = self.leading_space
+ if leading_space is None:
+ leading_space = is_float_type.any()
+
+ fmt_values = []
+ for i, v in enumerate(vals):
+ if not is_float_type[i] and leading_space:
+ fmt_values.append(u' {v}'.format(v=_format(v)))
+ elif is_float_type[i]:
+ fmt_values.append(float_format(v))
+ else:
+ if leading_space is False:
+ # False specifically, so that the default is
+ # to include a space if we get here.
+ tpl = u'{v}'
+ else:
+ tpl = u' {v}'
+ fmt_values.append(tpl.format(v=_format(v)))
+
+ return fmt_values
+
+
+class FloatArrayFormatter(GenericArrayFormatter):
+ """
+
+ """
+
+ def __init__(self, *args, **kwargs):
+ GenericArrayFormatter.__init__(self, *args, **kwargs)
+
+ # float_format is expected to be a string
+ # formatter should be used to pass a function
+ if self.float_format is not None and self.formatter is None:
+ # GH21625, GH22270
+ self.fixed_width = False
+ if callable(self.float_format):
+ self.formatter = self.float_format
+ self.float_format = None
+
+ def _value_formatter(self, float_format=None, threshold=None):
+ """Returns a function to be applied on each value to format it
+ """
+
+ # the float_format parameter supersedes self.float_format
+ if float_format is None:
+ float_format = self.float_format
+
+ # we are going to compose different functions, to first convert to
+ # a string, then replace the decimal symbol, and finally chop according
+ # to the threshold
+
+ # when there is no float_format, we use str instead of '%g'
+ # because str(0.0) = '0.0' while '%g' % 0.0 = '0'
+ if float_format:
+ def base_formatter(v):
+ return float_format(value=v) if notna(v) else self.na_rep
+ else:
+ def base_formatter(v):
+ return str(v) if notna(v) else self.na_rep
+
+ if self.decimal != '.':
+ def decimal_formatter(v):
+ return base_formatter(v).replace('.', self.decimal, 1)
+ else:
+ decimal_formatter = base_formatter
+
+ if threshold is None:
+ return decimal_formatter
+
+ def formatter(value):
+ if notna(value):
+ if abs(value) > threshold:
+ return decimal_formatter(value)
+ else:
+ return decimal_formatter(0.0)
+ else:
+ return self.na_rep
+
+ return formatter
+
+ def get_result_as_array(self):
+ """
+ Returns the float values converted into strings using
+ the parameters given at initialisation, as a numpy array
+ """
+
+ if self.formatter is not None:
+ return np.array([self.formatter(x) for x in self.values])
+
+ if self.fixed_width:
+ threshold = get_option("display.chop_threshold")
+ else:
+ threshold = None
+
+ # if we have a fixed_width, we'll need to try different float_format
+ def format_values_with(float_format):
+ formatter = self._value_formatter(float_format, threshold)
+
+ # default formatter leaves a space to the left when formatting
+ # floats, must be consistent for left-justifying NaNs (GH #25061)
+ if self.justify == 'left':
+ na_rep = ' ' + self.na_rep
+ else:
+ na_rep = self.na_rep
+
+ # separate the wheat from the chaff
+ values = self.values
+ mask = isna(values)
+ if hasattr(values, 'to_dense'): # sparse numpy ndarray
+ values = values.to_dense()
+ values = np.array(values, dtype='object')
+ values[mask] = na_rep
+ imask = (~mask).ravel()
+ values.flat[imask] = np.array([formatter(val)
+ for val in values.ravel()[imask]])
+
+ if self.fixed_width:
+ return _trim_zeros(values, na_rep)
+
+ return values
+
+ # There is a special default string when we are fixed-width
+ # The default is otherwise to use str instead of a formatting string
+ if self.float_format is None:
+ if self.fixed_width:
+ float_format = partial('{value: .{digits:d}f}'.format,
+ digits=self.digits)
+ else:
+ float_format = self.float_format
+ else:
+ float_format = lambda value: self.float_format % value
+
+ formatted_values = format_values_with(float_format)
+
+ if not self.fixed_width:
+ return formatted_values
+
+ # we need do convert to engineering format if some values are too small
+ # and would appear as 0, or if some values are too big and take too
+ # much space
+
+ if len(formatted_values) > 0:
+ maxlen = max(len(x) for x in formatted_values)
+ too_long = maxlen > self.digits + 6
+ else:
+ too_long = False
+
+ with np.errstate(invalid='ignore'):
+ abs_vals = np.abs(self.values)
+ # this is pretty arbitrary for now
+ # large values: more that 8 characters including decimal symbol
+ # and first digit, hence > 1e6
+ has_large_values = (abs_vals > 1e6).any()
+ has_small_values = ((abs_vals < 10**(-self.digits)) &
+ (abs_vals > 0)).any()
+
+ if has_small_values or (too_long and has_large_values):
+ float_format = partial('{value: .{digits:d}e}'.format,
+ digits=self.digits)
+ formatted_values = format_values_with(float_format)
+
+ return formatted_values
+
+ def _format_strings(self):
+ # shortcut
+ if self.formatter is not None:
+ return [self.formatter(x) for x in self.values]
+
+ return list(self.get_result_as_array())
+
+
+class IntArrayFormatter(GenericArrayFormatter):
+
+ def _format_strings(self):
+ formatter = self.formatter or (lambda x: '{x: d}'.format(x=x))
+ fmt_values = [formatter(x) for x in self.values]
+ return fmt_values
+
+
+class Datetime64Formatter(GenericArrayFormatter):
+
+ def __init__(self, values, nat_rep='NaT', date_format=None, **kwargs):
+ super(Datetime64Formatter, self).__init__(values, **kwargs)
+ self.nat_rep = nat_rep
+ self.date_format = date_format
+
+ def _format_strings(self):
+ """ we by definition have DO NOT have a TZ """
+
+ values = self.values
+
+ if not isinstance(values, DatetimeIndex):
+ values = DatetimeIndex(values)
+
+ if self.formatter is not None and callable(self.formatter):
+ return [self.formatter(x) for x in values]
+
+ fmt_values = format_array_from_datetime(
+ values.asi8.ravel(),
+ format=_get_format_datetime64_from_values(values,
+ self.date_format),
+ na_rep=self.nat_rep).reshape(values.shape)
+ return fmt_values.tolist()
+
+
+class ExtensionArrayFormatter(GenericArrayFormatter):
+ def _format_strings(self):
+ values = self.values
+ if isinstance(values, (ABCIndexClass, ABCSeries)):
+ values = values._values
+
+ formatter = values._formatter(boxed=True)
+
+ if is_categorical_dtype(values.dtype):
+ # Categorical is special for now, so that we can preserve tzinfo
+ array = values.get_values()
+ else:
+ array = np.asarray(values)
+
+ fmt_values = format_array(array,
+ formatter,
+ float_format=self.float_format,
+ na_rep=self.na_rep, digits=self.digits,
+ space=self.space, justify=self.justify,
+ leading_space=self.leading_space)
+ return fmt_values
+
+
+def format_percentiles(percentiles):
+ """
+ Outputs rounded and formatted percentiles.
+
+ Parameters
+ ----------
+ percentiles : list-like, containing floats from interval [0,1]
+
+ Returns
+ -------
+ formatted : list of strings
+
+ Notes
+ -----
+ Rounding precision is chosen so that: (1) if any two elements of
+ ``percentiles`` differ, they remain different after rounding
+ (2) no entry is *rounded* to 0% or 100%.
+ Any non-integer is always rounded to at least 1 decimal place.
+
+ Examples
+ --------
+ Keeps all entries different after rounding:
+
+ >>> format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999])
+ ['1.999%', '2.001%', '50%', '66.667%', '99.99%']
+
+ No element is rounded to 0% or 100% (unless already equal to it).
+ Duplicates are allowed:
+
+ >>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999])
+ ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%']
+ """
+
+ percentiles = np.asarray(percentiles)
+
+ # It checks for np.NaN as well
+ with np.errstate(invalid='ignore'):
+ if not is_numeric_dtype(percentiles) or not np.all(percentiles >= 0) \
+ or not np.all(percentiles <= 1):
+ raise ValueError("percentiles should all be in the interval [0,1]")
+
+ percentiles = 100 * percentiles
+ int_idx = (percentiles.astype(int) == percentiles)
+
+ if np.all(int_idx):
+ out = percentiles.astype(int).astype(str)
+ return [i + '%' for i in out]
+
+ unique_pcts = np.unique(percentiles)
+ to_begin = unique_pcts[0] if unique_pcts[0] > 0 else None
+ to_end = 100 - unique_pcts[-1] if unique_pcts[-1] < 100 else None
+
+ # Least precision that keeps percentiles unique after rounding
+ prec = -np.floor(np.log10(np.min(
+ np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end)
+ ))).astype(int)
+ prec = max(1, prec)
+ out = np.empty_like(percentiles, dtype=object)
+ out[int_idx] = percentiles[int_idx].astype(int).astype(str)
+ out[~int_idx] = percentiles[~int_idx].round(prec).astype(str)
+ return [i + '%' for i in out]
+
+
+def _is_dates_only(values):
+ # return a boolean if we are only dates (and don't have a timezone)
+ values = DatetimeIndex(values)
+ if values.tz is not None:
+ return False
+
+ values_int = values.asi8
+ consider_values = values_int != iNaT
+ one_day_nanos = (86400 * 1e9)
+ even_days = np.logical_and(consider_values,
+ values_int % int(one_day_nanos) != 0).sum() == 0
+ if even_days:
+ return True
+ return False
+
+
+def _format_datetime64(x, tz=None, nat_rep='NaT'):
+ if x is None or (is_scalar(x) and isna(x)):
+ return nat_rep
+
+ if tz is not None or not isinstance(x, Timestamp):
+ if getattr(x, 'tzinfo', None) is not None:
+ x = Timestamp(x).tz_convert(tz)
+ else:
+ x = Timestamp(x).tz_localize(tz)
+
+ return str(x)
+
+
+def _format_datetime64_dateonly(x, nat_rep='NaT', date_format=None):
+ if x is None or (is_scalar(x) and isna(x)):
+ return nat_rep
+
+ if not isinstance(x, Timestamp):
+ x = Timestamp(x)
+
+ if date_format:
+ return x.strftime(date_format)
+ else:
+ return x._date_repr
+
+
+def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None):
+
+ if is_dates_only:
+ return lambda x, tz=None: _format_datetime64_dateonly(
+ x, nat_rep=nat_rep, date_format=date_format)
+ else:
+ return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep)
+
+
+def _get_format_datetime64_from_values(values, date_format):
+ """ given values and a date_format, return a string format """
+ is_dates_only = _is_dates_only(values)
+ if is_dates_only:
+ return date_format or "%Y-%m-%d"
+ return date_format
+
+
+class Datetime64TZFormatter(Datetime64Formatter):
+
+ def _format_strings(self):
+ """ we by definition have a TZ """
+
+ values = self.values.astype(object)
+ is_dates_only = _is_dates_only(values)
+ formatter = (self.formatter or
+ _get_format_datetime64(is_dates_only,
+ date_format=self.date_format))
+ fmt_values = [formatter(x) for x in values]
+
+ return fmt_values
+
+
+class Timedelta64Formatter(GenericArrayFormatter):
+
+ def __init__(self, values, nat_rep='NaT', box=False, **kwargs):
+ super(Timedelta64Formatter, self).__init__(values, **kwargs)
+ self.nat_rep = nat_rep
+ self.box = box
+
+ def _format_strings(self):
+ formatter = (self.formatter or
+ _get_format_timedelta64(self.values, nat_rep=self.nat_rep,
+ box=self.box))
+ fmt_values = np.array([formatter(x) for x in self.values])
+ return fmt_values
+
+
+def _get_format_timedelta64(values, nat_rep='NaT', box=False):
+ """
+ Return a formatter function for a range of timedeltas.
+ These will all have the same format argument
+
+ If box, then show the return in quotes
+ """
+
+ values_int = values.astype(np.int64)
+
+ consider_values = values_int != iNaT
+
+ one_day_nanos = (86400 * 1e9)
+ even_days = np.logical_and(consider_values,
+ values_int % one_day_nanos != 0).sum() == 0
+ all_sub_day = np.logical_and(
+ consider_values, np.abs(values_int) >= one_day_nanos).sum() == 0
+
+ if even_days:
+ format = None
+ elif all_sub_day:
+ format = 'sub_day'
+ else:
+ format = 'long'
+
+ def _formatter(x):
+ if x is None or (is_scalar(x) and isna(x)):
+ return nat_rep
+
+ if not isinstance(x, Timedelta):
+ x = Timedelta(x)
+ result = x._repr_base(format=format)
+ if box:
+ result = "'{res}'".format(res=result)
+ return result
+
+ return _formatter
+
+
+def _make_fixed_width(strings, justify='right', minimum=None, adj=None):
+
+ if len(strings) == 0 or justify == 'all':
+ return strings
+
+ if adj is None:
+ adj = _get_adjustment()
+
+ max_len = max(adj.len(x) for x in strings)
+
+ if minimum is not None:
+ max_len = max(minimum, max_len)
+
+ conf_max = get_option("display.max_colwidth")
+ if conf_max is not None and max_len > conf_max:
+ max_len = conf_max
+
+ def just(x):
+ if conf_max is not None:
+ if (conf_max > 3) & (adj.len(x) > max_len):
+ x = x[:max_len - 3] + '...'
+ return x
+
+ strings = [just(x) for x in strings]
+ result = adj.justify(strings, max_len, mode=justify)
+ return result
+
+
+def _trim_zeros(str_floats, na_rep='NaN'):
+ """
+ Trims zeros, leaving just one before the decimal points if need be.
+ """
+ trimmed = str_floats
+
+ def _is_number(x):
+ return (x != na_rep and not x.endswith('inf'))
+
+ def _cond(values):
+ finite = [x for x in values if _is_number(x)]
+ return (len(finite) > 0 and all(x.endswith('0') for x in finite) and
+ not (any(('e' in x) or ('E' in x) for x in finite)))
+
+ while _cond(trimmed):
+ trimmed = [x[:-1] if _is_number(x) else x for x in trimmed]
+
+ # leave one 0 after the decimal points if need be.
+ return [x + "0" if x.endswith('.') and _is_number(x) else x
+ for x in trimmed]
+
+
+def _has_names(index):
+ if isinstance(index, ABCMultiIndex):
+ return com._any_not_none(*index.names)
+ else:
+ return index.name is not None
+
+
+class EngFormatter(object):
+ """
+ Formats float values according to engineering format.
+
+ Based on matplotlib.ticker.EngFormatter
+ """
+
+ # The SI engineering prefixes
+ ENG_PREFIXES = {
+ -24: "y",
+ -21: "z",
+ -18: "a",
+ -15: "f",
+ -12: "p",
+ -9: "n",
+ -6: "u",
+ -3: "m",
+ 0: "",
+ 3: "k",
+ 6: "M",
+ 9: "G",
+ 12: "T",
+ 15: "P",
+ 18: "E",
+ 21: "Z",
+ 24: "Y"
+ }
+
+ def __init__(self, accuracy=None, use_eng_prefix=False):
+ self.accuracy = accuracy
+ self.use_eng_prefix = use_eng_prefix
+
+ def __call__(self, num):
+ """ Formats a number in engineering notation, appending a letter
+ representing the power of 1000 of the original number. Some examples:
+
+ >>> format_eng(0) # for self.accuracy = 0
+ ' 0'
+
+ >>> format_eng(1000000) # for self.accuracy = 1,
+ # self.use_eng_prefix = True
+ ' 1.0M'
+
+ >>> format_eng("-1e-6") # for self.accuracy = 2
+ # self.use_eng_prefix = False
+ '-1.00E-06'
+
+ @param num: the value to represent
+ @type num: either a numeric value or a string that can be converted to
+ a numeric value (as per decimal.Decimal constructor)
+
+ @return: engineering formatted string
+ """
+ import decimal
+ import math
+ dnum = decimal.Decimal(str(num))
+
+ if decimal.Decimal.is_nan(dnum):
+ return 'NaN'
+
+ if decimal.Decimal.is_infinite(dnum):
+ return 'inf'
+
+ sign = 1
+
+ if dnum < 0: # pragma: no cover
+ sign = -1
+ dnum = -dnum
+
+ if dnum != 0:
+ pow10 = decimal.Decimal(int(math.floor(dnum.log10() / 3) * 3))
+ else:
+ pow10 = decimal.Decimal(0)
+
+ pow10 = pow10.min(max(self.ENG_PREFIXES.keys()))
+ pow10 = pow10.max(min(self.ENG_PREFIXES.keys()))
+ int_pow10 = int(pow10)
+
+ if self.use_eng_prefix:
+ prefix = self.ENG_PREFIXES[int_pow10]
+ else:
+ if int_pow10 < 0:
+ prefix = 'E-{pow10:02d}'.format(pow10=-int_pow10)
+ else:
+ prefix = 'E+{pow10:02d}'.format(pow10=int_pow10)
+
+ mant = sign * dnum / (10**pow10)
+
+ if self.accuracy is None: # pragma: no cover
+ format_str = u("{mant: g}{prefix}")
+ else:
+ format_str = (u("{{mant: .{acc:d}f}}{{prefix}}")
+ .format(acc=self.accuracy))
+
+ formatted = format_str.format(mant=mant, prefix=prefix)
+
+ return formatted # .strip()
+
+
+def set_eng_float_format(accuracy=3, use_eng_prefix=False):
+ """
+ Alter default behavior on how float is formatted in DataFrame.
+ Format float in engineering format. By accuracy, we mean the number of
+ decimal digits after the floating point.
+
+ See also EngFormatter.
+ """
+
+ set_option("display.float_format", EngFormatter(accuracy, use_eng_prefix))
+ set_option("display.column_space", max(12, accuracy + 9))
+
+
+def _binify(cols, line_width):
+ adjoin_width = 1
+ bins = []
+ curr_width = 0
+ i_last_column = len(cols) - 1
+ for i, w in enumerate(cols):
+ w_adjoined = w + adjoin_width
+ curr_width += w_adjoined
+ if i_last_column == i:
+ wrap = curr_width + 1 > line_width and i > 0
+ else:
+ wrap = curr_width + 2 > line_width and i > 0
+ if wrap:
+ bins.append(i)
+ curr_width = w_adjoined
+
+ bins.append(len(cols))
+ return bins
+
+
+def get_level_lengths(levels, sentinel=''):
+ """For each index in each level the function returns lengths of indexes.
+
+ Parameters
+ ----------
+ levels : list of lists
+ List of values on for level.
+ sentinel : string, optional
+ Value which states that no new index starts on there.
+
+ Returns
+ ----------
+ Returns list of maps. For each level returns map of indexes (key is index
+ in row and value is length of index).
+ """
+ if len(levels) == 0:
+ return []
+
+ control = [True] * len(levels[0])
+
+ result = []
+ for level in levels:
+ last_index = 0
+
+ lengths = {}
+ for i, key in enumerate(level):
+ if control[i] and key == sentinel:
+ pass
+ else:
+ control[i] = False
+ lengths[last_index] = i - last_index
+ last_index = i
+
+ lengths[last_index] = len(level) - last_index
+
+ result.append(lengths)
+
+ return result
+
+
+def buffer_put_lines(buf, lines):
+ """
+ Appends lines to a buffer.
+
+ Parameters
+ ----------
+ buf
+ The buffer to write to
+ lines
+ The lines to append.
+ """
+ if any(isinstance(x, compat.text_type) for x in lines):
+ lines = [compat.text_type(x) for x in lines]
+ buf.write('\n'.join(lines))
diff --git a/contrib/python/pandas/py2/pandas/io/formats/html.py b/contrib/python/pandas/py2/pandas/io/formats/html.py
new file mode 100644
index 00000000000..f41749e0a77
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/formats/html.py
@@ -0,0 +1,531 @@
+# -*- coding: utf-8 -*-
+"""
+Module for formatting output data in HTML.
+"""
+
+from __future__ import print_function
+
+from textwrap import dedent
+
+from pandas.compat import OrderedDict, lzip, map, range, u, unichr, zip
+
+from pandas.core.dtypes.generic import ABCMultiIndex
+
+from pandas import compat
+import pandas.core.common as com
+from pandas.core.config import get_option
+
+from pandas.io.common import _is_url
+from pandas.io.formats.format import TableFormatter, get_level_lengths
+from pandas.io.formats.printing import pprint_thing
+
+
+class HTMLFormatter(TableFormatter):
+ """
+ Internal class for formatting output data in html.
+ This class is intended for shared functionality between
+ DataFrame.to_html() and DataFrame._repr_html_().
+ Any logic in common with other output formatting methods
+ should ideally be inherited from classes in format.py
+ and this class responsible for only producing html markup.
+ """
+
+ indent_delta = 2
+
+ def __init__(self, formatter, classes=None, border=None):
+ self.fmt = formatter
+ self.classes = classes
+
+ self.frame = self.fmt.frame
+ self.columns = self.fmt.tr_frame.columns
+ self.elements = []
+ self.bold_rows = self.fmt.kwds.get('bold_rows', False)
+ self.escape = self.fmt.kwds.get('escape', True)
+ self.show_dimensions = self.fmt.show_dimensions
+ if border is None:
+ border = get_option('display.html.border')
+ self.border = border
+ self.table_id = self.fmt.table_id
+ self.render_links = self.fmt.render_links
+
+ @property
+ def show_row_idx_names(self):
+ return self.fmt.show_row_idx_names
+
+ @property
+ def show_col_idx_names(self):
+ return self.fmt.show_col_idx_names
+
+ @property
+ def row_levels(self):
+ if self.fmt.index:
+ # showing (row) index
+ return self.frame.index.nlevels
+ elif self.show_col_idx_names:
+ # see gh-22579
+ # Column misalignment also occurs for
+ # a standard index when the columns index is named.
+ # If the row index is not displayed a column of
+ # blank cells need to be included before the DataFrame values.
+ return 1
+ # not showing (row) index
+ return 0
+
+ @property
+ def is_truncated(self):
+ return self.fmt.is_truncated
+
+ @property
+ def ncols(self):
+ return len(self.fmt.tr_frame.columns)
+
+ def write(self, s, indent=0):
+ rs = pprint_thing(s)
+ self.elements.append(' ' * indent + rs)
+
+ def write_th(self, s, indent=0, tags=None):
+ if self.fmt.col_space is not None and self.fmt.col_space > 0:
+ tags = (tags or "")
+ tags += ('style="min-width: {colspace};"'
+ .format(colspace=self.fmt.col_space))
+
+ return self._write_cell(s, kind='th', indent=indent, tags=tags)
+
+ def write_td(self, s, indent=0, tags=None):
+ return self._write_cell(s, kind='td', indent=indent, tags=tags)
+
+ def _write_cell(self, s, kind='td', indent=0, tags=None):
+ if tags is not None:
+ start_tag = '<{kind} {tags}>'.format(kind=kind, tags=tags)
+ else:
+ start_tag = '<{kind}>'.format(kind=kind)
+
+ if self.escape:
+ # escape & first to prevent double escaping of &
+ esc = OrderedDict([('&', r'&amp;'), ('<', r'&lt;'),
+ ('>', r'&gt;')])
+ else:
+ esc = {}
+
+ rs = pprint_thing(s, escape_chars=esc).strip()
+
+ if self.render_links and _is_url(rs):
+ rs_unescaped = pprint_thing(s, escape_chars={}).strip()
+ start_tag += '<a href="{url}" target="_blank">'.format(
+ url=rs_unescaped)
+ end_a = '</a>'
+ else:
+ end_a = ''
+
+ self.write(u'{start}{rs}{end_a}</{kind}>'.format(
+ start=start_tag, rs=rs, end_a=end_a, kind=kind), indent)
+
+ def write_tr(self, line, indent=0, indent_delta=0, header=False,
+ align=None, tags=None, nindex_levels=0):
+ if tags is None:
+ tags = {}
+
+ if align is None:
+ self.write('<tr>', indent)
+ else:
+ self.write('<tr style="text-align: {align};">'
+ .format(align=align), indent)
+ indent += indent_delta
+
+ for i, s in enumerate(line):
+ val_tag = tags.get(i, None)
+ if header or (self.bold_rows and i < nindex_levels):
+ self.write_th(s, indent, tags=val_tag)
+ else:
+ self.write_td(s, indent, tags=val_tag)
+
+ indent -= indent_delta
+ self.write('</tr>', indent)
+
+ def render(self):
+ self._write_table()
+
+ if self.should_show_dimensions:
+ by = chr(215) if compat.PY3 else unichr(215) # ×
+ self.write(u('<p>{rows} rows {by} {cols} columns</p>')
+ .format(rows=len(self.frame),
+ by=by,
+ cols=len(self.frame.columns)))
+
+ return self.elements
+
+ def _write_table(self, indent=0):
+ _classes = ['dataframe'] # Default class.
+ use_mathjax = get_option("display.html.use_mathjax")
+ if not use_mathjax:
+ _classes.append('tex2jax_ignore')
+ if self.classes is not None:
+ if isinstance(self.classes, str):
+ self.classes = self.classes.split()
+ if not isinstance(self.classes, (list, tuple)):
+ raise AssertionError('classes must be list or tuple, not {typ}'
+ .format(typ=type(self.classes)))
+ _classes.extend(self.classes)
+
+ if self.table_id is None:
+ id_section = ""
+ else:
+ id_section = ' id="{table_id}"'.format(table_id=self.table_id)
+
+ self.write('<table border="{border}" class="{cls}"{id_section}>'
+ .format(border=self.border, cls=' '.join(_classes),
+ id_section=id_section), indent)
+
+ if self.fmt.header or self.show_row_idx_names:
+ self._write_header(indent + self.indent_delta)
+
+ self._write_body(indent + self.indent_delta)
+
+ self.write('</table>', indent)
+
+ def _write_col_header(self, indent):
+ truncate_h = self.fmt.truncate_h
+ if isinstance(self.columns, ABCMultiIndex):
+ template = 'colspan="{span:d}" halign="left"'
+
+ if self.fmt.sparsify:
+ # GH3547
+ sentinel = com.sentinel_factory()
+ else:
+ sentinel = False
+ levels = self.columns.format(sparsify=sentinel, adjoin=False,
+ names=False)
+ level_lengths = get_level_lengths(levels, sentinel)
+ inner_lvl = len(level_lengths) - 1
+ for lnum, (records, values) in enumerate(zip(level_lengths,
+ levels)):
+ if truncate_h:
+ # modify the header lines
+ ins_col = self.fmt.tr_col_num
+ if self.fmt.sparsify:
+ recs_new = {}
+ # Increment tags after ... col.
+ for tag, span in list(records.items()):
+ if tag >= ins_col:
+ recs_new[tag + 1] = span
+ elif tag + span > ins_col:
+ recs_new[tag] = span + 1
+ if lnum == inner_lvl:
+ values = (values[:ins_col] + (u('...'),) +
+ values[ins_col:])
+ else:
+ # sparse col headers do not receive a ...
+ values = (values[:ins_col] +
+ (values[ins_col - 1], ) +
+ values[ins_col:])
+ else:
+ recs_new[tag] = span
+ # if ins_col lies between tags, all col headers
+ # get ...
+ if tag + span == ins_col:
+ recs_new[ins_col] = 1
+ values = (values[:ins_col] + (u('...'),) +
+ values[ins_col:])
+ records = recs_new
+ inner_lvl = len(level_lengths) - 1
+ if lnum == inner_lvl:
+ records[ins_col] = 1
+ else:
+ recs_new = {}
+ for tag, span in list(records.items()):
+ if tag >= ins_col:
+ recs_new[tag + 1] = span
+ else:
+ recs_new[tag] = span
+ recs_new[ins_col] = 1
+ records = recs_new
+ values = (values[:ins_col] + [u('...')] +
+ values[ins_col:])
+
+ # see gh-22579
+ # Column Offset Bug with to_html(index=False) with
+ # MultiIndex Columns and Index.
+ # Initially fill row with blank cells before column names.
+ # TODO: Refactor to remove code duplication with code
+ # block below for standard columns index.
+ row = [''] * (self.row_levels - 1)
+ if self.fmt.index or self.show_col_idx_names:
+ # see gh-22747
+ # If to_html(index_names=False) do not show columns
+ # index names.
+ # TODO: Refactor to use _get_column_name_list from
+ # DataFrameFormatter class and create a
+ # _get_formatted_column_labels function for code
+ # parity with DataFrameFormatter class.
+ if self.fmt.show_index_names:
+ name = self.columns.names[lnum]
+ row.append(pprint_thing(name or ''))
+ else:
+ row.append('')
+
+ tags = {}
+ j = len(row)
+ for i, v in enumerate(values):
+ if i in records:
+ if records[i] > 1:
+ tags[j] = template.format(span=records[i])
+ else:
+ continue
+ j += 1
+ row.append(v)
+ self.write_tr(row, indent, self.indent_delta, tags=tags,
+ header=True)
+ else:
+ # see gh-22579
+ # Column misalignment also occurs for
+ # a standard index when the columns index is named.
+ # Initially fill row with blank cells before column names.
+ # TODO: Refactor to remove code duplication with code block
+ # above for columns MultiIndex.
+ row = [''] * (self.row_levels - 1)
+ if self.fmt.index or self.show_col_idx_names:
+ # see gh-22747
+ # If to_html(index_names=False) do not show columns
+ # index names.
+ # TODO: Refactor to use _get_column_name_list from
+ # DataFrameFormatter class.
+ if self.fmt.show_index_names:
+ row.append(self.columns.name or '')
+ else:
+ row.append('')
+ row.extend(self.columns)
+ align = self.fmt.justify
+
+ if truncate_h:
+ ins_col = self.row_levels + self.fmt.tr_col_num
+ row.insert(ins_col, '...')
+
+ self.write_tr(row, indent, self.indent_delta, header=True,
+ align=align)
+
+ def _write_row_header(self, indent):
+ truncate_h = self.fmt.truncate_h
+ row = ([x if x is not None else '' for x in self.frame.index.names]
+ + [''] * (self.ncols + (1 if truncate_h else 0)))
+ self.write_tr(row, indent, self.indent_delta, header=True)
+
+ def _write_header(self, indent):
+ self.write('<thead>', indent)
+
+ if self.fmt.header:
+ self._write_col_header(indent + self.indent_delta)
+
+ if self.show_row_idx_names:
+ self._write_row_header(indent + self.indent_delta)
+
+ self.write('</thead>', indent)
+
+ def _write_body(self, indent):
+ self.write('<tbody>', indent)
+ fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)}
+
+ # write values
+ if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex):
+ self._write_hierarchical_rows(
+ fmt_values, indent + self.indent_delta)
+ else:
+ self._write_regular_rows(
+ fmt_values, indent + self.indent_delta)
+
+ self.write('</tbody>', indent)
+
+ def _write_regular_rows(self, fmt_values, indent):
+ truncate_h = self.fmt.truncate_h
+ truncate_v = self.fmt.truncate_v
+
+ nrows = len(self.fmt.tr_frame)
+
+ if self.fmt.index:
+ fmt = self.fmt._get_formatter('__index__')
+ if fmt is not None:
+ index_values = self.fmt.tr_frame.index.map(fmt)
+ else:
+ index_values = self.fmt.tr_frame.index.format()
+
+ row = []
+ for i in range(nrows):
+
+ if truncate_v and i == (self.fmt.tr_row_num):
+ str_sep_row = ['...'] * len(row)
+ self.write_tr(str_sep_row, indent, self.indent_delta,
+ tags=None, nindex_levels=self.row_levels)
+
+ row = []
+ if self.fmt.index:
+ row.append(index_values[i])
+ # see gh-22579
+ # Column misalignment also occurs for
+ # a standard index when the columns index is named.
+ # Add blank cell before data cells.
+ elif self.show_col_idx_names:
+ row.append('')
+ row.extend(fmt_values[j][i] for j in range(self.ncols))
+
+ if truncate_h:
+ dot_col_ix = self.fmt.tr_col_num + self.row_levels
+ row.insert(dot_col_ix, '...')
+ self.write_tr(row, indent, self.indent_delta, tags=None,
+ nindex_levels=self.row_levels)
+
+ def _write_hierarchical_rows(self, fmt_values, indent):
+ template = 'rowspan="{span}" valign="top"'
+
+ truncate_h = self.fmt.truncate_h
+ truncate_v = self.fmt.truncate_v
+ frame = self.fmt.tr_frame
+ nrows = len(frame)
+
+ idx_values = frame.index.format(sparsify=False, adjoin=False,
+ names=False)
+ idx_values = lzip(*idx_values)
+
+ if self.fmt.sparsify:
+ # GH3547
+ sentinel = com.sentinel_factory()
+ levels = frame.index.format(sparsify=sentinel, adjoin=False,
+ names=False)
+
+ level_lengths = get_level_lengths(levels, sentinel)
+ inner_lvl = len(level_lengths) - 1
+ if truncate_v:
+ # Insert ... row and adjust idx_values and
+ # level_lengths to take this into account.
+ ins_row = self.fmt.tr_row_num
+ inserted = False
+ for lnum, records in enumerate(level_lengths):
+ rec_new = {}
+ for tag, span in list(records.items()):
+ if tag >= ins_row:
+ rec_new[tag + 1] = span
+ elif tag + span > ins_row:
+ rec_new[tag] = span + 1
+
+ # GH 14882 - Make sure insertion done once
+ if not inserted:
+ dot_row = list(idx_values[ins_row - 1])
+ dot_row[-1] = u('...')
+ idx_values.insert(ins_row, tuple(dot_row))
+ inserted = True
+ else:
+ dot_row = list(idx_values[ins_row])
+ dot_row[inner_lvl - lnum] = u('...')
+ idx_values[ins_row] = tuple(dot_row)
+ else:
+ rec_new[tag] = span
+ # If ins_row lies between tags, all cols idx cols
+ # receive ...
+ if tag + span == ins_row:
+ rec_new[ins_row] = 1
+ if lnum == 0:
+ idx_values.insert(ins_row, tuple(
+ [u('...')] * len(level_lengths)))
+
+ # GH 14882 - Place ... in correct level
+ elif inserted:
+ dot_row = list(idx_values[ins_row])
+ dot_row[inner_lvl - lnum] = u('...')
+ idx_values[ins_row] = tuple(dot_row)
+ level_lengths[lnum] = rec_new
+
+ level_lengths[inner_lvl][ins_row] = 1
+ for ix_col in range(len(fmt_values)):
+ fmt_values[ix_col].insert(ins_row, '...')
+ nrows += 1
+
+ for i in range(nrows):
+ row = []
+ tags = {}
+
+ sparse_offset = 0
+ j = 0
+ for records, v in zip(level_lengths, idx_values[i]):
+ if i in records:
+ if records[i] > 1:
+ tags[j] = template.format(span=records[i])
+ else:
+ sparse_offset += 1
+ continue
+
+ j += 1
+ row.append(v)
+
+ row.extend(fmt_values[j][i] for j in range(self.ncols))
+ if truncate_h:
+ row.insert(self.row_levels - sparse_offset +
+ self.fmt.tr_col_num, '...')
+ self.write_tr(row, indent, self.indent_delta, tags=tags,
+ nindex_levels=len(levels) - sparse_offset)
+ else:
+ row = []
+ for i in range(len(frame)):
+ if truncate_v and i == (self.fmt.tr_row_num):
+ str_sep_row = ['...'] * len(row)
+ self.write_tr(str_sep_row, indent, self.indent_delta,
+ tags=None, nindex_levels=self.row_levels)
+
+ idx_values = list(zip(*frame.index.format(
+ sparsify=False, adjoin=False, names=False)))
+ row = []
+ row.extend(idx_values[i])
+ row.extend(fmt_values[j][i] for j in range(self.ncols))
+ if truncate_h:
+ row.insert(self.row_levels + self.fmt.tr_col_num, '...')
+ self.write_tr(row, indent, self.indent_delta, tags=None,
+ nindex_levels=frame.index.nlevels)
+
+
+class NotebookFormatter(HTMLFormatter):
+ """
+ Internal class for formatting output data in html for display in Jupyter
+ Notebooks. This class is intended for functionality specific to
+ DataFrame._repr_html_() and DataFrame.to_html(notebook=True)
+ """
+
+ def write_style(self):
+ # We use the "scoped" attribute here so that the desired
+ # style properties for the data frame are not then applied
+ # throughout the entire notebook.
+ template_first = """\
+ <style scoped>"""
+ template_last = """\
+ </style>"""
+ template_select = """\
+ .dataframe %s {
+ %s: %s;
+ }"""
+ element_props = [('tbody tr th:only-of-type',
+ 'vertical-align',
+ 'middle'),
+ ('tbody tr th',
+ 'vertical-align',
+ 'top')]
+ if isinstance(self.columns, ABCMultiIndex):
+ element_props.append(('thead tr th',
+ 'text-align',
+ 'left'))
+ if self.show_row_idx_names:
+ element_props.append(('thead tr:last-of-type th',
+ 'text-align',
+ 'right'))
+ else:
+ element_props.append(('thead th',
+ 'text-align',
+ 'right'))
+ template_mid = '\n\n'.join(map(lambda t: template_select % t,
+ element_props))
+ template = dedent('\n'.join((template_first,
+ template_mid,
+ template_last)))
+ self.write(template)
+
+ def render(self):
+ self.write('<div>')
+ self.write_style()
+ super(NotebookFormatter, self).render()
+ self.write('</div>')
+ return self.elements
diff --git a/contrib/python/pandas/py2/pandas/io/formats/latex.py b/contrib/python/pandas/py2/pandas/io/formats/latex.py
new file mode 100644
index 00000000000..90be3364932
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/formats/latex.py
@@ -0,0 +1,246 @@
+# -*- coding: utf-8 -*-
+"""
+Module for formatting output data in Latex.
+"""
+from __future__ import print_function
+
+import numpy as np
+
+from pandas.compat import map, range, u, zip
+
+from pandas.core.dtypes.generic import ABCMultiIndex
+
+from pandas import compat
+
+from pandas.io.formats.format import TableFormatter
+
+
+class LatexFormatter(TableFormatter):
+ """ Used to render a DataFrame to a LaTeX tabular/longtable environment
+ output.
+
+ Parameters
+ ----------
+ formatter : `DataFrameFormatter`
+ column_format : str, default None
+ The columns format as specified in `LaTeX table format
+ <https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g 'rcl' for 3 columns
+ longtable : boolean, default False
+ Use a longtable environment instead of tabular.
+
+ See Also
+ --------
+ HTMLFormatter
+ """
+
+ def __init__(self, formatter, column_format=None, longtable=False,
+ multicolumn=False, multicolumn_format=None, multirow=False):
+ self.fmt = formatter
+ self.frame = self.fmt.frame
+ self.bold_rows = self.fmt.kwds.get('bold_rows', False)
+ self.column_format = column_format
+ self.longtable = longtable
+ self.multicolumn = multicolumn
+ self.multicolumn_format = multicolumn_format
+ self.multirow = multirow
+
+ def write_result(self, buf):
+ """
+ Render a DataFrame to a LaTeX tabular/longtable environment output.
+ """
+
+ # string representation of the columns
+ if len(self.frame.columns) == 0 or len(self.frame.index) == 0:
+ info_line = (u('Empty {name}\nColumns: {col}\nIndex: {idx}')
+ .format(name=type(self.frame).__name__,
+ col=self.frame.columns,
+ idx=self.frame.index))
+ strcols = [[info_line]]
+ else:
+ strcols = self.fmt._to_str_columns()
+
+ def get_col_type(dtype):
+ if issubclass(dtype.type, np.number):
+ return 'r'
+ else:
+ return 'l'
+
+ # reestablish the MultiIndex that has been joined by _to_str_column
+ if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex):
+ out = self.frame.index.format(
+ adjoin=False, sparsify=self.fmt.sparsify,
+ names=self.fmt.has_index_names, na_rep=self.fmt.na_rep
+ )
+
+ # index.format will sparsify repeated entries with empty strings
+ # so pad these with some empty space
+ def pad_empties(x):
+ for pad in reversed(x):
+ if pad:
+ break
+ return [x[0]] + [i if i else ' ' * len(pad) for i in x[1:]]
+ out = (pad_empties(i) for i in out)
+
+ # Add empty spaces for each column level
+ clevels = self.frame.columns.nlevels
+ out = [[' ' * len(i[-1])] * clevels + i for i in out]
+
+ # Add the column names to the last index column
+ cnames = self.frame.columns.names
+ if any(cnames):
+ new_names = [i if i else '{}' for i in cnames]
+ out[self.frame.index.nlevels - 1][:clevels] = new_names
+
+ # Get rid of old multiindex column and add new ones
+ strcols = out + strcols[1:]
+
+ column_format = self.column_format
+ if column_format is None:
+ dtypes = self.frame.dtypes._values
+ column_format = ''.join(map(get_col_type, dtypes))
+ if self.fmt.index:
+ index_format = 'l' * self.frame.index.nlevels
+ column_format = index_format + column_format
+ elif not isinstance(column_format,
+ compat.string_types): # pragma: no cover
+ raise AssertionError('column_format must be str or unicode, '
+ 'not {typ}'.format(typ=type(column_format)))
+
+ if not self.longtable:
+ buf.write('\\begin{{tabular}}{{{fmt}}}\n'
+ .format(fmt=column_format))
+ buf.write('\\toprule\n')
+ else:
+ buf.write('\\begin{{longtable}}{{{fmt}}}\n'
+ .format(fmt=column_format))
+ buf.write('\\toprule\n')
+
+ ilevels = self.frame.index.nlevels
+ clevels = self.frame.columns.nlevels
+ nlevels = clevels
+ if self.fmt.has_index_names and self.fmt.show_index_names:
+ nlevels += 1
+ strrows = list(zip(*strcols))
+ self.clinebuf = []
+
+ for i, row in enumerate(strrows):
+ if i == nlevels and self.fmt.header:
+ buf.write('\\midrule\n') # End of header
+ if self.longtable:
+ buf.write('\\endhead\n')
+ buf.write('\\midrule\n')
+ buf.write('\\multicolumn{{{n}}}{{r}}{{{{Continued on next '
+ 'page}}}} \\\\\n'.format(n=len(row)))
+ buf.write('\\midrule\n')
+ buf.write('\\endfoot\n\n')
+ buf.write('\\bottomrule\n')
+ buf.write('\\endlastfoot\n')
+ if self.fmt.kwds.get('escape', True):
+ # escape backslashes first
+ crow = [(x.replace('\\', '\\textbackslash ')
+ .replace('_', '\\_')
+ .replace('%', '\\%').replace('$', '\\$')
+ .replace('#', '\\#').replace('{', '\\{')
+ .replace('}', '\\}').replace('~', '\\textasciitilde ')
+ .replace('^', '\\textasciicircum ')
+ .replace('&', '\\&')
+ if (x and x != '{}') else '{}') for x in row]
+ else:
+ crow = [x if x else '{}' for x in row]
+ if self.bold_rows and self.fmt.index:
+ # bold row labels
+ crow = ['\\textbf{{{x}}}'.format(x=x)
+ if j < ilevels and x.strip() not in ['', '{}'] else x
+ for j, x in enumerate(crow)]
+ if i < clevels and self.fmt.header and self.multicolumn:
+ # sum up columns to multicolumns
+ crow = self._format_multicolumn(crow, ilevels)
+ if (i >= nlevels and self.fmt.index and self.multirow and
+ ilevels > 1):
+ # sum up rows to multirows
+ crow = self._format_multirow(crow, ilevels, i, strrows)
+ buf.write(' & '.join(crow))
+ buf.write(' \\\\\n')
+ if self.multirow and i < len(strrows) - 1:
+ self._print_cline(buf, i, len(strcols))
+
+ if not self.longtable:
+ buf.write('\\bottomrule\n')
+ buf.write('\\end{tabular}\n')
+ else:
+ buf.write('\\end{longtable}\n')
+
+ def _format_multicolumn(self, row, ilevels):
+ r"""
+ Combine columns belonging to a group to a single multicolumn entry
+ according to self.multicolumn_format
+
+ e.g.:
+ a & & & b & c &
+ will become
+ \multicolumn{3}{l}{a} & b & \multicolumn{2}{l}{c}
+ """
+ row2 = list(row[:ilevels])
+ ncol = 1
+ coltext = ''
+
+ def append_col():
+ # write multicolumn if needed
+ if ncol > 1:
+ row2.append('\\multicolumn{{{ncol:d}}}{{{fmt:s}}}{{{txt:s}}}'
+ .format(ncol=ncol, fmt=self.multicolumn_format,
+ txt=coltext.strip()))
+ # don't modify where not needed
+ else:
+ row2.append(coltext)
+ for c in row[ilevels:]:
+ # if next col has text, write the previous
+ if c.strip():
+ if coltext:
+ append_col()
+ coltext = c
+ ncol = 1
+ # if not, add it to the previous multicolumn
+ else:
+ ncol += 1
+ # write last column name
+ if coltext:
+ append_col()
+ return row2
+
+ def _format_multirow(self, row, ilevels, i, rows):
+ r"""
+ Check following rows, whether row should be a multirow
+
+ e.g.: becomes:
+ a & 0 & \multirow{2}{*}{a} & 0 &
+ & 1 & & 1 &
+ b & 0 & \cline{1-2}
+ b & 0 &
+ """
+ for j in range(ilevels):
+ if row[j].strip():
+ nrow = 1
+ for r in rows[i + 1:]:
+ if not r[j].strip():
+ nrow += 1
+ else:
+ break
+ if nrow > 1:
+ # overwrite non-multirow entry
+ row[j] = '\\multirow{{{nrow:d}}}{{*}}{{{row:s}}}'.format(
+ nrow=nrow, row=row[j].strip())
+ # save when to end the current block with \cline
+ self.clinebuf.append([i + nrow - 1, j + 1])
+ return row
+
+ def _print_cline(self, buf, i, icol):
+ """
+ Print clines after multirow-blocks are finished
+ """
+ for cl in self.clinebuf:
+ if cl[0] == i:
+ buf.write('\\cline{{{cl:d}-{icol:d}}}\n'
+ .format(cl=cl[1], icol=icol))
+ # remove entries that have been written to buffer
+ self.clinebuf = [x for x in self.clinebuf if x[0] != i]
diff --git a/contrib/python/pandas/py2/pandas/io/formats/printing.py b/contrib/python/pandas/py2/pandas/io/formats/printing.py
new file mode 100644
index 00000000000..6d45d1e5dfc
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/formats/printing.py
@@ -0,0 +1,435 @@
+"""
+printing tools
+"""
+
+import sys
+
+from pandas.compat import u
+
+from pandas.core.dtypes.inference import is_sequence
+
+from pandas import compat
+from pandas.core.config import get_option
+
+
+def adjoin(space, *lists, **kwargs):
+ """
+ Glues together two sets of strings using the amount of space requested.
+ The idea is to prettify.
+
+ ----------
+ space : int
+ number of spaces for padding
+ lists : str
+ list of str which being joined
+ strlen : callable
+ function used to calculate the length of each str. Needed for unicode
+ handling.
+ justfunc : callable
+ function used to justify str. Needed for unicode handling.
+ """
+ strlen = kwargs.pop('strlen', len)
+ justfunc = kwargs.pop('justfunc', justify)
+
+ out_lines = []
+ newLists = []
+ lengths = [max(map(strlen, x)) + space for x in lists[:-1]]
+ # not the last one
+ lengths.append(max(map(len, lists[-1])))
+ maxLen = max(map(len, lists))
+ for i, lst in enumerate(lists):
+ nl = justfunc(lst, lengths[i], mode='left')
+ nl.extend([' ' * lengths[i]] * (maxLen - len(lst)))
+ newLists.append(nl)
+ toJoin = zip(*newLists)
+ for lines in toJoin:
+ out_lines.append(_join_unicode(lines))
+ return _join_unicode(out_lines, sep='\n')
+
+
+def justify(texts, max_len, mode='right'):
+ """
+ Perform ljust, center, rjust against string or list-like
+ """
+ if mode == 'left':
+ return [x.ljust(max_len) for x in texts]
+ elif mode == 'center':
+ return [x.center(max_len) for x in texts]
+ else:
+ return [x.rjust(max_len) for x in texts]
+
+
+def _join_unicode(lines, sep=''):
+ try:
+ return sep.join(lines)
+ except UnicodeDecodeError:
+ sep = compat.text_type(sep)
+ return sep.join([x.decode('utf-8') if isinstance(x, str) else x
+ for x in lines])
+
+
+# Unicode consolidation
+# ---------------------
+#
+# pprinting utility functions for generating Unicode text or
+# bytes(3.x)/str(2.x) representations of objects.
+# Try to use these as much as possible rather then rolling your own.
+#
+# When to use
+# -----------
+#
+# 1) If you're writing code internal to pandas (no I/O directly involved),
+# use pprint_thing().
+#
+# It will always return unicode text which can handled by other
+# parts of the package without breakage.
+#
+# 2) if you need to write something out to file, use
+# pprint_thing_encoded(encoding).
+#
+# If no encoding is specified, it defaults to utf-8. Since encoding pure
+# ascii with utf-8 is a no-op you can safely use the default utf-8 if you're
+# working with straight ascii.
+
+
+def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds):
+ """
+ internal. pprinter for iterables. you should probably use pprint_thing()
+ rather then calling this directly.
+
+ bounds length of printed sequence, depending on options
+ """
+ if isinstance(seq, set):
+ fmt = u("{{{body}}}")
+ else:
+ fmt = u("[{body}]") if hasattr(seq, '__setitem__') else u("({body})")
+
+ if max_seq_items is False:
+ nitems = len(seq)
+ else:
+ nitems = max_seq_items or get_option("max_seq_items") or len(seq)
+
+ s = iter(seq)
+ # handle sets, no slicing
+ r = [pprint_thing(next(s),
+ _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
+ for i in range(min(nitems, len(seq)))]
+ body = ", ".join(r)
+
+ if nitems < len(seq):
+ body += ", ..."
+ elif isinstance(seq, tuple) and len(seq) == 1:
+ body += ','
+
+ return fmt.format(body=body)
+
+
+def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds):
+ """
+ internal. pprinter for iterables. you should probably use pprint_thing()
+ rather then calling this directly.
+ """
+ fmt = u("{{{things}}}")
+ pairs = []
+
+ pfmt = u("{key}: {val}")
+
+ if max_seq_items is False:
+ nitems = len(seq)
+ else:
+ nitems = max_seq_items or get_option("max_seq_items") or len(seq)
+
+ for k, v in list(seq.items())[:nitems]:
+ pairs.append(
+ pfmt.format(
+ key=pprint_thing(k, _nest_lvl + 1,
+ max_seq_items=max_seq_items, **kwds),
+ val=pprint_thing(v, _nest_lvl + 1,
+ max_seq_items=max_seq_items, **kwds)))
+
+ if nitems < len(seq):
+ return fmt.format(things=", ".join(pairs) + ", ...")
+ else:
+ return fmt.format(things=", ".join(pairs))
+
+
+def pprint_thing(thing, _nest_lvl=0, escape_chars=None, default_escapes=False,
+ quote_strings=False, max_seq_items=None):
+ """
+ This function is the sanctioned way of converting objects
+ to a unicode representation.
+
+ properly handles nested sequences containing unicode strings
+ (unicode(object) does not)
+
+ Parameters
+ ----------
+ thing : anything to be formatted
+ _nest_lvl : internal use only. pprint_thing() is mutually-recursive
+ with pprint_sequence, this argument is used to keep track of the
+ current nesting level, and limit it.
+ escape_chars : list or dict, optional
+ Characters to escape. If a dict is passed the values are the
+ replacements
+ default_escapes : bool, default False
+ Whether the input escape characters replaces or adds to the defaults
+ max_seq_items : False, int, default None
+ Pass thru to other pretty printers to limit sequence printing
+
+ Returns
+ -------
+ result - unicode object on py2, str on py3. Always Unicode.
+
+ """
+
+ def as_escaped_unicode(thing, escape_chars=escape_chars):
+ # Unicode is fine, else we try to decode using utf-8 and 'replace'
+ # if that's not it either, we have no way of knowing and the user
+ # should deal with it himself.
+
+ try:
+ result = compat.text_type(thing) # we should try this first
+ except UnicodeDecodeError:
+ # either utf-8 or we replace errors
+ result = str(thing).decode('utf-8', "replace")
+
+ translate = {'\t': r'\t', '\n': r'\n', '\r': r'\r', }
+ if isinstance(escape_chars, dict):
+ if default_escapes:
+ translate.update(escape_chars)
+ else:
+ translate = escape_chars
+ escape_chars = list(escape_chars.keys())
+ else:
+ escape_chars = escape_chars or tuple()
+ for c in escape_chars:
+ result = result.replace(c, translate[c])
+
+ return compat.text_type(result)
+
+ if (compat.PY3 and hasattr(thing, '__next__')) or hasattr(thing, 'next'):
+ return compat.text_type(thing)
+ elif (isinstance(thing, dict) and
+ _nest_lvl < get_option("display.pprint_nest_depth")):
+ result = _pprint_dict(thing, _nest_lvl, quote_strings=True,
+ max_seq_items=max_seq_items)
+ elif (is_sequence(thing) and
+ _nest_lvl < get_option("display.pprint_nest_depth")):
+ result = _pprint_seq(thing, _nest_lvl, escape_chars=escape_chars,
+ quote_strings=quote_strings,
+ max_seq_items=max_seq_items)
+ elif isinstance(thing, compat.string_types) and quote_strings:
+ if compat.PY3:
+ fmt = u("'{thing}'")
+ else:
+ fmt = u("u'{thing}'")
+ result = fmt.format(thing=as_escaped_unicode(thing))
+ else:
+ result = as_escaped_unicode(thing)
+
+ return compat.text_type(result) # always unicode
+
+
+def pprint_thing_encoded(object, encoding='utf-8', errors='replace', **kwds):
+ value = pprint_thing(object) # get unicode representation of object
+ return value.encode(encoding, errors, **kwds)
+
+
+def _enable_data_resource_formatter(enable):
+ if 'IPython' not in sys.modules:
+ # definitely not in IPython
+ return
+ from IPython import get_ipython
+ ip = get_ipython()
+ if ip is None:
+ # still not in IPython
+ return
+
+ formatters = ip.display_formatter.formatters
+ mimetype = "application/vnd.dataresource+json"
+
+ if enable:
+ if mimetype not in formatters:
+ # define tableschema formatter
+ from IPython.core.formatters import BaseFormatter
+
+ class TableSchemaFormatter(BaseFormatter):
+ print_method = '_repr_data_resource_'
+ _return_type = (dict,)
+ # register it:
+ formatters[mimetype] = TableSchemaFormatter()
+ # enable it if it's been disabled:
+ formatters[mimetype].enabled = True
+ else:
+ # unregister tableschema mime-type
+ if mimetype in formatters:
+ formatters[mimetype].enabled = False
+
+
+default_pprint = lambda x, max_seq_items=None: \
+ pprint_thing(x, escape_chars=('\t', '\r', '\n'), quote_strings=True,
+ max_seq_items=max_seq_items)
+
+
+def format_object_summary(obj, formatter, is_justify=True, name=None,
+ indent_for_name=True):
+ """
+ Return the formatted obj as a unicode string
+
+ Parameters
+ ----------
+ obj : object
+ must be iterable and support __getitem__
+ formatter : callable
+ string formatter for an element
+ is_justify : boolean
+ should justify the display
+ name : name, optional
+ defaults to the class name of the obj
+ indent_for_name : bool, default True
+ Whether subsequent lines should be be indented to
+ align with the name.
+
+ Returns
+ -------
+ summary string
+
+ """
+ from pandas.io.formats.console import get_console_size
+ from pandas.io.formats.format import _get_adjustment
+
+ display_width, _ = get_console_size()
+ if display_width is None:
+ display_width = get_option('display.width') or 80
+ if name is None:
+ name = obj.__class__.__name__
+
+ if indent_for_name:
+ name_len = len(name)
+ space1 = "\n%s" % (' ' * (name_len + 1))
+ space2 = "\n%s" % (' ' * (name_len + 2))
+ else:
+ space1 = "\n"
+ space2 = "\n " # space for the opening '['
+
+ n = len(obj)
+ sep = ','
+ max_seq_items = get_option('display.max_seq_items') or n
+
+ # are we a truncated display
+ is_truncated = n > max_seq_items
+
+ # adj can optionally handle unicode eastern asian width
+ adj = _get_adjustment()
+
+ def _extend_line(s, line, value, display_width, next_line_prefix):
+
+ if (adj.len(line.rstrip()) + adj.len(value.rstrip()) >=
+ display_width):
+ s += line.rstrip()
+ line = next_line_prefix
+ line += value
+ return s, line
+
+ def best_len(values):
+ if values:
+ return max(adj.len(x) for x in values)
+ else:
+ return 0
+
+ close = u', '
+
+ if n == 0:
+ summary = u'[]{}'.format(close)
+ elif n == 1:
+ first = formatter(obj[0])
+ summary = u'[{}]{}'.format(first, close)
+ elif n == 2:
+ first = formatter(obj[0])
+ last = formatter(obj[-1])
+ summary = u'[{}, {}]{}'.format(first, last, close)
+ else:
+
+ if n > max_seq_items:
+ n = min(max_seq_items // 2, 10)
+ head = [formatter(x) for x in obj[:n]]
+ tail = [formatter(x) for x in obj[-n:]]
+ else:
+ head = []
+ tail = [formatter(x) for x in obj]
+
+ # adjust all values to max length if needed
+ if is_justify:
+
+ # however, if we are not truncated and we are only a single
+ # line, then don't justify
+ if (is_truncated or
+ not (len(', '.join(head)) < display_width and
+ len(', '.join(tail)) < display_width)):
+ max_len = max(best_len(head), best_len(tail))
+ head = [x.rjust(max_len) for x in head]
+ tail = [x.rjust(max_len) for x in tail]
+
+ summary = ""
+ line = space2
+
+ for i in range(len(head)):
+ word = head[i] + sep + ' '
+ summary, line = _extend_line(summary, line, word,
+ display_width, space2)
+
+ if is_truncated:
+ # remove trailing space of last line
+ summary += line.rstrip() + space2 + '...'
+ line = space2
+
+ for i in range(len(tail) - 1):
+ word = tail[i] + sep + ' '
+ summary, line = _extend_line(summary, line, word,
+ display_width, space2)
+
+ # last value: no sep added + 1 space of width used for trailing ','
+ summary, line = _extend_line(summary, line, tail[-1],
+ display_width - 2, space2)
+ summary += line
+
+ # right now close is either '' or ', '
+ # Now we want to include the ']', but not the maybe space.
+ close = ']' + close.rstrip(' ')
+ summary += close
+
+ if len(summary) > (display_width):
+ summary += space1
+ else: # one row
+ summary += ' '
+
+ # remove initial space
+ summary = '[' + summary[len(space2):]
+
+ return summary
+
+
+def format_object_attrs(obj):
+ """
+ Return a list of tuples of the (attr, formatted_value)
+ for common attrs, including dtype, name, length
+
+ Parameters
+ ----------
+ obj : object
+ must be iterable
+
+ Returns
+ -------
+ list
+
+ """
+ attrs = []
+ if hasattr(obj, 'dtype'):
+ attrs.append(('dtype', "'{}'".format(obj.dtype)))
+ if getattr(obj, 'name', None) is not None:
+ attrs.append(('name', default_pprint(obj.name)))
+ max_seq_items = get_option('display.max_seq_items') or len(obj)
+ if len(obj) > max_seq_items:
+ attrs.append(('length', len(obj)))
+ return attrs
diff --git a/contrib/python/pandas/py2/pandas/io/formats/style.py b/contrib/python/pandas/py2/pandas/io/formats/style.py
new file mode 100644
index 00000000000..598453eb92d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/formats/style.py
@@ -0,0 +1,1367 @@
+"""
+Module for applying conditional formatting to
+DataFrames and Series.
+"""
+
+from collections import defaultdict
+from contextlib import contextmanager
+import copy
+from functools import partial
+from itertools import product
+from uuid import uuid1
+
+import numpy as np
+
+from pandas.compat import range
+from pandas.util._decorators import Appender
+
+from pandas.core.dtypes.common import is_float, is_string_like
+from pandas.core.dtypes.generic import ABCSeries
+
+import pandas as pd
+from pandas.api.types import is_dict_like, is_list_like
+import pandas.core.common as com
+from pandas.core.config import get_option
+from pandas.core.generic import _shared_docs
+from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice
+
+try:
+ from jinja2 import (
+ PackageLoader, Environment, ChoiceLoader, FileSystemLoader
+ )
+except ImportError:
+ raise ImportError("pandas.Styler requires jinja2. "
+ "Please install with `conda install Jinja2`\n"
+ "or `pip install Jinja2`")
+
+
+try:
+ import matplotlib.pyplot as plt
+ from matplotlib import colors
+ has_mpl = True
+except ImportError:
+ has_mpl = False
+ no_mpl_message = "{0} requires matplotlib."
+
+
+@contextmanager
+def _mpl(func):
+ if has_mpl:
+ yield plt, colors
+ else:
+ raise ImportError(no_mpl_message.format(func.__name__))
+
+
+class Styler(object):
+ """
+ Helps style a DataFrame or Series according to the data with HTML and CSS.
+
+ Parameters
+ ----------
+ data : Series or DataFrame
+ precision : int
+ precision to round floats to, defaults to pd.options.display.precision
+ table_styles : list-like, default None
+ list of {selector: (attr, value)} dicts; see Notes
+ uuid : str, default None
+ a unique identifier to avoid CSS collisions; generated automatically
+ caption : str, default None
+ caption to attach to the table
+ cell_ids : bool, default True
+ If True, each cell will have an ``id`` attribute in their HTML tag.
+ The ``id`` takes the form ``T_<uuid>_row<num_row>_col<num_col>``
+ where ``<uuid>`` is the unique identifier, ``<num_row>`` is the row
+ number and ``<num_col>`` is the column number.
+
+ Attributes
+ ----------
+ env : Jinja2 Environment
+ template : Jinja2 Template
+ loader : Jinja2 Loader
+
+ See Also
+ --------
+ pandas.DataFrame.style
+
+ Notes
+ -----
+ Most styling will be done by passing style functions into
+ ``Styler.apply`` or ``Styler.applymap``. Style functions should
+ return values with strings containing CSS ``'attr: value'`` that will
+ be applied to the indicated cells.
+
+ If using in the Jupyter notebook, Styler has defined a ``_repr_html_``
+ to automatically render itself. Otherwise call Styler.render to get
+ the generated HTML.
+
+ CSS classes are attached to the generated HTML
+
+ * Index and Column names include ``index_name`` and ``level<k>``
+ where `k` is its level in a MultiIndex
+ * Index label cells include
+
+ * ``row_heading``
+ * ``row<n>`` where `n` is the numeric position of the row
+ * ``level<k>`` where `k` is the level in a MultiIndex
+
+ * Column label cells include
+ * ``col_heading``
+ * ``col<n>`` where `n` is the numeric position of the column
+ * ``evel<k>`` where `k` is the level in a MultiIndex
+
+ * Blank cells include ``blank``
+ * Data cells include ``data``
+ """
+ loader = PackageLoader("pandas", "io/formats/templates")
+ env = Environment(
+ loader=loader,
+ trim_blocks=True,
+ )
+ template = env.get_template("html.tpl")
+
+ def __init__(self, data, precision=None, table_styles=None, uuid=None,
+ caption=None, table_attributes=None, cell_ids=True):
+ self.ctx = defaultdict(list)
+ self._todo = []
+
+ if not isinstance(data, (pd.Series, pd.DataFrame)):
+ raise TypeError("``data`` must be a Series or DataFrame")
+ if data.ndim == 1:
+ data = data.to_frame()
+ if not data.index.is_unique or not data.columns.is_unique:
+ raise ValueError("style is not supported for non-unique indices.")
+
+ self.data = data
+ self.index = data.index
+ self.columns = data.columns
+
+ self.uuid = uuid
+ self.table_styles = table_styles
+ self.caption = caption
+ if precision is None:
+ precision = get_option('display.precision')
+ self.precision = precision
+ self.table_attributes = table_attributes
+ self.hidden_index = False
+ self.hidden_columns = []
+ self.cell_ids = cell_ids
+
+ # display_funcs maps (row, col) -> formatting function
+
+ def default_display_func(x):
+ if is_float(x):
+ return '{:>.{precision}g}'.format(x, precision=self.precision)
+ else:
+ return x
+
+ self._display_funcs = defaultdict(lambda: default_display_func)
+
+ def _repr_html_(self):
+ """
+ Hooks into Jupyter notebook rich display system.
+ """
+ return self.render()
+
+ @Appender(_shared_docs['to_excel'] % dict(
+ axes='index, columns', klass='Styler',
+ axes_single_arg="{0 or 'index', 1 or 'columns'}",
+ optional_by="""
+ by : str or list of str
+ Name or list of names which refer to the axis items.""",
+ versionadded_to_excel='\n .. versionadded:: 0.20'))
+ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
+ float_format=None, columns=None, header=True, index=True,
+ index_label=None, startrow=0, startcol=0, engine=None,
+ merge_cells=True, encoding=None, inf_rep='inf', verbose=True,
+ freeze_panes=None):
+
+ from pandas.io.formats.excel import ExcelFormatter
+ formatter = ExcelFormatter(self, na_rep=na_rep, cols=columns,
+ header=header,
+ float_format=float_format, index=index,
+ index_label=index_label,
+ merge_cells=merge_cells,
+ inf_rep=inf_rep)
+ formatter.write(excel_writer, sheet_name=sheet_name, startrow=startrow,
+ startcol=startcol, freeze_panes=freeze_panes,
+ engine=engine)
+
+ def _translate(self):
+ """
+ Convert the DataFrame in `self.data` and the attrs from `_build_styles`
+ into a dictionary of {head, body, uuid, cellstyle}.
+ """
+ table_styles = self.table_styles or []
+ caption = self.caption
+ ctx = self.ctx
+ precision = self.precision
+ hidden_index = self.hidden_index
+ hidden_columns = self.hidden_columns
+ uuid = self.uuid or str(uuid1()).replace("-", "_")
+ ROW_HEADING_CLASS = "row_heading"
+ COL_HEADING_CLASS = "col_heading"
+ INDEX_NAME_CLASS = "index_name"
+
+ DATA_CLASS = "data"
+ BLANK_CLASS = "blank"
+ BLANK_VALUE = ""
+
+ def format_attr(pair):
+ return "{key}={value}".format(**pair)
+
+ # for sparsifying a MultiIndex
+ idx_lengths = _get_level_lengths(self.index)
+ col_lengths = _get_level_lengths(self.columns, hidden_columns)
+
+ cell_context = dict()
+
+ n_rlvls = self.data.index.nlevels
+ n_clvls = self.data.columns.nlevels
+ rlabels = self.data.index.tolist()
+ clabels = self.data.columns.tolist()
+
+ if n_rlvls == 1:
+ rlabels = [[x] for x in rlabels]
+ if n_clvls == 1:
+ clabels = [[x] for x in clabels]
+ clabels = list(zip(*clabels))
+
+ cellstyle = []
+ head = []
+
+ for r in range(n_clvls):
+ # Blank for Index columns...
+ row_es = [{"type": "th",
+ "value": BLANK_VALUE,
+ "display_value": BLANK_VALUE,
+ "is_visible": not hidden_index,
+ "class": " ".join([BLANK_CLASS])}] * (n_rlvls - 1)
+
+ # ... except maybe the last for columns.names
+ name = self.data.columns.names[r]
+ cs = [BLANK_CLASS if name is None else INDEX_NAME_CLASS,
+ "level{lvl}".format(lvl=r)]
+ name = BLANK_VALUE if name is None else name
+ row_es.append({"type": "th",
+ "value": name,
+ "display_value": name,
+ "class": " ".join(cs),
+ "is_visible": not hidden_index})
+
+ if clabels:
+ for c, value in enumerate(clabels[r]):
+ cs = [COL_HEADING_CLASS, "level{lvl}".format(lvl=r),
+ "col{col}".format(col=c)]
+ cs.extend(cell_context.get(
+ "col_headings", {}).get(r, {}).get(c, []))
+ es = {
+ "type": "th",
+ "value": value,
+ "display_value": value,
+ "class": " ".join(cs),
+ "is_visible": _is_visible(c, r, col_lengths),
+ }
+ colspan = col_lengths.get((r, c), 0)
+ if colspan > 1:
+ es["attributes"] = [
+ format_attr({"key": "colspan", "value": colspan})
+ ]
+ row_es.append(es)
+ head.append(row_es)
+
+ if (self.data.index.names and
+ com._any_not_none(*self.data.index.names) and
+ not hidden_index):
+ index_header_row = []
+
+ for c, name in enumerate(self.data.index.names):
+ cs = [INDEX_NAME_CLASS,
+ "level{lvl}".format(lvl=c)]
+ name = '' if name is None else name
+ index_header_row.append({"type": "th", "value": name,
+ "class": " ".join(cs)})
+
+ index_header_row.extend(
+ [{"type": "th",
+ "value": BLANK_VALUE,
+ "class": " ".join([BLANK_CLASS])
+ }] * (len(clabels[0]) - len(hidden_columns)))
+
+ head.append(index_header_row)
+
+ body = []
+ for r, idx in enumerate(self.data.index):
+ row_es = []
+ for c, value in enumerate(rlabels[r]):
+ rid = [ROW_HEADING_CLASS, "level{lvl}".format(lvl=c),
+ "row{row}".format(row=r)]
+ es = {
+ "type": "th",
+ "is_visible": (_is_visible(r, c, idx_lengths) and
+ not hidden_index),
+ "value": value,
+ "display_value": value,
+ "id": "_".join(rid[1:]),
+ "class": " ".join(rid)
+ }
+ rowspan = idx_lengths.get((c, r), 0)
+ if rowspan > 1:
+ es["attributes"] = [
+ format_attr({"key": "rowspan", "value": rowspan})
+ ]
+ row_es.append(es)
+
+ for c, col in enumerate(self.data.columns):
+ cs = [DATA_CLASS, "row{row}".format(row=r),
+ "col{col}".format(col=c)]
+ cs.extend(cell_context.get("data", {}).get(r, {}).get(c, []))
+ formatter = self._display_funcs[(r, c)]
+ value = self.data.iloc[r, c]
+ row_dict = {"type": "td",
+ "value": value,
+ "class": " ".join(cs),
+ "display_value": formatter(value),
+ "is_visible": (c not in hidden_columns)}
+ # only add an id if the cell has a style
+ if (self.cell_ids or
+ not(len(ctx[r, c]) == 1 and ctx[r, c][0] == '')):
+ row_dict["id"] = "_".join(cs[1:])
+ row_es.append(row_dict)
+ props = []
+ for x in ctx[r, c]:
+ # have to handle empty styles like ['']
+ if x.count(":"):
+ props.append(x.split(":"))
+ else:
+ props.append(['', ''])
+ cellstyle.append({'props': props,
+ 'selector': "row{row}_col{col}"
+ .format(row=r, col=c)})
+ body.append(row_es)
+
+ table_attr = self.table_attributes
+ use_mathjax = get_option("display.html.use_mathjax")
+ if not use_mathjax:
+ table_attr = table_attr or ''
+ if 'class="' in table_attr:
+ table_attr = table_attr.replace('class="',
+ 'class="tex2jax_ignore ')
+ else:
+ table_attr += ' class="tex2jax_ignore"'
+
+ return dict(head=head, cellstyle=cellstyle, body=body, uuid=uuid,
+ precision=precision, table_styles=table_styles,
+ caption=caption, table_attributes=table_attr)
+
+ def format(self, formatter, subset=None):
+ """
+ Format the text display value of cells.
+
+ .. versionadded:: 0.18.0
+
+ Parameters
+ ----------
+ formatter : str, callable, or dict
+ subset : IndexSlice
+ An argument to ``DataFrame.loc`` that restricts which elements
+ ``formatter`` is applied to.
+
+ Returns
+ -------
+ self : Styler
+
+ Notes
+ -----
+
+ ``formatter`` is either an ``a`` or a dict ``{column name: a}`` where
+ ``a`` is one of
+
+ - str: this will be wrapped in: ``a.format(x)``
+ - callable: called with the value of an individual cell
+
+ The default display value for numeric values is the "general" (``g``)
+ format with ``pd.options.display.precision`` precision.
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b'])
+ >>> df.style.format("{:.2%}")
+ >>> df['c'] = ['a', 'b', 'c', 'd']
+ >>> df.style.format({'c': str.upper})
+ """
+ if subset is None:
+ row_locs = range(len(self.data))
+ col_locs = range(len(self.data.columns))
+ else:
+ subset = _non_reducing_slice(subset)
+ if len(subset) == 1:
+ subset = subset, self.data.columns
+
+ sub_df = self.data.loc[subset]
+ row_locs = self.data.index.get_indexer_for(sub_df.index)
+ col_locs = self.data.columns.get_indexer_for(sub_df.columns)
+
+ if is_dict_like(formatter):
+ for col, col_formatter in formatter.items():
+ # formatter must be callable, so '{}' are converted to lambdas
+ col_formatter = _maybe_wrap_formatter(col_formatter)
+ col_num = self.data.columns.get_indexer_for([col])[0]
+
+ for row_num in row_locs:
+ self._display_funcs[(row_num, col_num)] = col_formatter
+ else:
+ # single scalar to format all cells with
+ locs = product(*(row_locs, col_locs))
+ for i, j in locs:
+ formatter = _maybe_wrap_formatter(formatter)
+ self._display_funcs[(i, j)] = formatter
+ return self
+
+ def render(self, **kwargs):
+ """
+ Render the built up styles to HTML.
+
+ Parameters
+ ----------
+ `**kwargs` : Any additional keyword arguments are passed through
+ to ``self.template.render``. This is useful when you need to provide
+ additional variables for a custom template.
+
+ .. versionadded:: 0.20
+
+ Returns
+ -------
+ rendered : str
+ the rendered HTML
+
+ Notes
+ -----
+ ``Styler`` objects have defined the ``_repr_html_`` method
+ which automatically calls ``self.render()`` when it's the
+ last item in a Notebook cell. When calling ``Styler.render()``
+ directly, wrap the result in ``IPython.display.HTML`` to view
+ the rendered HTML in the notebook.
+
+ Pandas uses the following keys in render. Arguments passed
+ in ``**kwargs`` take precedence, so think carefully if you want
+ to override them:
+
+ * head
+ * cellstyle
+ * body
+ * uuid
+ * precision
+ * table_styles
+ * caption
+ * table_attributes
+ """
+ self._compute()
+ # TODO: namespace all the pandas keys
+ d = self._translate()
+ # filter out empty styles, every cell will have a class
+ # but the list of props may just be [['', '']].
+ # so we have the neested anys below
+ trimmed = [x for x in d['cellstyle']
+ if any(any(y) for y in x['props'])]
+ d['cellstyle'] = trimmed
+ d.update(kwargs)
+ return self.template.render(**d)
+
+ def _update_ctx(self, attrs):
+ """
+ Update the state of the Styler.
+
+ Collects a mapping of {index_label: ['<property>: <value>']}.
+
+ attrs : Series or DataFrame
+ should contain strings of '<property>: <value>;<prop2>: <val2>'
+ Whitespace shouldn't matter and the final trailing ';' shouldn't
+ matter.
+ """
+ for row_label, v in attrs.iterrows():
+ for col_label, col in v.iteritems():
+ i = self.index.get_indexer([row_label])[0]
+ j = self.columns.get_indexer([col_label])[0]
+ for pair in col.rstrip(";").split(";"):
+ self.ctx[(i, j)].append(pair)
+
+ def _copy(self, deepcopy=False):
+ styler = Styler(self.data, precision=self.precision,
+ caption=self.caption, uuid=self.uuid,
+ table_styles=self.table_styles)
+ if deepcopy:
+ styler.ctx = copy.deepcopy(self.ctx)
+ styler._todo = copy.deepcopy(self._todo)
+ else:
+ styler.ctx = self.ctx
+ styler._todo = self._todo
+ return styler
+
+ def __copy__(self):
+ """
+ Deep copy by default.
+ """
+ return self._copy(deepcopy=False)
+
+ def __deepcopy__(self, memo):
+ return self._copy(deepcopy=True)
+
+ def clear(self):
+ """
+ Reset the styler, removing any previously applied styles.
+ Returns None.
+ """
+ self.ctx.clear()
+ self._todo = []
+
+ def _compute(self):
+ """
+ Execute the style functions built up in `self._todo`.
+
+ Relies on the conventions that all style functions go through
+ .apply or .applymap. The append styles to apply as tuples of
+
+ (application method, *args, **kwargs)
+ """
+ r = self
+ for func, args, kwargs in self._todo:
+ r = func(self)(*args, **kwargs)
+ return r
+
+ def _apply(self, func, axis=0, subset=None, **kwargs):
+ subset = slice(None) if subset is None else subset
+ subset = _non_reducing_slice(subset)
+ data = self.data.loc[subset]
+ if axis is not None:
+ result = data.apply(func, axis=axis,
+ result_type='expand', **kwargs)
+ result.columns = data.columns
+ else:
+ result = func(data, **kwargs)
+ if not isinstance(result, pd.DataFrame):
+ raise TypeError(
+ "Function {func!r} must return a DataFrame when "
+ "passed to `Styler.apply` with axis=None"
+ .format(func=func))
+ if not (result.index.equals(data.index) and
+ result.columns.equals(data.columns)):
+ msg = ('Result of {func!r} must have identical index and '
+ 'columns as the input'.format(func=func))
+ raise ValueError(msg)
+
+ result_shape = result.shape
+ expected_shape = self.data.loc[subset].shape
+ if result_shape != expected_shape:
+ msg = ("Function {func!r} returned the wrong shape.\n"
+ "Result has shape: {res}\n"
+ "Expected shape: {expect}".format(func=func,
+ res=result.shape,
+ expect=expected_shape))
+ raise ValueError(msg)
+ self._update_ctx(result)
+ return self
+
+ def apply(self, func, axis=0, subset=None, **kwargs):
+ """
+ Apply a function column-wise, row-wise, or table-wise,
+ updating the HTML representation with the result.
+
+ Parameters
+ ----------
+ func : function
+ ``func`` should take a Series or DataFrame (depending
+ on ``axis``), and return an object with the same shape.
+ Must return a DataFrame with identical index and
+ column labels when ``axis=None``
+ axis : int, str or None
+ apply to each column (``axis=0`` or ``'index'``)
+ or to each row (``axis=1`` or ``'columns'``) or
+ to the entire DataFrame at once with ``axis=None``
+ subset : IndexSlice
+ a valid indexer to limit ``data`` to *before* applying the
+ function. Consider using a pandas.IndexSlice
+ kwargs : dict
+ pass along to ``func``
+
+ Returns
+ -------
+ self : Styler
+
+ Notes
+ -----
+ The output shape of ``func`` should match the input, i.e. if
+ ``x`` is the input row, column, or table (depending on ``axis``),
+ then ``func(x).shape == x.shape`` should be true.
+
+ This is similar to ``DataFrame.apply``, except that ``axis=None``
+ applies the function to the entire DataFrame at once,
+ rather than column-wise or row-wise.
+
+ Examples
+ --------
+ >>> def highlight_max(x):
+ ... return ['background-color: yellow' if v == x.max() else ''
+ for v in x]
+ ...
+ >>> df = pd.DataFrame(np.random.randn(5, 2))
+ >>> df.style.apply(highlight_max)
+ """
+ self._todo.append((lambda instance: getattr(instance, '_apply'),
+ (func, axis, subset), kwargs))
+ return self
+
+ def _applymap(self, func, subset=None, **kwargs):
+ func = partial(func, **kwargs) # applymap doesn't take kwargs?
+ if subset is None:
+ subset = pd.IndexSlice[:]
+ subset = _non_reducing_slice(subset)
+ result = self.data.loc[subset].applymap(func)
+ self._update_ctx(result)
+ return self
+
+ def applymap(self, func, subset=None, **kwargs):
+ """
+ Apply a function elementwise, updating the HTML
+ representation with the result.
+
+ Parameters
+ ----------
+ func : function
+ ``func`` should take a scalar and return a scalar
+ subset : IndexSlice
+ a valid indexer to limit ``data`` to *before* applying the
+ function. Consider using a pandas.IndexSlice
+ kwargs : dict
+ pass along to ``func``
+
+ Returns
+ -------
+ self : Styler
+
+ See Also
+ --------
+ Styler.where
+ """
+ self._todo.append((lambda instance: getattr(instance, '_applymap'),
+ (func, subset), kwargs))
+ return self
+
+ def where(self, cond, value, other=None, subset=None, **kwargs):
+ """
+ Apply a function elementwise, updating the HTML
+ representation with a style which is selected in
+ accordance with the return value of a function.
+
+ .. versionadded:: 0.21.0
+
+ Parameters
+ ----------
+ cond : callable
+ ``cond`` should take a scalar and return a boolean
+ value : str
+ applied when ``cond`` returns true
+ other : str
+ applied when ``cond`` returns false
+ subset : IndexSlice
+ a valid indexer to limit ``data`` to *before* applying the
+ function. Consider using a pandas.IndexSlice
+ kwargs : dict
+ pass along to ``cond``
+
+ Returns
+ -------
+ self : Styler
+
+ See Also
+ --------
+ Styler.applymap
+ """
+
+ if other is None:
+ other = ''
+
+ return self.applymap(lambda val: value if cond(val) else other,
+ subset=subset, **kwargs)
+
+ def set_precision(self, precision):
+ """
+ Set the precision used to render.
+
+ Parameters
+ ----------
+ precision : int
+
+ Returns
+ -------
+ self : Styler
+ """
+ self.precision = precision
+ return self
+
+ def set_table_attributes(self, attributes):
+ """
+ Set the table attributes.
+
+ These are the items that show up in the opening ``<table>`` tag
+ in addition to to automatic (by default) id.
+
+ Parameters
+ ----------
+ attributes : string
+
+ Returns
+ -------
+ self : Styler
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(np.random.randn(10, 4))
+ >>> df.style.set_table_attributes('class="pure-table"')
+ # ... <table class="pure-table"> ...
+ """
+ self.table_attributes = attributes
+ return self
+
+ def export(self):
+ """
+ Export the styles to applied to the current Styler.
+
+ Can be applied to a second style with ``Styler.use``.
+
+ Returns
+ -------
+ styles : list
+
+ See Also
+ --------
+ Styler.use
+ """
+ return self._todo
+
+ def use(self, styles):
+ """
+ Set the styles on the current Styler, possibly using styles
+ from ``Styler.export``.
+
+ Parameters
+ ----------
+ styles : list
+ list of style functions
+
+ Returns
+ -------
+ self : Styler
+
+ See Also
+ --------
+ Styler.export
+ """
+ self._todo.extend(styles)
+ return self
+
+ def set_uuid(self, uuid):
+ """
+ Set the uuid for a Styler.
+
+ Parameters
+ ----------
+ uuid : str
+
+ Returns
+ -------
+ self : Styler
+ """
+ self.uuid = uuid
+ return self
+
+ def set_caption(self, caption):
+ """
+ Set the caption on a Styler
+
+ Parameters
+ ----------
+ caption : str
+
+ Returns
+ -------
+ self : Styler
+ """
+ self.caption = caption
+ return self
+
+ def set_table_styles(self, table_styles):
+ """
+ Set the table styles on a Styler.
+
+ These are placed in a ``<style>`` tag before the generated HTML table.
+
+ Parameters
+ ----------
+ table_styles : list
+ Each individual table_style should be a dictionary with
+ ``selector`` and ``props`` keys. ``selector`` should be a CSS
+ selector that the style will be applied to (automatically
+ prefixed by the table's UUID) and ``props`` should be a list of
+ tuples with ``(attribute, value)``.
+
+ Returns
+ -------
+ self : Styler
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(np.random.randn(10, 4))
+ >>> df.style.set_table_styles(
+ ... [{'selector': 'tr:hover',
+ ... 'props': [('background-color', 'yellow')]}]
+ ... )
+ """
+ self.table_styles = table_styles
+ return self
+
+ def hide_index(self):
+ """
+ Hide any indices from rendering.
+
+ .. versionadded:: 0.23.0
+
+ Returns
+ -------
+ self : Styler
+ """
+ self.hidden_index = True
+ return self
+
+ def hide_columns(self, subset):
+ """
+ Hide columns from rendering.
+
+ .. versionadded:: 0.23.0
+
+ Parameters
+ ----------
+ subset : IndexSlice
+ An argument to ``DataFrame.loc`` that identifies which columns
+ are hidden.
+
+ Returns
+ -------
+ self : Styler
+ """
+ subset = _non_reducing_slice(subset)
+ hidden_df = self.data.loc[subset]
+ self.hidden_columns = self.columns.get_indexer_for(hidden_df.columns)
+ return self
+
+ # -----------------------------------------------------------------------
+ # A collection of "builtin" styles
+ # -----------------------------------------------------------------------
+
+ @staticmethod
+ def _highlight_null(v, null_color):
+ return ('background-color: {color}'.format(color=null_color)
+ if pd.isna(v) else '')
+
+ def highlight_null(self, null_color='red'):
+ """
+ Shade the background ``null_color`` for missing values.
+
+ Parameters
+ ----------
+ null_color : str
+
+ Returns
+ -------
+ self : Styler
+ """
+ self.applymap(self._highlight_null, null_color=null_color)
+ return self
+
+ def background_gradient(self, cmap='PuBu', low=0, high=0, axis=0,
+ subset=None, text_color_threshold=0.408):
+ """
+ Color the background in a gradient according to
+ the data in each column (optionally row).
+
+ Requires matplotlib.
+
+ Parameters
+ ----------
+ cmap : str or colormap
+ matplotlib colormap
+ low, high : float
+ compress the range by these values.
+ axis : int or str
+ 1 or 'columns' for columnwise, 0 or 'index' for rowwise
+ subset : IndexSlice
+ a valid slice for ``data`` to limit the style application to
+ text_color_threshold : float or int
+ luminance threshold for determining text color. Facilitates text
+ visibility across varying background colors. From 0 to 1.
+ 0 = all text is dark colored, 1 = all text is light colored.
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ self : Styler
+
+ Raises
+ ------
+ ValueError
+ If ``text_color_threshold`` is not a value from 0 to 1.
+
+ Notes
+ -----
+ Set ``text_color_threshold`` or tune ``low`` and ``high`` to keep the
+ text legible by not using the entire range of the color map. The range
+ of the data is extended by ``low * (x.max() - x.min())`` and ``high *
+ (x.max() - x.min())`` before normalizing.
+ """
+ subset = _maybe_numeric_slice(self.data, subset)
+ subset = _non_reducing_slice(subset)
+ self.apply(self._background_gradient, cmap=cmap, subset=subset,
+ axis=axis, low=low, high=high,
+ text_color_threshold=text_color_threshold)
+ return self
+
+ @staticmethod
+ def _background_gradient(s, cmap='PuBu', low=0, high=0,
+ text_color_threshold=0.408):
+ """
+ Color background in a range according to the data.
+ """
+ if (not isinstance(text_color_threshold, (float, int)) or
+ not 0 <= text_color_threshold <= 1):
+ msg = "`text_color_threshold` must be a value from 0 to 1."
+ raise ValueError(msg)
+
+ with _mpl(Styler.background_gradient) as (plt, colors):
+ smin = s.values.min()
+ smax = s.values.max()
+ rng = smax - smin
+ # extend lower / upper bounds, compresses color range
+ norm = colors.Normalize(smin - (rng * low), smax + (rng * high))
+ # matplotlib colors.Normalize modifies inplace?
+ # https://github.com/matplotlib/matplotlib/issues/5427
+ rgbas = plt.cm.get_cmap(cmap)(norm(s.values))
+
+ def relative_luminance(rgba):
+ """
+ Calculate relative luminance of a color.
+
+ The calculation adheres to the W3C standards
+ (https://www.w3.org/WAI/GL/wiki/Relative_luminance)
+
+ Parameters
+ ----------
+ color : rgb or rgba tuple
+
+ Returns
+ -------
+ float
+ The relative luminance as a value from 0 to 1
+ """
+ r, g, b = (
+ x / 12.92 if x <= 0.03928 else ((x + 0.055) / 1.055 ** 2.4)
+ for x in rgba[:3]
+ )
+ return 0.2126 * r + 0.7152 * g + 0.0722 * b
+
+ def css(rgba):
+ dark = relative_luminance(rgba) < text_color_threshold
+ text_color = '#f1f1f1' if dark else '#000000'
+ return 'background-color: {b};color: {c};'.format(
+ b=colors.rgb2hex(rgba), c=text_color
+ )
+
+ if s.ndim == 1:
+ return [css(rgba) for rgba in rgbas]
+ else:
+ return pd.DataFrame(
+ [[css(rgba) for rgba in row] for row in rgbas],
+ index=s.index, columns=s.columns
+ )
+
+ def set_properties(self, subset=None, **kwargs):
+ """
+ Convenience method for setting one or more non-data dependent
+ properties or each cell.
+
+ Parameters
+ ----------
+ subset : IndexSlice
+ a valid slice for ``data`` to limit the style application to
+ kwargs : dict
+ property: value pairs to be set for each cell
+
+ Returns
+ -------
+ self : Styler
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(np.random.randn(10, 4))
+ >>> df.style.set_properties(color="white", align="right")
+ >>> df.style.set_properties(**{'background-color': 'yellow'})
+ """
+ values = ';'.join('{p}: {v}'.format(p=p, v=v)
+ for p, v in kwargs.items())
+ f = lambda x: values
+ return self.applymap(f, subset=subset)
+
+ @staticmethod
+ def _bar(s, align, colors, width=100, vmin=None, vmax=None):
+ """
+ Draw bar chart in dataframe cells.
+ """
+ # Get input value range.
+ smin = s.min() if vmin is None else vmin
+ if isinstance(smin, ABCSeries):
+ smin = smin.min()
+ smax = s.max() if vmax is None else vmax
+ if isinstance(smax, ABCSeries):
+ smax = smax.max()
+ if align == 'mid':
+ smin = min(0, smin)
+ smax = max(0, smax)
+ elif align == 'zero':
+ # For "zero" mode, we want the range to be symmetrical around zero.
+ smax = max(abs(smin), abs(smax))
+ smin = -smax
+ # Transform to percent-range of linear-gradient
+ normed = width * (s.values - smin) / (smax - smin + 1e-12)
+ zero = -width * smin / (smax - smin + 1e-12)
+
+ def css_bar(start, end, color):
+ """
+ Generate CSS code to draw a bar from start to end.
+ """
+ css = 'width: 10em; height: 80%;'
+ if end > start:
+ css += 'background: linear-gradient(90deg,'
+ if start > 0:
+ css += ' transparent {s:.1f}%, {c} {s:.1f}%, '.format(
+ s=start, c=color
+ )
+ css += '{c} {e:.1f}%, transparent {e:.1f}%)'.format(
+ e=min(end, width), c=color,
+ )
+ return css
+
+ def css(x):
+ if pd.isna(x):
+ return ''
+
+ # avoid deprecated indexing `colors[x > zero]`
+ color = colors[1] if x > zero else colors[0]
+
+ if align == 'left':
+ return css_bar(0, x, color)
+ else:
+ return css_bar(min(x, zero), max(x, zero), color)
+
+ if s.ndim == 1:
+ return [css(x) for x in normed]
+ else:
+ return pd.DataFrame(
+ [[css(x) for x in row] for row in normed],
+ index=s.index, columns=s.columns
+ )
+
+ def bar(self, subset=None, axis=0, color='#d65f5f', width=100,
+ align='left', vmin=None, vmax=None):
+ """
+ Draw bar chart in the cell backgrounds.
+
+ Parameters
+ ----------
+ subset : IndexSlice, optional
+ A valid slice for `data` to limit the style application to.
+ axis : int, str or None, default 0
+ Apply to each column (`axis=0` or `'index'`)
+ or to each row (`axis=1` or `'columns'`) or
+ to the entire DataFrame at once with `axis=None`.
+ color : str or 2-tuple/list
+ If a str is passed, the color is the same for both
+ negative and positive numbers. If 2-tuple/list is used, the
+ first element is the color_negative and the second is the
+ color_positive (eg: ['#d65f5f', '#5fba7d']).
+ width : float, default 100
+ A number between 0 or 100. The largest value will cover `width`
+ percent of the cell's width.
+ align : {'left', 'zero',' mid'}, default 'left'
+ How to align the bars with the cells.
+
+ - 'left' : the min value starts at the left of the cell.
+ - 'zero' : a value of zero is located at the center of the cell.
+ - 'mid' : the center of the cell is at (max-min)/2, or
+ if values are all negative (positive) the zero is aligned
+ at the right (left) of the cell.
+
+ .. versionadded:: 0.20.0
+
+ vmin : float, optional
+ Minimum bar value, defining the left hand limit
+ of the bar drawing range, lower values are clipped to `vmin`.
+ When None (default): the minimum value of the data will be used.
+
+ .. versionadded:: 0.24.0
+
+ vmax : float, optional
+ Maximum bar value, defining the right hand limit
+ of the bar drawing range, higher values are clipped to `vmax`.
+ When None (default): the maximum value of the data will be used.
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ self : Styler
+ """
+ if align not in ('left', 'zero', 'mid'):
+ raise ValueError("`align` must be one of {'left', 'zero',' mid'}")
+
+ if not (is_list_like(color)):
+ color = [color, color]
+ elif len(color) == 1:
+ color = [color[0], color[0]]
+ elif len(color) > 2:
+ raise ValueError("`color` must be string or a list-like"
+ " of length 2: [`color_neg`, `color_pos`]"
+ " (eg: color=['#d65f5f', '#5fba7d'])")
+
+ subset = _maybe_numeric_slice(self.data, subset)
+ subset = _non_reducing_slice(subset)
+ self.apply(self._bar, subset=subset, axis=axis,
+ align=align, colors=color, width=width,
+ vmin=vmin, vmax=vmax)
+
+ return self
+
+ def highlight_max(self, subset=None, color='yellow', axis=0):
+ """
+ Highlight the maximum by shading the background.
+
+ Parameters
+ ----------
+ subset : IndexSlice, default None
+ a valid slice for ``data`` to limit the style application to
+ color : str, default 'yellow'
+ axis : int, str, or None; default 0
+ 0 or 'index' for columnwise (default), 1 or 'columns' for rowwise,
+ or ``None`` for tablewise
+
+ Returns
+ -------
+ self : Styler
+ """
+ return self._highlight_handler(subset=subset, color=color, axis=axis,
+ max_=True)
+
+ def highlight_min(self, subset=None, color='yellow', axis=0):
+ """
+ Highlight the minimum by shading the background.
+
+ Parameters
+ ----------
+ subset : IndexSlice, default None
+ a valid slice for ``data`` to limit the style application to
+ color : str, default 'yellow'
+ axis : int, str, or None; default 0
+ 0 or 'index' for columnwise (default), 1 or 'columns' for rowwise,
+ or ``None`` for tablewise
+
+ Returns
+ -------
+ self : Styler
+ """
+ return self._highlight_handler(subset=subset, color=color, axis=axis,
+ max_=False)
+
+ def _highlight_handler(self, subset=None, color='yellow', axis=None,
+ max_=True):
+ subset = _non_reducing_slice(_maybe_numeric_slice(self.data, subset))
+ self.apply(self._highlight_extrema, color=color, axis=axis,
+ subset=subset, max_=max_)
+ return self
+
+ @staticmethod
+ def _highlight_extrema(data, color='yellow', max_=True):
+ """
+ Highlight the min or max in a Series or DataFrame.
+ """
+ attr = 'background-color: {0}'.format(color)
+ if data.ndim == 1: # Series from .apply
+ if max_:
+ extrema = data == data.max()
+ else:
+ extrema = data == data.min()
+ return [attr if v else '' for v in extrema]
+ else: # DataFrame from .tee
+ if max_:
+ extrema = data == data.max().max()
+ else:
+ extrema = data == data.min().min()
+ return pd.DataFrame(np.where(extrema, attr, ''),
+ index=data.index, columns=data.columns)
+
+ @classmethod
+ def from_custom_template(cls, searchpath, name):
+ """
+ Factory function for creating a subclass of ``Styler``
+ with a custom template and Jinja environment.
+
+ Parameters
+ ----------
+ searchpath : str or list
+ Path or paths of directories containing the templates
+ name : str
+ Name of your custom template to use for rendering
+
+ Returns
+ -------
+ MyStyler : subclass of Styler
+ has the correct ``env`` and ``template`` class attributes set.
+ """
+ loader = ChoiceLoader([
+ FileSystemLoader(searchpath),
+ cls.loader,
+ ])
+
+ class MyStyler(cls):
+ env = Environment(loader=loader)
+ template = env.get_template(name)
+
+ return MyStyler
+
+ def pipe(self, func, *args, **kwargs):
+ """
+ Apply ``func(self, *args, **kwargs)``, and return the result.
+
+ .. versionadded:: 0.24.0
+
+ Parameters
+ ----------
+ func : function
+ Function to apply to the Styler. Alternatively, a
+ ``(callable, keyword)`` tuple where ``keyword`` is a string
+ indicating the keyword of ``callable`` that expects the Styler.
+ *args, **kwargs :
+ Arguments passed to `func`.
+
+ Returns
+ -------
+ object :
+ The value returned by ``func``.
+
+ See Also
+ --------
+ DataFrame.pipe : Analogous method for DataFrame.
+ Styler.apply : Apply a function row-wise, column-wise, or table-wise to
+ modify the dataframe's styling.
+
+ Notes
+ -----
+ Like :meth:`DataFrame.pipe`, this method can simplify the
+ application of several user-defined functions to a styler. Instead
+ of writing:
+
+ .. code-block:: python
+
+ f(g(df.style.set_precision(3), arg1=a), arg2=b, arg3=c)
+
+ users can write:
+
+ .. code-block:: python
+
+ (df.style.set_precision(3)
+ .pipe(g, arg1=a)
+ .pipe(f, arg2=b, arg3=c))
+
+ In particular, this allows users to define functions that take a
+ styler object, along with other parameters, and return the styler after
+ making styling changes (such as calling :meth:`Styler.apply` or
+ :meth:`Styler.set_properties`). Using ``.pipe``, these user-defined
+ style "transformations" can be interleaved with calls to the built-in
+ Styler interface.
+
+ Examples
+ --------
+ >>> def format_conversion(styler):
+ ... return (styler.set_properties(**{'text-align': 'right'})
+ ... .format({'conversion': '{:.1%}'}))
+
+ The user-defined ``format_conversion`` function above can be called
+ within a sequence of other style modifications:
+
+ >>> df = pd.DataFrame({'trial': list(range(5)),
+ ... 'conversion': [0.75, 0.85, np.nan, 0.7, 0.72]})
+ >>> (df.style
+ ... .highlight_min(subset=['conversion'], color='yellow')
+ ... .pipe(format_conversion)
+ ... .set_caption("Results with minimum conversion highlighted."))
+ """
+ return com._pipe(self, func, *args, **kwargs)
+
+
+def _is_visible(idx_row, idx_col, lengths):
+ """
+ Index -> {(idx_row, idx_col): bool}).
+ """
+ return (idx_col, idx_row) in lengths
+
+
+def _get_level_lengths(index, hidden_elements=None):
+ """
+ Given an index, find the level length for each element.
+
+ Optional argument is a list of index positions which
+ should not be visible.
+
+ Result is a dictionary of (level, inital_position): span
+ """
+ sentinel = com.sentinel_factory()
+ levels = index.format(sparsify=sentinel, adjoin=False, names=False)
+
+ if hidden_elements is None:
+ hidden_elements = []
+
+ lengths = {}
+ if index.nlevels == 1:
+ for i, value in enumerate(levels):
+ if(i not in hidden_elements):
+ lengths[(0, i)] = 1
+ return lengths
+
+ for i, lvl in enumerate(levels):
+ for j, row in enumerate(lvl):
+ if not get_option('display.multi_sparse'):
+ lengths[(i, j)] = 1
+ elif (row != sentinel) and (j not in hidden_elements):
+ last_label = j
+ lengths[(i, last_label)] = 1
+ elif (row != sentinel):
+ # even if its hidden, keep track of it in case
+ # length >1 and later elements are visible
+ last_label = j
+ lengths[(i, last_label)] = 0
+ elif(j not in hidden_elements):
+ lengths[(i, last_label)] += 1
+
+ non_zero_lengths = {
+ element: length for element, length in lengths.items() if length >= 1}
+
+ return non_zero_lengths
+
+
+def _maybe_wrap_formatter(formatter):
+ if is_string_like(formatter):
+ return lambda x: formatter.format(x)
+ elif callable(formatter):
+ return formatter
+ else:
+ msg = ("Expected a template string or callable, got {formatter} "
+ "instead".format(formatter=formatter))
+ raise TypeError(msg)
diff --git a/contrib/python/pandas/py2/pandas/io/formats/templates/html.tpl b/contrib/python/pandas/py2/pandas/io/formats/templates/html.tpl
new file mode 100644
index 00000000000..15feafcea68
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/formats/templates/html.tpl
@@ -0,0 +1,70 @@
+{# Update the template_structure.html document too #}
+{%- block before_style -%}{%- endblock before_style -%}
+{% block style %}
+<style type="text/css" >
+{% block table_styles %}
+{% for s in table_styles %}
+ #T_{{uuid}} {{s.selector}} {
+ {% for p,val in s.props %}
+ {{p}}: {{val}};
+ {% endfor -%}
+ }
+{%- endfor -%}
+{% endblock table_styles %}
+{% block before_cellstyle %}{% endblock before_cellstyle %}
+{% block cellstyle %}
+{%- for s in cellstyle %}
+ #T_{{uuid}}{{s.selector}} {
+ {% for p,val in s.props %}
+ {{p}}: {{val}};
+ {% endfor %}
+ }
+{%- endfor -%}
+{%- endblock cellstyle %}
+</style>
+{%- endblock style %}
+{%- block before_table %}{% endblock before_table %}
+{%- block table %}
+<table id="T_{{uuid}}" {% if table_attributes %}{{ table_attributes }}{% endif %}>
+{%- block caption %}
+{%- if caption -%}
+ <caption>{{caption}}</caption>
+{%- endif -%}
+{%- endblock caption %}
+{%- block thead %}
+<thead>
+ {%- block before_head_rows %}{% endblock %}
+ {%- for r in head %}
+ {%- block head_tr scoped %}
+ <tr>
+ {%- for c in r %}
+ {%- if c.is_visible != False %}
+ <{{ c.type }} class="{{c.class}}" {{ c.attributes|join(" ") }}>{{c.value}}</{{ c.type }}>
+ {%- endif %}
+ {%- endfor %}
+ </tr>
+ {%- endblock head_tr %}
+ {%- endfor %}
+ {%- block after_head_rows %}{% endblock %}
+</thead>
+{%- endblock thead %}
+{%- block tbody %}
+<tbody>
+ {% block before_rows %}{% endblock before_rows %}
+ {% for r in body %}
+ {% block tr scoped %}
+ <tr>
+ {% for c in r %}
+ {% if c.is_visible != False %}
+ <{{ c.type }} {% if c.id is defined -%} id="T_{{ uuid }}{{ c.id }}" {%- endif %} class="{{ c.class }}" {{ c.attributes|join(" ") }}>{{ c.display_value }}</{{ c.type }}>
+ {% endif %}
+ {%- endfor %}
+ </tr>
+ {% endblock tr %}
+ {%- endfor %}
+ {%- block after_rows %}{%- endblock after_rows %}
+</tbody>
+{%- endblock tbody %}
+</table>
+{%- endblock table %}
+{%- block after_table %}{% endblock after_table %}
diff --git a/contrib/python/pandas/py2/pandas/io/formats/terminal.py b/contrib/python/pandas/py2/pandas/io/formats/terminal.py
new file mode 100644
index 00000000000..cf2383955d5
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/formats/terminal.py
@@ -0,0 +1,152 @@
+"""
+get_terminal_size() -- return width and height of terminal as a tuple
+
+code from:
+http://stackoverflow.com/questions/566746/how-to-get-console- window-width-in-
+python
+
+written by
+Harco Kuppens (http://stackoverflow.com/users/825214/harco-kuppens)
+
+It is mentioned in the stackoverflow response that this code works
+on linux, os x, windows and cygwin (windows).
+"""
+from __future__ import print_function
+
+import os
+import shutil
+import subprocess
+
+from pandas.compat import PY3
+
+__all__ = ['get_terminal_size', 'is_terminal']
+
+
+def get_terminal_size():
+ """
+ Detect terminal size and return tuple = (width, height).
+
+ Only to be used when running in a terminal. Note that the IPython notebook,
+ IPython zmq frontends, or IDLE do not run in a terminal,
+ """
+ import platform
+
+ if PY3:
+ return shutil.get_terminal_size()
+
+ current_os = platform.system()
+ tuple_xy = None
+ if current_os == 'Windows':
+ tuple_xy = _get_terminal_size_windows()
+ if tuple_xy is None:
+ tuple_xy = _get_terminal_size_tput()
+ # needed for window's python in cygwin's xterm!
+ if (current_os == 'Linux' or current_os == 'Darwin' or
+ current_os.startswith('CYGWIN')):
+ tuple_xy = _get_terminal_size_linux()
+ if tuple_xy is None:
+ tuple_xy = (80, 25) # default value
+ return tuple_xy
+
+
+def is_terminal():
+ """
+ Detect if Python is running in a terminal.
+
+ Returns True if Python is running in a terminal or False if not.
+ """
+ try:
+ ip = get_ipython()
+ except NameError: # assume standard Python interpreter in a terminal
+ return True
+ else:
+ if hasattr(ip, 'kernel'): # IPython as a Jupyter kernel
+ return False
+ else: # IPython in a terminal
+ return True
+
+
+def _get_terminal_size_windows():
+
+ try:
+ from ctypes import windll, create_string_buffer
+
+ # stdin handle is -10
+ # stdout handle is -11
+ # stderr handle is -12
+
+ h = windll.kernel32.GetStdHandle(-12)
+ csbi = create_string_buffer(22)
+ res = windll.kernel32.GetConsoleScreenBufferInfo(h, csbi)
+ except (AttributeError, ValueError):
+ return None
+ if res:
+ import struct
+ (bufx, bufy, curx, cury, wattr, left, top, right, bottom, maxx,
+ maxy) = struct.unpack("hhhhHhhhhhh", csbi.raw)
+ sizex = right - left + 1
+ sizey = bottom - top + 1
+ return sizex, sizey
+ else:
+ return None
+
+
+def _get_terminal_size_tput():
+ # get terminal width
+ # src: http://stackoverflow.com/questions/263890/how-do-i-find-the-width
+ # -height-of-a-terminal-window
+
+ try:
+ proc = subprocess.Popen(["tput", "cols"],
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE)
+ output_cols = proc.communicate(input=None)
+ proc = subprocess.Popen(["tput", "lines"],
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE)
+ output_rows = proc.communicate(input=None)
+ except OSError:
+ return None
+
+ try:
+ # Some terminals (e.g. spyder) may report a terminal size of '',
+ # making the `int` fail.
+
+ cols = int(output_cols[0])
+ rows = int(output_rows[0])
+ return cols, rows
+ except (ValueError, IndexError):
+ return None
+
+
+def _get_terminal_size_linux():
+ def ioctl_GWINSZ(fd):
+ try:
+ import fcntl
+ import termios
+ import struct
+ cr = struct.unpack(
+ 'hh', fcntl.ioctl(fd, termios.TIOCGWINSZ, '1234'))
+ except (struct.error, IOError):
+ return None
+ return cr
+ cr = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2)
+ if not cr:
+ try:
+ fd = os.open(os.ctermid(), os.O_RDONLY)
+ cr = ioctl_GWINSZ(fd)
+ os.close(fd)
+ except OSError:
+ pass
+ if not cr or cr == (0, 0):
+ try:
+ from os import environ as env
+ cr = (env['LINES'], env['COLUMNS'])
+ except (ValueError, KeyError):
+ return None
+ return int(cr[1]), int(cr[0])
+
+
+if __name__ == "__main__":
+ sizex, sizey = get_terminal_size()
+ print('width = {w} height = {h}'.format(w=sizex, h=sizey))
diff --git a/contrib/python/pandas/py2/pandas/io/gbq.py b/contrib/python/pandas/py2/pandas/io/gbq.py
new file mode 100644
index 00000000000..639b68d433a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/gbq.py
@@ -0,0 +1,162 @@
+""" Google BigQuery support """
+
+import warnings
+
+
+def _try_import():
+ # since pandas is a dependency of pandas-gbq
+ # we need to import on first use
+ try:
+ import pandas_gbq
+ except ImportError:
+
+ # give a nice error message
+ raise ImportError("Load data from Google BigQuery\n"
+ "\n"
+ "the pandas-gbq package is not installed\n"
+ "see the docs: https://pandas-gbq.readthedocs.io\n"
+ "\n"
+ "you can install via pip or conda:\n"
+ "pip install pandas-gbq\n"
+ "conda install pandas-gbq -c conda-forge\n")
+
+ return pandas_gbq
+
+
+def read_gbq(query, project_id=None, index_col=None, col_order=None,
+ reauth=False, auth_local_webserver=False, dialect=None,
+ location=None, configuration=None, credentials=None,
+ private_key=None, verbose=None):
+ """
+ Load data from Google BigQuery.
+
+ This function requires the `pandas-gbq package
+ <https://pandas-gbq.readthedocs.io>`__.
+
+ See the `How to authenticate with Google BigQuery
+ <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
+ guide for authentication instructions.
+
+ Parameters
+ ----------
+ query : str
+ SQL-Like Query to return data values.
+ project_id : str, optional
+ Google BigQuery Account project ID. Optional when available from
+ the environment.
+ index_col : str, optional
+ Name of result column to use for index in results DataFrame.
+ col_order : list(str), optional
+ List of BigQuery column names in the desired order for results
+ DataFrame.
+ reauth : boolean, default False
+ Force Google BigQuery to re-authenticate the user. This is useful
+ if multiple accounts are used.
+ auth_local_webserver : boolean, default False
+ Use the `local webserver flow`_ instead of the `console flow`_
+ when getting user credentials.
+
+ .. _local webserver flow:
+ http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
+ .. _console flow:
+ http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
+
+ *New in version 0.2.0 of pandas-gbq*.
+ dialect : str, default 'legacy'
+ Note: The default value is changing to 'standard' in a future verion.
+
+ SQL syntax dialect to use. Value can be one of:
+
+ ``'legacy'``
+ Use BigQuery's legacy SQL dialect. For more information see
+ `BigQuery Legacy SQL Reference
+ <https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__.
+ ``'standard'``
+ Use BigQuery's standard SQL, which is
+ compliant with the SQL 2011 standard. For more information
+ see `BigQuery Standard SQL Reference
+ <https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__.
+
+ .. versionchanged:: 0.24.0
+ location : str, optional
+ Location where the query job should run. See the `BigQuery locations
+ documentation
+ <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
+ list of available locations. The location must match that of any
+ datasets used in the query.
+
+ *New in version 0.5.0 of pandas-gbq*.
+ configuration : dict, optional
+ Query config parameters for job processing.
+ For example:
+
+ configuration = {'query': {'useQueryCache': False}}
+
+ For more information see `BigQuery REST API Reference
+ <https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__.
+ credentials : google.auth.credentials.Credentials, optional
+ Credentials for accessing Google APIs. Use this parameter to override
+ default credentials, such as to use Compute Engine
+ :class:`google.auth.compute_engine.Credentials` or Service Account
+ :class:`google.oauth2.service_account.Credentials` directly.
+
+ *New in version 0.8.0 of pandas-gbq*.
+
+ .. versionadded:: 0.24.0
+ private_key : str, deprecated
+ Deprecated in pandas-gbq version 0.8.0. Use the ``credentials``
+ parameter and
+ :func:`google.oauth2.service_account.Credentials.from_service_account_info`
+ or
+ :func:`google.oauth2.service_account.Credentials.from_service_account_file`
+ instead.
+
+ Service account private key in JSON format. Can be file path
+ or string contents. This is useful for remote server
+ authentication (eg. Jupyter/IPython notebook on remote host).
+ verbose : None, deprecated
+ Deprecated in pandas-gbq version 0.4.0. Use the `logging module to
+ adjust verbosity instead
+ <https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.
+
+ Returns
+ -------
+ df: DataFrame
+ DataFrame representing results of query.
+
+ See Also
+ --------
+ pandas_gbq.read_gbq : This function in the pandas-gbq library.
+ pandas.DataFrame.to_gbq : Write a DataFrame to Google BigQuery.
+ """
+ pandas_gbq = _try_import()
+
+ if dialect is None:
+ dialect = "legacy"
+ warnings.warn(
+ 'The default value for dialect is changing to "standard" in a '
+ 'future version of pandas-gbq. Pass in dialect="legacy" to '
+ "disable this warning.",
+ FutureWarning,
+ stacklevel=2,
+ )
+
+ return pandas_gbq.read_gbq(
+ query, project_id=project_id, index_col=index_col,
+ col_order=col_order, reauth=reauth,
+ auth_local_webserver=auth_local_webserver, dialect=dialect,
+ location=location, configuration=configuration,
+ credentials=credentials, verbose=verbose, private_key=private_key)
+
+
+def to_gbq(dataframe, destination_table, project_id=None, chunksize=None,
+ reauth=False, if_exists='fail', auth_local_webserver=False,
+ table_schema=None, location=None, progress_bar=True,
+ credentials=None, verbose=None, private_key=None):
+ pandas_gbq = _try_import()
+ return pandas_gbq.to_gbq(
+ dataframe, destination_table, project_id=project_id,
+ chunksize=chunksize, reauth=reauth, if_exists=if_exists,
+ auth_local_webserver=auth_local_webserver, table_schema=table_schema,
+ location=location, progress_bar=progress_bar,
+ credentials=credentials, verbose=verbose, private_key=private_key)
diff --git a/contrib/python/pandas/py2/pandas/io/gcs.py b/contrib/python/pandas/py2/pandas/io/gcs.py
new file mode 100644
index 00000000000..aa1cb648f05
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/gcs.py
@@ -0,0 +1,16 @@
+""" GCS support for remote file interactivity """
+try:
+ import gcsfs
+except ImportError:
+ raise ImportError("The gcsfs library is required to handle GCS files")
+
+
+def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
+ compression=None, mode=None):
+
+ if mode is None:
+ mode = 'rb'
+
+ fs = gcsfs.GCSFileSystem()
+ filepath_or_buffer = fs.open(filepath_or_buffer, mode)
+ return filepath_or_buffer, None, compression, True
diff --git a/contrib/python/pandas/py2/pandas/io/html.py b/contrib/python/pandas/py2/pandas/io/html.py
new file mode 100644
index 00000000000..74934740a69
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/html.py
@@ -0,0 +1,1094 @@
+""":mod:`pandas.io.html` is a module containing functionality for dealing with
+HTML IO.
+
+"""
+
+from distutils.version import LooseVersion
+import numbers
+import os
+import re
+
+import pandas.compat as compat
+from pandas.compat import (
+ binary_type, iteritems, lmap, lrange, raise_with_traceback, string_types,
+ u)
+from pandas.errors import AbstractMethodError, EmptyDataError
+
+from pandas.core.dtypes.common import is_list_like
+
+from pandas import Series
+
+from pandas.io.common import _is_url, _validate_header_arg, urlopen
+from pandas.io.formats.printing import pprint_thing
+from pandas.io.parsers import TextParser
+
+_IMPORTS = False
+_HAS_BS4 = False
+_HAS_LXML = False
+_HAS_HTML5LIB = False
+
+
+def _importers():
+ # import things we need
+ # but make this done on a first use basis
+
+ global _IMPORTS
+ if _IMPORTS:
+ return
+
+ global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB
+
+ try:
+ import bs4 # noqa
+ _HAS_BS4 = True
+ except ImportError:
+ pass
+
+ try:
+ import lxml # noqa
+ _HAS_LXML = True
+ except ImportError:
+ pass
+
+ try:
+ import html5lib # noqa
+ _HAS_HTML5LIB = True
+ except ImportError:
+ pass
+
+ _IMPORTS = True
+
+
+#############
+# READ HTML #
+#############
+_RE_WHITESPACE = re.compile(r'[\r\n]+|\s{2,}')
+
+
+char_types = string_types + (binary_type,)
+
+
+def _remove_whitespace(s, regex=_RE_WHITESPACE):
+ """Replace extra whitespace inside of a string with a single space.
+
+ Parameters
+ ----------
+ s : str or unicode
+ The string from which to remove extra whitespace.
+
+ regex : regex
+ The regular expression to use to remove extra whitespace.
+
+ Returns
+ -------
+ subd : str or unicode
+ `s` with all extra whitespace replaced with a single space.
+ """
+ return regex.sub(' ', s.strip())
+
+
+def _get_skiprows(skiprows):
+ """Get an iterator given an integer, slice or container.
+
+ Parameters
+ ----------
+ skiprows : int, slice, container
+ The iterator to use to skip rows; can also be a slice.
+
+ Raises
+ ------
+ TypeError
+ * If `skiprows` is not a slice, integer, or Container
+
+ Returns
+ -------
+ it : iterable
+ A proper iterator to use to skip rows of a DataFrame.
+ """
+ if isinstance(skiprows, slice):
+ return lrange(skiprows.start or 0, skiprows.stop, skiprows.step or 1)
+ elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):
+ return skiprows
+ elif skiprows is None:
+ return 0
+ raise TypeError('%r is not a valid type for skipping rows' %
+ type(skiprows).__name__)
+
+
+def _read(obj):
+ """Try to read from a url, file or string.
+
+ Parameters
+ ----------
+ obj : str, unicode, or file-like
+
+ Returns
+ -------
+ raw_text : str
+ """
+ if _is_url(obj):
+ with urlopen(obj) as url:
+ text = url.read()
+ elif hasattr(obj, 'read'):
+ text = obj.read()
+ elif isinstance(obj, char_types):
+ text = obj
+ try:
+ if os.path.isfile(text):
+ with open(text, 'rb') as f:
+ return f.read()
+ except (TypeError, ValueError):
+ pass
+ else:
+ raise TypeError("Cannot read object of type %r" % type(obj).__name__)
+ return text
+
+
+class _HtmlFrameParser(object):
+ """Base class for parsers that parse HTML into DataFrames.
+
+ Parameters
+ ----------
+ io : str or file-like
+ This can be either a string of raw HTML, a valid URL using the HTTP,
+ FTP, or FILE protocols or a file-like object.
+
+ match : str or regex
+ The text to match in the document.
+
+ attrs : dict
+ List of HTML <table> element attributes to match.
+
+ encoding : str
+ Encoding to be used by parser
+
+ displayed_only : bool
+ Whether or not items with "display:none" should be ignored
+
+ .. versionadded:: 0.23.0
+
+ Attributes
+ ----------
+ io : str or file-like
+ raw HTML, URL, or file-like object
+
+ match : regex
+ The text to match in the raw HTML
+
+ attrs : dict-like
+ A dictionary of valid table attributes to use to search for table
+ elements.
+
+ encoding : str
+ Encoding to be used by parser
+
+ displayed_only : bool
+ Whether or not items with "display:none" should be ignored
+
+ .. versionadded:: 0.23.0
+
+ Notes
+ -----
+ To subclass this class effectively you must override the following methods:
+ * :func:`_build_doc`
+ * :func:`_attr_getter`
+ * :func:`_text_getter`
+ * :func:`_parse_td`
+ * :func:`_parse_thead_tr`
+ * :func:`_parse_tbody_tr`
+ * :func:`_parse_tfoot_tr`
+ * :func:`_parse_tables`
+ * :func:`_equals_tag`
+ See each method's respective documentation for details on their
+ functionality.
+ """
+
+ def __init__(self, io, match, attrs, encoding, displayed_only):
+ self.io = io
+ self.match = match
+ self.attrs = attrs
+ self.encoding = encoding
+ self.displayed_only = displayed_only
+
+ def parse_tables(self):
+ """
+ Parse and return all tables from the DOM.
+
+ Returns
+ -------
+ list of parsed (header, body, footer) tuples from tables.
+ """
+ tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
+ return (self._parse_thead_tbody_tfoot(table) for table in tables)
+
+ def _attr_getter(self, obj, attr):
+ """
+ Return the attribute value of an individual DOM node.
+
+ Parameters
+ ----------
+ obj : node-like
+ A DOM node.
+
+ attr : str or unicode
+ The attribute, such as "colspan"
+
+ Returns
+ -------
+ str or unicode
+ The attribute value.
+ """
+ # Both lxml and BeautifulSoup have the same implementation:
+ return obj.get(attr)
+
+ def _text_getter(self, obj):
+ """
+ Return the text of an individual DOM node.
+
+ Parameters
+ ----------
+ obj : node-like
+ A DOM node.
+
+ Returns
+ -------
+ text : str or unicode
+ The text from an individual DOM node.
+ """
+ raise AbstractMethodError(self)
+
+ def _parse_td(self, obj):
+ """Return the td elements from a row element.
+
+ Parameters
+ ----------
+ obj : node-like
+ A DOM <tr> node.
+
+ Returns
+ -------
+ list of node-like
+ These are the elements of each row, i.e., the columns.
+ """
+ raise AbstractMethodError(self)
+
+ def _parse_thead_tr(self, table):
+ """
+ Return the list of thead row elements from the parsed table element.
+
+ Parameters
+ ----------
+ table : a table element that contains zero or more thead elements.
+
+ Returns
+ -------
+ list of node-like
+ These are the <tr> row elements of a table.
+ """
+ raise AbstractMethodError(self)
+
+ def _parse_tbody_tr(self, table):
+ """
+ Return the list of tbody row elements from the parsed table element.
+
+ HTML5 table bodies consist of either 0 or more <tbody> elements (which
+ only contain <tr> elements) or 0 or more <tr> elements. This method
+ checks for both structures.
+
+ Parameters
+ ----------
+ table : a table element that contains row elements.
+
+ Returns
+ -------
+ list of node-like
+ These are the <tr> row elements of a table.
+ """
+ raise AbstractMethodError(self)
+
+ def _parse_tfoot_tr(self, table):
+ """
+ Return the list of tfoot row elements from the parsed table element.
+
+ Parameters
+ ----------
+ table : a table element that contains row elements.
+
+ Returns
+ -------
+ list of node-like
+ These are the <tr> row elements of a table.
+ """
+ raise AbstractMethodError(self)
+
+ def _parse_tables(self, doc, match, attrs):
+ """
+ Return all tables from the parsed DOM.
+
+ Parameters
+ ----------
+ doc : the DOM from which to parse the table element.
+
+ match : str or regular expression
+ The text to search for in the DOM tree.
+
+ attrs : dict
+ A dictionary of table attributes that can be used to disambiguate
+ multiple tables on a page.
+
+ Raises
+ ------
+ ValueError : `match` does not match any text in the document.
+
+ Returns
+ -------
+ list of node-like
+ HTML <table> elements to be parsed into raw data.
+ """
+ raise AbstractMethodError(self)
+
+ def _equals_tag(self, obj, tag):
+ """
+ Return whether an individual DOM node matches a tag
+
+ Parameters
+ ----------
+ obj : node-like
+ A DOM node.
+
+ tag : str
+ Tag name to be checked for equality.
+
+ Returns
+ -------
+ boolean
+ Whether `obj`'s tag name is `tag`
+ """
+ raise AbstractMethodError(self)
+
+ def _build_doc(self):
+ """
+ Return a tree-like object that can be used to iterate over the DOM.
+
+ Returns
+ -------
+ node-like
+ The DOM from which to parse the table element.
+ """
+ raise AbstractMethodError(self)
+
+ def _parse_thead_tbody_tfoot(self, table_html):
+ """
+ Given a table, return parsed header, body, and foot.
+
+ Parameters
+ ----------
+ table_html : node-like
+
+ Returns
+ -------
+ tuple of (header, body, footer), each a list of list-of-text rows.
+
+ Notes
+ -----
+ Header and body are lists-of-lists. Top level list is a list of
+ rows. Each row is a list of str text.
+
+ Logic: Use <thead>, <tbody>, <tfoot> elements to identify
+ header, body, and footer, otherwise:
+ - Put all rows into body
+ - Move rows from top of body to header only if
+ all elements inside row are <th>
+ - Move rows from bottom of body to footer only if
+ all elements inside row are <th>
+ """
+
+ header_rows = self._parse_thead_tr(table_html)
+ body_rows = self._parse_tbody_tr(table_html)
+ footer_rows = self._parse_tfoot_tr(table_html)
+
+ def row_is_all_th(row):
+ return all(self._equals_tag(t, 'th') for t in
+ self._parse_td(row))
+
+ if not header_rows:
+ # The table has no <thead>. Move the top all-<th> rows from
+ # body_rows to header_rows. (This is a common case because many
+ # tables in the wild have no <thead> or <tfoot>
+ while body_rows and row_is_all_th(body_rows[0]):
+ header_rows.append(body_rows.pop(0))
+
+ header = self._expand_colspan_rowspan(header_rows)
+ body = self._expand_colspan_rowspan(body_rows)
+ footer = self._expand_colspan_rowspan(footer_rows)
+
+ return header, body, footer
+
+ def _expand_colspan_rowspan(self, rows):
+ """
+ Given a list of <tr>s, return a list of text rows.
+
+ Parameters
+ ----------
+ rows : list of node-like
+ List of <tr>s
+
+ Returns
+ -------
+ list of list
+ Each returned row is a list of str text.
+
+ Notes
+ -----
+ Any cell with ``rowspan`` or ``colspan`` will have its contents copied
+ to subsequent cells.
+ """
+
+ all_texts = [] # list of rows, each a list of str
+ remainder = [] # list of (index, text, nrows)
+
+ for tr in rows:
+ texts = [] # the output for this row
+ next_remainder = []
+
+ index = 0
+ tds = self._parse_td(tr)
+ for td in tds:
+ # Append texts from previous rows with rowspan>1 that come
+ # before this <td>
+ while remainder and remainder[0][0] <= index:
+ prev_i, prev_text, prev_rowspan = remainder.pop(0)
+ texts.append(prev_text)
+ if prev_rowspan > 1:
+ next_remainder.append((prev_i, prev_text,
+ prev_rowspan - 1))
+ index += 1
+
+ # Append the text from this <td>, colspan times
+ text = _remove_whitespace(self._text_getter(td))
+ rowspan = int(self._attr_getter(td, 'rowspan') or 1)
+ colspan = int(self._attr_getter(td, 'colspan') or 1)
+
+ for _ in range(colspan):
+ texts.append(text)
+ if rowspan > 1:
+ next_remainder.append((index, text, rowspan - 1))
+ index += 1
+
+ # Append texts from previous rows at the final position
+ for prev_i, prev_text, prev_rowspan in remainder:
+ texts.append(prev_text)
+ if prev_rowspan > 1:
+ next_remainder.append((prev_i, prev_text,
+ prev_rowspan - 1))
+
+ all_texts.append(texts)
+ remainder = next_remainder
+
+ # Append rows that only appear because the previous row had non-1
+ # rowspan
+ while remainder:
+ next_remainder = []
+ texts = []
+ for prev_i, prev_text, prev_rowspan in remainder:
+ texts.append(prev_text)
+ if prev_rowspan > 1:
+ next_remainder.append((prev_i, prev_text,
+ prev_rowspan - 1))
+ all_texts.append(texts)
+ remainder = next_remainder
+
+ return all_texts
+
+ def _handle_hidden_tables(self, tbl_list, attr_name):
+ """
+ Return list of tables, potentially removing hidden elements
+
+ Parameters
+ ----------
+ tbl_list : list of node-like
+ Type of list elements will vary depending upon parser used
+ attr_name : str
+ Name of the accessor for retrieving HTML attributes
+
+ Returns
+ -------
+ list of node-like
+ Return type matches `tbl_list`
+ """
+ if not self.displayed_only:
+ return tbl_list
+
+ return [x for x in tbl_list if "display:none" not in
+ getattr(x, attr_name).get('style', '').replace(" ", "")]
+
+
+class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
+ """HTML to DataFrame parser that uses BeautifulSoup under the hood.
+
+ See Also
+ --------
+ pandas.io.html._HtmlFrameParser
+ pandas.io.html._LxmlFrameParser
+
+ Notes
+ -----
+ Documentation strings for this class are in the base class
+ :class:`pandas.io.html._HtmlFrameParser`.
+ """
+
+ def __init__(self, *args, **kwargs):
+ super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args,
+ **kwargs)
+ from bs4 import SoupStrainer
+ self._strainer = SoupStrainer('table')
+
+ def _parse_tables(self, doc, match, attrs):
+ element_name = self._strainer.name
+ tables = doc.find_all(element_name, attrs=attrs)
+
+ if not tables:
+ raise ValueError('No tables found')
+
+ result = []
+ unique_tables = set()
+ tables = self._handle_hidden_tables(tables, "attrs")
+
+ for table in tables:
+ if self.displayed_only:
+ for elem in table.find_all(
+ style=re.compile(r"display:\s*none")):
+ elem.decompose()
+
+ if (table not in unique_tables and
+ table.find(text=match) is not None):
+ result.append(table)
+ unique_tables.add(table)
+
+ if not result:
+ raise ValueError("No tables found matching pattern {patt!r}"
+ .format(patt=match.pattern))
+ return result
+
+ def _text_getter(self, obj):
+ return obj.text
+
+ def _equals_tag(self, obj, tag):
+ return obj.name == tag
+
+ def _parse_td(self, row):
+ return row.find_all(('td', 'th'), recursive=False)
+
+ def _parse_thead_tr(self, table):
+ return table.select('thead tr')
+
+ def _parse_tbody_tr(self, table):
+ from_tbody = table.select('tbody tr')
+ from_root = table.find_all('tr', recursive=False)
+ # HTML spec: at most one of these lists has content
+ return from_tbody + from_root
+
+ def _parse_tfoot_tr(self, table):
+ return table.select('tfoot tr')
+
+ def _setup_build_doc(self):
+ raw_text = _read(self.io)
+ if not raw_text:
+ raise ValueError('No text parsed from document: {doc}'
+ .format(doc=self.io))
+ return raw_text
+
+ def _build_doc(self):
+ from bs4 import BeautifulSoup
+ return BeautifulSoup(self._setup_build_doc(), features='html5lib',
+ from_encoding=self.encoding)
+
+
+def _build_xpath_expr(attrs):
+ """Build an xpath expression to simulate bs4's ability to pass in kwargs to
+ search for attributes when using the lxml parser.
+
+ Parameters
+ ----------
+ attrs : dict
+ A dict of HTML attributes. These are NOT checked for validity.
+
+ Returns
+ -------
+ expr : unicode
+ An XPath expression that checks for the given HTML attributes.
+ """
+ # give class attribute as class_ because class is a python keyword
+ if 'class_' in attrs:
+ attrs['class'] = attrs.pop('class_')
+
+ s = [u("@{key}={val!r}").format(key=k, val=v) for k, v in iteritems(attrs)]
+ return u('[{expr}]').format(expr=' and '.join(s))
+
+
+_re_namespace = {'re': 'http://exslt.org/regular-expressions'}
+_valid_schemes = 'http', 'file', 'ftp'
+
+
+class _LxmlFrameParser(_HtmlFrameParser):
+ """HTML to DataFrame parser that uses lxml under the hood.
+
+ Warning
+ -------
+ This parser can only handle HTTP, FTP, and FILE urls.
+
+ See Also
+ --------
+ _HtmlFrameParser
+ _BeautifulSoupLxmlFrameParser
+
+ Notes
+ -----
+ Documentation strings for this class are in the base class
+ :class:`_HtmlFrameParser`.
+ """
+
+ def __init__(self, *args, **kwargs):
+ super(_LxmlFrameParser, self).__init__(*args, **kwargs)
+
+ def _text_getter(self, obj):
+ return obj.text_content()
+
+ def _parse_td(self, row):
+ # Look for direct children only: the "row" element here may be a
+ # <thead> or <tfoot> (see _parse_thead_tr).
+ return row.xpath('./td|./th')
+
+ def _parse_tables(self, doc, match, kwargs):
+ pattern = match.pattern
+
+ # 1. check all descendants for the given pattern and only search tables
+ # 2. go up the tree until we find a table
+ query = '//table//*[re:test(text(), {patt!r})]/ancestor::table'
+ xpath_expr = u(query).format(patt=pattern)
+
+ # if any table attributes were given build an xpath expression to
+ # search for them
+ if kwargs:
+ xpath_expr += _build_xpath_expr(kwargs)
+
+ tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
+
+ tables = self._handle_hidden_tables(tables, "attrib")
+ if self.displayed_only:
+ for table in tables:
+ # lxml utilizes XPATH 1.0 which does not have regex
+ # support. As a result, we find all elements with a style
+ # attribute and iterate them to check for display:none
+ for elem in table.xpath('.//*[@style]'):
+ if "display:none" in elem.attrib.get(
+ "style", "").replace(" ", ""):
+ elem.getparent().remove(elem)
+
+ if not tables:
+ raise ValueError("No tables found matching regex {patt!r}"
+ .format(patt=pattern))
+ return tables
+
+ def _equals_tag(self, obj, tag):
+ return obj.tag == tag
+
+ def _build_doc(self):
+ """
+ Raises
+ ------
+ ValueError
+ * If a URL that lxml cannot parse is passed.
+
+ Exception
+ * Any other ``Exception`` thrown. For example, trying to parse a
+ URL that is syntactically correct on a machine with no internet
+ connection will fail.
+
+ See Also
+ --------
+ pandas.io.html._HtmlFrameParser._build_doc
+ """
+ from lxml.html import parse, fromstring, HTMLParser
+ from lxml.etree import XMLSyntaxError
+ parser = HTMLParser(recover=True, encoding=self.encoding)
+
+ try:
+ if _is_url(self.io):
+ with urlopen(self.io) as f:
+ r = parse(f, parser=parser)
+ else:
+ # try to parse the input in the simplest way
+ r = parse(self.io, parser=parser)
+ try:
+ r = r.getroot()
+ except AttributeError:
+ pass
+ except (UnicodeDecodeError, IOError) as e:
+ # if the input is a blob of html goop
+ if not _is_url(self.io):
+ r = fromstring(self.io, parser=parser)
+
+ try:
+ r = r.getroot()
+ except AttributeError:
+ pass
+ else:
+ raise e
+ else:
+ if not hasattr(r, 'text_content'):
+ raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
+ return r
+
+ def _parse_thead_tr(self, table):
+ rows = []
+
+ for thead in table.xpath('.//thead'):
+ rows.extend(thead.xpath('./tr'))
+
+ # HACK: lxml does not clean up the clearly-erroneous
+ # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add
+ # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its
+ # children as though it's a <tr>.
+ #
+ # Better solution would be to use html5lib.
+ elements_at_root = thead.xpath('./td|./th')
+ if elements_at_root:
+ rows.append(thead)
+
+ return rows
+
+ def _parse_tbody_tr(self, table):
+ from_tbody = table.xpath('.//tbody//tr')
+ from_root = table.xpath('./tr')
+ # HTML spec: at most one of these lists has content
+ return from_tbody + from_root
+
+ def _parse_tfoot_tr(self, table):
+ return table.xpath('.//tfoot//tr')
+
+
+def _expand_elements(body):
+ lens = Series(lmap(len, body))
+ lens_max = lens.max()
+ not_max = lens[lens != lens_max]
+
+ empty = ['']
+ for ind, length in iteritems(not_max):
+ body[ind] += empty * (lens_max - length)
+
+
+def _data_to_frame(**kwargs):
+ head, body, foot = kwargs.pop('data')
+ header = kwargs.pop('header')
+ kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
+ if head:
+ body = head + body
+
+ # Infer header when there is a <thead> or top <th>-only rows
+ if header is None:
+ if len(head) == 1:
+ header = 0
+ else:
+ # ignore all-empty-text rows
+ header = [i for i, row in enumerate(head)
+ if any(text for text in row)]
+
+ if foot:
+ body += foot
+
+ # fill out elements of body that are "ragged"
+ _expand_elements(body)
+ tp = TextParser(body, header=header, **kwargs)
+ df = tp.read()
+ return df
+
+
+_valid_parsers = {'lxml': _LxmlFrameParser, None: _LxmlFrameParser,
+ 'html5lib': _BeautifulSoupHtml5LibFrameParser,
+ 'bs4': _BeautifulSoupHtml5LibFrameParser}
+
+
+def _parser_dispatch(flavor):
+ """Choose the parser based on the input flavor.
+
+ Parameters
+ ----------
+ flavor : str
+ The type of parser to use. This must be a valid backend.
+
+ Returns
+ -------
+ cls : _HtmlFrameParser subclass
+ The parser class based on the requested input flavor.
+
+ Raises
+ ------
+ ValueError
+ * If `flavor` is not a valid backend.
+ ImportError
+ * If you do not have the requested `flavor`
+ """
+ valid_parsers = list(_valid_parsers.keys())
+ if flavor not in valid_parsers:
+ raise ValueError('{invalid!r} is not a valid flavor, valid flavors '
+ 'are {valid}'
+ .format(invalid=flavor, valid=valid_parsers))
+
+ if flavor in ('bs4', 'html5lib'):
+ if not _HAS_HTML5LIB:
+ raise ImportError("html5lib not found, please install it")
+ if not _HAS_BS4:
+ raise ImportError(
+ "BeautifulSoup4 (bs4) not found, please install it")
+ import bs4
+ if LooseVersion(bs4.__version__) <= LooseVersion('4.2.0'):
+ raise ValueError("A minimum version of BeautifulSoup 4.2.1 "
+ "is required")
+
+ else:
+ if not _HAS_LXML:
+ raise ImportError("lxml not found, please install it")
+ return _valid_parsers[flavor]
+
+
+def _print_as_set(s):
+ return ('{' + '{arg}'.format(arg=', '.join(
+ pprint_thing(el) for el in s)) + '}')
+
+
+def _validate_flavor(flavor):
+ if flavor is None:
+ flavor = 'lxml', 'bs4'
+ elif isinstance(flavor, string_types):
+ flavor = flavor,
+ elif isinstance(flavor, compat.Iterable):
+ if not all(isinstance(flav, string_types) for flav in flavor):
+ raise TypeError('Object of type {typ!r} is not an iterable of '
+ 'strings'
+ .format(typ=type(flavor).__name__))
+ else:
+ fmt = '{flavor!r}' if isinstance(flavor, string_types) else '{flavor}'
+ fmt += ' is not a valid flavor'
+ raise ValueError(fmt.format(flavor=flavor))
+
+ flavor = tuple(flavor)
+ valid_flavors = set(_valid_parsers)
+ flavor_set = set(flavor)
+
+ if not flavor_set & valid_flavors:
+ raise ValueError('{invalid} is not a valid set of flavors, valid '
+ 'flavors are {valid}'
+ .format(invalid=_print_as_set(flavor_set),
+ valid=_print_as_set(valid_flavors)))
+ return flavor
+
+
+def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
+ flavor = _validate_flavor(flavor)
+ compiled_match = re.compile(match) # you can pass a compiled regex here
+
+ # hack around python 3 deleting the exception variable
+ retained = None
+ for flav in flavor:
+ parser = _parser_dispatch(flav)
+ p = parser(io, compiled_match, attrs, encoding, displayed_only)
+
+ try:
+ tables = p.parse_tables()
+ except Exception as caught:
+ # if `io` is an io-like object, check if it's seekable
+ # and try to rewind it before trying the next parser
+ if hasattr(io, 'seekable') and io.seekable():
+ io.seek(0)
+ elif hasattr(io, 'seekable') and not io.seekable():
+ # if we couldn't rewind it, let the user know
+ raise ValueError('The flavor {} failed to parse your input. '
+ 'Since you passed a non-rewindable file '
+ 'object, we can\'t rewind it to try '
+ 'another parser. Try read_html() with a '
+ 'different flavor.'.format(flav))
+
+ retained = caught
+ else:
+ break
+ else:
+ raise_with_traceback(retained)
+
+ ret = []
+ for table in tables:
+ try:
+ ret.append(_data_to_frame(data=table, **kwargs))
+ except EmptyDataError: # empty table
+ continue
+ return ret
+
+
+def read_html(io, match='.+', flavor=None, header=None, index_col=None,
+ skiprows=None, attrs=None, parse_dates=False,
+ tupleize_cols=None, thousands=',', encoding=None,
+ decimal='.', converters=None, na_values=None,
+ keep_default_na=True, displayed_only=True):
+ r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
+
+ Parameters
+ ----------
+ io : str or file-like
+ A URL, a file-like object, or a raw string containing HTML. Note that
+ lxml only accepts the http, ftp and file url protocols. If you have a
+ URL that starts with ``'https'`` you might try removing the ``'s'``.
+
+ match : str or compiled regular expression, optional
+ The set of tables containing text matching this regex or string will be
+ returned. Unless the HTML is extremely simple you will probably need to
+ pass a non-empty string here. Defaults to '.+' (match any non-empty
+ string). The default value will return all tables contained on a page.
+ This value is converted to a regular expression so that there is
+ consistent behavior between Beautiful Soup and lxml.
+
+ flavor : str or None, container of strings
+ The parsing engine to use. 'bs4' and 'html5lib' are synonymous with
+ each other, they are both there for backwards compatibility. The
+ default of ``None`` tries to use ``lxml`` to parse and if that fails it
+ falls back on ``bs4`` + ``html5lib``.
+
+ header : int or list-like or None, optional
+ The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
+ make the columns headers.
+
+ index_col : int or list-like or None, optional
+ The column (or list of columns) to use to create the index.
+
+ skiprows : int or list-like or slice or None, optional
+ 0-based. Number of rows to skip after parsing the column integer. If a
+ sequence of integers or a slice is given, will skip the rows indexed by
+ that sequence. Note that a single element sequence means 'skip the nth
+ row' whereas an integer means 'skip n rows'.
+
+ attrs : dict or None, optional
+ This is a dictionary of attributes that you can pass to use to identify
+ the table in the HTML. These are not checked for validity before being
+ passed to lxml or Beautiful Soup. However, these attributes must be
+ valid HTML table attributes to work correctly. For example, ::
+
+ attrs = {'id': 'table'}
+
+ is a valid attribute dictionary because the 'id' HTML tag attribute is
+ a valid HTML attribute for *any* HTML tag as per `this document
+ <http://www.w3.org/TR/html-markup/global-attributes.html>`__. ::
+
+ attrs = {'asdf': 'table'}
+
+ is *not* a valid attribute dictionary because 'asdf' is not a valid
+ HTML attribute even if it is a valid XML attribute. Valid HTML 4.01
+ table attributes can be found `here
+ <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A
+ working draft of the HTML 5 spec can be found `here
+ <http://www.w3.org/TR/html-markup/table.html>`__. It contains the
+ latest information on table attributes for the modern web.
+
+ parse_dates : bool, optional
+ See :func:`~pandas.read_csv` for more details.
+
+ tupleize_cols : bool, optional
+ If ``False`` try to parse multiple header rows into a
+ :class:`~pandas.MultiIndex`, otherwise return raw tuples. Defaults to
+ ``False``.
+
+ .. deprecated:: 0.21.0
+ This argument will be removed and will always convert to MultiIndex
+
+ thousands : str, optional
+ Separator to use to parse thousands. Defaults to ``','``.
+
+ encoding : str or None, optional
+ The encoding used to decode the web page. Defaults to ``None``.``None``
+ preserves the previous encoding behavior, which depends on the
+ underlying parser library (e.g., the parser library will try to use
+ the encoding provided by the document).
+
+ decimal : str, default '.'
+ Character to recognize as decimal point (e.g. use ',' for European
+ data).
+
+ .. versionadded:: 0.19.0
+
+ converters : dict, default None
+ Dict of functions for converting values in certain columns. Keys can
+ either be integers or column labels, values are functions that take one
+ input argument, the cell (not column) content, and return the
+ transformed content.
+
+ .. versionadded:: 0.19.0
+
+ na_values : iterable, default None
+ Custom NA values
+
+ .. versionadded:: 0.19.0
+
+ keep_default_na : bool, default True
+ If na_values are specified and keep_default_na is False the default NaN
+ values are overridden, otherwise they're appended to
+
+ .. versionadded:: 0.19.0
+
+ displayed_only : bool, default True
+ Whether elements with "display: none" should be parsed
+
+ .. versionadded:: 0.23.0
+
+ Returns
+ -------
+ dfs : list of DataFrames
+
+ See Also
+ --------
+ pandas.read_csv
+
+ Notes
+ -----
+ Before using this function you should read the :ref:`gotchas about the
+ HTML parsing libraries <io.html.gotchas>`.
+
+ Expect to do some cleanup after you call this function. For example, you
+ might need to manually assign column names if the column names are
+ converted to NaN when you pass the `header=0` argument. We try to assume as
+ little as possible about the structure of the table and push the
+ idiosyncrasies of the HTML contained in the table to the user.
+
+ This function searches for ``<table>`` elements and only for ``<tr>``
+ and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
+ element in the table. ``<td>`` stands for "table data". This function
+ attempts to properly handle ``colspan`` and ``rowspan`` attributes.
+ If the function has a ``<thead>`` argument, it is used to construct
+ the header, otherwise the function attempts to find the header within
+ the body (by putting rows with only ``<th>`` elements into the header).
+
+ .. versionadded:: 0.21.0
+
+ Similar to :func:`~pandas.read_csv` the `header` argument is applied
+ **after** `skiprows` is applied.
+
+ This function will *always* return a list of :class:`DataFrame` *or*
+ it will fail, e.g., it will *not* return an empty list.
+
+ Examples
+ --------
+ See the :ref:`read_html documentation in the IO section of the docs
+ <io.read_html>` for some examples of reading in HTML tables.
+ """
+ _importers()
+
+ # Type check here. We don't want to parse only to fail because of an
+ # invalid value of an integer skiprows.
+ if isinstance(skiprows, numbers.Integral) and skiprows < 0:
+ raise ValueError('cannot skip rows starting from the end of the '
+ 'data (you passed a negative value)')
+ _validate_header_arg(header)
+ return _parse(flavor=flavor, io=io, match=match, header=header,
+ index_col=index_col, skiprows=skiprows,
+ parse_dates=parse_dates, tupleize_cols=tupleize_cols,
+ thousands=thousands, attrs=attrs, encoding=encoding,
+ decimal=decimal, converters=converters, na_values=na_values,
+ keep_default_na=keep_default_na,
+ displayed_only=displayed_only)
diff --git a/contrib/python/pandas/py2/pandas/io/json/__init__.py b/contrib/python/pandas/py2/pandas/io/json/__init__.py
new file mode 100644
index 00000000000..32d110b3404
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/json/__init__.py
@@ -0,0 +1,5 @@
+from .json import to_json, read_json, loads, dumps # noqa
+from .normalize import json_normalize # noqa
+from .table_schema import build_table_schema # noqa
+
+del json, normalize, table_schema # noqa
diff --git a/contrib/python/pandas/py2/pandas/io/json/json.py b/contrib/python/pandas/py2/pandas/io/json/json.py
new file mode 100644
index 00000000000..4bbccc8339d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/json/json.py
@@ -0,0 +1,951 @@
+# pylint: disable-msg=E1101,W0613,W0603
+from itertools import islice
+import os
+
+import numpy as np
+
+import pandas._libs.json as json
+from pandas._libs.tslibs import iNaT
+from pandas.compat import StringIO, long, to_str, u
+from pandas.errors import AbstractMethodError
+
+from pandas.core.dtypes.common import is_period_dtype
+
+from pandas import DataFrame, MultiIndex, Series, compat, isna, to_datetime
+from pandas.core.reshape.concat import concat
+
+from pandas.io.common import (
+ BaseIterator, _get_handle, _infer_compression, _stringify_path,
+ get_filepath_or_buffer)
+from pandas.io.formats.printing import pprint_thing
+from pandas.io.parsers import _validate_integer
+
+from .normalize import _convert_to_line_delimits
+from .table_schema import build_table_schema, parse_table_schema
+
+loads = json.loads
+dumps = json.dumps
+
+TABLE_SCHEMA_VERSION = '0.20.0'
+
+
+# interface to/from
+def to_json(path_or_buf, obj, orient=None, date_format='epoch',
+ double_precision=10, force_ascii=True, date_unit='ms',
+ default_handler=None, lines=False, compression='infer',
+ index=True):
+
+ if not index and orient not in ['split', 'table']:
+ raise ValueError("'index=False' is only valid when 'orient' is "
+ "'split' or 'table'")
+
+ path_or_buf = _stringify_path(path_or_buf)
+ if lines and orient != 'records':
+ raise ValueError(
+ "'lines' keyword only valid when 'orient' is records")
+
+ if orient == 'table' and isinstance(obj, Series):
+ obj = obj.to_frame(name=obj.name or 'values')
+ if orient == 'table' and isinstance(obj, DataFrame):
+ writer = JSONTableWriter
+ elif isinstance(obj, Series):
+ writer = SeriesWriter
+ elif isinstance(obj, DataFrame):
+ writer = FrameWriter
+ else:
+ raise NotImplementedError("'obj' should be a Series or a DataFrame")
+
+ s = writer(
+ obj, orient=orient, date_format=date_format,
+ double_precision=double_precision, ensure_ascii=force_ascii,
+ date_unit=date_unit, default_handler=default_handler,
+ index=index).write()
+
+ if lines:
+ s = _convert_to_line_delimits(s)
+
+ if isinstance(path_or_buf, compat.string_types):
+ fh, handles = _get_handle(path_or_buf, 'w', compression=compression)
+ try:
+ fh.write(s)
+ finally:
+ fh.close()
+ elif path_or_buf is None:
+ return s
+ else:
+ path_or_buf.write(s)
+
+
+class Writer(object):
+ def __init__(self, obj, orient, date_format, double_precision,
+ ensure_ascii, date_unit, index, default_handler=None):
+ self.obj = obj
+
+ if orient is None:
+ orient = self._default_orient
+
+ self.orient = orient
+ self.date_format = date_format
+ self.double_precision = double_precision
+ self.ensure_ascii = ensure_ascii
+ self.date_unit = date_unit
+ self.default_handler = default_handler
+ self.index = index
+
+ self.is_copy = None
+ self._format_axes()
+
+ def _format_axes(self):
+ raise AbstractMethodError(self)
+
+ def write(self):
+ return self._write(self.obj, self.orient, self.double_precision,
+ self.ensure_ascii, self.date_unit,
+ self.date_format == 'iso', self.default_handler)
+
+ def _write(self, obj, orient, double_precision, ensure_ascii,
+ date_unit, iso_dates, default_handler):
+ return dumps(
+ obj,
+ orient=orient,
+ double_precision=double_precision,
+ ensure_ascii=ensure_ascii,
+ date_unit=date_unit,
+ iso_dates=iso_dates,
+ default_handler=default_handler
+ )
+
+
+class SeriesWriter(Writer):
+ _default_orient = 'index'
+
+ def _format_axes(self):
+ if not self.obj.index.is_unique and self.orient == 'index':
+ raise ValueError("Series index must be unique for orient="
+ "'{orient}'".format(orient=self.orient))
+
+ def _write(self, obj, orient, double_precision, ensure_ascii,
+ date_unit, iso_dates, default_handler):
+ if not self.index and orient == 'split':
+ obj = {"name": obj.name, "data": obj.values}
+ return super(SeriesWriter, self)._write(obj, orient,
+ double_precision,
+ ensure_ascii, date_unit,
+ iso_dates, default_handler)
+
+
+class FrameWriter(Writer):
+ _default_orient = 'columns'
+
+ def _format_axes(self):
+ """
+ Try to format axes if they are datelike.
+ """
+ if not self.obj.index.is_unique and self.orient in (
+ 'index', 'columns'):
+ raise ValueError("DataFrame index must be unique for orient="
+ "'{orient}'.".format(orient=self.orient))
+ if not self.obj.columns.is_unique and self.orient in (
+ 'index', 'columns', 'records'):
+ raise ValueError("DataFrame columns must be unique for orient="
+ "'{orient}'.".format(orient=self.orient))
+
+ def _write(self, obj, orient, double_precision, ensure_ascii,
+ date_unit, iso_dates, default_handler):
+ if not self.index and orient == 'split':
+ obj = obj.to_dict(orient='split')
+ del obj["index"]
+ return super(FrameWriter, self)._write(obj, orient,
+ double_precision,
+ ensure_ascii, date_unit,
+ iso_dates, default_handler)
+
+
+class JSONTableWriter(FrameWriter):
+ _default_orient = 'records'
+
+ def __init__(self, obj, orient, date_format, double_precision,
+ ensure_ascii, date_unit, index, default_handler=None):
+ """
+ Adds a `schema` attribute with the Table Schema, resets
+ the index (can't do in caller, because the schema inference needs
+ to know what the index is, forces orient to records, and forces
+ date_format to 'iso'.
+ """
+ super(JSONTableWriter, self).__init__(
+ obj, orient, date_format, double_precision, ensure_ascii,
+ date_unit, index, default_handler=default_handler)
+
+ if date_format != 'iso':
+ msg = ("Trying to write with `orient='table'` and "
+ "`date_format='{fmt}'`. Table Schema requires dates "
+ "to be formatted with `date_format='iso'`"
+ .format(fmt=date_format))
+ raise ValueError(msg)
+
+ self.schema = build_table_schema(obj, index=self.index)
+
+ # NotImplementd on a column MultiIndex
+ if obj.ndim == 2 and isinstance(obj.columns, MultiIndex):
+ raise NotImplementedError(
+ "orient='table' is not supported for MultiIndex")
+
+ # TODO: Do this timedelta properly in objToJSON.c See GH #15137
+ if ((obj.ndim == 1) and (obj.name in set(obj.index.names)) or
+ len(obj.columns & obj.index.names)):
+ msg = "Overlapping names between the index and columns"
+ raise ValueError(msg)
+
+ obj = obj.copy()
+ timedeltas = obj.select_dtypes(include=['timedelta']).columns
+ if len(timedeltas):
+ obj[timedeltas] = obj[timedeltas].applymap(
+ lambda x: x.isoformat())
+ # Convert PeriodIndex to datetimes before serialzing
+ if is_period_dtype(obj.index):
+ obj.index = obj.index.to_timestamp()
+
+ # exclude index from obj if index=False
+ if not self.index:
+ self.obj = obj.reset_index(drop=True)
+ else:
+ self.obj = obj.reset_index(drop=False)
+ self.date_format = 'iso'
+ self.orient = 'records'
+ self.index = index
+
+ def _write(self, obj, orient, double_precision, ensure_ascii,
+ date_unit, iso_dates, default_handler):
+ data = super(JSONTableWriter, self)._write(obj, orient,
+ double_precision,
+ ensure_ascii, date_unit,
+ iso_dates,
+ default_handler)
+ serialized = '{{"schema": {schema}, "data": {data}}}'.format(
+ schema=dumps(self.schema), data=data)
+ return serialized
+
+
+def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
+ convert_axes=True, convert_dates=True, keep_default_dates=True,
+ numpy=False, precise_float=False, date_unit=None, encoding=None,
+ lines=False, chunksize=None, compression='infer'):
+ """
+ Convert a JSON string to pandas object.
+
+ Parameters
+ ----------
+ path_or_buf : a valid JSON string or file-like, default: None
+ The string could be a URL. Valid URL schemes include http, ftp, s3,
+ gcs, and file. For file URLs, a host is expected. For instance, a local
+ file could be ``file://localhost/path/to/table.json``
+
+ orient : string,
+ Indication of expected JSON string format.
+ Compatible JSON strings can be produced by ``to_json()`` with a
+ corresponding orient value.
+ The set of possible orients is:
+
+ - ``'split'`` : dict like
+ ``{index -> [index], columns -> [columns], data -> [values]}``
+ - ``'records'`` : list like
+ ``[{column -> value}, ... , {column -> value}]``
+ - ``'index'`` : dict like ``{index -> {column -> value}}``
+ - ``'columns'`` : dict like ``{column -> {index -> value}}``
+ - ``'values'`` : just the values array
+
+ The allowed and default values depend on the value
+ of the `typ` parameter.
+
+ * when ``typ == 'series'``,
+
+ - allowed orients are ``{'split','records','index'}``
+ - default is ``'index'``
+ - The Series index must be unique for orient ``'index'``.
+
+ * when ``typ == 'frame'``,
+
+ - allowed orients are ``{'split','records','index',
+ 'columns','values', 'table'}``
+ - default is ``'columns'``
+ - The DataFrame index must be unique for orients ``'index'`` and
+ ``'columns'``.
+ - The DataFrame columns must be unique for orients ``'index'``,
+ ``'columns'``, and ``'records'``.
+
+ .. versionadded:: 0.23.0
+ 'table' as an allowed value for the ``orient`` argument
+
+ typ : type of object to recover (series or frame), default 'frame'
+ dtype : boolean or dict, default True
+ If True, infer dtypes, if a dict of column to dtype, then use those,
+ if False, then don't infer dtypes at all, applies only to the data.
+ convert_axes : boolean, default True
+ Try to convert the axes to the proper dtypes.
+ convert_dates : boolean, default True
+ List of columns to parse for dates; If True, then try to parse
+ datelike columns default is True; a column label is datelike if
+
+ * it ends with ``'_at'``,
+
+ * it ends with ``'_time'``,
+
+ * it begins with ``'timestamp'``,
+
+ * it is ``'modified'``, or
+
+ * it is ``'date'``
+
+ keep_default_dates : boolean, default True
+ If parsing dates, then parse the default datelike columns
+ numpy : boolean, default False
+ Direct decoding to numpy arrays. Supports numeric data only, but
+ non-numeric column and index labels are supported. Note also that the
+ JSON ordering MUST be the same for each term if numpy=True.
+ precise_float : boolean, default False
+ Set to enable usage of higher precision (strtod) function when
+ decoding string to double values. Default (False) is to use fast but
+ less precise builtin functionality
+ date_unit : string, default None
+ The timestamp unit to detect if converting dates. The default behaviour
+ is to try and detect the correct precision, but if this is not desired
+ then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
+ milliseconds, microseconds or nanoseconds respectively.
+ encoding : str, default is 'utf-8'
+ The encoding to use to decode py3 bytes.
+
+ .. versionadded:: 0.19.0
+
+ lines : boolean, default False
+ Read the file as a json object per line.
+
+ .. versionadded:: 0.19.0
+
+ chunksize : integer, default None
+ Return JsonReader object for iteration.
+ See the `line-delimted json docs
+ <http://pandas.pydata.org/pandas-docs/stable/io.html#io-jsonl>`_
+ for more information on ``chunksize``.
+ This can only be passed if `lines=True`.
+ If this is None, the file will be read into memory all at once.
+
+ .. versionadded:: 0.21.0
+
+ compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
+ For on-the-fly decompression of on-disk data. If 'infer', then use
+ gzip, bz2, zip or xz if path_or_buf is a string ending in
+ '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
+ otherwise. If using 'zip', the ZIP file must contain only one data
+ file to be read in. Set to None for no decompression.
+
+ .. versionadded:: 0.21.0
+
+ Returns
+ -------
+ result : Series or DataFrame, depending on the value of `typ`.
+
+ See Also
+ --------
+ DataFrame.to_json
+
+ Notes
+ -----
+ Specific to ``orient='table'``, if a :class:`DataFrame` with a literal
+ :class:`Index` name of `index` gets written with :func:`to_json`, the
+ subsequent read operation will incorrectly set the :class:`Index` name to
+ ``None``. This is because `index` is also used by :func:`DataFrame.to_json`
+ to denote a missing :class:`Index` name, and the subsequent
+ :func:`read_json` operation cannot distinguish between the two. The same
+ limitation is encountered with a :class:`MultiIndex` and any names
+ beginning with ``'level_'``.
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
+ ... index=['row 1', 'row 2'],
+ ... columns=['col 1', 'col 2'])
+
+ Encoding/decoding a Dataframe using ``'split'`` formatted JSON:
+
+ >>> df.to_json(orient='split')
+ '{"columns":["col 1","col 2"],
+ "index":["row 1","row 2"],
+ "data":[["a","b"],["c","d"]]}'
+ >>> pd.read_json(_, orient='split')
+ col 1 col 2
+ row 1 a b
+ row 2 c d
+
+ Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
+
+ >>> df.to_json(orient='index')
+ '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
+ >>> pd.read_json(_, orient='index')
+ col 1 col 2
+ row 1 a b
+ row 2 c d
+
+ Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
+ Note that index labels are not preserved with this encoding.
+
+ >>> df.to_json(orient='records')
+ '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
+ >>> pd.read_json(_, orient='records')
+ col 1 col 2
+ 0 a b
+ 1 c d
+
+ Encoding with Table Schema
+
+ >>> df.to_json(orient='table')
+ '{"schema": {"fields": [{"name": "index", "type": "string"},
+ {"name": "col 1", "type": "string"},
+ {"name": "col 2", "type": "string"}],
+ "primaryKey": "index",
+ "pandas_version": "0.20.0"},
+ "data": [{"index": "row 1", "col 1": "a", "col 2": "b"},
+ {"index": "row 2", "col 1": "c", "col 2": "d"}]}'
+ """
+
+ compression = _infer_compression(path_or_buf, compression)
+ filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
+ path_or_buf, encoding=encoding, compression=compression,
+ )
+
+ json_reader = JsonReader(
+ filepath_or_buffer, orient=orient, typ=typ, dtype=dtype,
+ convert_axes=convert_axes, convert_dates=convert_dates,
+ keep_default_dates=keep_default_dates, numpy=numpy,
+ precise_float=precise_float, date_unit=date_unit, encoding=encoding,
+ lines=lines, chunksize=chunksize, compression=compression,
+ )
+
+ if chunksize:
+ return json_reader
+
+ result = json_reader.read()
+ if should_close:
+ try:
+ filepath_or_buffer.close()
+ except: # noqa: flake8
+ pass
+ return result
+
+
+class JsonReader(BaseIterator):
+ """
+ JsonReader provides an interface for reading in a JSON file.
+
+ If initialized with ``lines=True`` and ``chunksize``, can be iterated over
+ ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the
+ whole document.
+ """
+ def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes,
+ convert_dates, keep_default_dates, numpy, precise_float,
+ date_unit, encoding, lines, chunksize, compression):
+
+ self.path_or_buf = filepath_or_buffer
+ self.orient = orient
+ self.typ = typ
+ self.dtype = dtype
+ self.convert_axes = convert_axes
+ self.convert_dates = convert_dates
+ self.keep_default_dates = keep_default_dates
+ self.numpy = numpy
+ self.precise_float = precise_float
+ self.date_unit = date_unit
+ self.encoding = encoding
+ self.compression = compression
+ self.lines = lines
+ self.chunksize = chunksize
+ self.nrows_seen = 0
+ self.should_close = False
+
+ if self.chunksize is not None:
+ self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
+ if not self.lines:
+ raise ValueError("chunksize can only be passed if lines=True")
+
+ data = self._get_data_from_filepath(filepath_or_buffer)
+ self.data = self._preprocess_data(data)
+
+ def _preprocess_data(self, data):
+ """
+ At this point, the data either has a `read` attribute (e.g. a file
+ object or a StringIO) or is a string that is a JSON document.
+
+ If self.chunksize, we prepare the data for the `__next__` method.
+ Otherwise, we read it into memory for the `read` method.
+ """
+ if hasattr(data, 'read') and not self.chunksize:
+ data = data.read()
+ if not hasattr(data, 'read') and self.chunksize:
+ data = StringIO(data)
+
+ return data
+
+ def _get_data_from_filepath(self, filepath_or_buffer):
+ """
+ The function read_json accepts three input types:
+ 1. filepath (string-like)
+ 2. file-like object (e.g. open file object, StringIO)
+ 3. JSON string
+
+ This method turns (1) into (2) to simplify the rest of the processing.
+ It returns input types (2) and (3) unchanged.
+ """
+ data = filepath_or_buffer
+
+ exists = False
+ if isinstance(data, compat.string_types):
+ try:
+ exists = os.path.exists(filepath_or_buffer)
+ # gh-5874: if the filepath is too long will raise here
+ except (TypeError, ValueError):
+ pass
+
+ if exists or self.compression is not None:
+ data, _ = _get_handle(filepath_or_buffer, 'r',
+ encoding=self.encoding,
+ compression=self.compression)
+ self.should_close = True
+ self.open_stream = data
+
+ return data
+
+ def _combine_lines(self, lines):
+ """
+ Combines a list of JSON objects into one JSON object.
+ """
+ lines = filter(None, map(lambda x: x.strip(), lines))
+ return '[' + ','.join(lines) + ']'
+
+ def read(self):
+ """
+ Read the whole JSON input into a pandas object.
+ """
+ if self.lines and self.chunksize:
+ obj = concat(self)
+ elif self.lines:
+
+ data = to_str(self.data)
+ obj = self._get_object_parser(
+ self._combine_lines(data.split('\n'))
+ )
+ else:
+ obj = self._get_object_parser(self.data)
+ self.close()
+ return obj
+
+ def _get_object_parser(self, json):
+ """
+ Parses a json document into a pandas object.
+ """
+ typ = self.typ
+ dtype = self.dtype
+ kwargs = {
+ "orient": self.orient, "dtype": self.dtype,
+ "convert_axes": self.convert_axes,
+ "convert_dates": self.convert_dates,
+ "keep_default_dates": self.keep_default_dates, "numpy": self.numpy,
+ "precise_float": self.precise_float, "date_unit": self.date_unit
+ }
+ obj = None
+ if typ == 'frame':
+ obj = FrameParser(json, **kwargs).parse()
+
+ if typ == 'series' or obj is None:
+ if not isinstance(dtype, bool):
+ kwargs['dtype'] = dtype
+ obj = SeriesParser(json, **kwargs).parse()
+
+ return obj
+
+ def close(self):
+ """
+ If we opened a stream earlier, in _get_data_from_filepath, we should
+ close it.
+
+ If an open stream or file was passed, we leave it open.
+ """
+ if self.should_close:
+ try:
+ self.open_stream.close()
+ except (IOError, AttributeError):
+ pass
+
+ def __next__(self):
+ lines = list(islice(self.data, self.chunksize))
+ if lines:
+ lines_json = self._combine_lines(lines)
+ obj = self._get_object_parser(lines_json)
+
+ # Make sure that the returned objects have the right index.
+ obj.index = range(self.nrows_seen, self.nrows_seen + len(obj))
+ self.nrows_seen += len(obj)
+
+ return obj
+
+ self.close()
+ raise StopIteration
+
+
+class Parser(object):
+
+ _STAMP_UNITS = ('s', 'ms', 'us', 'ns')
+ _MIN_STAMPS = {
+ 's': long(31536000),
+ 'ms': long(31536000000),
+ 'us': long(31536000000000),
+ 'ns': long(31536000000000000)}
+
+ def __init__(self, json, orient, dtype=True, convert_axes=True,
+ convert_dates=True, keep_default_dates=False, numpy=False,
+ precise_float=False, date_unit=None):
+ self.json = json
+
+ if orient is None:
+ orient = self._default_orient
+
+ self.orient = orient
+ self.dtype = dtype
+
+ if orient == "split":
+ numpy = False
+
+ if date_unit is not None:
+ date_unit = date_unit.lower()
+ if date_unit not in self._STAMP_UNITS:
+ raise ValueError('date_unit must be one of {units}'
+ .format(units=self._STAMP_UNITS))
+ self.min_stamp = self._MIN_STAMPS[date_unit]
+ else:
+ self.min_stamp = self._MIN_STAMPS['s']
+
+ self.numpy = numpy
+ self.precise_float = precise_float
+ self.convert_axes = convert_axes
+ self.convert_dates = convert_dates
+ self.date_unit = date_unit
+ self.keep_default_dates = keep_default_dates
+ self.obj = None
+
+ def check_keys_split(self, decoded):
+ """
+ Checks that dict has only the appropriate keys for orient='split'.
+ """
+ bad_keys = set(decoded.keys()).difference(set(self._split_keys))
+ if bad_keys:
+ bad_keys = ", ".join(bad_keys)
+ raise ValueError(u("JSON data had unexpected key(s): {bad_keys}")
+ .format(bad_keys=pprint_thing(bad_keys)))
+
+ def parse(self):
+
+ # try numpy
+ numpy = self.numpy
+ if numpy:
+ self._parse_numpy()
+
+ else:
+ self._parse_no_numpy()
+
+ if self.obj is None:
+ return None
+ if self.convert_axes:
+ self._convert_axes()
+ self._try_convert_types()
+ return self.obj
+
+ def _convert_axes(self):
+ """
+ Try to convert axes.
+ """
+ for axis in self.obj._AXIS_NUMBERS.keys():
+ new_axis, result = self._try_convert_data(
+ axis, self.obj._get_axis(axis), use_dtypes=False,
+ convert_dates=True)
+ if result:
+ setattr(self.obj, axis, new_axis)
+
+ def _try_convert_types(self):
+ raise AbstractMethodError(self)
+
+ def _try_convert_data(self, name, data, use_dtypes=True,
+ convert_dates=True):
+ """
+ Try to parse a ndarray like into a column by inferring dtype.
+ """
+
+ # don't try to coerce, unless a force conversion
+ if use_dtypes:
+ if self.dtype is False:
+ return data, False
+ elif self.dtype is True:
+ pass
+ else:
+ # dtype to force
+ dtype = (self.dtype.get(name)
+ if isinstance(self.dtype, dict) else self.dtype)
+ if dtype is not None:
+ try:
+ dtype = np.dtype(dtype)
+ return data.astype(dtype), True
+ except (TypeError, ValueError):
+ return data, False
+
+ if convert_dates:
+ new_data, result = self._try_convert_to_date(data)
+ if result:
+ return new_data, True
+
+ result = False
+
+ if data.dtype == 'object':
+
+ # try float
+ try:
+ data = data.astype('float64')
+ result = True
+ except (TypeError, ValueError):
+ pass
+
+ if data.dtype.kind == 'f':
+
+ if data.dtype != 'float64':
+
+ # coerce floats to 64
+ try:
+ data = data.astype('float64')
+ result = True
+ except (TypeError, ValueError):
+ pass
+
+ # don't coerce 0-len data
+ if len(data) and (data.dtype == 'float' or data.dtype == 'object'):
+
+ # coerce ints if we can
+ try:
+ new_data = data.astype('int64')
+ if (new_data == data).all():
+ data = new_data
+ result = True
+ except (TypeError, ValueError):
+ pass
+
+ # coerce ints to 64
+ if data.dtype == 'int':
+
+ # coerce floats to 64
+ try:
+ data = data.astype('int64')
+ result = True
+ except (TypeError, ValueError):
+ pass
+
+ return data, result
+
+ def _try_convert_to_date(self, data):
+ """
+ Try to parse a ndarray like into a date column.
+
+ Try to coerce object in epoch/iso formats and integer/float in epoch
+ formats. Return a boolean if parsing was successful.
+ """
+
+ # no conversion on empty
+ if not len(data):
+ return data, False
+
+ new_data = data
+ if new_data.dtype == 'object':
+ try:
+ new_data = data.astype('int64')
+ except (TypeError, ValueError, OverflowError):
+ pass
+
+ # ignore numbers that are out of range
+ if issubclass(new_data.dtype.type, np.number):
+ in_range = (isna(new_data.values) | (new_data > self.min_stamp) |
+ (new_data.values == iNaT))
+ if not in_range.all():
+ return data, False
+
+ date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
+ for date_unit in date_units:
+ try:
+ new_data = to_datetime(new_data, errors='raise',
+ unit=date_unit)
+ except ValueError:
+ continue
+ except Exception:
+ break
+ return new_data, True
+ return data, False
+
+ def _try_convert_dates(self):
+ raise AbstractMethodError(self)
+
+
+class SeriesParser(Parser):
+ _default_orient = 'index'
+ _split_keys = ('name', 'index', 'data')
+
+ def _parse_no_numpy(self):
+
+ json = self.json
+ orient = self.orient
+ if orient == "split":
+ decoded = {str(k): v for k, v in compat.iteritems(
+ loads(json, precise_float=self.precise_float))}
+ self.check_keys_split(decoded)
+ self.obj = Series(dtype=None, **decoded)
+ else:
+ self.obj = Series(
+ loads(json, precise_float=self.precise_float), dtype=None)
+
+ def _parse_numpy(self):
+
+ json = self.json
+ orient = self.orient
+ if orient == "split":
+ decoded = loads(json, dtype=None, numpy=True,
+ precise_float=self.precise_float)
+ decoded = {str(k): v for k, v in compat.iteritems(decoded)}
+ self.check_keys_split(decoded)
+ self.obj = Series(**decoded)
+ elif orient == "columns" or orient == "index":
+ self.obj = Series(*loads(json, dtype=None, numpy=True,
+ labelled=True,
+ precise_float=self.precise_float))
+ else:
+ self.obj = Series(loads(json, dtype=None, numpy=True,
+ precise_float=self.precise_float))
+
+ def _try_convert_types(self):
+ if self.obj is None:
+ return
+ obj, result = self._try_convert_data(
+ 'data', self.obj, convert_dates=self.convert_dates)
+ if result:
+ self.obj = obj
+
+
+class FrameParser(Parser):
+ _default_orient = 'columns'
+ _split_keys = ('columns', 'index', 'data')
+
+ def _parse_numpy(self):
+
+ json = self.json
+ orient = self.orient
+
+ if orient == "columns":
+ args = loads(json, dtype=None, numpy=True, labelled=True,
+ precise_float=self.precise_float)
+ if len(args):
+ args = (args[0].T, args[2], args[1])
+ self.obj = DataFrame(*args)
+ elif orient == "split":
+ decoded = loads(json, dtype=None, numpy=True,
+ precise_float=self.precise_float)
+ decoded = {str(k): v for k, v in compat.iteritems(decoded)}
+ self.check_keys_split(decoded)
+ self.obj = DataFrame(**decoded)
+ elif orient == "values":
+ self.obj = DataFrame(loads(json, dtype=None, numpy=True,
+ precise_float=self.precise_float))
+ else:
+ self.obj = DataFrame(*loads(json, dtype=None, numpy=True,
+ labelled=True,
+ precise_float=self.precise_float))
+
+ def _parse_no_numpy(self):
+
+ json = self.json
+ orient = self.orient
+
+ if orient == "columns":
+ self.obj = DataFrame(
+ loads(json, precise_float=self.precise_float), dtype=None)
+ elif orient == "split":
+ decoded = {str(k): v for k, v in compat.iteritems(
+ loads(json, precise_float=self.precise_float))}
+ self.check_keys_split(decoded)
+ self.obj = DataFrame(dtype=None, **decoded)
+ elif orient == "index":
+ self.obj = DataFrame(
+ loads(json, precise_float=self.precise_float), dtype=None).T
+ elif orient == 'table':
+ self.obj = parse_table_schema(json,
+ precise_float=self.precise_float)
+ else:
+ self.obj = DataFrame(
+ loads(json, precise_float=self.precise_float), dtype=None)
+
+ def _process_converter(self, f, filt=None):
+ """
+ Take a conversion function and possibly recreate the frame.
+ """
+
+ if filt is None:
+ filt = lambda col, c: True
+
+ needs_new_obj = False
+ new_obj = dict()
+ for i, (col, c) in enumerate(self.obj.iteritems()):
+ if filt(col, c):
+ new_data, result = f(col, c)
+ if result:
+ c = new_data
+ needs_new_obj = True
+ new_obj[i] = c
+
+ if needs_new_obj:
+
+ # possibly handle dup columns
+ new_obj = DataFrame(new_obj, index=self.obj.index)
+ new_obj.columns = self.obj.columns
+ self.obj = new_obj
+
+ def _try_convert_types(self):
+ if self.obj is None:
+ return
+ if self.convert_dates:
+ self._try_convert_dates()
+
+ self._process_converter(
+ lambda col, c: self._try_convert_data(col, c, convert_dates=False))
+
+ def _try_convert_dates(self):
+ if self.obj is None:
+ return
+
+ # our columns to parse
+ convert_dates = self.convert_dates
+ if convert_dates is True:
+ convert_dates = []
+ convert_dates = set(convert_dates)
+
+ def is_ok(col):
+ """
+ Return if this col is ok to try for a date parse.
+ """
+ if not isinstance(col, compat.string_types):
+ return False
+
+ col_lower = col.lower()
+ if (col_lower.endswith('_at') or
+ col_lower.endswith('_time') or
+ col_lower == 'modified' or
+ col_lower == 'date' or
+ col_lower == 'datetime' or
+ col_lower.startswith('timestamp')):
+ return True
+ return False
+
+ self._process_converter(
+ lambda col, c: self._try_convert_to_date(c),
+ lambda col, c: ((self.keep_default_dates and is_ok(col)) or
+ col in convert_dates))
diff --git a/contrib/python/pandas/py2/pandas/io/json/normalize.py b/contrib/python/pandas/py2/pandas/io/json/normalize.py
new file mode 100644
index 00000000000..279630ccd10
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/json/normalize.py
@@ -0,0 +1,286 @@
+# ---------------------------------------------------------------------
+# JSON normalization routines
+
+from collections import defaultdict
+import copy
+
+import numpy as np
+
+from pandas._libs.writers import convert_json_to_lines
+
+from pandas import DataFrame, compat
+
+
+def _convert_to_line_delimits(s):
+ """
+ Helper function that converts JSON lists to line delimited JSON.
+ """
+
+ # Determine we have a JSON list to turn to lines otherwise just return the
+ # json object, only lists can
+ if not s[0] == '[' and s[-1] == ']':
+ return s
+ s = s[1:-1]
+
+ return convert_json_to_lines(s)
+
+
+def nested_to_record(ds, prefix="", sep=".", level=0):
+ """
+ A simplified json_normalize.
+
+ Converts a nested dict into a flat dict ("record"), unlike json_normalize,
+ it does not attempt to extract a subset of the data.
+
+ Parameters
+ ----------
+ ds : dict or list of dicts
+ prefix: the prefix, optional, default: ""
+ sep : string, default '.'
+ Nested records will generate names separated by sep,
+ e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
+
+ .. versionadded:: 0.20.0
+
+ level: the number of levels in the jason string, optional, default: 0
+
+ Returns
+ -------
+ d - dict or list of dicts, matching `ds`
+
+ Examples
+ --------
+
+ IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),
+ nested=dict(e=dict(c=1,d=2),d=2)))
+ Out[52]:
+ {'dict1.c': 1,
+ 'dict1.d': 2,
+ 'flat1': 1,
+ 'nested.d': 2,
+ 'nested.e.c': 1,
+ 'nested.e.d': 2}
+ """
+ singleton = False
+ if isinstance(ds, dict):
+ ds = [ds]
+ singleton = True
+
+ new_ds = []
+ for d in ds:
+
+ new_d = copy.deepcopy(d)
+ for k, v in d.items():
+ # each key gets renamed with prefix
+ if not isinstance(k, compat.string_types):
+ k = str(k)
+ if level == 0:
+ newkey = k
+ else:
+ newkey = prefix + sep + k
+
+ # only dicts gets recurse-flattend
+ # only at level>1 do we rename the rest of the keys
+ if not isinstance(v, dict):
+ if level != 0: # so we skip copying for top level, common case
+ v = new_d.pop(k)
+ new_d[newkey] = v
+ continue
+ else:
+ v = new_d.pop(k)
+ new_d.update(nested_to_record(v, newkey, sep, level + 1))
+ new_ds.append(new_d)
+
+ if singleton:
+ return new_ds[0]
+ return new_ds
+
+
+def json_normalize(data, record_path=None, meta=None,
+ meta_prefix=None,
+ record_prefix=None,
+ errors='raise',
+ sep='.'):
+ """
+ Normalize semi-structured JSON data into a flat table.
+
+ Parameters
+ ----------
+ data : dict or list of dicts
+ Unserialized JSON objects
+ record_path : string or list of strings, default None
+ Path in each object to list of records. If not passed, data will be
+ assumed to be an array of records
+ meta : list of paths (string or list of strings), default None
+ Fields to use as metadata for each record in resulting table
+ meta_prefix : string, default None
+ record_prefix : string, default None
+ If True, prefix records with dotted (?) path, e.g. foo.bar.field if
+ path to records is ['foo', 'bar']
+ errors : {'raise', 'ignore'}, default 'raise'
+
+ * 'ignore' : will ignore KeyError if keys listed in meta are not
+ always present
+ * 'raise' : will raise KeyError if keys listed in meta are not
+ always present
+
+ .. versionadded:: 0.20.0
+
+ sep : string, default '.'
+ Nested records will generate names separated by sep,
+ e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
+
+ .. versionadded:: 0.20.0
+
+ Returns
+ -------
+ frame : DataFrame
+
+ Examples
+ --------
+
+ >>> from pandas.io.json import json_normalize
+ >>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
+ ... {'name': {'given': 'Mose', 'family': 'Regner'}},
+ ... {'id': 2, 'name': 'Faye Raker'}]
+ >>> json_normalize(data)
+ id name name.family name.first name.given name.last
+ 0 1.0 NaN NaN Coleen NaN Volk
+ 1 NaN NaN Regner NaN Mose NaN
+ 2 2.0 Faye Raker NaN NaN NaN NaN
+
+ >>> data = [{'state': 'Florida',
+ ... 'shortname': 'FL',
+ ... 'info': {
+ ... 'governor': 'Rick Scott'
+ ... },
+ ... 'counties': [{'name': 'Dade', 'population': 12345},
+ ... {'name': 'Broward', 'population': 40000},
+ ... {'name': 'Palm Beach', 'population': 60000}]},
+ ... {'state': 'Ohio',
+ ... 'shortname': 'OH',
+ ... 'info': {
+ ... 'governor': 'John Kasich'
+ ... },
+ ... 'counties': [{'name': 'Summit', 'population': 1234},
+ ... {'name': 'Cuyahoga', 'population': 1337}]}]
+ >>> result = json_normalize(data, 'counties', ['state', 'shortname',
+ ... ['info', 'governor']])
+ >>> result
+ name population info.governor state shortname
+ 0 Dade 12345 Rick Scott Florida FL
+ 1 Broward 40000 Rick Scott Florida FL
+ 2 Palm Beach 60000 Rick Scott Florida FL
+ 3 Summit 1234 John Kasich Ohio OH
+ 4 Cuyahoga 1337 John Kasich Ohio OH
+
+ >>> data = {'A': [1, 2]}
+ >>> json_normalize(data, 'A', record_prefix='Prefix.')
+ Prefix.0
+ 0 1
+ 1 2
+ """
+ def _pull_field(js, spec):
+ result = js
+ if isinstance(spec, list):
+ for field in spec:
+ result = result[field]
+ else:
+ result = result[spec]
+
+ return result
+
+ if isinstance(data, list) and not data:
+ return DataFrame()
+
+ # A bit of a hackjob
+ if isinstance(data, dict):
+ data = [data]
+
+ if record_path is None:
+ if any([isinstance(x, dict)
+ for x in compat.itervalues(y)] for y in data):
+ # naive normalization, this is idempotent for flat records
+ # and potentially will inflate the data considerably for
+ # deeply nested structures:
+ # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
+ #
+ # TODO: handle record value which are lists, at least error
+ # reasonably
+ data = nested_to_record(data, sep=sep)
+ return DataFrame(data)
+ elif not isinstance(record_path, list):
+ record_path = [record_path]
+
+ if meta is None:
+ meta = []
+ elif not isinstance(meta, list):
+ meta = [meta]
+
+ meta = [m if isinstance(m, list) else [m] for m in meta]
+
+ # Disastrously inefficient for now
+ records = []
+ lengths = []
+
+ meta_vals = defaultdict(list)
+ if not isinstance(sep, compat.string_types):
+ sep = str(sep)
+ meta_keys = [sep.join(val) for val in meta]
+
+ def _recursive_extract(data, path, seen_meta, level=0):
+ if isinstance(data, dict):
+ data = [data]
+ if len(path) > 1:
+ for obj in data:
+ for val, key in zip(meta, meta_keys):
+ if level + 1 == len(val):
+ seen_meta[key] = _pull_field(obj, val[-1])
+
+ _recursive_extract(obj[path[0]], path[1:],
+ seen_meta, level=level + 1)
+ else:
+ for obj in data:
+ recs = _pull_field(obj, path[0])
+
+ # For repeating the metadata later
+ lengths.append(len(recs))
+
+ for val, key in zip(meta, meta_keys):
+ if level + 1 > len(val):
+ meta_val = seen_meta[key]
+ else:
+ try:
+ meta_val = _pull_field(obj, val[level:])
+ except KeyError as e:
+ if errors == 'ignore':
+ meta_val = np.nan
+ else:
+ raise KeyError("Try running with "
+ "errors='ignore' as key "
+ "{err} is not always present"
+ .format(err=e))
+ meta_vals[key].append(meta_val)
+
+ records.extend(recs)
+
+ _recursive_extract(data, record_path, {}, level=0)
+
+ result = DataFrame(records)
+
+ if record_prefix is not None:
+ result = result.rename(
+ columns=lambda x: "{p}{c}".format(p=record_prefix, c=x))
+
+ # Data types, a problem
+ for k, v in compat.iteritems(meta_vals):
+ if meta_prefix is not None:
+ k = meta_prefix + k
+
+ if k in result:
+ raise ValueError('Conflicting metadata name {name}, '
+ 'need distinguishing prefix '.format(name=k))
+
+ result[k] = np.array(v).repeat(lengths)
+
+ return result
diff --git a/contrib/python/pandas/py2/pandas/io/json/table_schema.py b/contrib/python/pandas/py2/pandas/io/json/table_schema.py
new file mode 100644
index 00000000000..971386c9194
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/json/table_schema.py
@@ -0,0 +1,326 @@
+"""
+Table Schema builders
+
+http://specs.frictionlessdata.io/json-table-schema/
+"""
+import warnings
+
+import pandas._libs.json as json
+
+from pandas.core.dtypes.common import (
+ is_bool_dtype, is_categorical_dtype, is_datetime64_dtype,
+ is_datetime64tz_dtype, is_integer_dtype, is_numeric_dtype, is_period_dtype,
+ is_string_dtype, is_timedelta64_dtype)
+
+from pandas import DataFrame
+from pandas.api.types import CategoricalDtype
+import pandas.core.common as com
+
+loads = json.loads
+
+
+def as_json_table_type(x):
+ """
+ Convert a NumPy / pandas type to its corresponding json_table.
+
+ Parameters
+ ----------
+ x : array or dtype
+
+ Returns
+ -------
+ t : str
+ the Table Schema data types
+
+ Notes
+ -----
+ This table shows the relationship between NumPy / pandas dtypes,
+ and Table Schema dtypes.
+
+ ============== =================
+ Pandas type Table Schema type
+ ============== =================
+ int64 integer
+ float64 number
+ bool boolean
+ datetime64[ns] datetime
+ timedelta64[ns] duration
+ object str
+ categorical any
+ =============== =================
+ """
+ if is_integer_dtype(x):
+ return 'integer'
+ elif is_bool_dtype(x):
+ return 'boolean'
+ elif is_numeric_dtype(x):
+ return 'number'
+ elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or
+ is_period_dtype(x)):
+ return 'datetime'
+ elif is_timedelta64_dtype(x):
+ return 'duration'
+ elif is_categorical_dtype(x):
+ return 'any'
+ elif is_string_dtype(x):
+ return 'string'
+ else:
+ return 'any'
+
+
+def set_default_names(data):
+ """Sets index names to 'index' for regular, or 'level_x' for Multi"""
+ if com._all_not_none(*data.index.names):
+ nms = data.index.names
+ if len(nms) == 1 and data.index.name == 'index':
+ warnings.warn("Index name of 'index' is not round-trippable")
+ elif len(nms) > 1 and any(x.startswith('level_') for x in nms):
+ warnings.warn("Index names beginning with 'level_' are not "
+ "round-trippable")
+ return data
+
+ data = data.copy()
+ if data.index.nlevels > 1:
+ names = [name if name is not None else 'level_{}'.format(i)
+ for i, name in enumerate(data.index.names)]
+ data.index.names = names
+ else:
+ data.index.name = data.index.name or 'index'
+ return data
+
+
+def convert_pandas_type_to_json_field(arr, dtype=None):
+ dtype = dtype or arr.dtype
+ if arr.name is None:
+ name = 'values'
+ else:
+ name = arr.name
+ field = {'name': name,
+ 'type': as_json_table_type(dtype)}
+
+ if is_categorical_dtype(arr):
+ if hasattr(arr, 'categories'):
+ cats = arr.categories
+ ordered = arr.ordered
+ else:
+ cats = arr.cat.categories
+ ordered = arr.cat.ordered
+ field['constraints'] = {"enum": list(cats)}
+ field['ordered'] = ordered
+ elif is_period_dtype(arr):
+ field['freq'] = arr.freqstr
+ elif is_datetime64tz_dtype(arr):
+ if hasattr(arr, 'dt'):
+ field['tz'] = arr.dt.tz.zone
+ else:
+ field['tz'] = arr.tz.zone
+ return field
+
+
+def convert_json_field_to_pandas_type(field):
+ """
+ Converts a JSON field descriptor into its corresponding NumPy / pandas type
+
+ Parameters
+ ----------
+ field
+ A JSON field descriptor
+
+ Returns
+ -------
+ dtype
+
+ Raises
+ -----
+ ValueError
+ If the type of the provided field is unknown or currently unsupported
+
+ Examples
+ --------
+ >>> convert_json_field_to_pandas_type({'name': 'an_int',
+ 'type': 'integer'})
+ 'int64'
+ >>> convert_json_field_to_pandas_type({'name': 'a_categorical',
+ 'type': 'any',
+ 'contraints': {'enum': [
+ 'a', 'b', 'c']},
+ 'ordered': True})
+ 'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)'
+ >>> convert_json_field_to_pandas_type({'name': 'a_datetime',
+ 'type': 'datetime'})
+ 'datetime64[ns]'
+ >>> convert_json_field_to_pandas_type({'name': 'a_datetime_with_tz',
+ 'type': 'datetime',
+ 'tz': 'US/Central'})
+ 'datetime64[ns, US/Central]'
+ """
+ typ = field['type']
+ if typ == 'string':
+ return 'object'
+ elif typ == 'integer':
+ return 'int64'
+ elif typ == 'number':
+ return 'float64'
+ elif typ == 'boolean':
+ return 'bool'
+ elif typ == 'duration':
+ return 'timedelta64'
+ elif typ == 'datetime':
+ if field.get('tz'):
+ return 'datetime64[ns, {tz}]'.format(tz=field['tz'])
+ else:
+ return 'datetime64[ns]'
+ elif typ == 'any':
+ if 'constraints' in field and 'ordered' in field:
+ return CategoricalDtype(categories=field['constraints']['enum'],
+ ordered=field['ordered'])
+ else:
+ return 'object'
+
+ raise ValueError("Unsupported or invalid field type: {}".format(typ))
+
+
+def build_table_schema(data, index=True, primary_key=None, version=True):
+ """
+ Create a Table schema from ``data``.
+
+ Parameters
+ ----------
+ data : Series, DataFrame
+ index : bool, default True
+ Whether to include ``data.index`` in the schema.
+ primary_key : bool or None, default True
+ column names to designate as the primary key.
+ The default `None` will set `'primaryKey'` to the index
+ level or levels if the index is unique.
+ version : bool, default True
+ Whether to include a field `pandas_version` with the version
+ of pandas that generated the schema.
+
+ Returns
+ -------
+ schema : dict
+
+ Notes
+ -----
+ See `_as_json_table_type` for conversion types.
+ Timedeltas as converted to ISO8601 duration format with
+ 9 decimal places after the seconds field for nanosecond precision.
+
+ Categoricals are converted to the `any` dtype, and use the `enum` field
+ constraint to list the allowed values. The `ordered` attribute is included
+ in an `ordered` field.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(
+ ... {'A': [1, 2, 3],
+ ... 'B': ['a', 'b', 'c'],
+ ... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
+ ... }, index=pd.Index(range(3), name='idx'))
+ >>> build_table_schema(df)
+ {'fields': [{'name': 'idx', 'type': 'integer'},
+ {'name': 'A', 'type': 'integer'},
+ {'name': 'B', 'type': 'string'},
+ {'name': 'C', 'type': 'datetime'}],
+ 'pandas_version': '0.20.0',
+ 'primaryKey': ['idx']}
+ """
+ if index is True:
+ data = set_default_names(data)
+
+ schema = {}
+ fields = []
+
+ if index:
+ if data.index.nlevels > 1:
+ for level in data.index.levels:
+ fields.append(convert_pandas_type_to_json_field(level))
+ else:
+ fields.append(convert_pandas_type_to_json_field(data.index))
+
+ if data.ndim > 1:
+ for column, s in data.iteritems():
+ fields.append(convert_pandas_type_to_json_field(s))
+ else:
+ fields.append(convert_pandas_type_to_json_field(data))
+
+ schema['fields'] = fields
+ if index and data.index.is_unique and primary_key is None:
+ if data.index.nlevels == 1:
+ schema['primaryKey'] = [data.index.name]
+ else:
+ schema['primaryKey'] = data.index.names
+ elif primary_key is not None:
+ schema['primaryKey'] = primary_key
+
+ if version:
+ schema['pandas_version'] = '0.20.0'
+ return schema
+
+
+def parse_table_schema(json, precise_float):
+ """
+ Builds a DataFrame from a given schema
+
+ Parameters
+ ----------
+ json :
+ A JSON table schema
+ precise_float : boolean
+ Flag controlling precision when decoding string to double values, as
+ dictated by ``read_json``
+
+ Returns
+ -------
+ df : DataFrame
+
+ Raises
+ ------
+ NotImplementedError
+ If the JSON table schema contains either timezone or timedelta data
+
+ Notes
+ -----
+ Because :func:`DataFrame.to_json` uses the string 'index' to denote a
+ name-less :class:`Index`, this function sets the name of the returned
+ :class:`DataFrame` to ``None`` when said string is encountered with a
+ normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
+ applies to any strings beginning with 'level_'. Therefore, an
+ :class:`Index` name of 'index' and :class:`MultiIndex` names starting
+ with 'level_' are not supported.
+
+ See Also
+ --------
+ build_table_schema : Inverse function.
+ pandas.read_json
+ """
+ table = loads(json, precise_float=precise_float)
+ col_order = [field['name'] for field in table['schema']['fields']]
+ df = DataFrame(table['data'], columns=col_order)[col_order]
+
+ dtypes = {field['name']: convert_json_field_to_pandas_type(field)
+ for field in table['schema']['fields']}
+
+ # Cannot directly use as_type with timezone data on object; raise for now
+ if any(str(x).startswith('datetime64[ns, ') for x in dtypes.values()):
+ raise NotImplementedError('table="orient" can not yet read timezone '
+ 'data')
+
+ # No ISO constructor for Timedelta as of yet, so need to raise
+ if 'timedelta64' in dtypes.values():
+ raise NotImplementedError('table="orient" can not yet read '
+ 'ISO-formatted Timedelta data')
+
+ df = df.astype(dtypes)
+
+ if 'primaryKey' in table['schema']:
+ df = df.set_index(table['schema']['primaryKey'])
+ if len(df.index.names) == 1:
+ if df.index.name == 'index':
+ df.index.name = None
+ else:
+ df.index.names = [None if x.startswith('level_') else x for x in
+ df.index.names]
+
+ return df
diff --git a/contrib/python/pandas/py2/pandas/io/msgpack/__init__.py b/contrib/python/pandas/py2/pandas/io/msgpack/__init__.py
new file mode 100644
index 00000000000..984e90ee03e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/msgpack/__init__.py
@@ -0,0 +1,50 @@
+# coding: utf-8
+
+from collections import namedtuple
+
+from pandas.io.msgpack.exceptions import * # noqa
+from pandas.io.msgpack._version import version # noqa
+
+
+class ExtType(namedtuple('ExtType', 'code data')):
+ """ExtType represents ext type in msgpack."""
+ def __new__(cls, code, data):
+ if not isinstance(code, int):
+ raise TypeError("code must be int")
+ if not isinstance(data, bytes):
+ raise TypeError("data must be bytes")
+ if not 0 <= code <= 127:
+ raise ValueError("code must be 0~127")
+ return super(ExtType, cls).__new__(cls, code, data)
+
+import os # noqa
+
+from pandas.io.msgpack._packer import Packer # noqa
+from pandas.io.msgpack._unpacker import unpack, unpackb, Unpacker # noqa
+
+
+def pack(o, stream, **kwargs):
+ """
+ Pack object `o` and write it to `stream`
+
+ See :class:`Packer` for options.
+ """
+ packer = Packer(**kwargs)
+ stream.write(packer.pack(o))
+
+
+def packb(o, **kwargs):
+ """
+ Pack object `o` and return packed bytes
+
+ See :class:`Packer` for options.
+ """
+ return Packer(**kwargs).pack(o)
+
+
+# alias for compatibility to simplejson/marshal/pickle.
+load = unpack
+loads = unpackb
+
+dump = pack
+dumps = packb
diff --git a/contrib/python/pandas/py2/pandas/io/msgpack/_packer.pyx b/contrib/python/pandas/py2/pandas/io/msgpack/_packer.pyx
new file mode 100644
index 00000000000..d67c632188e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/msgpack/_packer.pyx
@@ -0,0 +1,308 @@
+# coding: utf-8
+# cython: embedsignature=True
+
+from cpython cimport (
+ PyFloat_Check, PyLong_Check, PyInt_Check,
+ PyDict_CheckExact, PyDict_Check,
+ PyTuple_Check, PyList_Check,
+ PyCallable_Check,
+ PyUnicode_Check, PyBytes_Check,
+ PyBytes_AsString,
+ PyBytes_FromStringAndSize,
+ PyUnicode_AsEncodedString)
+from libc.stdlib cimport free, malloc
+
+from pandas.io.msgpack.exceptions import PackValueError
+from pandas.io.msgpack import ExtType
+import numpy as np
+
+
+cdef extern from "../../src/msgpack/pack.h":
+ struct msgpack_packer:
+ char* buf
+ size_t length
+ size_t buf_size
+ bint use_bin_type
+
+ int msgpack_pack_int(msgpack_packer* pk, int d)
+ int msgpack_pack_nil(msgpack_packer* pk)
+ int msgpack_pack_true(msgpack_packer* pk)
+ int msgpack_pack_false(msgpack_packer* pk)
+ int msgpack_pack_long(msgpack_packer* pk, long d)
+ int msgpack_pack_long_long(msgpack_packer* pk, long long d)
+ int msgpack_pack_unsigned_long_long(msgpack_packer* pk,
+ unsigned long long d)
+ int msgpack_pack_float(msgpack_packer* pk, float d)
+ int msgpack_pack_double(msgpack_packer* pk, double d)
+ int msgpack_pack_array(msgpack_packer* pk, size_t l)
+ int msgpack_pack_map(msgpack_packer* pk, size_t l)
+ int msgpack_pack_raw(msgpack_packer* pk, size_t l)
+ int msgpack_pack_bin(msgpack_packer* pk, size_t l)
+ int msgpack_pack_raw_body(msgpack_packer* pk, char* body, size_t l)
+ int msgpack_pack_ext(msgpack_packer* pk, char typecode, size_t l)
+
+cdef int DEFAULT_RECURSE_LIMIT=511
+
+
+cdef class Packer(object):
+ """
+ MessagePack Packer
+
+ usage::
+
+ packer = Packer()
+ astream.write(packer.pack(a))
+ astream.write(packer.pack(b))
+
+ Packer's constructor has some keyword arguments:
+
+ :param callable default:
+ Convert user type to builtin type that Packer supports.
+ See also simplejson's document.
+ :param str encoding:
+ Convert unicode to bytes with this encoding. (default: 'utf-8')
+ :param str unicode_errors:
+ Error handler for encoding unicode. (default: 'strict')
+ :param bool use_single_float:
+ Use single precision float type for float. (default: False)
+ :param bool autoreset:
+ Reset buffer after each pack and return it's
+ content as `bytes`. (default: True).
+ If set this to false, use `bytes()` to get
+ content and `.reset()` to clear buffer.
+ :param bool use_bin_type:
+ Use bin type introduced in msgpack spec 2.0 for bytes.
+ It also enable str8 type for unicode.
+ """
+ cdef msgpack_packer pk
+ cdef object _default
+ cdef object _bencoding
+ cdef object _berrors
+ cdef char *encoding
+ cdef char *unicode_errors
+ cdef bint use_float
+ cdef bint autoreset
+
+ def __cinit__(self):
+ cdef int buf_size = 1024 * 1024
+ self.pk.buf = <char*> malloc(buf_size)
+ if self.pk.buf == NULL:
+ raise MemoryError("Unable to allocate internal buffer.")
+ self.pk.buf_size = buf_size
+ self.pk.length = 0
+
+ def __init__(self, default=None, encoding='utf-8',
+ unicode_errors='strict', use_single_float=False,
+ bint autoreset=1, bint use_bin_type=0):
+ """
+ """
+ self.use_float = use_single_float
+ self.autoreset = autoreset
+ self.pk.use_bin_type = use_bin_type
+ if default is not None:
+ if not PyCallable_Check(default):
+ raise TypeError("default must be a callable.")
+ self._default = default
+ if encoding is None:
+ self.encoding = NULL
+ self.unicode_errors = NULL
+ else:
+ if isinstance(encoding, unicode):
+ self._bencoding = encoding.encode('ascii')
+ else:
+ self._bencoding = encoding
+ self.encoding = PyBytes_AsString(self._bencoding)
+ if isinstance(unicode_errors, unicode):
+ self._berrors = unicode_errors.encode('ascii')
+ else:
+ self._berrors = unicode_errors
+ self.unicode_errors = PyBytes_AsString(self._berrors)
+
+ def __dealloc__(self):
+ free(self.pk.buf);
+
+ cdef int _pack(self, object o,
+ int nest_limit=DEFAULT_RECURSE_LIMIT) except -1:
+ cdef long long llval
+ cdef unsigned long long ullval
+ cdef long longval
+ cdef float fval
+ cdef double dval
+ cdef char* rawval
+ cdef int ret
+ cdef dict d
+ cdef size_t L
+ cdef int default_used = 0
+
+ if nest_limit < 0:
+ raise PackValueError("recursion limit exceeded.")
+
+ while True:
+ if o is None:
+ ret = msgpack_pack_nil(&self.pk)
+ elif isinstance(o, (bool, np.bool_)):
+ if o:
+ ret = msgpack_pack_true(&self.pk)
+ else:
+ ret = msgpack_pack_false(&self.pk)
+ elif PyLong_Check(o):
+ # PyInt_Check(long) is True for Python 3.
+ # Sow we should test long before int.
+ if o > 0:
+ ullval = o
+ ret = msgpack_pack_unsigned_long_long(&self.pk, ullval)
+ else:
+ llval = o
+ ret = msgpack_pack_long_long(&self.pk, llval)
+ elif PyInt_Check(o):
+ longval = o
+ ret = msgpack_pack_long(&self.pk, longval)
+ elif PyFloat_Check(o):
+ if self.use_float:
+ fval = o
+ ret = msgpack_pack_float(&self.pk, fval)
+ else:
+ dval = o
+ ret = msgpack_pack_double(&self.pk, dval)
+ elif PyBytes_Check(o):
+ L = len(o)
+ if L > (2**32) - 1:
+ raise ValueError("bytes is too large")
+ rawval = o
+ ret = msgpack_pack_bin(&self.pk, L)
+ if ret == 0:
+ ret = msgpack_pack_raw_body(&self.pk, rawval, L)
+ elif PyUnicode_Check(o):
+ if not self.encoding:
+ raise TypeError("Can't encode unicode string: "
+ "no encoding is specified")
+ o = PyUnicode_AsEncodedString(o, self.encoding,
+ self.unicode_errors)
+ L = len(o)
+ if L > (2**32) - 1:
+ raise ValueError("dict is too large")
+ rawval = o
+ ret = msgpack_pack_raw(&self.pk, len(o))
+ if ret == 0:
+ ret = msgpack_pack_raw_body(&self.pk, rawval, len(o))
+ elif PyDict_CheckExact(o):
+ d = <dict>o
+ L = len(d)
+ if L > (2**32) - 1:
+ raise ValueError("dict is too large")
+ ret = msgpack_pack_map(&self.pk, L)
+ if ret == 0:
+ for k, v in d.iteritems():
+ ret = self._pack(k, nest_limit - 1)
+ if ret != 0: break
+ ret = self._pack(v, nest_limit - 1)
+ if ret != 0: break
+ elif PyDict_Check(o):
+ L = len(o)
+ if L > (2**32) - 1:
+ raise ValueError("dict is too large")
+ ret = msgpack_pack_map(&self.pk, L)
+ if ret == 0:
+ for k, v in o.items():
+ ret = self._pack(k, nest_limit - 1)
+ if ret != 0: break
+ ret = self._pack(v, nest_limit - 1)
+ if ret != 0: break
+ elif isinstance(o, ExtType):
+ # This should be before Tuple because ExtType is namedtuple.
+ longval = o.code
+ rawval = o.data
+ L = len(o.data)
+ if L > (2**32) - 1:
+ raise ValueError("EXT data is too large")
+ ret = msgpack_pack_ext(&self.pk, longval, L)
+ ret = msgpack_pack_raw_body(&self.pk, rawval, L)
+ elif PyTuple_Check(o) or PyList_Check(o):
+ L = len(o)
+ if L > (2**32) - 1:
+ raise ValueError("list is too large")
+ ret = msgpack_pack_array(&self.pk, L)
+ if ret == 0:
+ for v in o:
+ ret = self._pack(v, nest_limit - 1)
+ if ret != 0: break
+ elif not default_used and self._default:
+ o = self._default(o)
+ default_used = 1
+ continue
+ else:
+ raise TypeError("can't serialize {thing!r}".format(thing=o))
+ return ret
+
+ cpdef pack(self, object obj):
+ cdef int ret
+ ret = self._pack(obj, DEFAULT_RECURSE_LIMIT)
+ if ret == -1:
+ raise MemoryError
+ elif ret: # should not happen.
+ raise TypeError
+ if self.autoreset:
+ buf = PyBytes_FromStringAndSize(self.pk.buf, self.pk.length)
+ self.pk.length = 0
+ return buf
+
+ def pack_ext_type(self, typecode, data):
+ msgpack_pack_ext(&self.pk, typecode, len(data))
+ msgpack_pack_raw_body(&self.pk, data, len(data))
+
+ def pack_array_header(self, size_t size):
+ if size > (2**32) - 1:
+ raise ValueError
+ cdef int ret = msgpack_pack_array(&self.pk, size)
+ if ret == -1:
+ raise MemoryError
+ elif ret: # should not happen
+ raise TypeError
+ if self.autoreset:
+ buf = PyBytes_FromStringAndSize(self.pk.buf, self.pk.length)
+ self.pk.length = 0
+ return buf
+
+ def pack_map_header(self, size_t size):
+ if size > (2**32) - 1:
+ raise ValueError
+ cdef int ret = msgpack_pack_map(&self.pk, size)
+ if ret == -1:
+ raise MemoryError
+ elif ret: # should not happen
+ raise TypeError
+ if self.autoreset:
+ buf = PyBytes_FromStringAndSize(self.pk.buf, self.pk.length)
+ self.pk.length = 0
+ return buf
+
+ def pack_map_pairs(self, object pairs):
+ """
+ Pack *pairs* as msgpack map type.
+
+ *pairs* should sequence of pair.
+ (`len(pairs)` and `for k, v in pairs:` should be supported.)
+ """
+ cdef int ret = msgpack_pack_map(&self.pk, len(pairs))
+ if ret == 0:
+ for k, v in pairs:
+ ret = self._pack(k)
+ if ret != 0: break
+ ret = self._pack(v)
+ if ret != 0: break
+ if ret == -1:
+ raise MemoryError
+ elif ret: # should not happen
+ raise TypeError
+ if self.autoreset:
+ buf = PyBytes_FromStringAndSize(self.pk.buf, self.pk.length)
+ self.pk.length = 0
+ return buf
+
+ def reset(self):
+ """Clear internal buffer."""
+ self.pk.length = 0
+
+ def bytes(self):
+ """Return buffer content."""
+ return PyBytes_FromStringAndSize(self.pk.buf, self.pk.length)
diff --git a/contrib/python/pandas/py2/pandas/io/msgpack/_unpacker.pyx b/contrib/python/pandas/py2/pandas/io/msgpack/_unpacker.pyx
new file mode 100644
index 00000000000..0c50aa5e681
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/msgpack/_unpacker.pyx
@@ -0,0 +1,486 @@
+# coding: utf-8
+# cython: embedsignature=True
+
+from cython cimport Py_ssize_t
+
+from cpython cimport (
+ PyCallable_Check,
+ PyBUF_SIMPLE, PyObject_GetBuffer, PyBuffer_Release,
+ PyBytes_Size,
+ PyBytes_FromStringAndSize,
+ PyBytes_AsString)
+
+cdef extern from "Python.h":
+ ctypedef struct PyObject
+ cdef int PyObject_AsReadBuffer(object o, const void** buff,
+ Py_ssize_t* buf_len) except -1
+
+from libc.stdlib cimport free, malloc
+from libc.string cimport memcpy, memmove
+from libc.limits cimport INT_MAX
+
+from pandas.io.msgpack.exceptions import (BufferFull, OutOfData,
+ UnpackValueError, ExtraData)
+from pandas.io.msgpack import ExtType
+
+
+cdef extern from "../../src/msgpack/unpack.h":
+ ctypedef struct msgpack_user:
+ bint use_list
+ PyObject* object_hook
+ bint has_pairs_hook # call object_hook with k-v pairs
+ PyObject* list_hook
+ PyObject* ext_hook
+ char *encoding
+ char *unicode_errors
+ Py_ssize_t max_str_len
+ Py_ssize_t max_bin_len
+ Py_ssize_t max_array_len
+ Py_ssize_t max_map_len
+ Py_ssize_t max_ext_len
+
+ ctypedef struct unpack_context:
+ msgpack_user user
+ PyObject* obj
+ size_t count
+
+ ctypedef int (*execute_fn)(unpack_context* ctx, const char* data,
+ size_t len, size_t* off) except? -1
+ execute_fn unpack_construct
+ execute_fn unpack_skip
+ execute_fn read_array_header
+ execute_fn read_map_header
+ void unpack_init(unpack_context* ctx)
+ object unpack_data(unpack_context* ctx)
+
+cdef inline init_ctx(unpack_context *ctx,
+ object object_hook, object object_pairs_hook,
+ object list_hook, object ext_hook,
+ bint use_list, char* encoding, char* unicode_errors,
+ Py_ssize_t max_str_len, Py_ssize_t max_bin_len,
+ Py_ssize_t max_array_len, Py_ssize_t max_map_len,
+ Py_ssize_t max_ext_len):
+ unpack_init(ctx)
+ ctx.user.use_list = use_list
+ ctx.user.object_hook = ctx.user.list_hook = <PyObject*>NULL
+ ctx.user.max_str_len = max_str_len
+ ctx.user.max_bin_len = max_bin_len
+ ctx.user.max_array_len = max_array_len
+ ctx.user.max_map_len = max_map_len
+ ctx.user.max_ext_len = max_ext_len
+
+ if object_hook is not None and object_pairs_hook is not None:
+ raise TypeError("object_pairs_hook and object_hook "
+ "are mutually exclusive.")
+
+ if object_hook is not None:
+ if not PyCallable_Check(object_hook):
+ raise TypeError("object_hook must be a callable.")
+ ctx.user.object_hook = <PyObject*>object_hook
+
+ if object_pairs_hook is None:
+ ctx.user.has_pairs_hook = False
+ else:
+ if not PyCallable_Check(object_pairs_hook):
+ raise TypeError("object_pairs_hook must be a callable.")
+ ctx.user.object_hook = <PyObject*>object_pairs_hook
+ ctx.user.has_pairs_hook = True
+
+ if list_hook is not None:
+ if not PyCallable_Check(list_hook):
+ raise TypeError("list_hook must be a callable.")
+ ctx.user.list_hook = <PyObject*>list_hook
+
+ if ext_hook is not None:
+ if not PyCallable_Check(ext_hook):
+ raise TypeError("ext_hook must be a callable.")
+ ctx.user.ext_hook = <PyObject*>ext_hook
+
+ ctx.user.encoding = encoding
+ ctx.user.unicode_errors = unicode_errors
+
+
+def default_read_extended_type(typecode, data):
+ raise NotImplementedError("Cannot decode extended type "
+ "with typecode={code}".format(code=typecode))
+
+
+def unpackb(object packed, object object_hook=None, object list_hook=None,
+ bint use_list=1, encoding=None, unicode_errors="strict",
+ object_pairs_hook=None, ext_hook=ExtType,
+ Py_ssize_t max_str_len=2147483647, # 2**32-1
+ Py_ssize_t max_bin_len=2147483647,
+ Py_ssize_t max_array_len=2147483647,
+ Py_ssize_t max_map_len=2147483647,
+ Py_ssize_t max_ext_len=2147483647):
+ """
+ Unpack packed_bytes to object. Returns an unpacked object.
+
+ Raises `ValueError` when `packed` contains extra bytes.
+
+ See :class:`Unpacker` for options.
+ """
+ cdef unpack_context ctx
+ cdef size_t off = 0
+ cdef int ret
+
+ cdef char* buf
+ cdef Py_ssize_t buf_len
+ cdef char* cenc = NULL
+ cdef char* cerr = NULL
+
+ PyObject_AsReadBuffer(packed, <const void**>&buf, &buf_len)
+
+ if encoding is not None:
+ if isinstance(encoding, unicode):
+ encoding = encoding.encode('ascii')
+ cenc = PyBytes_AsString(encoding)
+
+ if unicode_errors is not None:
+ if isinstance(unicode_errors, unicode):
+ unicode_errors = unicode_errors.encode('ascii')
+ cerr = PyBytes_AsString(unicode_errors)
+
+ init_ctx(&ctx, object_hook, object_pairs_hook, list_hook, ext_hook,
+ use_list, cenc, cerr,
+ max_str_len, max_bin_len, max_array_len, max_map_len, max_ext_len)
+ ret = unpack_construct(&ctx, buf, buf_len, &off)
+ if ret == 1:
+ obj = unpack_data(&ctx)
+ if <Py_ssize_t> off < buf_len:
+ raise ExtraData(obj, PyBytes_FromStringAndSize(
+ buf + off, buf_len - off))
+ return obj
+ else:
+ raise UnpackValueError("Unpack failed: error = {ret}".format(ret=ret))
+
+
+def unpack(object stream, object object_hook=None, object list_hook=None,
+ bint use_list=1, encoding=None, unicode_errors="strict",
+ object_pairs_hook=None,
+ ):
+ """
+ Unpack an object from `stream`.
+
+ Raises `ValueError` when `stream` has extra bytes.
+
+ See :class:`Unpacker` for options.
+ """
+ return unpackb(stream.read(), use_list=use_list,
+ object_hook=object_hook,
+ object_pairs_hook=object_pairs_hook, list_hook=list_hook,
+ encoding=encoding, unicode_errors=unicode_errors)
+
+
+cdef class Unpacker(object):
+ """Streaming unpacker.
+
+ arguments:
+
+ :param file_like:
+ File-like object having `.read(n)` method.
+ If specified, unpacker reads serialized data from it and
+ :meth:`feed()` is not usable.
+
+ :param int read_size:
+ Used as `file_like.read(read_size)`. (default:
+ `min(1024**2, max_buffer_size)`)
+
+ :param bool use_list:
+ If true, unpack msgpack array to Python list.
+ Otherwise, unpack to Python tuple. (default: True)
+
+ :param callable object_hook:
+ When specified, it should be callable.
+ Unpacker calls it with a dict argument after unpacking msgpack map.
+ (See also simplejson)
+
+ :param callable object_pairs_hook:
+ When specified, it should be callable. Unpacker calls it with a list
+ of key-value pairs after unpacking msgpack map. (See also simplejson)
+
+ :param str encoding:
+ Encoding used for decoding msgpack raw.
+ If it is None (default), msgpack raw is deserialized to Python bytes.
+
+ :param str unicode_errors:
+ Used for decoding msgpack raw with *encoding*.
+ (default: `'strict'`)
+
+ :param int max_buffer_size:
+ Limits size of data waiting unpacked. 0 means system's
+ INT_MAX (default). Raises `BufferFull` exception when it
+ is insufficient. You should set this parameter when unpacking
+ data from untrasted source.
+
+ :param int max_str_len:
+ Limits max length of str. (default: 2**31-1)
+
+ :param int max_bin_len:
+ Limits max length of bin. (default: 2**31-1)
+
+ :param int max_array_len:
+ Limits max length of array. (default: 2**31-1)
+
+ :param int max_map_len:
+ Limits max length of map. (default: 2**31-1)
+
+
+ example of streaming deserialize from file-like object::
+
+ unpacker = Unpacker(file_like)
+ for o in unpacker:
+ process(o)
+
+ example of streaming deserialize from socket::
+
+ unpacker = Unpacker()
+ while True:
+ buf = sock.recv(1024**2)
+ if not buf:
+ break
+ unpacker.feed(buf)
+ for o in unpacker:
+ process(o)
+ """
+ cdef unpack_context ctx
+ cdef char* buf
+ cdef size_t buf_size, buf_head, buf_tail
+ cdef object file_like
+ cdef object file_like_read
+ cdef Py_ssize_t read_size
+ # To maintain refcnt.
+ cdef object object_hook, object_pairs_hook, list_hook, ext_hook
+ cdef object encoding, unicode_errors
+ cdef size_t max_buffer_size
+
+ def __cinit__(self):
+ self.buf = NULL
+
+ def __dealloc__(self):
+ free(self.buf)
+ self.buf = NULL
+
+ def __init__(self, file_like=None, Py_ssize_t read_size=0, bint use_list=1,
+ object object_hook=None, object object_pairs_hook=None,
+ object list_hook=None, encoding=None, unicode_errors='strict',
+ int max_buffer_size=0, object ext_hook=ExtType,
+ Py_ssize_t max_str_len=2147483647, # 2**32-1
+ Py_ssize_t max_bin_len=2147483647,
+ Py_ssize_t max_array_len=2147483647,
+ Py_ssize_t max_map_len=2147483647,
+ Py_ssize_t max_ext_len=2147483647):
+ cdef char *cenc=NULL,
+ cdef char *cerr=NULL
+
+ self.object_hook = object_hook
+ self.object_pairs_hook = object_pairs_hook
+ self.list_hook = list_hook
+ self.ext_hook = ext_hook
+
+ self.file_like = file_like
+ if file_like:
+ self.file_like_read = file_like.read
+ if not PyCallable_Check(self.file_like_read):
+ raise TypeError("`file_like.read` must be a callable.")
+ if not max_buffer_size:
+ max_buffer_size = INT_MAX
+ if read_size > max_buffer_size:
+ raise ValueError("read_size should be less or "
+ "equal to max_buffer_size")
+ if not read_size:
+ read_size = min(max_buffer_size, 1024**2)
+ self.max_buffer_size = max_buffer_size
+ self.read_size = read_size
+ self.buf = <char*>malloc(read_size)
+ if self.buf == NULL:
+ raise MemoryError("Unable to allocate internal buffer.")
+ self.buf_size = read_size
+ self.buf_head = 0
+ self.buf_tail = 0
+
+ if encoding is not None:
+ if isinstance(encoding, unicode):
+ self.encoding = encoding.encode('ascii')
+ elif isinstance(encoding, bytes):
+ self.encoding = encoding
+ else:
+ raise TypeError("encoding should be bytes or unicode")
+ cenc = PyBytes_AsString(self.encoding)
+
+ if unicode_errors is not None:
+ if isinstance(unicode_errors, unicode):
+ self.unicode_errors = unicode_errors.encode('ascii')
+ elif isinstance(unicode_errors, bytes):
+ self.unicode_errors = unicode_errors
+ else:
+ raise TypeError("unicode_errors should be bytes or unicode")
+ cerr = PyBytes_AsString(self.unicode_errors)
+
+ init_ctx(&self.ctx, object_hook, object_pairs_hook, list_hook,
+ ext_hook, use_list, cenc, cerr,
+ max_str_len, max_bin_len, max_array_len,
+ max_map_len, max_ext_len)
+
+ def feed(self, object next_bytes):
+ """Append `next_bytes` to internal buffer."""
+ cdef Py_buffer pybuff
+ if self.file_like is not None:
+ raise AssertionError("unpacker.feed() is not be able "
+ "to use with `file_like`.")
+ PyObject_GetBuffer(next_bytes, &pybuff, PyBUF_SIMPLE)
+ try:
+ self.append_buffer(<char*>pybuff.buf, pybuff.len)
+ finally:
+ PyBuffer_Release(&pybuff)
+
+ cdef append_buffer(self, void* _buf, Py_ssize_t _buf_len):
+ cdef:
+ char* buf = self.buf
+ char* new_buf
+ size_t head = self.buf_head
+ size_t tail = self.buf_tail
+ size_t buf_size = self.buf_size
+ size_t new_size
+
+ if tail + _buf_len > buf_size:
+ if ((tail - head) + _buf_len) <= buf_size:
+ # move to front.
+ memmove(buf, buf + head, tail - head)
+ tail -= head
+ head = 0
+ else:
+ # expand buffer.
+ new_size = (tail - head) + _buf_len
+ if new_size > self.max_buffer_size:
+ raise BufferFull
+ new_size = min(new_size * 2, self.max_buffer_size)
+ new_buf = <char*>malloc(new_size)
+ if new_buf == NULL:
+ # self.buf still holds old buffer and will be freed during
+ # obj destruction
+ raise MemoryError("Unable to enlarge internal buffer.")
+ memcpy(new_buf, buf + head, tail - head)
+ free(buf)
+
+ buf = new_buf
+ buf_size = new_size
+ tail -= head
+ head = 0
+
+ memcpy(buf + tail, <char*>(_buf), _buf_len)
+ self.buf = buf
+ self.buf_head = head
+ self.buf_size = buf_size
+ self.buf_tail = tail + _buf_len
+
+ cdef read_from_file(self):
+ # Assume self.max_buffer_size - (self.buf_tail - self.buf_head) >= 0
+ next_bytes = self.file_like_read(
+ min(self.read_size,
+ <Py_ssize_t>(self.max_buffer_size -
+ (self.buf_tail - self.buf_head))))
+ if next_bytes:
+ self.append_buffer(PyBytes_AsString(next_bytes),
+ PyBytes_Size(next_bytes))
+ else:
+ self.file_like = None
+
+ cdef object _unpack(self, execute_fn execute,
+ object write_bytes, bint iter=0):
+ cdef int ret
+ cdef object obj
+ cdef size_t prev_head
+
+ if self.buf_head >= self.buf_tail and self.file_like is not None:
+ self.read_from_file()
+
+ while 1:
+ prev_head = self.buf_head
+ if prev_head >= self.buf_tail:
+ if iter:
+ raise StopIteration("No more data to unpack.")
+ else:
+ raise OutOfData("No more data to unpack.")
+
+ ret = execute(&self.ctx, self.buf, self.buf_tail, &self.buf_head)
+ if write_bytes is not None:
+ write_bytes(PyBytes_FromStringAndSize(
+ self.buf + prev_head, self.buf_head - prev_head))
+
+ if ret == 1:
+ obj = unpack_data(&self.ctx)
+ unpack_init(&self.ctx)
+ return obj
+ elif ret == 0:
+ if self.file_like is not None:
+ self.read_from_file()
+ continue
+ if iter:
+ raise StopIteration("No more data to unpack.")
+ else:
+ raise OutOfData("No more data to unpack.")
+ else:
+ raise ValueError("Unpack failed: error = {ret}"
+ .format(ret=ret))
+
+ def read_bytes(self, Py_ssize_t nbytes):
+ """Read a specified number of raw bytes from the stream"""
+ cdef size_t nread
+
+ # Assume that self.buf_tail - self.buf_head >= 0
+ nread = min(<Py_ssize_t>(self.buf_tail - self.buf_head), nbytes)
+ ret = PyBytes_FromStringAndSize(self.buf + self.buf_head, nread)
+ self.buf_head += nread
+ if len(ret) < nbytes and self.file_like is not None:
+ ret += self.file_like.read(nbytes - len(ret))
+ return ret
+
+ def unpack(self, object write_bytes=None):
+ """Unpack one object
+
+ If write_bytes is not None, it will be called with parts of the raw
+ message as it is unpacked.
+
+ Raises `OutOfData` when there are no more bytes to unpack.
+ """
+ return self._unpack(unpack_construct, write_bytes)
+
+ def skip(self, object write_bytes=None):
+ """Read and ignore one object, returning None
+
+ If write_bytes is not None, it will be called with parts of the raw
+ message as it is unpacked.
+
+ Raises `OutOfData` when there are no more bytes to unpack.
+ """
+ return self._unpack(unpack_skip, write_bytes)
+
+ def read_array_header(self, object write_bytes=None):
+ """assuming the next object is an array, return its size n, such that
+ the next n unpack() calls will iterate over its contents.
+
+ Raises `OutOfData` when there are no more bytes to unpack.
+ """
+ return self._unpack(read_array_header, write_bytes)
+
+ def read_map_header(self, object write_bytes=None):
+ """assuming the next object is a map, return its size n, such that the
+ next n * 2 unpack() calls will iterate over its key-value pairs.
+
+ Raises `OutOfData` when there are no more bytes to unpack.
+ """
+ return self._unpack(read_map_header, write_bytes)
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ return self._unpack(unpack_construct, None, 1)
+
+ # for debug.
+ # def _buf(self):
+ # return PyString_FromStringAndSize(self.buf, self.buf_tail)
+
+ # def _off(self):
+ # return self.buf_head
diff --git a/contrib/python/pandas/py2/pandas/io/msgpack/_version.py b/contrib/python/pandas/py2/pandas/io/msgpack/_version.py
new file mode 100644
index 00000000000..2c1c96c0759
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/msgpack/_version.py
@@ -0,0 +1 @@
+version = (0, 4, 6)
diff --git a/contrib/python/pandas/py2/pandas/io/msgpack/exceptions.py b/contrib/python/pandas/py2/pandas/io/msgpack/exceptions.py
new file mode 100644
index 00000000000..ae0f74a6700
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/msgpack/exceptions.py
@@ -0,0 +1,32 @@
+class UnpackException(Exception):
+ pass
+
+
+class BufferFull(UnpackException):
+ pass
+
+
+class OutOfData(UnpackException):
+ pass
+
+
+class UnpackValueError(UnpackException, ValueError):
+ pass
+
+
+class ExtraData(ValueError):
+
+ def __init__(self, unpacked, extra):
+ self.unpacked = unpacked
+ self.extra = extra
+
+ def __str__(self):
+ return "unpack(b) received extra data."
+
+
+class PackException(Exception):
+ pass
+
+
+class PackValueError(PackException, ValueError):
+ pass
diff --git a/contrib/python/pandas/py2/pandas/io/packers.py b/contrib/python/pandas/py2/pandas/io/packers.py
new file mode 100644
index 00000000000..efe4e3a91c6
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/packers.py
@@ -0,0 +1,830 @@
+"""
+Msgpack serializer support for reading and writing pandas data structures
+to disk
+
+portions of msgpack_numpy package, by Lev Givon were incorporated
+into this module (and tests_packers.py)
+
+License
+=======
+
+Copyright (c) 2013, Lev Givon.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+* Neither the name of Lev Givon nor the names of any
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+from datetime import date, datetime, timedelta
+import os
+from textwrap import dedent
+import warnings
+
+from dateutil.parser import parse
+import numpy as np
+
+import pandas.compat as compat
+from pandas.compat import u, u_safe
+from pandas.errors import PerformanceWarning
+from pandas.util._move import (
+ BadMove as _BadMove, move_into_mutable_buffer as _move_into_mutable_buffer)
+
+from pandas.core.dtypes.common import (
+ is_categorical_dtype, is_datetime64tz_dtype, is_object_dtype,
+ needs_i8_conversion, pandas_dtype)
+
+from pandas import ( # noqa:F401
+ Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index,
+ Index, Int64Index, Interval, IntervalIndex, MultiIndex, NaT, Panel, Period,
+ PeriodIndex, RangeIndex, Series, TimedeltaIndex, Timestamp)
+from pandas.core import internals
+from pandas.core.arrays import DatetimeArray, IntervalArray, PeriodArray
+from pandas.core.arrays.sparse import BlockIndex, IntIndex
+from pandas.core.generic import NDFrame
+from pandas.core.internals import BlockManager, _safe_reshape, make_block
+from pandas.core.sparse.api import SparseDataFrame, SparseSeries
+
+from pandas.io.common import _stringify_path, get_filepath_or_buffer
+from pandas.io.msgpack import ExtType, Packer as _Packer, Unpacker as _Unpacker
+
+# check which compression libs we have installed
+try:
+ import zlib
+
+ def _check_zlib():
+ pass
+except ImportError:
+ def _check_zlib():
+ raise ImportError('zlib is not installed')
+
+_check_zlib.__doc__ = dedent(
+ """\
+ Check if zlib is installed.
+
+ Raises
+ ------
+ ImportError
+ Raised when zlib is not installed.
+ """,
+)
+
+try:
+ import blosc
+
+ def _check_blosc():
+ pass
+except ImportError:
+ def _check_blosc():
+ raise ImportError('blosc is not installed')
+
+_check_blosc.__doc__ = dedent(
+ """\
+ Check if blosc is installed.
+
+ Raises
+ ------
+ ImportError
+ Raised when blosc is not installed.
+ """,
+)
+
+# until we can pass this into our conversion functions,
+# this is pretty hacky
+compressor = None
+
+
+def to_msgpack(path_or_buf, *args, **kwargs):
+ """
+ msgpack (serialize) object to input file path
+
+ THIS IS AN EXPERIMENTAL LIBRARY and the storage format
+ may not be stable until a future release.
+
+ Parameters
+ ----------
+ path_or_buf : string File path, buffer-like, or None
+ if None, return generated string
+ args : an object or objects to serialize
+ encoding : encoding for unicode objects
+ append : boolean whether to append to an existing msgpack
+ (default is False)
+ compress : type of compressor (zlib or blosc), default to None (no
+ compression)
+ """
+ global compressor
+ compressor = kwargs.pop('compress', None)
+ if compressor:
+ compressor = u(compressor)
+ append = kwargs.pop('append', None)
+ if append:
+ mode = 'a+b'
+ else:
+ mode = 'wb'
+
+ def writer(fh):
+ for a in args:
+ fh.write(pack(a, **kwargs))
+
+ path_or_buf = _stringify_path(path_or_buf)
+ if isinstance(path_or_buf, compat.string_types):
+ with open(path_or_buf, mode) as fh:
+ writer(fh)
+ elif path_or_buf is None:
+ buf = compat.BytesIO()
+ writer(buf)
+ return buf.getvalue()
+ else:
+ writer(path_or_buf)
+
+
+def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs):
+ """
+ Load msgpack pandas object from the specified
+ file path
+
+ THIS IS AN EXPERIMENTAL LIBRARY and the storage format
+ may not be stable until a future release.
+
+ Parameters
+ ----------
+ path_or_buf : string File path, BytesIO like or string
+ encoding : Encoding for decoding msgpack str type
+ iterator : boolean, if True, return an iterator to the unpacker
+ (default is False)
+
+ Returns
+ -------
+ obj : same type as object stored in file
+ """
+ path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf)
+ if iterator:
+ return Iterator(path_or_buf)
+
+ def read(fh):
+ unpacked_obj = list(unpack(fh, encoding=encoding, **kwargs))
+ if len(unpacked_obj) == 1:
+ return unpacked_obj[0]
+
+ if should_close:
+ try:
+ path_or_buf.close()
+ except IOError:
+ pass
+ return unpacked_obj
+
+ # see if we have an actual file
+ if isinstance(path_or_buf, compat.string_types):
+ try:
+ exists = os.path.exists(path_or_buf)
+ except (TypeError, ValueError):
+ exists = False
+
+ if exists:
+ with open(path_or_buf, 'rb') as fh:
+ return read(fh)
+
+ if isinstance(path_or_buf, compat.binary_type):
+ # treat as a binary-like
+ fh = None
+ try:
+ # We can't distinguish between a path and a buffer of bytes in
+ # Python 2 so instead assume the first byte of a valid path is
+ # less than 0x80.
+ if compat.PY3 or ord(path_or_buf[0]) >= 0x80:
+ fh = compat.BytesIO(path_or_buf)
+ return read(fh)
+ finally:
+ if fh is not None:
+ fh.close()
+ elif hasattr(path_or_buf, 'read') and compat.callable(path_or_buf.read):
+ # treat as a buffer like
+ return read(path_or_buf)
+
+ raise ValueError('path_or_buf needs to be a string file path or file-like')
+
+
+dtype_dict = {21: np.dtype('M8[ns]'),
+ u('datetime64[ns]'): np.dtype('M8[ns]'),
+ u('datetime64[us]'): np.dtype('M8[us]'),
+ 22: np.dtype('m8[ns]'),
+ u('timedelta64[ns]'): np.dtype('m8[ns]'),
+ u('timedelta64[us]'): np.dtype('m8[us]'),
+
+ # this is platform int, which we need to remap to np.int64
+ # for compat on windows platforms
+ 7: np.dtype('int64'),
+ 'category': 'category'
+ }
+
+
+def dtype_for(t):
+ """ return my dtype mapping, whether number or name """
+ if t in dtype_dict:
+ return dtype_dict[t]
+ return np.typeDict.get(t, t)
+
+
+c2f_dict = {'complex': np.float64,
+ 'complex128': np.float64,
+ 'complex64': np.float32}
+
+# windows (32 bit) compat
+if hasattr(np, 'float128'):
+ c2f_dict['complex256'] = np.float128
+
+
+def c2f(r, i, ctype_name):
+ """
+ Convert strings to complex number instance with specified numpy type.
+ """
+
+ ftype = c2f_dict[ctype_name]
+ return np.typeDict[ctype_name](ftype(r) + 1j * ftype(i))
+
+
+def convert(values):
+ """ convert the numpy values to a list """
+
+ dtype = values.dtype
+
+ if is_categorical_dtype(values):
+ return values
+
+ elif is_object_dtype(dtype):
+ return values.ravel().tolist()
+
+ if needs_i8_conversion(dtype):
+ values = values.view('i8')
+ v = values.ravel()
+
+ if compressor == 'zlib':
+ _check_zlib()
+
+ # return string arrays like they are
+ if dtype == np.object_:
+ return v.tolist()
+
+ # convert to a bytes array
+ v = v.tostring()
+ return ExtType(0, zlib.compress(v))
+
+ elif compressor == 'blosc':
+ _check_blosc()
+
+ # return string arrays like they are
+ if dtype == np.object_:
+ return v.tolist()
+
+ # convert to a bytes array
+ v = v.tostring()
+ return ExtType(0, blosc.compress(v, typesize=dtype.itemsize))
+
+ # ndarray (on original dtype)
+ return ExtType(0, v.tostring())
+
+
+def unconvert(values, dtype, compress=None):
+
+ as_is_ext = isinstance(values, ExtType) and values.code == 0
+
+ if as_is_ext:
+ values = values.data
+
+ if is_categorical_dtype(dtype):
+ return values
+
+ elif is_object_dtype(dtype):
+ return np.array(values, dtype=object)
+
+ dtype = pandas_dtype(dtype).base
+
+ if not as_is_ext:
+ values = values.encode('latin1')
+
+ if compress:
+ if compress == u'zlib':
+ _check_zlib()
+ decompress = zlib.decompress
+ elif compress == u'blosc':
+ _check_blosc()
+ decompress = blosc.decompress
+ else:
+ raise ValueError("compress must be one of 'zlib' or 'blosc'")
+
+ try:
+ return np.frombuffer(
+ _move_into_mutable_buffer(decompress(values)),
+ dtype=dtype,
+ )
+ except _BadMove as e:
+ # Pull the decompressed data off of the `_BadMove` exception.
+ # We don't just store this in the locals because we want to
+ # minimize the risk of giving users access to a `bytes` object
+ # whose data is also given to a mutable buffer.
+ values = e.args[0]
+ if len(values) > 1:
+ # The empty string and single characters are memoized in many
+ # string creating functions in the capi. This case should not
+ # warn even though we need to make a copy because we are only
+ # copying at most 1 byte.
+ warnings.warn(
+ 'copying data after decompressing; this may mean that'
+ ' decompress is caching its result',
+ PerformanceWarning,
+ )
+ # fall through to copying `np.fromstring`
+
+ # Copy the bytes into a numpy array.
+ buf = np.frombuffer(values, dtype=dtype)
+ buf = buf.copy() # required to not mutate the original data
+ buf.flags.writeable = True
+ return buf
+
+
+def encode(obj):
+ """
+ Data encoder
+ """
+ tobj = type(obj)
+ if isinstance(obj, Index):
+ if isinstance(obj, RangeIndex):
+ return {u'typ': u'range_index',
+ u'klass': u(obj.__class__.__name__),
+ u'name': getattr(obj, 'name', None),
+ u'start': getattr(obj, '_start', None),
+ u'stop': getattr(obj, '_stop', None),
+ u'step': getattr(obj, '_step', None)}
+ elif isinstance(obj, PeriodIndex):
+ return {u'typ': u'period_index',
+ u'klass': u(obj.__class__.__name__),
+ u'name': getattr(obj, 'name', None),
+ u'freq': u_safe(getattr(obj, 'freqstr', None)),
+ u'dtype': u(obj.dtype.name),
+ u'data': convert(obj.asi8),
+ u'compress': compressor}
+ elif isinstance(obj, DatetimeIndex):
+ tz = getattr(obj, 'tz', None)
+
+ # store tz info and data as UTC
+ if tz is not None:
+ tz = u(tz.zone)
+ obj = obj.tz_convert('UTC')
+ return {u'typ': u'datetime_index',
+ u'klass': u(obj.__class__.__name__),
+ u'name': getattr(obj, 'name', None),
+ u'dtype': u(obj.dtype.name),
+ u'data': convert(obj.asi8),
+ u'freq': u_safe(getattr(obj, 'freqstr', None)),
+ u'tz': tz,
+ u'compress': compressor}
+ elif isinstance(obj, (IntervalIndex, IntervalArray)):
+ if isinstance(obj, IntervalIndex):
+ typ = u'interval_index'
+ else:
+ typ = u'interval_array'
+ return {u'typ': typ,
+ u'klass': u(obj.__class__.__name__),
+ u'name': getattr(obj, 'name', None),
+ u'left': getattr(obj, 'left', None),
+ u'right': getattr(obj, 'right', None),
+ u'closed': getattr(obj, 'closed', None)}
+ elif isinstance(obj, MultiIndex):
+ return {u'typ': u'multi_index',
+ u'klass': u(obj.__class__.__name__),
+ u'names': getattr(obj, 'names', None),
+ u'dtype': u(obj.dtype.name),
+ u'data': convert(obj.values),
+ u'compress': compressor}
+ else:
+ return {u'typ': u'index',
+ u'klass': u(obj.__class__.__name__),
+ u'name': getattr(obj, 'name', None),
+ u'dtype': u(obj.dtype.name),
+ u'data': convert(obj.values),
+ u'compress': compressor}
+
+ elif isinstance(obj, Categorical):
+ return {u'typ': u'category',
+ u'klass': u(obj.__class__.__name__),
+ u'name': getattr(obj, 'name', None),
+ u'codes': obj.codes,
+ u'categories': obj.categories,
+ u'ordered': obj.ordered,
+ u'compress': compressor}
+
+ elif isinstance(obj, Series):
+ if isinstance(obj, SparseSeries):
+ raise NotImplementedError(
+ 'msgpack sparse series is not implemented'
+ )
+ # d = {'typ': 'sparse_series',
+ # 'klass': obj.__class__.__name__,
+ # 'dtype': obj.dtype.name,
+ # 'index': obj.index,
+ # 'sp_index': obj.sp_index,
+ # 'sp_values': convert(obj.sp_values),
+ # 'compress': compressor}
+ # for f in ['name', 'fill_value', 'kind']:
+ # d[f] = getattr(obj, f, None)
+ # return d
+ else:
+ return {u'typ': u'series',
+ u'klass': u(obj.__class__.__name__),
+ u'name': getattr(obj, 'name', None),
+ u'index': obj.index,
+ u'dtype': u(obj.dtype.name),
+ u'data': convert(obj.values),
+ u'compress': compressor}
+ elif issubclass(tobj, NDFrame):
+ if isinstance(obj, SparseDataFrame):
+ raise NotImplementedError(
+ 'msgpack sparse frame is not implemented'
+ )
+ # d = {'typ': 'sparse_dataframe',
+ # 'klass': obj.__class__.__name__,
+ # 'columns': obj.columns}
+ # for f in ['default_fill_value', 'default_kind']:
+ # d[f] = getattr(obj, f, None)
+ # d['data'] = dict([(name, ss)
+ # for name, ss in compat.iteritems(obj)])
+ # return d
+ else:
+
+ data = obj._data
+ if not data.is_consolidated():
+ data = data.consolidate()
+
+ # the block manager
+ return {u'typ': u'block_manager',
+ u'klass': u(obj.__class__.__name__),
+ u'axes': data.axes,
+ u'blocks': [{u'locs': b.mgr_locs.as_array,
+ u'values': convert(b.values),
+ u'shape': b.values.shape,
+ u'dtype': u(b.dtype.name),
+ u'klass': u(b.__class__.__name__),
+ u'compress': compressor} for b in data.blocks]
+ }
+
+ elif isinstance(obj, (datetime, date, np.datetime64, timedelta,
+ np.timedelta64)) or obj is NaT:
+ if isinstance(obj, Timestamp):
+ tz = obj.tzinfo
+ if tz is not None:
+ tz = u(tz.zone)
+ freq = obj.freq
+ if freq is not None:
+ freq = u(freq.freqstr)
+ return {u'typ': u'timestamp',
+ u'value': obj.value,
+ u'freq': freq,
+ u'tz': tz}
+ if obj is NaT:
+ return {u'typ': u'nat'}
+ elif isinstance(obj, np.timedelta64):
+ return {u'typ': u'timedelta64',
+ u'data': obj.view('i8')}
+ elif isinstance(obj, timedelta):
+ return {u'typ': u'timedelta',
+ u'data': (obj.days, obj.seconds, obj.microseconds)}
+ elif isinstance(obj, np.datetime64):
+ return {u'typ': u'datetime64',
+ u'data': u(str(obj))}
+ elif isinstance(obj, datetime):
+ return {u'typ': u'datetime',
+ u'data': u(obj.isoformat())}
+ elif isinstance(obj, date):
+ return {u'typ': u'date',
+ u'data': u(obj.isoformat())}
+ raise Exception(
+ "cannot encode this datetimelike object: {obj}".format(obj=obj))
+ elif isinstance(obj, Period):
+ return {u'typ': u'period',
+ u'ordinal': obj.ordinal,
+ u'freq': u_safe(obj.freqstr)}
+ elif isinstance(obj, Interval):
+ return {u'typ': u'interval',
+ u'left': obj.left,
+ u'right': obj.right,
+ u'closed': obj.closed}
+ elif isinstance(obj, BlockIndex):
+ return {u'typ': u'block_index',
+ u'klass': u(obj.__class__.__name__),
+ u'blocs': obj.blocs,
+ u'blengths': obj.blengths,
+ u'length': obj.length}
+ elif isinstance(obj, IntIndex):
+ return {u'typ': u'int_index',
+ u'klass': u(obj.__class__.__name__),
+ u'indices': obj.indices,
+ u'length': obj.length}
+ elif isinstance(obj, np.ndarray):
+ return {u'typ': u'ndarray',
+ u'shape': obj.shape,
+ u'ndim': obj.ndim,
+ u'dtype': u(obj.dtype.name),
+ u'data': convert(obj),
+ u'compress': compressor}
+ elif isinstance(obj, np.number):
+ if np.iscomplexobj(obj):
+ return {u'typ': u'np_scalar',
+ u'sub_typ': u'np_complex',
+ u'dtype': u(obj.dtype.name),
+ u'real': u(obj.real.__repr__()),
+ u'imag': u(obj.imag.__repr__())}
+ else:
+ return {u'typ': u'np_scalar',
+ u'dtype': u(obj.dtype.name),
+ u'data': u(obj.__repr__())}
+ elif isinstance(obj, complex):
+ return {u'typ': u'np_complex',
+ u'real': u(obj.real.__repr__()),
+ u'imag': u(obj.imag.__repr__())}
+
+ return obj
+
+
+def decode(obj):
+ """
+ Decoder for deserializing numpy data types.
+ """
+
+ typ = obj.get(u'typ')
+ if typ is None:
+ return obj
+ elif typ == u'timestamp':
+ freq = obj[u'freq'] if 'freq' in obj else obj[u'offset']
+ return Timestamp(obj[u'value'], tz=obj[u'tz'], freq=freq)
+ elif typ == u'nat':
+ return NaT
+ elif typ == u'period':
+ return Period(ordinal=obj[u'ordinal'], freq=obj[u'freq'])
+ elif typ == u'index':
+ dtype = dtype_for(obj[u'dtype'])
+ data = unconvert(obj[u'data'], dtype,
+ obj.get(u'compress'))
+ return Index(data, dtype=dtype, name=obj[u'name'])
+ elif typ == u'range_index':
+ return RangeIndex(obj[u'start'],
+ obj[u'stop'],
+ obj[u'step'],
+ name=obj[u'name'])
+ elif typ == u'multi_index':
+ dtype = dtype_for(obj[u'dtype'])
+ data = unconvert(obj[u'data'], dtype,
+ obj.get(u'compress'))
+ data = [tuple(x) for x in data]
+ return MultiIndex.from_tuples(data, names=obj[u'names'])
+ elif typ == u'period_index':
+ data = unconvert(obj[u'data'], np.int64, obj.get(u'compress'))
+ d = dict(name=obj[u'name'], freq=obj[u'freq'])
+ freq = d.pop('freq', None)
+ return PeriodIndex(PeriodArray(data, freq), **d)
+
+ elif typ == u'datetime_index':
+ data = unconvert(obj[u'data'], np.int64, obj.get(u'compress'))
+ d = dict(name=obj[u'name'], freq=obj[u'freq'])
+ result = DatetimeIndex(data, **d)
+ tz = obj[u'tz']
+
+ # reverse tz conversion
+ if tz is not None:
+ result = result.tz_localize('UTC').tz_convert(tz)
+ return result
+
+ elif typ in (u'interval_index', 'interval_array'):
+ return globals()[obj[u'klass']].from_arrays(obj[u'left'],
+ obj[u'right'],
+ obj[u'closed'],
+ name=obj[u'name'])
+ elif typ == u'category':
+ from_codes = globals()[obj[u'klass']].from_codes
+ return from_codes(codes=obj[u'codes'],
+ categories=obj[u'categories'],
+ ordered=obj[u'ordered'])
+
+ elif typ == u'interval':
+ return Interval(obj[u'left'], obj[u'right'], obj[u'closed'])
+ elif typ == u'series':
+ dtype = dtype_for(obj[u'dtype'])
+ pd_dtype = pandas_dtype(dtype)
+
+ index = obj[u'index']
+ result = Series(unconvert(obj[u'data'], dtype, obj[u'compress']),
+ index=index,
+ dtype=pd_dtype,
+ name=obj[u'name'])
+ return result
+
+ elif typ == u'block_manager':
+ axes = obj[u'axes']
+
+ def create_block(b):
+ values = _safe_reshape(unconvert(
+ b[u'values'], dtype_for(b[u'dtype']),
+ b[u'compress']), b[u'shape'])
+
+ # locs handles duplicate column names, and should be used instead
+ # of items; see GH 9618
+ if u'locs' in b:
+ placement = b[u'locs']
+ else:
+ placement = axes[0].get_indexer(b[u'items'])
+
+ if is_datetime64tz_dtype(b[u'dtype']):
+ assert isinstance(values, np.ndarray), type(values)
+ assert values.dtype == 'M8[ns]', values.dtype
+ values = DatetimeArray(values, dtype=b[u'dtype'])
+
+ return make_block(values=values,
+ klass=getattr(internals, b[u'klass']),
+ placement=placement,
+ dtype=b[u'dtype'])
+
+ blocks = [create_block(b) for b in obj[u'blocks']]
+ return globals()[obj[u'klass']](BlockManager(blocks, axes))
+ elif typ == u'datetime':
+ return parse(obj[u'data'])
+ elif typ == u'datetime64':
+ return np.datetime64(parse(obj[u'data']))
+ elif typ == u'date':
+ return parse(obj[u'data']).date()
+ elif typ == u'timedelta':
+ return timedelta(*obj[u'data'])
+ elif typ == u'timedelta64':
+ return np.timedelta64(int(obj[u'data']))
+ # elif typ == 'sparse_series':
+ # dtype = dtype_for(obj['dtype'])
+ # return SparseSeries(
+ # unconvert(obj['sp_values'], dtype, obj['compress']),
+ # sparse_index=obj['sp_index'], index=obj['index'],
+ # fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name'])
+ # elif typ == 'sparse_dataframe':
+ # return SparseDataFrame(
+ # obj['data'], columns=obj['columns'],
+ # default_fill_value=obj['default_fill_value'],
+ # default_kind=obj['default_kind']
+ # )
+ # elif typ == 'sparse_panel':
+ # return SparsePanel(
+ # obj['data'], items=obj['items'],
+ # default_fill_value=obj['default_fill_value'],
+ # default_kind=obj['default_kind'])
+ elif typ == u'block_index':
+ return globals()[obj[u'klass']](obj[u'length'], obj[u'blocs'],
+ obj[u'blengths'])
+ elif typ == u'int_index':
+ return globals()[obj[u'klass']](obj[u'length'], obj[u'indices'])
+ elif typ == u'ndarray':
+ return unconvert(obj[u'data'], np.typeDict[obj[u'dtype']],
+ obj.get(u'compress')).reshape(obj[u'shape'])
+ elif typ == u'np_scalar':
+ if obj.get(u'sub_typ') == u'np_complex':
+ return c2f(obj[u'real'], obj[u'imag'], obj[u'dtype'])
+ else:
+ dtype = dtype_for(obj[u'dtype'])
+ try:
+ return dtype(obj[u'data'])
+ except (ValueError, TypeError):
+ return dtype.type(obj[u'data'])
+ elif typ == u'np_complex':
+ return complex(obj[u'real'] + u'+' + obj[u'imag'] + u'j')
+ elif isinstance(obj, (dict, list, set)):
+ return obj
+ else:
+ return obj
+
+
+def pack(o, default=encode,
+ encoding='utf-8', unicode_errors='strict', use_single_float=False,
+ autoreset=1, use_bin_type=1):
+ """
+ Pack an object and return the packed bytes.
+ """
+
+ return Packer(default=default, encoding=encoding,
+ unicode_errors=unicode_errors,
+ use_single_float=use_single_float,
+ autoreset=autoreset,
+ use_bin_type=use_bin_type).pack(o)
+
+
+def unpack(packed, object_hook=decode,
+ list_hook=None, use_list=False, encoding='utf-8',
+ unicode_errors='strict', object_pairs_hook=None,
+ max_buffer_size=0, ext_hook=ExtType):
+ """
+ Unpack a packed object, return an iterator
+ Note: packed lists will be returned as tuples
+ """
+
+ return Unpacker(packed, object_hook=object_hook,
+ list_hook=list_hook,
+ use_list=use_list, encoding=encoding,
+ unicode_errors=unicode_errors,
+ object_pairs_hook=object_pairs_hook,
+ max_buffer_size=max_buffer_size,
+ ext_hook=ext_hook)
+
+
+class Packer(_Packer):
+
+ def __init__(self, default=encode,
+ encoding='utf-8',
+ unicode_errors='strict',
+ use_single_float=False,
+ autoreset=1,
+ use_bin_type=1):
+ super(Packer, self).__init__(default=default,
+ encoding=encoding,
+ unicode_errors=unicode_errors,
+ use_single_float=use_single_float,
+ autoreset=autoreset,
+ use_bin_type=use_bin_type)
+
+
+class Unpacker(_Unpacker):
+
+ def __init__(self, file_like=None, read_size=0, use_list=False,
+ object_hook=decode,
+ object_pairs_hook=None, list_hook=None, encoding='utf-8',
+ unicode_errors='strict', max_buffer_size=0, ext_hook=ExtType):
+ super(Unpacker, self).__init__(file_like=file_like,
+ read_size=read_size,
+ use_list=use_list,
+ object_hook=object_hook,
+ object_pairs_hook=object_pairs_hook,
+ list_hook=list_hook,
+ encoding=encoding,
+ unicode_errors=unicode_errors,
+ max_buffer_size=max_buffer_size,
+ ext_hook=ext_hook)
+
+
+class Iterator(object):
+
+ """ manage the unpacking iteration,
+ close the file on completion """
+
+ def __init__(self, path, **kwargs):
+ self.path = path
+ self.kwargs = kwargs
+
+ def __iter__(self):
+
+ needs_closing = True
+ try:
+
+ # see if we have an actual file
+ if isinstance(self.path, compat.string_types):
+
+ try:
+ path_exists = os.path.exists(self.path)
+ except TypeError:
+ path_exists = False
+
+ if path_exists:
+ fh = open(self.path, 'rb')
+ else:
+ fh = compat.BytesIO(self.path)
+
+ else:
+
+ if not hasattr(self.path, 'read'):
+ fh = compat.BytesIO(self.path)
+
+ else:
+
+ # a file-like
+ needs_closing = False
+ fh = self.path
+
+ unpacker = unpack(fh)
+ for o in unpacker:
+ yield o
+ finally:
+ if needs_closing:
+ fh.close()
diff --git a/contrib/python/pandas/py2/pandas/io/parquet.py b/contrib/python/pandas/py2/pandas/io/parquet.py
new file mode 100644
index 00000000000..dada9000d90
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/parquet.py
@@ -0,0 +1,282 @@
+""" parquet compat """
+
+from distutils.version import LooseVersion
+from warnings import catch_warnings
+
+from pandas.compat import string_types
+from pandas.errors import AbstractMethodError
+
+from pandas import DataFrame, get_option
+
+from pandas.io.common import get_filepath_or_buffer, is_s3_url
+
+
+def get_engine(engine):
+ """ return our implementation """
+
+ if engine == 'auto':
+ engine = get_option('io.parquet.engine')
+
+ if engine == 'auto':
+ # try engines in this order
+ try:
+ return PyArrowImpl()
+ except ImportError:
+ pass
+
+ try:
+ return FastParquetImpl()
+ except ImportError:
+ pass
+
+ raise ImportError("Unable to find a usable engine; "
+ "tried using: 'pyarrow', 'fastparquet'.\n"
+ "pyarrow or fastparquet is required for parquet "
+ "support")
+
+ if engine not in ['pyarrow', 'fastparquet']:
+ raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
+
+ if engine == 'pyarrow':
+ return PyArrowImpl()
+ elif engine == 'fastparquet':
+ return FastParquetImpl()
+
+
+class BaseImpl(object):
+
+ api = None # module
+
+ @staticmethod
+ def validate_dataframe(df):
+
+ if not isinstance(df, DataFrame):
+ raise ValueError("to_parquet only supports IO with DataFrames")
+
+ # must have value column names (strings only)
+ if df.columns.inferred_type not in {'string', 'unicode'}:
+ raise ValueError("parquet must have string column names")
+
+ # index level names must be strings
+ valid_names = all(
+ isinstance(name, string_types)
+ for name in df.index.names
+ if name is not None
+ )
+ if not valid_names:
+ raise ValueError("Index level names must be strings")
+
+ def write(self, df, path, compression, **kwargs):
+ raise AbstractMethodError(self)
+
+ def read(self, path, columns=None, **kwargs):
+ raise AbstractMethodError(self)
+
+
+class PyArrowImpl(BaseImpl):
+
+ def __init__(self):
+ # since pandas is a dependency of pyarrow
+ # we need to import on first use
+ try:
+ import pyarrow
+ import pyarrow.parquet
+ except ImportError:
+ raise ImportError(
+ "pyarrow is required for parquet support\n\n"
+ "you can install via conda\n"
+ "conda install pyarrow -c conda-forge\n"
+ "\nor via pip\n"
+ "pip install -U pyarrow\n"
+ )
+ if LooseVersion(pyarrow.__version__) < '0.9.0':
+ raise ImportError(
+ "pyarrow >= 0.9.0 is required for parquet support\n\n"
+ "you can install via conda\n"
+ "conda install pyarrow -c conda-forge\n"
+ "\nor via pip\n"
+ "pip install -U pyarrow\n"
+ )
+
+ self.api = pyarrow
+
+ def write(self, df, path, compression='snappy',
+ coerce_timestamps='ms', index=None, partition_cols=None,
+ **kwargs):
+ self.validate_dataframe(df)
+ path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
+
+ if index is None:
+ from_pandas_kwargs = {}
+ else:
+ from_pandas_kwargs = {'preserve_index': index}
+ table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
+ if partition_cols is not None:
+ self.api.parquet.write_to_dataset(
+ table, path, compression=compression,
+ coerce_timestamps=coerce_timestamps,
+ partition_cols=partition_cols, **kwargs)
+ else:
+ self.api.parquet.write_table(
+ table, path, compression=compression,
+ coerce_timestamps=coerce_timestamps, **kwargs)
+
+ def read(self, path, columns=None, **kwargs):
+ path, _, _, should_close = get_filepath_or_buffer(path)
+
+ kwargs['use_pandas_metadata'] = True
+ result = self.api.parquet.read_table(path, columns=columns,
+ **kwargs).to_pandas()
+ if should_close:
+ try:
+ path.close()
+ except: # noqa: flake8
+ pass
+
+ return result
+
+
+class FastParquetImpl(BaseImpl):
+
+ def __init__(self):
+ # since pandas is a dependency of fastparquet
+ # we need to import on first use
+ try:
+ import fastparquet
+ except ImportError:
+ raise ImportError(
+ "fastparquet is required for parquet support\n\n"
+ "you can install via conda\n"
+ "conda install fastparquet -c conda-forge\n"
+ "\nor via pip\n"
+ "pip install -U fastparquet"
+ )
+ if LooseVersion(fastparquet.__version__) < '0.2.1':
+ raise ImportError(
+ "fastparquet >= 0.2.1 is required for parquet "
+ "support\n\n"
+ "you can install via conda\n"
+ "conda install fastparquet -c conda-forge\n"
+ "\nor via pip\n"
+ "pip install -U fastparquet"
+ )
+ self.api = fastparquet
+
+ def write(self, df, path, compression='snappy', index=None,
+ partition_cols=None, **kwargs):
+ self.validate_dataframe(df)
+ # thriftpy/protocol/compact.py:339:
+ # DeprecationWarning: tostring() is deprecated.
+ # Use tobytes() instead.
+
+ if 'partition_on' in kwargs and partition_cols is not None:
+ raise ValueError("Cannot use both partition_on and "
+ "partition_cols. Use partition_cols for "
+ "partitioning data")
+ elif 'partition_on' in kwargs:
+ partition_cols = kwargs.pop('partition_on')
+
+ if partition_cols is not None:
+ kwargs['file_scheme'] = 'hive'
+
+ if is_s3_url(path):
+ # path is s3:// so we need to open the s3file in 'wb' mode.
+ # TODO: Support 'ab'
+
+ path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
+ # And pass the opened s3file to the fastparquet internal impl.
+ kwargs['open_with'] = lambda path, _: path
+ else:
+ path, _, _, _ = get_filepath_or_buffer(path)
+
+ with catch_warnings(record=True):
+ self.api.write(path, df, compression=compression,
+ write_index=index, partition_on=partition_cols,
+ **kwargs)
+
+ def read(self, path, columns=None, **kwargs):
+ if is_s3_url(path):
+ # When path is s3:// an S3File is returned.
+ # We need to retain the original path(str) while also
+ # pass the S3File().open function to fsatparquet impl.
+ s3, _, _, should_close = get_filepath_or_buffer(path)
+ try:
+ parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open)
+ finally:
+ s3.close()
+ else:
+ path, _, _, _ = get_filepath_or_buffer(path)
+ parquet_file = self.api.ParquetFile(path)
+
+ return parquet_file.to_pandas(columns=columns, **kwargs)
+
+
+def to_parquet(df, path, engine='auto', compression='snappy', index=None,
+ partition_cols=None, **kwargs):
+ """
+ Write a DataFrame to the parquet format.
+
+ Parameters
+ ----------
+ path : str
+ File path or Root Directory path. Will be used as Root Directory path
+ while writing a partitioned dataset.
+
+ .. versionchanged:: 0.24.0
+
+ engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
+ Parquet library to use. If 'auto', then the option
+ ``io.parquet.engine`` is used. The default ``io.parquet.engine``
+ behavior is to try 'pyarrow', falling back to 'fastparquet' if
+ 'pyarrow' is unavailable.
+ compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
+ Name of the compression to use. Use ``None`` for no compression.
+ index : bool, default None
+ If ``True``, include the dataframe's index(es) in the file output. If
+ ``False``, they will not be written to the file. If ``None``, the
+ engine's default behavior will be used.
+
+ .. versionadded 0.24.0
+
+ partition_cols : list, optional, default None
+ Column names by which to partition the dataset
+ Columns are partitioned in the order they are given
+
+ .. versionadded:: 0.24.0
+
+ kwargs
+ Additional keyword arguments passed to the engine
+ """
+ impl = get_engine(engine)
+ return impl.write(df, path, compression=compression, index=index,
+ partition_cols=partition_cols, **kwargs)
+
+
+def read_parquet(path, engine='auto', columns=None, **kwargs):
+ """
+ Load a parquet object from the file path, returning a DataFrame.
+
+ .. versionadded 0.21.0
+
+ Parameters
+ ----------
+ path : string
+ File path
+ columns : list, default=None
+ If not None, only these columns will be read from the file.
+
+ .. versionadded 0.21.1
+ engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
+ Parquet library to use. If 'auto', then the option
+ ``io.parquet.engine`` is used. The default ``io.parquet.engine``
+ behavior is to try 'pyarrow', falling back to 'fastparquet' if
+ 'pyarrow' is unavailable.
+ kwargs are passed to the engine
+
+ Returns
+ -------
+ DataFrame
+ """
+
+ impl = get_engine(engine)
+ return impl.read(path, columns=columns, **kwargs)
diff --git a/contrib/python/pandas/py2/pandas/io/parsers.py b/contrib/python/pandas/py2/pandas/io/parsers.py
new file mode 100755
index 00000000000..4163a571df8
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/parsers.py
@@ -0,0 +1,3610 @@
+"""
+Module contains tools for processing files into DataFrames or other objects
+"""
+
+from __future__ import print_function
+
+from collections import defaultdict
+import csv
+import datetime
+import re
+import sys
+from textwrap import fill
+import warnings
+
+import numpy as np
+
+import pandas._libs.lib as lib
+import pandas._libs.ops as libops
+import pandas._libs.parsers as parsers
+from pandas._libs.tslibs import parsing
+import pandas.compat as compat
+from pandas.compat import (
+ PY3, StringIO, lrange, lzip, map, range, string_types, u, zip)
+from pandas.errors import (
+ AbstractMethodError, EmptyDataError, ParserError, ParserWarning)
+from pandas.util._decorators import Appender
+
+from pandas.core.dtypes.cast import astype_nansafe
+from pandas.core.dtypes.common import (
+ ensure_object, is_bool_dtype, is_categorical_dtype, is_dtype_equal,
+ is_extension_array_dtype, is_float, is_integer, is_integer_dtype,
+ is_list_like, is_object_dtype, is_scalar, is_string_dtype, pandas_dtype)
+from pandas.core.dtypes.dtypes import CategoricalDtype
+from pandas.core.dtypes.missing import isna
+
+from pandas.core import algorithms
+from pandas.core.arrays import Categorical
+from pandas.core.frame import DataFrame
+from pandas.core.index import (
+ Index, MultiIndex, RangeIndex, ensure_index_from_sequences)
+from pandas.core.series import Series
+from pandas.core.tools import datetimes as tools
+
+from pandas.io.common import (
+ _NA_VALUES, BaseIterator, UnicodeReader, UTF8Recoder, _get_handle,
+ _infer_compression, _validate_header_arg, get_filepath_or_buffer,
+ is_file_like)
+from pandas.io.date_converters import generic_parser
+
+# BOM character (byte order mark)
+# This exists at the beginning of a file to indicate endianness
+# of a file (stream). Unfortunately, this marker screws up parsing,
+# so we need to remove it if we see it.
+_BOM = u('\ufeff')
+
+_doc_read_csv_and_table = r"""
+{summary}
+
+Also supports optionally iterating or breaking of the file
+into chunks.
+
+Additional help can be found in the online docs for
+`IO Tools <http://pandas.pydata.org/pandas-docs/stable/io.html>`_.
+
+Parameters
+----------
+filepath_or_buffer : str, path object, or file-like object
+ Any valid string path is acceptable. The string could be a URL. Valid
+ URL schemes include http, ftp, s3, and file. For file URLs, a host is
+ expected. A local file could be: file://localhost/path/to/table.csv.
+
+ If you want to pass in a path object, pandas accepts either
+ ``pathlib.Path`` or ``py._path.local.LocalPath``.
+
+ By file-like object, we refer to objects with a ``read()`` method, such as
+ a file handler (e.g. via builtin ``open`` function) or ``StringIO``.
+sep : str, default {_default_sep}
+ Delimiter to use. If sep is None, the C engine cannot automatically detect
+ the separator, but the Python parsing engine can, meaning the latter will
+ be used and automatically detect the separator by Python's builtin sniffer
+ tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
+ different from ``'\s+'`` will be interpreted as regular expressions and
+ will also force the use of the Python parsing engine. Note that regex
+ delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
+delimiter : str, default ``None``
+ Alias for sep.
+header : int, list of int, default 'infer'
+ Row number(s) to use as the column names, and the start of the
+ data. Default behavior is to infer the column names: if no names
+ are passed the behavior is identical to ``header=0`` and column
+ names are inferred from the first line of the file, if column
+ names are passed explicitly then the behavior is identical to
+ ``header=None``. Explicitly pass ``header=0`` to be able to
+ replace existing names. The header can be a list of integers that
+ specify row locations for a multi-index on the columns
+ e.g. [0,1,3]. Intervening rows that are not specified will be
+ skipped (e.g. 2 in this example is skipped). Note that this
+ parameter ignores commented lines and empty lines if
+ ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
+ data rather than the first line of the file.
+names : array-like, optional
+ List of column names to use. If file contains no header row, then you
+ should explicitly pass ``header=None``. Duplicates in this list will cause
+ a ``UserWarning`` to be issued.
+index_col : int, sequence or bool, optional
+ Column to use as the row labels of the DataFrame. If a sequence is given, a
+ MultiIndex is used. If you have a malformed file with delimiters at the end
+ of each line, you might consider ``index_col=False`` to force pandas to
+ not use the first column as the index (row names).
+usecols : list-like or callable, optional
+ Return a subset of the columns. If list-like, all elements must either
+ be positional (i.e. integer indices into the document columns) or strings
+ that correspond to column names provided either by the user in `names` or
+ inferred from the document header row(s). For example, a valid list-like
+ `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
+ Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
+ To instantiate a DataFrame from ``data`` with element order preserved use
+ ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
+ in ``['foo', 'bar']`` order or
+ ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
+ for ``['bar', 'foo']`` order.
+
+ If callable, the callable function will be evaluated against the column
+ names, returning names where the callable function evaluates to True. An
+ example of a valid callable argument would be ``lambda x: x.upper() in
+ ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
+ parsing time and lower memory usage.
+squeeze : bool, default False
+ If the parsed data only contains one column then return a Series.
+prefix : str, optional
+ Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
+mangle_dupe_cols : bool, default True
+ Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
+ 'X'...'X'. Passing in False will cause data to be overwritten if there
+ are duplicate names in the columns.
+dtype : Type name or dict of column -> type, optional
+ Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
+ 'c': 'Int64'}}
+ Use `str` or `object` together with suitable `na_values` settings
+ to preserve and not interpret dtype.
+ If converters are specified, they will be applied INSTEAD
+ of dtype conversion.
+engine : {{'c', 'python'}}, optional
+ Parser engine to use. The C engine is faster while the python engine is
+ currently more feature-complete.
+converters : dict, optional
+ Dict of functions for converting values in certain columns. Keys can either
+ be integers or column labels.
+true_values : list, optional
+ Values to consider as True.
+false_values : list, optional
+ Values to consider as False.
+skipinitialspace : bool, default False
+ Skip spaces after delimiter.
+skiprows : list-like, int or callable, optional
+ Line numbers to skip (0-indexed) or number of lines to skip (int)
+ at the start of the file.
+
+ If callable, the callable function will be evaluated against the row
+ indices, returning True if the row should be skipped and False otherwise.
+ An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
+skipfooter : int, default 0
+ Number of lines at bottom of file to skip (Unsupported with engine='c').
+nrows : int, optional
+ Number of rows of file to read. Useful for reading pieces of large files.
+na_values : scalar, str, list-like, or dict, optional
+ Additional strings to recognize as NA/NaN. If dict passed, specific
+ per-column NA values. By default the following values are interpreted as
+ NaN: '""" + fill("', '".join(sorted(_NA_VALUES)),
+ 70, subsequent_indent=" ") + """'.
+keep_default_na : bool, default True
+ Whether or not to include the default NaN values when parsing the data.
+ Depending on whether `na_values` is passed in, the behavior is as follows:
+
+ * If `keep_default_na` is True, and `na_values` are specified, `na_values`
+ is appended to the default NaN values used for parsing.
+ * If `keep_default_na` is True, and `na_values` are not specified, only
+ the default NaN values are used for parsing.
+ * If `keep_default_na` is False, and `na_values` are specified, only
+ the NaN values specified `na_values` are used for parsing.
+ * If `keep_default_na` is False, and `na_values` are not specified, no
+ strings will be parsed as NaN.
+
+ Note that if `na_filter` is passed in as False, the `keep_default_na` and
+ `na_values` parameters will be ignored.
+na_filter : bool, default True
+ Detect missing value markers (empty strings and the value of na_values). In
+ data without any NAs, passing na_filter=False can improve the performance
+ of reading a large file.
+verbose : bool, default False
+ Indicate number of NA values placed in non-numeric columns.
+skip_blank_lines : bool, default True
+ If True, skip over blank lines rather than interpreting as NaN values.
+parse_dates : bool or list of int or names or list of lists or dict, \
+default False
+ The behavior is as follows:
+
+ * boolean. If True -> try parsing the index.
+ * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
+ each as a separate date column.
+ * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
+ a single date column.
+ * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
+ result 'foo'
+
+ If a column or index cannot be represented as an array of datetimes,
+ say because of an unparseable value or a mixture of timezones, the column
+ or index will be returned unaltered as an object data type. For
+ non-standard datetime parsing, use ``pd.to_datetime`` after
+ ``pd.read_csv``. To parse an index or column with a mixture of timezones,
+ specify ``date_parser`` to be a partially-applied
+ :func:`pandas.to_datetime` with ``utc=True``. See
+ :ref:`io.csv.mixed_timezones` for more.
+
+ Note: A fast-path exists for iso8601-formatted dates.
+infer_datetime_format : bool, default False
+ If True and `parse_dates` is enabled, pandas will attempt to infer the
+ format of the datetime strings in the columns, and if it can be inferred,
+ switch to a faster method of parsing them. In some cases this can increase
+ the parsing speed by 5-10x.
+keep_date_col : bool, default False
+ If True and `parse_dates` specifies combining multiple columns then
+ keep the original columns.
+date_parser : function, optional
+ Function to use for converting a sequence of string columns to an array of
+ datetime instances. The default uses ``dateutil.parser.parser`` to do the
+ conversion. Pandas will try to call `date_parser` in three different ways,
+ advancing to the next if an exception occurs: 1) Pass one or more arrays
+ (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
+ string values from the columns defined by `parse_dates` into a single array
+ and pass that; and 3) call `date_parser` once for each row using one or
+ more strings (corresponding to the columns defined by `parse_dates`) as
+ arguments.
+dayfirst : bool, default False
+ DD/MM format dates, international and European format.
+iterator : bool, default False
+ Return TextFileReader object for iteration or getting chunks with
+ ``get_chunk()``.
+chunksize : int, optional
+ Return TextFileReader object for iteration.
+ See the `IO Tools docs
+ <http://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
+ for more information on ``iterator`` and ``chunksize``.
+compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
+ For on-the-fly decompression of on-disk data. If 'infer' and
+ `filepath_or_buffer` is path-like, then detect compression from the
+ following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
+ decompression). If using 'zip', the ZIP file must contain only one data
+ file to be read in. Set to None for no decompression.
+
+ .. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.
+
+thousands : str, optional
+ Thousands separator.
+decimal : str, default '.'
+ Character to recognize as decimal point (e.g. use ',' for European data).
+lineterminator : str (length 1), optional
+ Character to break file into lines. Only valid with C parser.
+quotechar : str (length 1), optional
+ The character used to denote the start and end of a quoted item. Quoted
+ items can include the delimiter and it will be ignored.
+quoting : int or csv.QUOTE_* instance, default 0
+ Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
+ QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
+doublequote : bool, default ``True``
+ When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
+ whether or not to interpret two consecutive quotechar elements INSIDE a
+ field as a single ``quotechar`` element.
+escapechar : str (length 1), optional
+ One-character string used to escape other characters.
+comment : str, optional
+ Indicates remainder of line should not be parsed. If found at the beginning
+ of a line, the line will be ignored altogether. This parameter must be a
+ single character. Like empty lines (as long as ``skip_blank_lines=True``),
+ fully commented lines are ignored by the parameter `header` but not by
+ `skiprows`. For example, if ``comment='#'``, parsing
+ ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
+ treated as the header.
+encoding : str, optional
+ Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
+ standard encodings
+ <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
+dialect : str or csv.Dialect, optional
+ If provided, this parameter will override values (default or not) for the
+ following parameters: `delimiter`, `doublequote`, `escapechar`,
+ `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
+ override values, a ParserWarning will be issued. See csv.Dialect
+ documentation for more details.
+tupleize_cols : bool, default False
+ Leave a list of tuples on columns as is (default is to convert to
+ a MultiIndex on the columns).
+
+ .. deprecated:: 0.21.0
+ This argument will be removed and will always convert to MultiIndex
+
+error_bad_lines : bool, default True
+ Lines with too many fields (e.g. a csv line with too many commas) will by
+ default cause an exception to be raised, and no DataFrame will be returned.
+ If False, then these "bad lines" will dropped from the DataFrame that is
+ returned.
+warn_bad_lines : bool, default True
+ If error_bad_lines is False, and warn_bad_lines is True, a warning for each
+ "bad line" will be output.
+delim_whitespace : bool, default False
+ Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
+ used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
+ is set to True, nothing should be passed in for the ``delimiter``
+ parameter.
+
+ .. versionadded:: 0.18.1 support for the Python parser.
+
+low_memory : bool, default True
+ Internally process the file in chunks, resulting in lower memory use
+ while parsing, but possibly mixed type inference. To ensure no mixed
+ types either set False, or specify the type with the `dtype` parameter.
+ Note that the entire file is read into a single DataFrame regardless,
+ use the `chunksize` or `iterator` parameter to return the data in chunks.
+ (Only valid with C parser).
+memory_map : bool, default False
+ If a filepath is provided for `filepath_or_buffer`, map the file object
+ directly onto memory and access the data directly from there. Using this
+ option can improve performance because there is no longer any I/O overhead.
+float_precision : str, optional
+ Specifies which converter the C engine should use for floating-point
+ values. The options are `None` for the ordinary converter,
+ `high` for the high-precision converter, and `round_trip` for the
+ round-trip converter.
+
+Returns
+-------
+DataFrame or TextParser
+ A comma-separated values (csv) file is returned as two-dimensional
+ data structure with labeled axes.
+
+See Also
+--------
+to_csv : Write DataFrame to a comma-separated values (csv) file.
+read_csv : Read a comma-separated values (csv) file into DataFrame.
+read_fwf : Read a table of fixed-width formatted lines into DataFrame.
+
+Examples
+--------
+>>> pd.{func_name}('data.csv') # doctest: +SKIP
+"""
+
+
+def _validate_integer(name, val, min_val=0):
+ """
+ Checks whether the 'name' parameter for parsing is either
+ an integer OR float that can SAFELY be cast to an integer
+ without losing accuracy. Raises a ValueError if that is
+ not the case.
+
+ Parameters
+ ----------
+ name : string
+ Parameter name (used for error reporting)
+ val : int or float
+ The value to check
+ min_val : int
+ Minimum allowed value (val < min_val will result in a ValueError)
+ """
+ msg = "'{name:s}' must be an integer >={min_val:d}".format(name=name,
+ min_val=min_val)
+
+ if val is not None:
+ if is_float(val):
+ if int(val) != val:
+ raise ValueError(msg)
+ val = int(val)
+ elif not (is_integer(val) and val >= min_val):
+ raise ValueError(msg)
+
+ return val
+
+
+def _validate_names(names):
+ """
+ Check if the `names` parameter contains duplicates.
+
+ If duplicates are found, we issue a warning before returning.
+
+ Parameters
+ ----------
+ names : array-like or None
+ An array containing a list of the names used for the output DataFrame.
+
+ Returns
+ -------
+ names : array-like or None
+ The original `names` parameter.
+ """
+
+ if names is not None:
+ if len(names) != len(set(names)):
+ msg = ("Duplicate names specified. This "
+ "will raise an error in the future.")
+ warnings.warn(msg, UserWarning, stacklevel=3)
+
+ return names
+
+
+def _read(filepath_or_buffer, kwds):
+ """Generic reader of line files."""
+ encoding = kwds.get('encoding', None)
+ if encoding is not None:
+ encoding = re.sub('_', '-', encoding).lower()
+ kwds['encoding'] = encoding
+
+ compression = kwds.get('compression', 'infer')
+ compression = _infer_compression(filepath_or_buffer, compression)
+ filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
+ filepath_or_buffer, encoding, compression)
+ kwds['compression'] = compression
+
+ if kwds.get('date_parser', None) is not None:
+ if isinstance(kwds['parse_dates'], bool):
+ kwds['parse_dates'] = True
+
+ # Extract some of the arguments (pass chunksize on).
+ iterator = kwds.get('iterator', False)
+ chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1)
+ nrows = kwds.get('nrows', None)
+
+ # Check for duplicates in names.
+ _validate_names(kwds.get("names", None))
+
+ # Create the parser.
+ parser = TextFileReader(filepath_or_buffer, **kwds)
+
+ if chunksize or iterator:
+ return parser
+
+ try:
+ data = parser.read(nrows)
+ finally:
+ parser.close()
+
+ if should_close:
+ try:
+ filepath_or_buffer.close()
+ except ValueError:
+ pass
+
+ return data
+
+
+_parser_defaults = {
+ 'delimiter': None,
+
+ 'escapechar': None,
+ 'quotechar': '"',
+ 'quoting': csv.QUOTE_MINIMAL,
+ 'doublequote': True,
+ 'skipinitialspace': False,
+ 'lineterminator': None,
+
+ 'header': 'infer',
+ 'index_col': None,
+ 'names': None,
+ 'prefix': None,
+ 'skiprows': None,
+ 'skipfooter': 0,
+ 'nrows': None,
+ 'na_values': None,
+ 'keep_default_na': True,
+
+ 'true_values': None,
+ 'false_values': None,
+ 'converters': None,
+ 'dtype': None,
+
+ 'thousands': None,
+ 'comment': None,
+ 'decimal': b'.',
+
+ # 'engine': 'c',
+ 'parse_dates': False,
+ 'keep_date_col': False,
+ 'dayfirst': False,
+ 'date_parser': None,
+ 'usecols': None,
+
+ # 'iterator': False,
+ 'chunksize': None,
+ 'verbose': False,
+ 'encoding': None,
+ 'squeeze': False,
+ 'compression': None,
+ 'mangle_dupe_cols': True,
+ 'tupleize_cols': False,
+ 'infer_datetime_format': False,
+ 'skip_blank_lines': True
+}
+
+
+_c_parser_defaults = {
+ 'delim_whitespace': False,
+ 'na_filter': True,
+ 'low_memory': True,
+ 'memory_map': False,
+ 'error_bad_lines': True,
+ 'warn_bad_lines': True,
+ 'tupleize_cols': False,
+ 'float_precision': None
+}
+
+_fwf_defaults = {
+ 'colspecs': 'infer',
+ 'infer_nrows': 100,
+ 'widths': None,
+}
+
+_c_unsupported = {'skipfooter'}
+_python_unsupported = {
+ 'low_memory',
+ 'float_precision',
+}
+
+_deprecated_defaults = {
+ 'tupleize_cols': None
+}
+_deprecated_args = {
+ 'tupleize_cols',
+}
+
+
+def _make_parser_function(name, default_sep=','):
+
+ # prepare read_table deprecation
+ if name == "read_table":
+ sep = False
+ else:
+ sep = default_sep
+
+ def parser_f(filepath_or_buffer,
+ sep=sep,
+ delimiter=None,
+
+ # Column and Index Locations and Names
+ header='infer',
+ names=None,
+ index_col=None,
+ usecols=None,
+ squeeze=False,
+ prefix=None,
+ mangle_dupe_cols=True,
+
+ # General Parsing Configuration
+ dtype=None,
+ engine=None,
+ converters=None,
+ true_values=None,
+ false_values=None,
+ skipinitialspace=False,
+ skiprows=None,
+ skipfooter=0,
+ nrows=None,
+
+ # NA and Missing Data Handling
+ na_values=None,
+ keep_default_na=True,
+ na_filter=True,
+ verbose=False,
+ skip_blank_lines=True,
+
+ # Datetime Handling
+ parse_dates=False,
+ infer_datetime_format=False,
+ keep_date_col=False,
+ date_parser=None,
+ dayfirst=False,
+
+ # Iteration
+ iterator=False,
+ chunksize=None,
+
+ # Quoting, Compression, and File Format
+ compression='infer',
+ thousands=None,
+ decimal=b'.',
+ lineterminator=None,
+ quotechar='"',
+ quoting=csv.QUOTE_MINIMAL,
+ doublequote=True,
+ escapechar=None,
+ comment=None,
+ encoding=None,
+ dialect=None,
+ tupleize_cols=None,
+
+ # Error Handling
+ error_bad_lines=True,
+ warn_bad_lines=True,
+
+ # Internal
+ delim_whitespace=False,
+ low_memory=_c_parser_defaults['low_memory'],
+ memory_map=False,
+ float_precision=None):
+
+ # deprecate read_table GH21948
+ if name == "read_table":
+ if sep is False and delimiter is None:
+ warnings.warn("read_table is deprecated, use read_csv "
+ "instead, passing sep='\\t'.",
+ FutureWarning, stacklevel=2)
+ else:
+ warnings.warn("read_table is deprecated, use read_csv "
+ "instead.",
+ FutureWarning, stacklevel=2)
+ if sep is False:
+ sep = default_sep
+
+ # gh-23761
+ #
+ # When a dialect is passed, it overrides any of the overlapping
+ # parameters passed in directly. We don't want to warn if the
+ # default parameters were passed in (since it probably means
+ # that the user didn't pass them in explicitly in the first place).
+ #
+ # "delimiter" is the annoying corner case because we alias it to
+ # "sep" before doing comparison to the dialect values later on.
+ # Thus, we need a flag to indicate that we need to "override"
+ # the comparison to dialect values by checking if default values
+ # for BOTH "delimiter" and "sep" were provided.
+ if dialect is not None:
+ sep_override = delimiter is None and sep == default_sep
+ kwds = dict(sep_override=sep_override)
+ else:
+ kwds = dict()
+
+ # Alias sep -> delimiter.
+ if delimiter is None:
+ delimiter = sep
+
+ if delim_whitespace and delimiter != default_sep:
+ raise ValueError("Specified a delimiter with both sep and"
+ " delim_whitespace=True; you can only"
+ " specify one.")
+
+ if engine is not None:
+ engine_specified = True
+ else:
+ engine = 'c'
+ engine_specified = False
+
+ kwds.update(delimiter=delimiter,
+ engine=engine,
+ dialect=dialect,
+ compression=compression,
+ engine_specified=engine_specified,
+
+ doublequote=doublequote,
+ escapechar=escapechar,
+ quotechar=quotechar,
+ quoting=quoting,
+ skipinitialspace=skipinitialspace,
+ lineterminator=lineterminator,
+
+ header=header,
+ index_col=index_col,
+ names=names,
+ prefix=prefix,
+ skiprows=skiprows,
+ skipfooter=skipfooter,
+ na_values=na_values,
+ true_values=true_values,
+ false_values=false_values,
+ keep_default_na=keep_default_na,
+ thousands=thousands,
+ comment=comment,
+ decimal=decimal,
+
+ parse_dates=parse_dates,
+ keep_date_col=keep_date_col,
+ dayfirst=dayfirst,
+ date_parser=date_parser,
+
+ nrows=nrows,
+ iterator=iterator,
+ chunksize=chunksize,
+ converters=converters,
+ dtype=dtype,
+ usecols=usecols,
+ verbose=verbose,
+ encoding=encoding,
+ squeeze=squeeze,
+ memory_map=memory_map,
+ float_precision=float_precision,
+
+ na_filter=na_filter,
+ delim_whitespace=delim_whitespace,
+ warn_bad_lines=warn_bad_lines,
+ error_bad_lines=error_bad_lines,
+ low_memory=low_memory,
+ mangle_dupe_cols=mangle_dupe_cols,
+ tupleize_cols=tupleize_cols,
+ infer_datetime_format=infer_datetime_format,
+ skip_blank_lines=skip_blank_lines)
+
+ return _read(filepath_or_buffer, kwds)
+
+ parser_f.__name__ = name
+
+ return parser_f
+
+
+read_csv = _make_parser_function('read_csv', default_sep=',')
+read_csv = Appender(_doc_read_csv_and_table.format(
+ func_name='read_csv',
+ summary=('Read a comma-separated values (csv) file '
+ 'into DataFrame.'),
+ _default_sep="','")
+ )(read_csv)
+
+read_table = _make_parser_function('read_table', default_sep='\t')
+read_table = Appender(_doc_read_csv_and_table.format(
+ func_name='read_table',
+ summary="""Read general delimited file into DataFrame.
+
+.. deprecated:: 0.24.0
+Use :func:`pandas.read_csv` instead, passing ``sep='\\t'`` if necessary.""",
+ _default_sep=r"'\\t' (tab-stop)")
+ )(read_table)
+
+
+def read_fwf(filepath_or_buffer, colspecs='infer', widths=None,
+ infer_nrows=100, **kwds):
+
+ r"""
+ Read a table of fixed-width formatted lines into DataFrame.
+
+ Also supports optionally iterating or breaking of the file
+ into chunks.
+
+ Additional help can be found in the `online docs for IO Tools
+ <http://pandas.pydata.org/pandas-docs/stable/io.html>`_.
+
+ Parameters
+ ----------
+ filepath_or_buffer : str, path object, or file-like object
+ Any valid string path is acceptable. The string could be a URL. Valid
+ URL schemes include http, ftp, s3, and file. For file URLs, a host is
+ expected. A local file could be: file://localhost/path/to/table.csv.
+
+ If you want to pass in a path object, pandas accepts either
+ ``pathlib.Path`` or ``py._path.local.LocalPath``.
+
+ By file-like object, we refer to objects with a ``read()`` method,
+ such as a file handler (e.g. via builtin ``open`` function)
+ or ``StringIO``.
+ colspecs : list of tuple (int, int) or 'infer'. optional
+ A list of tuples giving the extents of the fixed-width
+ fields of each line as half-open intervals (i.e., [from, to[ ).
+ String value 'infer' can be used to instruct the parser to try
+ detecting the column specifications from the first 100 rows of
+ the data which are not being skipped via skiprows (default='infer').
+ widths : list of int, optional
+ A list of field widths which can be used instead of 'colspecs' if
+ the intervals are contiguous.
+ infer_nrows : int, default 100
+ The number of rows to consider when letting the parser determine the
+ `colspecs`.
+
+ .. versionadded:: 0.24.0
+ **kwds : optional
+ Optional keyword arguments can be passed to ``TextFileReader``.
+
+ Returns
+ -------
+ DataFrame or TextParser
+ A comma-separated values (csv) file is returned as two-dimensional
+ data structure with labeled axes.
+
+ See Also
+ --------
+ to_csv : Write DataFrame to a comma-separated values (csv) file.
+ read_csv : Read a comma-separated values (csv) file into DataFrame.
+
+ Examples
+ --------
+ >>> pd.read_fwf('data.csv') # doctest: +SKIP
+ """
+
+ # Check input arguments.
+ if colspecs is None and widths is None:
+ raise ValueError("Must specify either colspecs or widths")
+ elif colspecs not in (None, 'infer') and widths is not None:
+ raise ValueError("You must specify only one of 'widths' and "
+ "'colspecs'")
+
+ # Compute 'colspecs' from 'widths', if specified.
+ if widths is not None:
+ colspecs, col = [], 0
+ for w in widths:
+ colspecs.append((col, col + w))
+ col += w
+
+ kwds['colspecs'] = colspecs
+ kwds['infer_nrows'] = infer_nrows
+ kwds['engine'] = 'python-fwf'
+ return _read(filepath_or_buffer, kwds)
+
+
+class TextFileReader(BaseIterator):
+ """
+
+ Passed dialect overrides any of the related parser options
+
+ """
+
+ def __init__(self, f, engine=None, **kwds):
+
+ self.f = f
+
+ if engine is not None:
+ engine_specified = True
+ else:
+ engine = 'python'
+ engine_specified = False
+
+ self._engine_specified = kwds.get('engine_specified', engine_specified)
+
+ if kwds.get('dialect') is not None:
+ dialect = kwds['dialect']
+ if dialect in csv.list_dialects():
+ dialect = csv.get_dialect(dialect)
+
+ # Any valid dialect should have these attributes.
+ # If any are missing, we will raise automatically.
+ for param in ('delimiter', 'doublequote', 'escapechar',
+ 'skipinitialspace', 'quotechar', 'quoting'):
+ try:
+ dialect_val = getattr(dialect, param)
+ except AttributeError:
+ raise ValueError("Invalid dialect '{dialect}' provided"
+ .format(dialect=kwds['dialect']))
+ parser_default = _parser_defaults[param]
+ provided = kwds.get(param, parser_default)
+
+ # Messages for conflicting values between the dialect
+ # instance and the actual parameters provided.
+ conflict_msgs = []
+
+ # Don't warn if the default parameter was passed in,
+ # even if it conflicts with the dialect (gh-23761).
+ if provided != parser_default and provided != dialect_val:
+ msg = ("Conflicting values for '{param}': '{val}' was "
+ "provided, but the dialect specifies '{diaval}'. "
+ "Using the dialect-specified value.".format(
+ param=param, val=provided, diaval=dialect_val))
+
+ # Annoying corner case for not warning about
+ # conflicts between dialect and delimiter parameter.
+ # Refer to the outer "_read_" function for more info.
+ if not (param == "delimiter" and
+ kwds.pop("sep_override", False)):
+ conflict_msgs.append(msg)
+
+ if conflict_msgs:
+ warnings.warn('\n\n'.join(conflict_msgs), ParserWarning,
+ stacklevel=2)
+ kwds[param] = dialect_val
+
+ if kwds.get("skipfooter"):
+ if kwds.get("iterator") or kwds.get("chunksize"):
+ raise ValueError("'skipfooter' not supported for 'iteration'")
+ if kwds.get("nrows"):
+ raise ValueError("'skipfooter' not supported with 'nrows'")
+
+ if kwds.get('header', 'infer') == 'infer':
+ kwds['header'] = 0 if kwds.get('names') is None else None
+
+ self.orig_options = kwds
+
+ # miscellanea
+ self.engine = engine
+ self._engine = None
+ self._currow = 0
+
+ options = self._get_options_with_defaults(engine)
+
+ self.chunksize = options.pop('chunksize', None)
+ self.nrows = options.pop('nrows', None)
+ self.squeeze = options.pop('squeeze', False)
+
+ # might mutate self.engine
+ self.engine = self._check_file_or_buffer(f, engine)
+ self.options, self.engine = self._clean_options(options, engine)
+
+ if 'has_index_names' in kwds:
+ self.options['has_index_names'] = kwds['has_index_names']
+
+ self._make_engine(self.engine)
+
+ def close(self):
+ self._engine.close()
+
+ def _get_options_with_defaults(self, engine):
+ kwds = self.orig_options
+
+ options = {}
+
+ for argname, default in compat.iteritems(_parser_defaults):
+ value = kwds.get(argname, default)
+
+ # see gh-12935
+ if argname == 'mangle_dupe_cols' and not value:
+ raise ValueError('Setting mangle_dupe_cols=False is '
+ 'not supported yet')
+ else:
+ options[argname] = value
+
+ for argname, default in compat.iteritems(_c_parser_defaults):
+ if argname in kwds:
+ value = kwds[argname]
+
+ if engine != 'c' and value != default:
+ if ('python' in engine and
+ argname not in _python_unsupported):
+ pass
+ elif value == _deprecated_defaults.get(argname, default):
+ pass
+ else:
+ raise ValueError(
+ 'The %r option is not supported with the'
+ ' %r engine' % (argname, engine))
+ else:
+ value = _deprecated_defaults.get(argname, default)
+ options[argname] = value
+
+ if engine == 'python-fwf':
+ for argname, default in compat.iteritems(_fwf_defaults):
+ options[argname] = kwds.get(argname, default)
+
+ return options
+
+ def _check_file_or_buffer(self, f, engine):
+ # see gh-16530
+ if is_file_like(f):
+ next_attr = "__next__" if PY3 else "next"
+
+ # The C engine doesn't need the file-like to have the "next" or
+ # "__next__" attribute. However, the Python engine explicitly calls
+ # "next(...)" when iterating through such an object, meaning it
+ # needs to have that attribute ("next" for Python 2.x, "__next__"
+ # for Python 3.x)
+ if engine != "c" and not hasattr(f, next_attr):
+ msg = ("The 'python' engine cannot iterate "
+ "through this file buffer.")
+ raise ValueError(msg)
+
+ return engine
+
+ def _clean_options(self, options, engine):
+ result = options.copy()
+
+ engine_specified = self._engine_specified
+ fallback_reason = None
+
+ sep = options['delimiter']
+ delim_whitespace = options['delim_whitespace']
+
+ # C engine not supported yet
+ if engine == 'c':
+ if options['skipfooter'] > 0:
+ fallback_reason = ("the 'c' engine does not support"
+ " skipfooter")
+ engine = 'python'
+
+ encoding = sys.getfilesystemencoding() or 'utf-8'
+ if sep is None and not delim_whitespace:
+ if engine == 'c':
+ fallback_reason = ("the 'c' engine does not support"
+ " sep=None with delim_whitespace=False")
+ engine = 'python'
+ elif sep is not None and len(sep) > 1:
+ if engine == 'c' and sep == r'\s+':
+ result['delim_whitespace'] = True
+ del result['delimiter']
+ elif engine not in ('python', 'python-fwf'):
+ # wait until regex engine integrated
+ fallback_reason = ("the 'c' engine does not support"
+ " regex separators (separators > 1 char and"
+ r" different from '\s+' are"
+ " interpreted as regex)")
+ engine = 'python'
+ elif delim_whitespace:
+ if 'python' in engine:
+ result['delimiter'] = r'\s+'
+ elif sep is not None:
+ encodeable = True
+ try:
+ if len(sep.encode(encoding)) > 1:
+ encodeable = False
+ except UnicodeDecodeError:
+ encodeable = False
+ if not encodeable and engine not in ('python', 'python-fwf'):
+ fallback_reason = ("the separator encoded in {encoding}"
+ " is > 1 char long, and the 'c' engine"
+ " does not support such separators"
+ .format(encoding=encoding))
+ engine = 'python'
+
+ quotechar = options['quotechar']
+ if (quotechar is not None and
+ isinstance(quotechar, (str, compat.text_type, bytes))):
+ if (len(quotechar) == 1 and ord(quotechar) > 127 and
+ engine not in ('python', 'python-fwf')):
+ fallback_reason = ("ord(quotechar) > 127, meaning the "
+ "quotechar is larger than one byte, "
+ "and the 'c' engine does not support "
+ "such quotechars")
+ engine = 'python'
+
+ if fallback_reason and engine_specified:
+ raise ValueError(fallback_reason)
+
+ if engine == 'c':
+ for arg in _c_unsupported:
+ del result[arg]
+
+ if 'python' in engine:
+ for arg in _python_unsupported:
+ if fallback_reason and result[arg] != _c_parser_defaults[arg]:
+ msg = ("Falling back to the 'python' engine because"
+ " {reason}, but this causes {option!r} to be"
+ " ignored as it is not supported by the 'python'"
+ " engine.").format(reason=fallback_reason,
+ option=arg)
+ raise ValueError(msg)
+ del result[arg]
+
+ if fallback_reason:
+ warnings.warn(("Falling back to the 'python' engine because"
+ " {0}; you can avoid this warning by specifying"
+ " engine='python'.").format(fallback_reason),
+ ParserWarning, stacklevel=5)
+
+ index_col = options['index_col']
+ names = options['names']
+ converters = options['converters']
+ na_values = options['na_values']
+ skiprows = options['skiprows']
+
+ _validate_header_arg(options['header'])
+
+ depr_warning = ''
+
+ for arg in _deprecated_args:
+ parser_default = _c_parser_defaults[arg]
+ depr_default = _deprecated_defaults[arg]
+
+ msg = ("The '{arg}' argument has been deprecated "
+ "and will be removed in a future version."
+ .format(arg=arg))
+
+ if arg == 'tupleize_cols':
+ msg += (' Column tuples will then '
+ 'always be converted to MultiIndex.')
+
+ if result.get(arg, depr_default) != depr_default:
+ # raise Exception(result.get(arg, depr_default), depr_default)
+ depr_warning += msg + '\n\n'
+ else:
+ result[arg] = parser_default
+
+ if depr_warning != '':
+ warnings.warn(depr_warning, FutureWarning, stacklevel=2)
+
+ if index_col is True:
+ raise ValueError("The value of index_col couldn't be 'True'")
+ if _is_index_col(index_col):
+ if not isinstance(index_col, (list, tuple, np.ndarray)):
+ index_col = [index_col]
+ result['index_col'] = index_col
+
+ names = list(names) if names is not None else names
+
+ # type conversion-related
+ if converters is not None:
+ if not isinstance(converters, dict):
+ raise TypeError('Type converters must be a dict or'
+ ' subclass, input was '
+ 'a {0!r}'.format(type(converters).__name__))
+ else:
+ converters = {}
+
+ # Converting values to NA
+ keep_default_na = options['keep_default_na']
+ na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
+
+ # handle skiprows; this is internally handled by the
+ # c-engine, so only need for python parsers
+ if engine != 'c':
+ if is_integer(skiprows):
+ skiprows = lrange(skiprows)
+ if skiprows is None:
+ skiprows = set()
+ elif not callable(skiprows):
+ skiprows = set(skiprows)
+
+ # put stuff back
+ result['names'] = names
+ result['converters'] = converters
+ result['na_values'] = na_values
+ result['na_fvalues'] = na_fvalues
+ result['skiprows'] = skiprows
+
+ return result, engine
+
+ def __next__(self):
+ try:
+ return self.get_chunk()
+ except StopIteration:
+ self.close()
+ raise
+
+ def _make_engine(self, engine='c'):
+ if engine == 'c':
+ self._engine = CParserWrapper(self.f, **self.options)
+ else:
+ if engine == 'python':
+ klass = PythonParser
+ elif engine == 'python-fwf':
+ klass = FixedWidthFieldParser
+ else:
+ raise ValueError('Unknown engine: {engine} (valid options are'
+ ' "c", "python", or' ' "python-fwf")'.format(
+ engine=engine))
+ self._engine = klass(self.f, **self.options)
+
+ def _failover_to_python(self):
+ raise AbstractMethodError(self)
+
+ def read(self, nrows=None):
+ nrows = _validate_integer('nrows', nrows)
+ ret = self._engine.read(nrows)
+
+ # May alter columns / col_dict
+ index, columns, col_dict = self._create_index(ret)
+
+ if index is None:
+ if col_dict:
+ # Any column is actually fine:
+ new_rows = len(compat.next(compat.itervalues(col_dict)))
+ index = RangeIndex(self._currow, self._currow + new_rows)
+ else:
+ new_rows = 0
+ else:
+ new_rows = len(index)
+
+ df = DataFrame(col_dict, columns=columns, index=index)
+
+ self._currow += new_rows
+
+ if self.squeeze and len(df.columns) == 1:
+ return df[df.columns[0]].copy()
+ return df
+
+ def _create_index(self, ret):
+ index, columns, col_dict = ret
+ return index, columns, col_dict
+
+ def get_chunk(self, size=None):
+ if size is None:
+ size = self.chunksize
+ if self.nrows is not None:
+ if self._currow >= self.nrows:
+ raise StopIteration
+ size = min(size, self.nrows - self._currow)
+ return self.read(nrows=size)
+
+
+def _is_index_col(col):
+ return col is not None and col is not False
+
+
+def _is_potential_multi_index(columns):
+ """
+ Check whether or not the `columns` parameter
+ could be converted into a MultiIndex.
+
+ Parameters
+ ----------
+ columns : array-like
+ Object which may or may not be convertible into a MultiIndex
+
+ Returns
+ -------
+ boolean : Whether or not columns could become a MultiIndex
+ """
+ return (len(columns) and not isinstance(columns, MultiIndex) and
+ all(isinstance(c, tuple) for c in columns))
+
+
+def _evaluate_usecols(usecols, names):
+ """
+ Check whether or not the 'usecols' parameter
+ is a callable. If so, enumerates the 'names'
+ parameter and returns a set of indices for
+ each entry in 'names' that evaluates to True.
+ If not a callable, returns 'usecols'.
+ """
+ if callable(usecols):
+ return {i for i, name in enumerate(names) if usecols(name)}
+ return usecols
+
+
+def _validate_usecols_names(usecols, names):
+ """
+ Validates that all usecols are present in a given
+ list of names. If not, raise a ValueError that
+ shows what usecols are missing.
+
+ Parameters
+ ----------
+ usecols : iterable of usecols
+ The columns to validate are present in names.
+ names : iterable of names
+ The column names to check against.
+
+ Returns
+ -------
+ usecols : iterable of usecols
+ The `usecols` parameter if the validation succeeds.
+
+ Raises
+ ------
+ ValueError : Columns were missing. Error message will list them.
+ """
+ missing = [c for c in usecols if c not in names]
+ if len(missing) > 0:
+ raise ValueError(
+ "Usecols do not match columns, "
+ "columns expected but not found: {missing}".format(missing=missing)
+ )
+
+ return usecols
+
+
+def _validate_skipfooter_arg(skipfooter):
+ """
+ Validate the 'skipfooter' parameter.
+
+ Checks whether 'skipfooter' is a non-negative integer.
+ Raises a ValueError if that is not the case.
+
+ Parameters
+ ----------
+ skipfooter : non-negative integer
+ The number of rows to skip at the end of the file.
+
+ Returns
+ -------
+ validated_skipfooter : non-negative integer
+ The original input if the validation succeeds.
+
+ Raises
+ ------
+ ValueError : 'skipfooter' was not a non-negative integer.
+ """
+
+ if not is_integer(skipfooter):
+ raise ValueError("skipfooter must be an integer")
+
+ if skipfooter < 0:
+ raise ValueError("skipfooter cannot be negative")
+
+ return skipfooter
+
+
+def _validate_usecols_arg(usecols):
+ """
+ Validate the 'usecols' parameter.
+
+ Checks whether or not the 'usecols' parameter contains all integers
+ (column selection by index), strings (column by name) or is a callable.
+ Raises a ValueError if that is not the case.
+
+ Parameters
+ ----------
+ usecols : list-like, callable, or None
+ List of columns to use when parsing or a callable that can be used
+ to filter a list of table columns.
+
+ Returns
+ -------
+ usecols_tuple : tuple
+ A tuple of (verified_usecols, usecols_dtype).
+
+ 'verified_usecols' is either a set if an array-like is passed in or
+ 'usecols' if a callable or None is passed in.
+
+ 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
+ is passed in or None if a callable or None is passed in.
+ """
+ msg = ("'usecols' must either be list-like of all strings, all unicode, "
+ "all integers or a callable.")
+ if usecols is not None:
+ if callable(usecols):
+ return usecols, None
+
+ if not is_list_like(usecols):
+ # see gh-20529
+ #
+ # Ensure it is iterable container but not string.
+ raise ValueError(msg)
+
+ usecols_dtype = lib.infer_dtype(usecols, skipna=False)
+
+ if usecols_dtype not in ("empty", "integer",
+ "string", "unicode"):
+ raise ValueError(msg)
+
+ usecols = set(usecols)
+
+ if usecols_dtype == "unicode":
+ # see gh-13253
+ #
+ # Python 2.x compatibility
+ usecols = {col.encode("utf-8") for col in usecols}
+
+ return usecols, usecols_dtype
+ return usecols, None
+
+
+def _validate_parse_dates_arg(parse_dates):
+ """
+ Check whether or not the 'parse_dates' parameter
+ is a non-boolean scalar. Raises a ValueError if
+ that is the case.
+ """
+ msg = ("Only booleans, lists, and "
+ "dictionaries are accepted "
+ "for the 'parse_dates' parameter")
+
+ if parse_dates is not None:
+ if is_scalar(parse_dates):
+ if not lib.is_bool(parse_dates):
+ raise TypeError(msg)
+
+ elif not isinstance(parse_dates, (list, dict)):
+ raise TypeError(msg)
+
+ return parse_dates
+
+
+class ParserBase(object):
+
+ def __init__(self, kwds):
+ self.names = kwds.get('names')
+ self.orig_names = None
+ self.prefix = kwds.pop('prefix', None)
+
+ self.index_col = kwds.get('index_col', None)
+ self.unnamed_cols = set()
+ self.index_names = None
+ self.col_names = None
+
+ self.parse_dates = _validate_parse_dates_arg(
+ kwds.pop('parse_dates', False))
+ self.date_parser = kwds.pop('date_parser', None)
+ self.dayfirst = kwds.pop('dayfirst', False)
+ self.keep_date_col = kwds.pop('keep_date_col', False)
+
+ self.na_values = kwds.get('na_values')
+ self.na_fvalues = kwds.get('na_fvalues')
+ self.na_filter = kwds.get('na_filter', False)
+ self.keep_default_na = kwds.get('keep_default_na', True)
+
+ self.true_values = kwds.get('true_values')
+ self.false_values = kwds.get('false_values')
+ self.tupleize_cols = kwds.get('tupleize_cols', False)
+ self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
+ self.infer_datetime_format = kwds.pop('infer_datetime_format', False)
+
+ self._date_conv = _make_date_converter(
+ date_parser=self.date_parser,
+ dayfirst=self.dayfirst,
+ infer_datetime_format=self.infer_datetime_format
+ )
+
+ # validate header options for mi
+ self.header = kwds.get('header')
+ if isinstance(self.header, (list, tuple, np.ndarray)):
+ if not all(map(is_integer, self.header)):
+ raise ValueError("header must be integer or list of integers")
+ if kwds.get('usecols'):
+ raise ValueError("cannot specify usecols when "
+ "specifying a multi-index header")
+ if kwds.get('names'):
+ raise ValueError("cannot specify names when "
+ "specifying a multi-index header")
+
+ # validate index_col that only contains integers
+ if self.index_col is not None:
+ is_sequence = isinstance(self.index_col, (list, tuple,
+ np.ndarray))
+ if not (is_sequence and
+ all(map(is_integer, self.index_col)) or
+ is_integer(self.index_col)):
+ raise ValueError("index_col must only contain row numbers "
+ "when specifying a multi-index header")
+
+ # GH 16338
+ elif self.header is not None and not is_integer(self.header):
+ raise ValueError("header must be integer or list of integers")
+
+ self._name_processed = False
+
+ self._first_chunk = True
+
+ # GH 13932
+ # keep references to file handles opened by the parser itself
+ self.handles = []
+
+ def close(self):
+ for f in self.handles:
+ f.close()
+
+ @property
+ def _has_complex_date_col(self):
+ return (isinstance(self.parse_dates, dict) or
+ (isinstance(self.parse_dates, list) and
+ len(self.parse_dates) > 0 and
+ isinstance(self.parse_dates[0], list)))
+
+ def _should_parse_dates(self, i):
+ if isinstance(self.parse_dates, bool):
+ return self.parse_dates
+ else:
+ if self.index_names is not None:
+ name = self.index_names[i]
+ else:
+ name = None
+ j = self.index_col[i]
+
+ if is_scalar(self.parse_dates):
+ return ((j == self.parse_dates) or
+ (name is not None and name == self.parse_dates))
+ else:
+ return ((j in self.parse_dates) or
+ (name is not None and name in self.parse_dates))
+
+ def _extract_multi_indexer_columns(self, header, index_names, col_names,
+ passed_names=False):
+ """ extract and return the names, index_names, col_names
+ header is a list-of-lists returned from the parsers """
+ if len(header) < 2:
+ return header[0], index_names, col_names, passed_names
+
+ # the names are the tuples of the header that are not the index cols
+ # 0 is the name of the index, assuming index_col is a list of column
+ # numbers
+ ic = self.index_col
+ if ic is None:
+ ic = []
+
+ if not isinstance(ic, (list, tuple, np.ndarray)):
+ ic = [ic]
+ sic = set(ic)
+
+ # clean the index_names
+ index_names = header.pop(-1)
+ index_names, names, index_col = _clean_index_names(index_names,
+ self.index_col,
+ self.unnamed_cols)
+
+ # extract the columns
+ field_count = len(header[0])
+
+ def extract(r):
+ return tuple(r[i] for i in range(field_count) if i not in sic)
+
+ columns = lzip(*[extract(r) for r in header])
+ names = ic + columns
+
+ # If we find unnamed columns all in a single
+ # level, then our header was too long.
+ for n in range(len(columns[0])):
+ if all(compat.to_str(c[n]) in self.unnamed_cols for c in columns):
+ raise ParserError(
+ "Passed header=[%s] are too many rows for this "
+ "multi_index of columns"
+ % ','.join(str(x) for x in self.header)
+ )
+
+ # Clean the column names (if we have an index_col).
+ if len(ic):
+ col_names = [r[0] if (len(r[0]) and
+ r[0] not in self.unnamed_cols) else None
+ for r in header]
+ else:
+ col_names = [None] * len(header)
+
+ passed_names = True
+
+ return names, index_names, col_names, passed_names
+
+ def _maybe_dedup_names(self, names):
+ # see gh-7160 and gh-9424: this helps to provide
+ # immediate alleviation of the duplicate names
+ # issue and appears to be satisfactory to users,
+ # but ultimately, not needing to butcher the names
+ # would be nice!
+ if self.mangle_dupe_cols:
+ names = list(names) # so we can index
+ counts = defaultdict(int)
+ is_potential_mi = _is_potential_multi_index(names)
+
+ for i, col in enumerate(names):
+ cur_count = counts[col]
+
+ while cur_count > 0:
+ counts[col] = cur_count + 1
+
+ if is_potential_mi:
+ col = col[:-1] + ('%s.%d' % (col[-1], cur_count),)
+ else:
+ col = '%s.%d' % (col, cur_count)
+ cur_count = counts[col]
+
+ names[i] = col
+ counts[col] = cur_count + 1
+
+ return names
+
+ def _maybe_make_multi_index_columns(self, columns, col_names=None):
+ # possibly create a column mi here
+ if _is_potential_multi_index(columns):
+ columns = MultiIndex.from_tuples(columns, names=col_names)
+ return columns
+
+ def _make_index(self, data, alldata, columns, indexnamerow=False):
+ if not _is_index_col(self.index_col) or not self.index_col:
+ index = None
+
+ elif not self._has_complex_date_col:
+ index = self._get_simple_index(alldata, columns)
+ index = self._agg_index(index)
+ elif self._has_complex_date_col:
+ if not self._name_processed:
+ (self.index_names, _,
+ self.index_col) = _clean_index_names(list(columns),
+ self.index_col,
+ self.unnamed_cols)
+ self._name_processed = True
+ index = self._get_complex_date_index(data, columns)
+ index = self._agg_index(index, try_parse_dates=False)
+
+ # add names for the index
+ if indexnamerow:
+ coffset = len(indexnamerow) - len(columns)
+ index = index.set_names(indexnamerow[:coffset])
+
+ # maybe create a mi on the columns
+ columns = self._maybe_make_multi_index_columns(columns, self.col_names)
+
+ return index, columns
+
+ _implicit_index = False
+
+ def _get_simple_index(self, data, columns):
+ def ix(col):
+ if not isinstance(col, compat.string_types):
+ return col
+ raise ValueError('Index %s invalid' % col)
+
+ to_remove = []
+ index = []
+ for idx in self.index_col:
+ i = ix(idx)
+ to_remove.append(i)
+ index.append(data[i])
+
+ # remove index items from content and columns, don't pop in
+ # loop
+ for i in reversed(sorted(to_remove)):
+ data.pop(i)
+ if not self._implicit_index:
+ columns.pop(i)
+
+ return index
+
+ def _get_complex_date_index(self, data, col_names):
+ def _get_name(icol):
+ if isinstance(icol, compat.string_types):
+ return icol
+
+ if col_names is None:
+ raise ValueError(('Must supply column order to use %s as '
+ 'index') % str(icol))
+
+ for i, c in enumerate(col_names):
+ if i == icol:
+ return c
+
+ to_remove = []
+ index = []
+ for idx in self.index_col:
+ name = _get_name(idx)
+ to_remove.append(name)
+ index.append(data[name])
+
+ # remove index items from content and columns, don't pop in
+ # loop
+ for c in reversed(sorted(to_remove)):
+ data.pop(c)
+ col_names.remove(c)
+
+ return index
+
+ def _agg_index(self, index, try_parse_dates=True):
+ arrays = []
+
+ for i, arr in enumerate(index):
+
+ if try_parse_dates and self._should_parse_dates(i):
+ arr = self._date_conv(arr)
+
+ if self.na_filter:
+ col_na_values = self.na_values
+ col_na_fvalues = self.na_fvalues
+ else:
+ col_na_values = set()
+ col_na_fvalues = set()
+
+ if isinstance(self.na_values, dict):
+ col_name = self.index_names[i]
+ if col_name is not None:
+ col_na_values, col_na_fvalues = _get_na_values(
+ col_name, self.na_values, self.na_fvalues,
+ self.keep_default_na)
+
+ arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues)
+ arrays.append(arr)
+
+ names = self.index_names
+ index = ensure_index_from_sequences(arrays, names)
+
+ return index
+
+ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
+ converters=None, dtypes=None):
+ result = {}
+ for c, values in compat.iteritems(dct):
+ conv_f = None if converters is None else converters.get(c, None)
+ if isinstance(dtypes, dict):
+ cast_type = dtypes.get(c, None)
+ else:
+ # single dtype or None
+ cast_type = dtypes
+
+ if self.na_filter:
+ col_na_values, col_na_fvalues = _get_na_values(
+ c, na_values, na_fvalues, self.keep_default_na)
+ else:
+ col_na_values, col_na_fvalues = set(), set()
+
+ if conv_f is not None:
+ # conv_f applied to data before inference
+ if cast_type is not None:
+ warnings.warn(("Both a converter and dtype were specified "
+ "for column {0} - only the converter will "
+ "be used").format(c), ParserWarning,
+ stacklevel=7)
+
+ try:
+ values = lib.map_infer(values, conv_f)
+ except ValueError:
+ mask = algorithms.isin(
+ values, list(na_values)).view(np.uint8)
+ values = lib.map_infer_mask(values, conv_f, mask)
+
+ cvals, na_count = self._infer_types(
+ values, set(col_na_values) | col_na_fvalues,
+ try_num_bool=False)
+ else:
+ is_str_or_ea_dtype = (is_string_dtype(cast_type)
+ or is_extension_array_dtype(cast_type))
+ # skip inference if specified dtype is object
+ # or casting to an EA
+ try_num_bool = not (cast_type and is_str_or_ea_dtype)
+
+ # general type inference and conversion
+ cvals, na_count = self._infer_types(
+ values, set(col_na_values) | col_na_fvalues,
+ try_num_bool)
+
+ # type specified in dtype param or cast_type is an EA
+ if cast_type and (not is_dtype_equal(cvals, cast_type)
+ or is_extension_array_dtype(cast_type)):
+ try:
+ if (is_bool_dtype(cast_type) and
+ not is_categorical_dtype(cast_type)
+ and na_count > 0):
+ raise ValueError("Bool column has NA values in "
+ "column {column}"
+ .format(column=c))
+ except (AttributeError, TypeError):
+ # invalid input to is_bool_dtype
+ pass
+ cvals = self._cast_types(cvals, cast_type, c)
+
+ result[c] = cvals
+ if verbose and na_count:
+ print('Filled %d NA values in column %s' % (na_count, str(c)))
+ return result
+
+ def _infer_types(self, values, na_values, try_num_bool=True):
+ """
+ Infer types of values, possibly casting
+
+ Parameters
+ ----------
+ values : ndarray
+ na_values : set
+ try_num_bool : bool, default try
+ try to cast values to numeric (first preference) or boolean
+
+ Returns:
+ --------
+ converted : ndarray
+ na_count : int
+ """
+ na_count = 0
+ if issubclass(values.dtype.type, (np.number, np.bool_)):
+ mask = algorithms.isin(values, list(na_values))
+ na_count = mask.sum()
+ if na_count > 0:
+ if is_integer_dtype(values):
+ values = values.astype(np.float64)
+ np.putmask(values, mask, np.nan)
+ return values, na_count
+
+ if try_num_bool:
+ try:
+ result = lib.maybe_convert_numeric(values, na_values, False)
+ na_count = isna(result).sum()
+ except Exception:
+ result = values
+ if values.dtype == np.object_:
+ na_count = parsers.sanitize_objects(result,
+ na_values, False)
+ else:
+ result = values
+ if values.dtype == np.object_:
+ na_count = parsers.sanitize_objects(values, na_values, False)
+
+ if result.dtype == np.object_ and try_num_bool:
+ result = libops.maybe_convert_bool(np.asarray(values),
+ true_values=self.true_values,
+ false_values=self.false_values)
+
+ return result, na_count
+
+ def _cast_types(self, values, cast_type, column):
+ """
+ Cast values to specified type
+
+ Parameters
+ ----------
+ values : ndarray
+ cast_type : string or np.dtype
+ dtype to cast values to
+ column : string
+ column name - used only for error reporting
+
+ Returns
+ -------
+ converted : ndarray
+ """
+
+ if is_categorical_dtype(cast_type):
+ known_cats = (isinstance(cast_type, CategoricalDtype) and
+ cast_type.categories is not None)
+
+ if not is_object_dtype(values) and not known_cats:
+ # XXX this is for consistency with
+ # c-parser which parses all categories
+ # as strings
+ values = astype_nansafe(values, str)
+
+ cats = Index(values).unique().dropna()
+ values = Categorical._from_inferred_categories(
+ cats, cats.get_indexer(values), cast_type,
+ true_values=self.true_values)
+
+ # use the EA's implementation of casting
+ elif is_extension_array_dtype(cast_type):
+ # ensure cast_type is an actual dtype and not a string
+ cast_type = pandas_dtype(cast_type)
+ array_type = cast_type.construct_array_type()
+ try:
+ return array_type._from_sequence_of_strings(values,
+ dtype=cast_type)
+ except NotImplementedError:
+ raise NotImplementedError(
+ "Extension Array: {ea} must implement "
+ "_from_sequence_of_strings in order "
+ "to be used in parser methods".format(ea=array_type))
+
+ else:
+ try:
+ values = astype_nansafe(values, cast_type,
+ copy=True, skipna=True)
+ except ValueError:
+ raise ValueError("Unable to convert column %s to "
+ "type %s" % (column, cast_type))
+ return values
+
+ def _do_date_conversions(self, names, data):
+ # returns data, columns
+
+ if self.parse_dates is not None:
+ data, names = _process_date_conversion(
+ data, self._date_conv, self.parse_dates, self.index_col,
+ self.index_names, names, keep_date_col=self.keep_date_col)
+
+ return names, data
+
+
+class CParserWrapper(ParserBase):
+ """
+
+ """
+
+ def __init__(self, src, **kwds):
+ self.kwds = kwds
+ kwds = kwds.copy()
+
+ ParserBase.__init__(self, kwds)
+
+ if (kwds.get('compression') is None
+ and 'utf-16' in (kwds.get('encoding') or '')):
+ # if source is utf-16 plain text, convert source to utf-8
+ if isinstance(src, compat.string_types):
+ src = open(src, 'rb')
+ self.handles.append(src)
+ src = UTF8Recoder(src, kwds['encoding'])
+ kwds['encoding'] = 'utf-8'
+
+ # #2442
+ kwds['allow_leading_cols'] = self.index_col is not False
+
+ # GH20529, validate usecol arg before TextReader
+ self.usecols, self.usecols_dtype = _validate_usecols_arg(
+ kwds['usecols'])
+ kwds['usecols'] = self.usecols
+
+ self._reader = parsers.TextReader(src, **kwds)
+ self.unnamed_cols = self._reader.unnamed_cols
+
+ passed_names = self.names is None
+
+ if self._reader.header is None:
+ self.names = None
+ else:
+ if len(self._reader.header) > 1:
+ # we have a multi index in the columns
+ self.names, self.index_names, self.col_names, passed_names = (
+ self._extract_multi_indexer_columns(
+ self._reader.header, self.index_names, self.col_names,
+ passed_names
+ )
+ )
+ else:
+ self.names = list(self._reader.header[0])
+
+ if self.names is None:
+ if self.prefix:
+ self.names = ['%s%d' % (self.prefix, i)
+ for i in range(self._reader.table_width)]
+ else:
+ self.names = lrange(self._reader.table_width)
+
+ # gh-9755
+ #
+ # need to set orig_names here first
+ # so that proper indexing can be done
+ # with _set_noconvert_columns
+ #
+ # once names has been filtered, we will
+ # then set orig_names again to names
+ self.orig_names = self.names[:]
+
+ if self.usecols:
+ usecols = _evaluate_usecols(self.usecols, self.orig_names)
+
+ # GH 14671
+ if (self.usecols_dtype == 'string' and
+ not set(usecols).issubset(self.orig_names)):
+ _validate_usecols_names(usecols, self.orig_names)
+
+ if len(self.names) > len(usecols):
+ self.names = [n for i, n in enumerate(self.names)
+ if (i in usecols or n in usecols)]
+
+ if len(self.names) < len(usecols):
+ _validate_usecols_names(usecols, self.names)
+
+ self._set_noconvert_columns()
+
+ self.orig_names = self.names
+
+ if not self._has_complex_date_col:
+ if (self._reader.leading_cols == 0 and
+ _is_index_col(self.index_col)):
+
+ self._name_processed = True
+ (index_names, self.names,
+ self.index_col) = _clean_index_names(self.names,
+ self.index_col,
+ self.unnamed_cols)
+
+ if self.index_names is None:
+ self.index_names = index_names
+
+ if self._reader.header is None and not passed_names:
+ self.index_names = [None] * len(self.index_names)
+
+ self._implicit_index = self._reader.leading_cols > 0
+
+ def close(self):
+ for f in self.handles:
+ f.close()
+
+ # close additional handles opened by C parser (for compression)
+ try:
+ self._reader.close()
+ except ValueError:
+ pass
+
+ def _set_noconvert_columns(self):
+ """
+ Set the columns that should not undergo dtype conversions.
+
+ Currently, any column that is involved with date parsing will not
+ undergo such conversions.
+ """
+ names = self.orig_names
+ if self.usecols_dtype == 'integer':
+ # A set of integers will be converted to a list in
+ # the correct order every single time.
+ usecols = list(self.usecols)
+ usecols.sort()
+ elif (callable(self.usecols) or
+ self.usecols_dtype not in ('empty', None)):
+ # The names attribute should have the correct columns
+ # in the proper order for indexing with parse_dates.
+ usecols = self.names[:]
+ else:
+ # Usecols is empty.
+ usecols = None
+
+ def _set(x):
+ if usecols is not None and is_integer(x):
+ x = usecols[x]
+
+ if not is_integer(x):
+ x = names.index(x)
+
+ self._reader.set_noconvert(x)
+
+ if isinstance(self.parse_dates, list):
+ for val in self.parse_dates:
+ if isinstance(val, list):
+ for k in val:
+ _set(k)
+ else:
+ _set(val)
+
+ elif isinstance(self.parse_dates, dict):
+ for val in self.parse_dates.values():
+ if isinstance(val, list):
+ for k in val:
+ _set(k)
+ else:
+ _set(val)
+
+ elif self.parse_dates:
+ if isinstance(self.index_col, list):
+ for k in self.index_col:
+ _set(k)
+ elif self.index_col is not None:
+ _set(self.index_col)
+
+ def set_error_bad_lines(self, status):
+ self._reader.set_error_bad_lines(int(status))
+
+ def read(self, nrows=None):
+ try:
+ data = self._reader.read(nrows)
+ except StopIteration:
+ if self._first_chunk:
+ self._first_chunk = False
+ names = self._maybe_dedup_names(self.orig_names)
+ index, columns, col_dict = _get_empty_meta(
+ names, self.index_col, self.index_names,
+ dtype=self.kwds.get('dtype'))
+ columns = self._maybe_make_multi_index_columns(
+ columns, self.col_names)
+
+ if self.usecols is not None:
+ columns = self._filter_usecols(columns)
+
+ col_dict = dict(filter(lambda item: item[0] in columns,
+ col_dict.items()))
+
+ return index, columns, col_dict
+
+ else:
+ raise
+
+ # Done with first read, next time raise StopIteration
+ self._first_chunk = False
+
+ names = self.names
+
+ if self._reader.leading_cols:
+ if self._has_complex_date_col:
+ raise NotImplementedError('file structure not yet supported')
+
+ # implicit index, no index names
+ arrays = []
+
+ for i in range(self._reader.leading_cols):
+ if self.index_col is None:
+ values = data.pop(i)
+ else:
+ values = data.pop(self.index_col[i])
+
+ values = self._maybe_parse_dates(values, i,
+ try_parse_dates=True)
+ arrays.append(values)
+
+ index = ensure_index_from_sequences(arrays)
+
+ if self.usecols is not None:
+ names = self._filter_usecols(names)
+
+ names = self._maybe_dedup_names(names)
+
+ # rename dict keys
+ data = sorted(data.items())
+ data = {k: v for k, (i, v) in zip(names, data)}
+
+ names, data = self._do_date_conversions(names, data)
+
+ else:
+ # rename dict keys
+ data = sorted(data.items())
+
+ # ugh, mutation
+ names = list(self.orig_names)
+ names = self._maybe_dedup_names(names)
+
+ if self.usecols is not None:
+ names = self._filter_usecols(names)
+
+ # columns as list
+ alldata = [x[1] for x in data]
+
+ data = {k: v for k, (i, v) in zip(names, data)}
+
+ names, data = self._do_date_conversions(names, data)
+ index, names = self._make_index(data, alldata, names)
+
+ # maybe create a mi on the columns
+ names = self._maybe_make_multi_index_columns(names, self.col_names)
+
+ return index, names, data
+
+ def _filter_usecols(self, names):
+ # hackish
+ usecols = _evaluate_usecols(self.usecols, names)
+ if usecols is not None and len(names) != len(usecols):
+ names = [name for i, name in enumerate(names)
+ if i in usecols or name in usecols]
+ return names
+
+ def _get_index_names(self):
+ names = list(self._reader.header[0])
+ idx_names = None
+
+ if self._reader.leading_cols == 0 and self.index_col is not None:
+ (idx_names, names,
+ self.index_col) = _clean_index_names(names, self.index_col,
+ self.unnamed_cols)
+
+ return names, idx_names
+
+ def _maybe_parse_dates(self, values, index, try_parse_dates=True):
+ if try_parse_dates and self._should_parse_dates(index):
+ values = self._date_conv(values)
+ return values
+
+
+def TextParser(*args, **kwds):
+ """
+ Converts lists of lists/tuples into DataFrames with proper type inference
+ and optional (e.g. string to datetime) conversion. Also enables iterating
+ lazily over chunks of large files
+
+ Parameters
+ ----------
+ data : file-like object or list
+ delimiter : separator character to use
+ dialect : str or csv.Dialect instance, optional
+ Ignored if delimiter is longer than 1 character
+ names : sequence, default
+ header : int, default 0
+ Row to use to parse column labels. Defaults to the first row. Prior
+ rows will be discarded
+ index_col : int or list, optional
+ Column or columns to use as the (possibly hierarchical) index
+ has_index_names: bool, default False
+ True if the cols defined in index_col have an index name and are
+ not in the header.
+ na_values : scalar, str, list-like, or dict, optional
+ Additional strings to recognize as NA/NaN.
+ keep_default_na : bool, default True
+ thousands : str, optional
+ Thousands separator
+ comment : str, optional
+ Comment out remainder of line
+ parse_dates : bool, default False
+ keep_date_col : bool, default False
+ date_parser : function, optional
+ skiprows : list of integers
+ Row numbers to skip
+ skipfooter : int
+ Number of line at bottom of file to skip
+ converters : dict, optional
+ Dict of functions for converting values in certain columns. Keys can
+ either be integers or column labels, values are functions that take one
+ input argument, the cell (not column) content, and return the
+ transformed content.
+ encoding : str, optional
+ Encoding to use for UTF when reading/writing (ex. 'utf-8')
+ squeeze : bool, default False
+ returns Series if only one column.
+ infer_datetime_format: bool, default False
+ If True and `parse_dates` is True for a column, try to infer the
+ datetime format based on the first datetime string. If the format
+ can be inferred, there often will be a large parsing speed-up.
+ float_precision : str, optional
+ Specifies which converter the C engine should use for floating-point
+ values. The options are None for the ordinary converter,
+ 'high' for the high-precision converter, and 'round_trip' for the
+ round-trip converter.
+ """
+ kwds['engine'] = 'python'
+ return TextFileReader(*args, **kwds)
+
+
+def count_empty_vals(vals):
+ return sum(1 for v in vals if v == '' or v is None)
+
+
+class PythonParser(ParserBase):
+
+ def __init__(self, f, **kwds):
+ """
+ Workhorse function for processing nested list into DataFrame
+
+ Should be replaced by np.genfromtxt eventually?
+ """
+ ParserBase.__init__(self, kwds)
+
+ self.data = None
+ self.buf = []
+ self.pos = 0
+ self.line_pos = 0
+
+ self.encoding = kwds['encoding']
+ self.compression = kwds['compression']
+ self.memory_map = kwds['memory_map']
+ self.skiprows = kwds['skiprows']
+
+ if callable(self.skiprows):
+ self.skipfunc = self.skiprows
+ else:
+ self.skipfunc = lambda x: x in self.skiprows
+
+ self.skipfooter = _validate_skipfooter_arg(kwds['skipfooter'])
+ self.delimiter = kwds['delimiter']
+
+ self.quotechar = kwds['quotechar']
+ if isinstance(self.quotechar, compat.text_type):
+ self.quotechar = str(self.quotechar)
+
+ self.escapechar = kwds['escapechar']
+ self.doublequote = kwds['doublequote']
+ self.skipinitialspace = kwds['skipinitialspace']
+ self.lineterminator = kwds['lineterminator']
+ self.quoting = kwds['quoting']
+ self.usecols, _ = _validate_usecols_arg(kwds['usecols'])
+ self.skip_blank_lines = kwds['skip_blank_lines']
+
+ self.warn_bad_lines = kwds['warn_bad_lines']
+ self.error_bad_lines = kwds['error_bad_lines']
+
+ self.names_passed = kwds['names'] or None
+
+ self.has_index_names = False
+ if 'has_index_names' in kwds:
+ self.has_index_names = kwds['has_index_names']
+
+ self.verbose = kwds['verbose']
+ self.converters = kwds['converters']
+
+ self.dtype = kwds['dtype']
+ self.thousands = kwds['thousands']
+ self.decimal = kwds['decimal']
+
+ self.comment = kwds['comment']
+ self._comment_lines = []
+
+ mode = 'r' if PY3 else 'rb'
+ f, handles = _get_handle(f, mode, encoding=self.encoding,
+ compression=self.compression,
+ memory_map=self.memory_map)
+ self.handles.extend(handles)
+
+ # Set self.data to something that can read lines.
+ if hasattr(f, 'readline'):
+ self._make_reader(f)
+ else:
+ self.data = f
+
+ # Get columns in two steps: infer from data, then
+ # infer column indices from self.usecols if it is specified.
+ self._col_indices = None
+ (self.columns, self.num_original_columns,
+ self.unnamed_cols) = self._infer_columns()
+
+ # Now self.columns has the set of columns that we will process.
+ # The original set is stored in self.original_columns.
+ if len(self.columns) > 1:
+ # we are processing a multi index column
+ self.columns, self.index_names, self.col_names, _ = (
+ self._extract_multi_indexer_columns(
+ self.columns, self.index_names, self.col_names
+ )
+ )
+ # Update list of original names to include all indices.
+ self.num_original_columns = len(self.columns)
+ else:
+ self.columns = self.columns[0]
+
+ # get popped off for index
+ self.orig_names = list(self.columns)
+
+ # needs to be cleaned/refactored
+ # multiple date column thing turning into a real spaghetti factory
+
+ if not self._has_complex_date_col:
+ (index_names, self.orig_names, self.columns) = (
+ self._get_index_name(self.columns))
+ self._name_processed = True
+ if self.index_names is None:
+ self.index_names = index_names
+
+ if self.parse_dates:
+ self._no_thousands_columns = self._set_no_thousands_columns()
+ else:
+ self._no_thousands_columns = None
+
+ if len(self.decimal) != 1:
+ raise ValueError('Only length-1 decimal markers supported')
+
+ if self.thousands is None:
+ self.nonnum = re.compile('[^-^0-9^%s]+' % self.decimal)
+ else:
+ self.nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands,
+ self.decimal))
+
+ def _set_no_thousands_columns(self):
+ # Create a set of column ids that are not to be stripped of thousands
+ # operators.
+ noconvert_columns = set()
+
+ def _set(x):
+ if is_integer(x):
+ noconvert_columns.add(x)
+ else:
+ noconvert_columns.add(self.columns.index(x))
+
+ if isinstance(self.parse_dates, list):
+ for val in self.parse_dates:
+ if isinstance(val, list):
+ for k in val:
+ _set(k)
+ else:
+ _set(val)
+
+ elif isinstance(self.parse_dates, dict):
+ for val in self.parse_dates.values():
+ if isinstance(val, list):
+ for k in val:
+ _set(k)
+ else:
+ _set(val)
+
+ elif self.parse_dates:
+ if isinstance(self.index_col, list):
+ for k in self.index_col:
+ _set(k)
+ elif self.index_col is not None:
+ _set(self.index_col)
+
+ return noconvert_columns
+
+ def _make_reader(self, f):
+ sep = self.delimiter
+
+ if sep is None or len(sep) == 1:
+ if self.lineterminator:
+ raise ValueError('Custom line terminators not supported in '
+ 'python parser (yet)')
+
+ class MyDialect(csv.Dialect):
+ delimiter = self.delimiter
+ quotechar = self.quotechar
+ escapechar = self.escapechar
+ doublequote = self.doublequote
+ skipinitialspace = self.skipinitialspace
+ quoting = self.quoting
+ lineterminator = '\n'
+
+ dia = MyDialect
+
+ sniff_sep = True
+
+ if sep is not None:
+ sniff_sep = False
+ dia.delimiter = sep
+ # attempt to sniff the delimiter
+ if sniff_sep:
+ line = f.readline()
+ while self.skipfunc(self.pos):
+ self.pos += 1
+ line = f.readline()
+
+ line = self._check_comments([line])[0]
+
+ self.pos += 1
+ self.line_pos += 1
+ sniffed = csv.Sniffer().sniff(line)
+ dia.delimiter = sniffed.delimiter
+ if self.encoding is not None:
+ self.buf.extend(list(
+ UnicodeReader(StringIO(line),
+ dialect=dia,
+ encoding=self.encoding)))
+ else:
+ self.buf.extend(list(csv.reader(StringIO(line),
+ dialect=dia)))
+
+ if self.encoding is not None:
+ reader = UnicodeReader(f, dialect=dia,
+ encoding=self.encoding,
+ strict=True)
+ else:
+ reader = csv.reader(f, dialect=dia,
+ strict=True)
+
+ else:
+ def _read():
+ line = f.readline()
+
+ if compat.PY2 and self.encoding:
+ line = line.decode(self.encoding)
+
+ pat = re.compile(sep)
+ yield pat.split(line.strip())
+ for line in f:
+ yield pat.split(line.strip())
+ reader = _read()
+
+ self.data = reader
+
+ def read(self, rows=None):
+ try:
+ content = self._get_lines(rows)
+ except StopIteration:
+ if self._first_chunk:
+ content = []
+ else:
+ raise
+
+ # done with first read, next time raise StopIteration
+ self._first_chunk = False
+
+ columns = list(self.orig_names)
+ if not len(content): # pragma: no cover
+ # DataFrame with the right metadata, even though it's length 0
+ names = self._maybe_dedup_names(self.orig_names)
+ index, columns, col_dict = _get_empty_meta(
+ names, self.index_col, self.index_names, self.dtype)
+ columns = self._maybe_make_multi_index_columns(
+ columns, self.col_names)
+ return index, columns, col_dict
+
+ # handle new style for names in index
+ count_empty_content_vals = count_empty_vals(content[0])
+ indexnamerow = None
+ if self.has_index_names and count_empty_content_vals == len(columns):
+ indexnamerow = content[0]
+ content = content[1:]
+
+ alldata = self._rows_to_cols(content)
+ data = self._exclude_implicit_index(alldata)
+
+ columns = self._maybe_dedup_names(self.columns)
+ columns, data = self._do_date_conversions(columns, data)
+
+ data = self._convert_data(data)
+ index, columns = self._make_index(data, alldata, columns, indexnamerow)
+
+ return index, columns, data
+
+ def _exclude_implicit_index(self, alldata):
+ names = self._maybe_dedup_names(self.orig_names)
+
+ if self._implicit_index:
+ excl_indices = self.index_col
+
+ data = {}
+ offset = 0
+ for i, col in enumerate(names):
+ while i + offset in excl_indices:
+ offset += 1
+ data[col] = alldata[i + offset]
+ else:
+ data = {k: v for k, v in zip(names, alldata)}
+
+ return data
+
+ # legacy
+ def get_chunk(self, size=None):
+ if size is None:
+ size = self.chunksize
+ return self.read(rows=size)
+
+ def _convert_data(self, data):
+ # apply converters
+ def _clean_mapping(mapping):
+ "converts col numbers to names"
+ clean = {}
+ for col, v in compat.iteritems(mapping):
+ if isinstance(col, int) and col not in self.orig_names:
+ col = self.orig_names[col]
+ clean[col] = v
+ return clean
+
+ clean_conv = _clean_mapping(self.converters)
+ if not isinstance(self.dtype, dict):
+ # handles single dtype applied to all columns
+ clean_dtypes = self.dtype
+ else:
+ clean_dtypes = _clean_mapping(self.dtype)
+
+ # Apply NA values.
+ clean_na_values = {}
+ clean_na_fvalues = {}
+
+ if isinstance(self.na_values, dict):
+ for col in self.na_values:
+ na_value = self.na_values[col]
+ na_fvalue = self.na_fvalues[col]
+
+ if isinstance(col, int) and col not in self.orig_names:
+ col = self.orig_names[col]
+
+ clean_na_values[col] = na_value
+ clean_na_fvalues[col] = na_fvalue
+ else:
+ clean_na_values = self.na_values
+ clean_na_fvalues = self.na_fvalues
+
+ return self._convert_to_ndarrays(data, clean_na_values,
+ clean_na_fvalues, self.verbose,
+ clean_conv, clean_dtypes)
+
+ def _infer_columns(self):
+ names = self.names
+ num_original_columns = 0
+ clear_buffer = True
+ unnamed_cols = set()
+
+ if self.header is not None:
+ header = self.header
+
+ if isinstance(header, (list, tuple, np.ndarray)):
+ have_mi_columns = len(header) > 1
+ # we have a mi columns, so read an extra line
+ if have_mi_columns:
+ header = list(header) + [header[-1] + 1]
+ else:
+ have_mi_columns = False
+ header = [header]
+
+ columns = []
+ for level, hr in enumerate(header):
+ try:
+ line = self._buffered_line()
+
+ while self.line_pos <= hr:
+ line = self._next_line()
+
+ except StopIteration:
+ if self.line_pos < hr:
+ raise ValueError(
+ 'Passed header=%s but only %d lines in file'
+ % (hr, self.line_pos + 1))
+
+ # We have an empty file, so check
+ # if columns are provided. That will
+ # serve as the 'line' for parsing
+ if have_mi_columns and hr > 0:
+ if clear_buffer:
+ self._clear_buffer()
+ columns.append([None] * len(columns[-1]))
+ return columns, num_original_columns, unnamed_cols
+
+ if not self.names:
+ raise EmptyDataError(
+ "No columns to parse from file")
+
+ line = self.names[:]
+
+ this_columns = []
+ this_unnamed_cols = []
+
+ for i, c in enumerate(line):
+ if c == '':
+ if have_mi_columns:
+ col_name = ("Unnamed: {i}_level_{level}"
+ .format(i=i, level=level))
+ else:
+ col_name = "Unnamed: {i}".format(i=i)
+
+ this_unnamed_cols.append(i)
+ this_columns.append(col_name)
+ else:
+ this_columns.append(c)
+
+ if not have_mi_columns and self.mangle_dupe_cols:
+ counts = defaultdict(int)
+
+ for i, col in enumerate(this_columns):
+ cur_count = counts[col]
+
+ while cur_count > 0:
+ counts[col] = cur_count + 1
+ col = "%s.%d" % (col, cur_count)
+ cur_count = counts[col]
+
+ this_columns[i] = col
+ counts[col] = cur_count + 1
+ elif have_mi_columns:
+
+ # if we have grabbed an extra line, but its not in our
+ # format so save in the buffer, and create an blank extra
+ # line for the rest of the parsing code
+ if hr == header[-1]:
+ lc = len(this_columns)
+ ic = (len(self.index_col)
+ if self.index_col is not None else 0)
+ unnamed_count = len(this_unnamed_cols)
+
+ if lc != unnamed_count and lc - ic > unnamed_count:
+ clear_buffer = False
+ this_columns = [None] * lc
+ self.buf = [self.buf[-1]]
+
+ columns.append(this_columns)
+ unnamed_cols.update({this_columns[i]
+ for i in this_unnamed_cols})
+
+ if len(columns) == 1:
+ num_original_columns = len(this_columns)
+
+ if clear_buffer:
+ self._clear_buffer()
+
+ if names is not None:
+ if ((self.usecols is not None and
+ len(names) != len(self.usecols)) or
+ (self.usecols is None and
+ len(names) != len(columns[0]))):
+ raise ValueError('Number of passed names did not match '
+ 'number of header fields in the file')
+ if len(columns) > 1:
+ raise TypeError('Cannot pass names with multi-index '
+ 'columns')
+
+ if self.usecols is not None:
+ # Set _use_cols. We don't store columns because they are
+ # overwritten.
+ self._handle_usecols(columns, names)
+ else:
+ self._col_indices = None
+ num_original_columns = len(names)
+ columns = [names]
+ else:
+ columns = self._handle_usecols(columns, columns[0])
+ else:
+ try:
+ line = self._buffered_line()
+
+ except StopIteration:
+ if not names:
+ raise EmptyDataError(
+ "No columns to parse from file")
+
+ line = names[:]
+
+ ncols = len(line)
+ num_original_columns = ncols
+
+ if not names:
+ if self.prefix:
+ columns = [['%s%d' % (self.prefix, i)
+ for i in range(ncols)]]
+ else:
+ columns = [lrange(ncols)]
+ columns = self._handle_usecols(columns, columns[0])
+ else:
+ if self.usecols is None or len(names) >= num_original_columns:
+ columns = self._handle_usecols([names], names)
+ num_original_columns = len(names)
+ else:
+ if (not callable(self.usecols) and
+ len(names) != len(self.usecols)):
+ raise ValueError(
+ 'Number of passed names did not match number of '
+ 'header fields in the file'
+ )
+ # Ignore output but set used columns.
+ self._handle_usecols([names], names)
+ columns = [names]
+ num_original_columns = ncols
+
+ return columns, num_original_columns, unnamed_cols
+
+ def _handle_usecols(self, columns, usecols_key):
+ """
+ Sets self._col_indices
+
+ usecols_key is used if there are string usecols.
+ """
+ if self.usecols is not None:
+ if callable(self.usecols):
+ col_indices = _evaluate_usecols(self.usecols, usecols_key)
+ elif any(isinstance(u, string_types) for u in self.usecols):
+ if len(columns) > 1:
+ raise ValueError("If using multiple headers, usecols must "
+ "be integers.")
+ col_indices = []
+
+ for col in self.usecols:
+ if isinstance(col, string_types):
+ try:
+ col_indices.append(usecols_key.index(col))
+ except ValueError:
+ _validate_usecols_names(self.usecols, usecols_key)
+ else:
+ col_indices.append(col)
+ else:
+ col_indices = self.usecols
+
+ columns = [[n for i, n in enumerate(column) if i in col_indices]
+ for column in columns]
+ self._col_indices = col_indices
+ return columns
+
+ def _buffered_line(self):
+ """
+ Return a line from buffer, filling buffer if required.
+ """
+ if len(self.buf) > 0:
+ return self.buf[0]
+ else:
+ return self._next_line()
+
+ def _check_for_bom(self, first_row):
+ """
+ Checks whether the file begins with the BOM character.
+ If it does, remove it. In addition, if there is quoting
+ in the field subsequent to the BOM, remove it as well
+ because it technically takes place at the beginning of
+ the name, not the middle of it.
+ """
+ # first_row will be a list, so we need to check
+ # that that list is not empty before proceeding.
+ if not first_row:
+ return first_row
+
+ # The first element of this row is the one that could have the
+ # BOM that we want to remove. Check that the first element is a
+ # string before proceeding.
+ if not isinstance(first_row[0], compat.string_types):
+ return first_row
+
+ # Check that the string is not empty, as that would
+ # obviously not have a BOM at the start of it.
+ if not first_row[0]:
+ return first_row
+
+ # Since the string is non-empty, check that it does
+ # in fact begin with a BOM.
+ first_elt = first_row[0][0]
+
+ # This is to avoid warnings we get in Python 2.x if
+ # we find ourselves comparing with non-Unicode
+ if compat.PY2 and not isinstance(first_elt, unicode): # noqa
+ try:
+ first_elt = u(first_elt)
+ except UnicodeDecodeError:
+ return first_row
+
+ if first_elt != _BOM:
+ return first_row
+
+ first_row = first_row[0]
+
+ if len(first_row) > 1 and first_row[1] == self.quotechar:
+ start = 2
+ quote = first_row[1]
+ end = first_row[2:].index(quote) + 2
+
+ # Extract the data between the quotation marks
+ new_row = first_row[start:end]
+
+ # Extract any remaining data after the second
+ # quotation mark.
+ if len(first_row) > end + 1:
+ new_row += first_row[end + 1:]
+ return [new_row]
+ elif len(first_row) > 1:
+ return [first_row[1:]]
+ else:
+ # First row is just the BOM, so we
+ # return an empty string.
+ return [""]
+
+ def _is_line_empty(self, line):
+ """
+ Check if a line is empty or not.
+
+ Parameters
+ ----------
+ line : str, array-like
+ The line of data to check.
+
+ Returns
+ -------
+ boolean : Whether or not the line is empty.
+ """
+ return not line or all(not x for x in line)
+
+ def _next_line(self):
+ if isinstance(self.data, list):
+ while self.skipfunc(self.pos):
+ self.pos += 1
+
+ while True:
+ try:
+ line = self._check_comments([self.data[self.pos]])[0]
+ self.pos += 1
+ # either uncommented or blank to begin with
+ if (not self.skip_blank_lines and
+ (self._is_line_empty(
+ self.data[self.pos - 1]) or line)):
+ break
+ elif self.skip_blank_lines:
+ ret = self._remove_empty_lines([line])
+ if ret:
+ line = ret[0]
+ break
+ except IndexError:
+ raise StopIteration
+ else:
+ while self.skipfunc(self.pos):
+ self.pos += 1
+ next(self.data)
+
+ while True:
+ orig_line = self._next_iter_line(row_num=self.pos + 1)
+ self.pos += 1
+
+ if orig_line is not None:
+ line = self._check_comments([orig_line])[0]
+
+ if self.skip_blank_lines:
+ ret = self._remove_empty_lines([line])
+
+ if ret:
+ line = ret[0]
+ break
+ elif self._is_line_empty(orig_line) or line:
+ break
+
+ # This was the first line of the file,
+ # which could contain the BOM at the
+ # beginning of it.
+ if self.pos == 1:
+ line = self._check_for_bom(line)
+
+ self.line_pos += 1
+ self.buf.append(line)
+ return line
+
+ def _alert_malformed(self, msg, row_num):
+ """
+ Alert a user about a malformed row.
+
+ If `self.error_bad_lines` is True, the alert will be `ParserError`.
+ If `self.warn_bad_lines` is True, the alert will be printed out.
+
+ Parameters
+ ----------
+ msg : The error message to display.
+ row_num : The row number where the parsing error occurred.
+ Because this row number is displayed, we 1-index,
+ even though we 0-index internally.
+ """
+
+ if self.error_bad_lines:
+ raise ParserError(msg)
+ elif self.warn_bad_lines:
+ base = 'Skipping line {row_num}: '.format(row_num=row_num)
+ sys.stderr.write(base + msg + '\n')
+
+ def _next_iter_line(self, row_num):
+ """
+ Wrapper around iterating through `self.data` (CSV source).
+
+ When a CSV error is raised, we check for specific
+ error messages that allow us to customize the
+ error message displayed to the user.
+
+ Parameters
+ ----------
+ row_num : The row number of the line being parsed.
+ """
+
+ try:
+ return next(self.data)
+ except csv.Error as e:
+ if self.warn_bad_lines or self.error_bad_lines:
+ msg = str(e)
+
+ if 'NULL byte' in msg:
+ msg = ('NULL byte detected. This byte '
+ 'cannot be processed in Python\'s '
+ 'native csv library at the moment, '
+ 'so please pass in engine=\'c\' instead')
+
+ if self.skipfooter > 0:
+ reason = ('Error could possibly be due to '
+ 'parsing errors in the skipped footer rows '
+ '(the skipfooter keyword is only applied '
+ 'after Python\'s csv library has parsed '
+ 'all rows).')
+ msg += '. ' + reason
+
+ self._alert_malformed(msg, row_num)
+ return None
+
+ def _check_comments(self, lines):
+ if self.comment is None:
+ return lines
+ ret = []
+ for l in lines:
+ rl = []
+ for x in l:
+ if (not isinstance(x, compat.string_types) or
+ self.comment not in x):
+ rl.append(x)
+ else:
+ x = x[:x.find(self.comment)]
+ if len(x) > 0:
+ rl.append(x)
+ break
+ ret.append(rl)
+ return ret
+
+ def _remove_empty_lines(self, lines):
+ """
+ Iterate through the lines and remove any that are
+ either empty or contain only one whitespace value
+
+ Parameters
+ ----------
+ lines : array-like
+ The array of lines that we are to filter.
+
+ Returns
+ -------
+ filtered_lines : array-like
+ The same array of lines with the "empty" ones removed.
+ """
+
+ ret = []
+ for l in lines:
+ # Remove empty lines and lines with only one whitespace value
+ if (len(l) > 1 or len(l) == 1 and
+ (not isinstance(l[0], compat.string_types) or
+ l[0].strip())):
+ ret.append(l)
+ return ret
+
+ def _check_thousands(self, lines):
+ if self.thousands is None:
+ return lines
+
+ return self._search_replace_num_columns(lines=lines,
+ search=self.thousands,
+ replace='')
+
+ def _search_replace_num_columns(self, lines, search, replace):
+ ret = []
+ for l in lines:
+ rl = []
+ for i, x in enumerate(l):
+ if (not isinstance(x, compat.string_types) or
+ search not in x or
+ (self._no_thousands_columns and
+ i in self._no_thousands_columns) or
+ self.nonnum.search(x.strip())):
+ rl.append(x)
+ else:
+ rl.append(x.replace(search, replace))
+ ret.append(rl)
+ return ret
+
+ def _check_decimal(self, lines):
+ if self.decimal == _parser_defaults['decimal']:
+ return lines
+
+ return self._search_replace_num_columns(lines=lines,
+ search=self.decimal,
+ replace='.')
+
+ def _clear_buffer(self):
+ self.buf = []
+
+ _implicit_index = False
+
+ def _get_index_name(self, columns):
+ """
+ Try several cases to get lines:
+
+ 0) There are headers on row 0 and row 1 and their
+ total summed lengths equals the length of the next line.
+ Treat row 0 as columns and row 1 as indices
+ 1) Look for implicit index: there are more columns
+ on row 1 than row 0. If this is true, assume that row
+ 1 lists index columns and row 0 lists normal columns.
+ 2) Get index from the columns if it was listed.
+ """
+ orig_names = list(columns)
+ columns = list(columns)
+
+ try:
+ line = self._next_line()
+ except StopIteration:
+ line = None
+
+ try:
+ next_line = self._next_line()
+ except StopIteration:
+ next_line = None
+
+ # implicitly index_col=0 b/c 1 fewer column names
+ implicit_first_cols = 0
+ if line is not None:
+ # leave it 0, #2442
+ # Case 1
+ if self.index_col is not False:
+ implicit_first_cols = len(line) - self.num_original_columns
+
+ # Case 0
+ if next_line is not None:
+ if len(next_line) == len(line) + self.num_original_columns:
+ # column and index names on diff rows
+ self.index_col = lrange(len(line))
+ self.buf = self.buf[1:]
+
+ for c in reversed(line):
+ columns.insert(0, c)
+
+ # Update list of original names to include all indices.
+ orig_names = list(columns)
+ self.num_original_columns = len(columns)
+ return line, orig_names, columns
+
+ if implicit_first_cols > 0:
+ # Case 1
+ self._implicit_index = True
+ if self.index_col is None:
+ self.index_col = lrange(implicit_first_cols)
+
+ index_name = None
+
+ else:
+ # Case 2
+ (index_name, columns_,
+ self.index_col) = _clean_index_names(columns, self.index_col,
+ self.unnamed_cols)
+
+ return index_name, orig_names, columns
+
+ def _rows_to_cols(self, content):
+ col_len = self.num_original_columns
+
+ if self._implicit_index:
+ col_len += len(self.index_col)
+
+ max_len = max(len(row) for row in content)
+
+ # Check that there are no rows with too many
+ # elements in their row (rows with too few
+ # elements are padded with NaN).
+ if (max_len > col_len and
+ self.index_col is not False and
+ self.usecols is None):
+
+ footers = self.skipfooter if self.skipfooter else 0
+ bad_lines = []
+
+ iter_content = enumerate(content)
+ content_len = len(content)
+ content = []
+
+ for (i, l) in iter_content:
+ actual_len = len(l)
+
+ if actual_len > col_len:
+ if self.error_bad_lines or self.warn_bad_lines:
+ row_num = self.pos - (content_len - i + footers)
+ bad_lines.append((row_num, actual_len))
+
+ if self.error_bad_lines:
+ break
+ else:
+ content.append(l)
+
+ for row_num, actual_len in bad_lines:
+ msg = ('Expected %d fields in line %d, saw %d' %
+ (col_len, row_num + 1, actual_len))
+ if (self.delimiter and
+ len(self.delimiter) > 1 and
+ self.quoting != csv.QUOTE_NONE):
+ # see gh-13374
+ reason = ('Error could possibly be due to quotes being '
+ 'ignored when a multi-char delimiter is used.')
+ msg += '. ' + reason
+
+ self._alert_malformed(msg, row_num + 1)
+
+ # see gh-13320
+ zipped_content = list(lib.to_object_array(
+ content, min_width=col_len).T)
+
+ if self.usecols:
+ if self._implicit_index:
+ zipped_content = [
+ a for i, a in enumerate(zipped_content)
+ if (i < len(self.index_col) or
+ i - len(self.index_col) in self._col_indices)]
+ else:
+ zipped_content = [a for i, a in enumerate(zipped_content)
+ if i in self._col_indices]
+ return zipped_content
+
+ def _get_lines(self, rows=None):
+ lines = self.buf
+ new_rows = None
+
+ # already fetched some number
+ if rows is not None:
+ # we already have the lines in the buffer
+ if len(self.buf) >= rows:
+ new_rows, self.buf = self.buf[:rows], self.buf[rows:]
+
+ # need some lines
+ else:
+ rows -= len(self.buf)
+
+ if new_rows is None:
+ if isinstance(self.data, list):
+ if self.pos > len(self.data):
+ raise StopIteration
+ if rows is None:
+ new_rows = self.data[self.pos:]
+ new_pos = len(self.data)
+ else:
+ new_rows = self.data[self.pos:self.pos + rows]
+ new_pos = self.pos + rows
+
+ # Check for stop rows. n.b.: self.skiprows is a set.
+ if self.skiprows:
+ new_rows = [row for i, row in enumerate(new_rows)
+ if not self.skipfunc(i + self.pos)]
+
+ lines.extend(new_rows)
+ self.pos = new_pos
+
+ else:
+ new_rows = []
+ try:
+ if rows is not None:
+ for _ in range(rows):
+ new_rows.append(next(self.data))
+ lines.extend(new_rows)
+ else:
+ rows = 0
+
+ while True:
+ new_row = self._next_iter_line(
+ row_num=self.pos + rows + 1)
+ rows += 1
+
+ if new_row is not None:
+ new_rows.append(new_row)
+
+ except StopIteration:
+ if self.skiprows:
+ new_rows = [row for i, row in enumerate(new_rows)
+ if not self.skipfunc(i + self.pos)]
+ lines.extend(new_rows)
+ if len(lines) == 0:
+ raise
+ self.pos += len(new_rows)
+
+ self.buf = []
+ else:
+ lines = new_rows
+
+ if self.skipfooter:
+ lines = lines[:-self.skipfooter]
+
+ lines = self._check_comments(lines)
+ if self.skip_blank_lines:
+ lines = self._remove_empty_lines(lines)
+ lines = self._check_thousands(lines)
+ return self._check_decimal(lines)
+
+
+def _make_date_converter(date_parser=None, dayfirst=False,
+ infer_datetime_format=False):
+ def converter(*date_cols):
+ if date_parser is None:
+ strs = _concat_date_cols(date_cols)
+
+ try:
+ return tools.to_datetime(
+ ensure_object(strs),
+ utc=None,
+ box=False,
+ dayfirst=dayfirst,
+ errors='ignore',
+ infer_datetime_format=infer_datetime_format
+ )
+ except ValueError:
+ return tools.to_datetime(
+ parsing.try_parse_dates(strs, dayfirst=dayfirst))
+ else:
+ try:
+ result = tools.to_datetime(
+ date_parser(*date_cols), errors='ignore')
+ if isinstance(result, datetime.datetime):
+ raise Exception('scalar parser')
+ return result
+ except Exception:
+ try:
+ return tools.to_datetime(
+ parsing.try_parse_dates(_concat_date_cols(date_cols),
+ parser=date_parser,
+ dayfirst=dayfirst),
+ errors='ignore')
+ except Exception:
+ return generic_parser(date_parser, *date_cols)
+
+ return converter
+
+
+def _process_date_conversion(data_dict, converter, parse_spec,
+ index_col, index_names, columns,
+ keep_date_col=False):
+ def _isindex(colspec):
+ return ((isinstance(index_col, list) and
+ colspec in index_col) or
+ (isinstance(index_names, list) and
+ colspec in index_names))
+
+ new_cols = []
+ new_data = {}
+
+ orig_names = columns
+ columns = list(columns)
+
+ date_cols = set()
+
+ if parse_spec is None or isinstance(parse_spec, bool):
+ return data_dict, columns
+
+ if isinstance(parse_spec, list):
+ # list of column lists
+ for colspec in parse_spec:
+ if is_scalar(colspec):
+ if isinstance(colspec, int) and colspec not in data_dict:
+ colspec = orig_names[colspec]
+ if _isindex(colspec):
+ continue
+ data_dict[colspec] = converter(data_dict[colspec])
+ else:
+ new_name, col, old_names = _try_convert_dates(
+ converter, colspec, data_dict, orig_names)
+ if new_name in data_dict:
+ raise ValueError('New date column already in dict %s' %
+ new_name)
+ new_data[new_name] = col
+ new_cols.append(new_name)
+ date_cols.update(old_names)
+
+ elif isinstance(parse_spec, dict):
+ # dict of new name to column list
+ for new_name, colspec in compat.iteritems(parse_spec):
+ if new_name in data_dict:
+ raise ValueError('Date column %s already in dict' %
+ new_name)
+
+ _, col, old_names = _try_convert_dates(converter, colspec,
+ data_dict, orig_names)
+
+ new_data[new_name] = col
+ new_cols.append(new_name)
+ date_cols.update(old_names)
+
+ data_dict.update(new_data)
+ new_cols.extend(columns)
+
+ if not keep_date_col:
+ for c in list(date_cols):
+ data_dict.pop(c)
+ new_cols.remove(c)
+
+ return data_dict, new_cols
+
+
+def _try_convert_dates(parser, colspec, data_dict, columns):
+ colset = set(columns)
+ colnames = []
+
+ for c in colspec:
+ if c in colset:
+ colnames.append(c)
+ elif isinstance(c, int) and c not in columns:
+ colnames.append(columns[c])
+ else:
+ colnames.append(c)
+
+ new_name = '_'.join(str(x) for x in colnames)
+ to_parse = [data_dict[c] for c in colnames if c in data_dict]
+
+ new_col = parser(*to_parse)
+ return new_name, new_col, colnames
+
+
+def _clean_na_values(na_values, keep_default_na=True):
+
+ if na_values is None:
+ if keep_default_na:
+ na_values = _NA_VALUES
+ else:
+ na_values = set()
+ na_fvalues = set()
+ elif isinstance(na_values, dict):
+ old_na_values = na_values.copy()
+ na_values = {} # Prevent aliasing.
+
+ # Convert the values in the na_values dictionary
+ # into array-likes for further use. This is also
+ # where we append the default NaN values, provided
+ # that `keep_default_na=True`.
+ for k, v in compat.iteritems(old_na_values):
+ if not is_list_like(v):
+ v = [v]
+
+ if keep_default_na:
+ v = set(v) | _NA_VALUES
+
+ na_values[k] = v
+ na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
+ else:
+ if not is_list_like(na_values):
+ na_values = [na_values]
+ na_values = _stringify_na_values(na_values)
+ if keep_default_na:
+ na_values = na_values | _NA_VALUES
+
+ na_fvalues = _floatify_na_values(na_values)
+
+ return na_values, na_fvalues
+
+
+def _clean_index_names(columns, index_col, unnamed_cols):
+ if not _is_index_col(index_col):
+ return None, columns, index_col
+
+ columns = list(columns)
+
+ cp_cols = list(columns)
+ index_names = []
+
+ # don't mutate
+ index_col = list(index_col)
+
+ for i, c in enumerate(index_col):
+ if isinstance(c, compat.string_types):
+ index_names.append(c)
+ for j, name in enumerate(cp_cols):
+ if name == c:
+ index_col[i] = j
+ columns.remove(name)
+ break
+ else:
+ name = cp_cols[c]
+ columns.remove(name)
+ index_names.append(name)
+
+ # Only clean index names that were placeholders.
+ for i, name in enumerate(index_names):
+ if isinstance(name, compat.string_types) and name in unnamed_cols:
+ index_names[i] = None
+
+ return index_names, columns, index_col
+
+
+def _get_empty_meta(columns, index_col, index_names, dtype=None):
+ columns = list(columns)
+
+ # Convert `dtype` to a defaultdict of some kind.
+ # This will enable us to write `dtype[col_name]`
+ # without worrying about KeyError issues later on.
+ if not isinstance(dtype, dict):
+ # if dtype == None, default will be np.object.
+ default_dtype = dtype or np.object
+ dtype = defaultdict(lambda: default_dtype)
+ else:
+ # Save a copy of the dictionary.
+ _dtype = dtype.copy()
+ dtype = defaultdict(lambda: np.object)
+
+ # Convert column indexes to column names.
+ for k, v in compat.iteritems(_dtype):
+ col = columns[k] if is_integer(k) else k
+ dtype[col] = v
+
+ # Even though we have no data, the "index" of the empty DataFrame
+ # could for example still be an empty MultiIndex. Thus, we need to
+ # check whether we have any index columns specified, via either:
+ #
+ # 1) index_col (column indices)
+ # 2) index_names (column names)
+ #
+ # Both must be non-null to ensure a successful construction. Otherwise,
+ # we have to create a generic emtpy Index.
+ if (index_col is None or index_col is False) or index_names is None:
+ index = Index([])
+ else:
+ data = [Series([], dtype=dtype[name]) for name in index_names]
+ index = ensure_index_from_sequences(data, names=index_names)
+ index_col.sort()
+
+ for i, n in enumerate(index_col):
+ columns.pop(n - i)
+
+ col_dict = {col_name: Series([], dtype=dtype[col_name])
+ for col_name in columns}
+
+ return index, columns, col_dict
+
+
+def _floatify_na_values(na_values):
+ # create float versions of the na_values
+ result = set()
+ for v in na_values:
+ try:
+ v = float(v)
+ if not np.isnan(v):
+ result.add(v)
+ except (TypeError, ValueError, OverflowError):
+ pass
+ return result
+
+
+def _stringify_na_values(na_values):
+ """ return a stringified and numeric for these values """
+ result = []
+ for x in na_values:
+ result.append(str(x))
+ result.append(x)
+ try:
+ v = float(x)
+
+ # we are like 999 here
+ if v == int(v):
+ v = int(v)
+ result.append("%s.0" % v)
+ result.append(str(v))
+
+ result.append(v)
+ except (TypeError, ValueError, OverflowError):
+ pass
+ try:
+ result.append(int(x))
+ except (TypeError, ValueError, OverflowError):
+ pass
+ return set(result)
+
+
+def _get_na_values(col, na_values, na_fvalues, keep_default_na):
+ """
+ Get the NaN values for a given column.
+
+ Parameters
+ ----------
+ col : str
+ The name of the column.
+ na_values : array-like, dict
+ The object listing the NaN values as strings.
+ na_fvalues : array-like, dict
+ The object listing the NaN values as floats.
+ keep_default_na : bool
+ If `na_values` is a dict, and the column is not mapped in the
+ dictionary, whether to return the default NaN values or the empty set.
+
+ Returns
+ -------
+ nan_tuple : A length-two tuple composed of
+
+ 1) na_values : the string NaN values for that column.
+ 2) na_fvalues : the float NaN values for that column.
+ """
+
+ if isinstance(na_values, dict):
+ if col in na_values:
+ return na_values[col], na_fvalues[col]
+ else:
+ if keep_default_na:
+ return _NA_VALUES, set()
+
+ return set(), set()
+ else:
+ return na_values, na_fvalues
+
+
+def _get_col_names(colspec, columns):
+ colset = set(columns)
+ colnames = []
+ for c in colspec:
+ if c in colset:
+ colnames.append(c)
+ elif isinstance(c, int):
+ colnames.append(columns[c])
+ return colnames
+
+
+def _concat_date_cols(date_cols):
+ if len(date_cols) == 1:
+ if compat.PY3:
+ return np.array([compat.text_type(x) for x in date_cols[0]],
+ dtype=object)
+ else:
+ return np.array([
+ str(x) if not isinstance(x, compat.string_types) else x
+ for x in date_cols[0]
+ ], dtype=object)
+
+ rs = np.array([' '.join(compat.text_type(y) for y in x)
+ for x in zip(*date_cols)], dtype=object)
+ return rs
+
+
+class FixedWidthReader(BaseIterator):
+ """
+ A reader of fixed-width lines.
+ """
+
+ def __init__(self, f, colspecs, delimiter, comment, skiprows=None,
+ infer_nrows=100):
+ self.f = f
+ self.buffer = None
+ self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t '
+ self.comment = comment
+ if colspecs == 'infer':
+ self.colspecs = self.detect_colspecs(infer_nrows=infer_nrows,
+ skiprows=skiprows)
+ else:
+ self.colspecs = colspecs
+
+ if not isinstance(self.colspecs, (tuple, list)):
+ raise TypeError("column specifications must be a list or tuple, "
+ "input was a %r" % type(colspecs).__name__)
+
+ for colspec in self.colspecs:
+ if not (isinstance(colspec, (tuple, list)) and
+ len(colspec) == 2 and
+ isinstance(colspec[0], (int, np.integer, type(None))) and
+ isinstance(colspec[1], (int, np.integer, type(None)))):
+ raise TypeError('Each column specification must be '
+ '2 element tuple or list of integers')
+
+ def get_rows(self, infer_nrows, skiprows=None):
+ """
+ Read rows from self.f, skipping as specified.
+
+ We distinguish buffer_rows (the first <= infer_nrows
+ lines) from the rows returned to detect_colspecs
+ because it's simpler to leave the other locations
+ with skiprows logic alone than to modify them to
+ deal with the fact we skipped some rows here as
+ well.
+
+ Parameters
+ ----------
+ infer_nrows : int
+ Number of rows to read from self.f, not counting
+ rows that are skipped.
+ skiprows: set, optional
+ Indices of rows to skip.
+
+ Returns
+ -------
+ detect_rows : list of str
+ A list containing the rows to read.
+
+ """
+ if skiprows is None:
+ skiprows = set()
+ buffer_rows = []
+ detect_rows = []
+ for i, row in enumerate(self.f):
+ if i not in skiprows:
+ detect_rows.append(row)
+ buffer_rows.append(row)
+ if len(detect_rows) >= infer_nrows:
+ break
+ self.buffer = iter(buffer_rows)
+ return detect_rows
+
+ def detect_colspecs(self, infer_nrows=100, skiprows=None):
+ # Regex escape the delimiters
+ delimiters = ''.join(r'\%s' % x for x in self.delimiter)
+ pattern = re.compile('([^%s]+)' % delimiters)
+ rows = self.get_rows(infer_nrows, skiprows)
+ if not rows:
+ raise EmptyDataError("No rows from which to infer column width")
+ max_len = max(map(len, rows))
+ mask = np.zeros(max_len + 1, dtype=int)
+ if self.comment is not None:
+ rows = [row.partition(self.comment)[0] for row in rows]
+ for row in rows:
+ for m in pattern.finditer(row):
+ mask[m.start():m.end()] = 1
+ shifted = np.roll(mask, 1)
+ shifted[0] = 0
+ edges = np.where((mask ^ shifted) == 1)[0]
+ edge_pairs = list(zip(edges[::2], edges[1::2]))
+ return edge_pairs
+
+ def __next__(self):
+ if self.buffer is not None:
+ try:
+ line = next(self.buffer)
+ except StopIteration:
+ self.buffer = None
+ line = next(self.f)
+ else:
+ line = next(self.f)
+ # Note: 'colspecs' is a sequence of half-open intervals.
+ return [line[fromm:to].strip(self.delimiter)
+ for (fromm, to) in self.colspecs]
+
+
+class FixedWidthFieldParser(PythonParser):
+ """
+ Specialization that Converts fixed-width fields into DataFrames.
+ See PythonParser for details.
+ """
+
+ def __init__(self, f, **kwds):
+ # Support iterators, convert to a list.
+ self.colspecs = kwds.pop('colspecs')
+ self.infer_nrows = kwds.pop('infer_nrows')
+ PythonParser.__init__(self, f, **kwds)
+
+ def _make_reader(self, f):
+ self.data = FixedWidthReader(f, self.colspecs, self.delimiter,
+ self.comment, self.skiprows,
+ self.infer_nrows)
diff --git a/contrib/python/pandas/py2/pandas/io/pickle.py b/contrib/python/pandas/py2/pandas/io/pickle.py
new file mode 100644
index 00000000000..789f55a62dc
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/pickle.py
@@ -0,0 +1,201 @@
+""" pickle compat """
+import warnings
+
+import numpy as np
+from numpy.lib.format import read_array, write_array
+
+from pandas.compat import PY3, BytesIO, cPickle as pkl, pickle_compat as pc
+
+from pandas.io.common import _get_handle, _stringify_path
+
+
+def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
+ """
+ Pickle (serialize) object to file.
+
+ Parameters
+ ----------
+ obj : any object
+ Any python object.
+ path : str
+ File path where the pickled object will be stored.
+ compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
+ A string representing the compression to use in the output file. By
+ default, infers from the file extension in specified path.
+
+ .. versionadded:: 0.20.0
+ protocol : int
+ Int which indicates which protocol should be used by the pickler,
+ default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
+ values for this parameter depend on the version of Python. For Python
+ 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
+ For Python >= 3.4, 4 is a valid value. A negative value for the
+ protocol parameter is equivalent to setting its value to
+ HIGHEST_PROTOCOL.
+
+ .. [1] https://docs.python.org/3/library/pickle.html
+ .. versionadded:: 0.21.0
+
+ See Also
+ --------
+ read_pickle : Load pickled pandas object (or any object) from file.
+ DataFrame.to_hdf : Write DataFrame to an HDF5 file.
+ DataFrame.to_sql : Write DataFrame to a SQL database.
+ DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
+
+ Examples
+ --------
+ >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
+ >>> original_df
+ foo bar
+ 0 0 5
+ 1 1 6
+ 2 2 7
+ 3 3 8
+ 4 4 9
+ >>> pd.to_pickle(original_df, "./dummy.pkl")
+
+ >>> unpickled_df = pd.read_pickle("./dummy.pkl")
+ >>> unpickled_df
+ foo bar
+ 0 0 5
+ 1 1 6
+ 2 2 7
+ 3 3 8
+ 4 4 9
+
+ >>> import os
+ >>> os.remove("./dummy.pkl")
+ """
+ path = _stringify_path(path)
+ f, fh = _get_handle(path, 'wb',
+ compression=compression,
+ is_text=False)
+ if protocol < 0:
+ protocol = pkl.HIGHEST_PROTOCOL
+ try:
+ f.write(pkl.dumps(obj, protocol=protocol))
+ finally:
+ for _f in fh:
+ _f.close()
+
+
+def read_pickle(path, compression='infer'):
+ """
+ Load pickled pandas object (or any object) from file.
+
+ .. warning::
+
+ Loading pickled data received from untrusted sources can be
+ unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.
+
+ Parameters
+ ----------
+ path : str
+ File path where the pickled object will be loaded.
+ compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
+ For on-the-fly decompression of on-disk data. If 'infer', then use
+ gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
+ or '.zip' respectively, and no decompression otherwise.
+ Set to None for no decompression.
+
+ .. versionadded:: 0.20.0
+
+ Returns
+ -------
+ unpickled : same type as object stored in file
+
+ See Also
+ --------
+ DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
+ Series.to_pickle : Pickle (serialize) Series object to file.
+ read_hdf : Read HDF5 file into a DataFrame.
+ read_sql : Read SQL query or database table into a DataFrame.
+ read_parquet : Load a parquet object, returning a DataFrame.
+
+ Examples
+ --------
+ >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
+ >>> original_df
+ foo bar
+ 0 0 5
+ 1 1 6
+ 2 2 7
+ 3 3 8
+ 4 4 9
+ >>> pd.to_pickle(original_df, "./dummy.pkl")
+
+ >>> unpickled_df = pd.read_pickle("./dummy.pkl")
+ >>> unpickled_df
+ foo bar
+ 0 0 5
+ 1 1 6
+ 2 2 7
+ 3 3 8
+ 4 4 9
+
+ >>> import os
+ >>> os.remove("./dummy.pkl")
+ """
+ path = _stringify_path(path)
+
+ def read_wrapper(func):
+ # wrapper file handle open/close operation
+ f, fh = _get_handle(path, 'rb',
+ compression=compression,
+ is_text=False)
+ try:
+ return func(f)
+ finally:
+ for _f in fh:
+ _f.close()
+
+ def try_read(path, encoding=None):
+ # try with cPickle
+ # try with current pickle, if we have a Type Error then
+ # try with the compat pickle to handle subclass changes
+ # pass encoding only if its not None as py2 doesn't handle
+ # the param
+
+ # cpickle
+ # GH 6899
+ try:
+ with warnings.catch_warnings(record=True):
+ # We want to silence any warnings about, e.g. moved modules.
+ warnings.simplefilter("ignore", Warning)
+ return read_wrapper(lambda f: pkl.load(f))
+ except Exception: # noqa: E722
+ # reg/patched pickle
+ # compat not used in pandas/compat/pickle_compat.py::load
+ # TODO: remove except block OR modify pc.load to use compat
+ try:
+ return read_wrapper(
+ lambda f: pc.load(f, encoding=encoding, compat=False))
+ # compat pickle
+ except Exception: # noqa: E722
+ return read_wrapper(
+ lambda f: pc.load(f, encoding=encoding, compat=True))
+ try:
+ return try_read(path)
+ except Exception: # noqa: E722
+ if PY3:
+ return try_read(path, encoding='latin1')
+ raise
+
+
+# compat with sparse pickle / unpickle
+
+
+def _pickle_array(arr):
+ arr = arr.view(np.ndarray)
+
+ buf = BytesIO()
+ write_array(buf, arr)
+
+ return buf.getvalue()
+
+
+def _unpickle_array(bytes):
+ arr = read_array(BytesIO(bytes))
+
+ return arr
diff --git a/contrib/python/pandas/py2/pandas/io/pytables.py b/contrib/python/pandas/py2/pandas/io/pytables.py
new file mode 100644
index 00000000000..2ab6ddb5b25
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/pytables.py
@@ -0,0 +1,4890 @@
+# pylint: disable-msg=E1101,W0613,W0603
+"""
+High level interface to PyTables for reading and writing pandas data structures
+to disk
+"""
+
+import copy
+from datetime import date, datetime
+from distutils.version import LooseVersion
+import itertools
+import os
+import re
+import time
+import warnings
+
+import numpy as np
+
+from pandas._libs import algos, lib, writers as libwriters
+from pandas._libs.tslibs import timezones
+from pandas.compat import PY3, filter, lrange, range, string_types
+from pandas.errors import PerformanceWarning
+
+from pandas.core.dtypes.common import (
+ ensure_int64, ensure_object, ensure_platform_int, is_categorical_dtype,
+ is_datetime64_dtype, is_datetime64tz_dtype, is_list_like,
+ is_timedelta64_dtype)
+from pandas.core.dtypes.missing import array_equivalent
+
+from pandas import (
+ DataFrame, DatetimeIndex, Index, Int64Index, MultiIndex, Panel,
+ PeriodIndex, Series, SparseDataFrame, SparseSeries, TimedeltaIndex, compat,
+ concat, isna, to_datetime)
+from pandas.core import config
+from pandas.core.algorithms import match, unique
+from pandas.core.arrays.categorical import (
+ Categorical, _factorize_from_iterables)
+from pandas.core.arrays.sparse import BlockIndex, IntIndex
+from pandas.core.base import StringMixin
+import pandas.core.common as com
+from pandas.core.computation.pytables import Expr, maybe_expression
+from pandas.core.config import get_option
+from pandas.core.index import ensure_index
+from pandas.core.internals import (
+ BlockManager, _block2d_to_blocknd, _block_shape, _factor_indexer,
+ make_block)
+
+from pandas.io.common import _stringify_path
+from pandas.io.formats.printing import adjoin, pprint_thing
+
+# versioning attribute
+_version = '0.15.2'
+
+# encoding
+# PY3 encoding if we don't specify
+_default_encoding = 'UTF-8'
+
+
+def _ensure_decoded(s):
+ """ if we have bytes, decode them to unicode """
+ if isinstance(s, np.bytes_):
+ s = s.decode('UTF-8')
+ return s
+
+
+def _ensure_encoding(encoding):
+ # set the encoding if we need
+ if encoding is None:
+ if PY3:
+ encoding = _default_encoding
+ return encoding
+
+
+def _ensure_str(name):
+ """Ensure that an index / column name is a str (python 3) or
+ unicode (python 2); otherwise they may be np.string dtype.
+ Non-string dtypes are passed through unchanged.
+
+ https://github.com/pandas-dev/pandas/issues/13492
+ """
+ if isinstance(name, compat.string_types):
+ name = compat.text_type(name)
+ return name
+
+
+Term = Expr
+
+
+def _ensure_term(where, scope_level):
+ """
+ ensure that the where is a Term or a list of Term
+ this makes sure that we are capturing the scope of variables
+ that are passed
+ create the terms here with a frame_level=2 (we are 2 levels down)
+ """
+
+ # only consider list/tuple here as an ndarray is automatically a coordinate
+ # list
+ level = scope_level + 1
+ if isinstance(where, (list, tuple)):
+ wlist = []
+ for w in filter(lambda x: x is not None, where):
+ if not maybe_expression(w):
+ wlist.append(w)
+ else:
+ wlist.append(Term(w, scope_level=level))
+ where = wlist
+ elif maybe_expression(where):
+ where = Term(where, scope_level=level)
+ return where
+
+
+class PossibleDataLossError(Exception):
+ pass
+
+
+class ClosedFileError(Exception):
+ pass
+
+
+class IncompatibilityWarning(Warning):
+ pass
+
+
+incompatibility_doc = """
+where criteria is being ignored as this version [%s] is too old (or
+not-defined), read the file in and write it out to a new file to upgrade (with
+the copy_to method)
+"""
+
+
+class AttributeConflictWarning(Warning):
+ pass
+
+
+attribute_conflict_doc = """
+the [%s] attribute of the existing index is [%s] which conflicts with the new
+[%s], resetting the attribute to None
+"""
+
+
+class DuplicateWarning(Warning):
+ pass
+
+
+duplicate_doc = """
+duplicate entries in table, taking most recently appended
+"""
+
+performance_doc = """
+your performance may suffer as PyTables will pickle object types that it cannot
+map directly to c-types [inferred_type->%s,key->%s] [items->%s]
+"""
+
+# formats
+_FORMAT_MAP = {
+ u'f': 'fixed',
+ u'fixed': 'fixed',
+ u't': 'table',
+ u'table': 'table',
+}
+
+format_deprecate_doc = """
+the table keyword has been deprecated
+use the format='fixed(f)|table(t)' keyword instead
+ fixed(f) : specifies the Fixed format
+ and is the default for put operations
+ table(t) : specifies the Table format
+ and is the default for append operations
+"""
+
+# map object types
+_TYPE_MAP = {
+
+ Series: u'series',
+ SparseSeries: u'sparse_series',
+ DataFrame: u'frame',
+ SparseDataFrame: u'sparse_frame',
+ Panel: u'wide',
+}
+
+# storer class map
+_STORER_MAP = {
+ u'Series': 'LegacySeriesFixed',
+ u'DataFrame': 'LegacyFrameFixed',
+ u'DataMatrix': 'LegacyFrameFixed',
+ u'series': 'SeriesFixed',
+ u'sparse_series': 'SparseSeriesFixed',
+ u'frame': 'FrameFixed',
+ u'sparse_frame': 'SparseFrameFixed',
+ u'wide': 'PanelFixed',
+}
+
+# table class map
+_TABLE_MAP = {
+ u'generic_table': 'GenericTable',
+ u'appendable_series': 'AppendableSeriesTable',
+ u'appendable_multiseries': 'AppendableMultiSeriesTable',
+ u'appendable_frame': 'AppendableFrameTable',
+ u'appendable_multiframe': 'AppendableMultiFrameTable',
+ u'appendable_panel': 'AppendablePanelTable',
+ u'worm': 'WORMTable',
+ u'legacy_frame': 'LegacyFrameTable',
+ u'legacy_panel': 'LegacyPanelTable',
+}
+
+# axes map
+_AXES_MAP = {
+ DataFrame: [0],
+ Panel: [1, 2]
+}
+
+# register our configuration options
+dropna_doc = """
+: boolean
+ drop ALL nan rows when appending to a table
+"""
+format_doc = """
+: format
+ default format writing format, if None, then
+ put will default to 'fixed' and append will default to 'table'
+"""
+
+with config.config_prefix('io.hdf'):
+ config.register_option('dropna_table', False, dropna_doc,
+ validator=config.is_bool)
+ config.register_option(
+ 'default_format', None, format_doc,
+ validator=config.is_one_of_factory(['fixed', 'table', None])
+ )
+
+# oh the troubles to reduce import time
+_table_mod = None
+_table_file_open_policy_is_strict = False
+
+
+def _tables():
+ global _table_mod
+ global _table_file_open_policy_is_strict
+ if _table_mod is None:
+ import tables
+ _table_mod = tables
+
+ # version requirements
+ if LooseVersion(tables.__version__) < LooseVersion('3.0.0'):
+ raise ImportError("PyTables version >= 3.0.0 is required")
+
+ # set the file open policy
+ # return the file open policy; this changes as of pytables 3.1
+ # depending on the HDF5 version
+ try:
+ _table_file_open_policy_is_strict = (
+ tables.file._FILE_OPEN_POLICY == 'strict')
+ except AttributeError:
+ pass
+
+ return _table_mod
+
+# interface to/from ###
+
+
+def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None,
+ append=None, **kwargs):
+ """ store this object, close it if we opened it """
+
+ if append:
+ f = lambda store: store.append(key, value, **kwargs)
+ else:
+ f = lambda store: store.put(key, value, **kwargs)
+
+ path_or_buf = _stringify_path(path_or_buf)
+ if isinstance(path_or_buf, string_types):
+ with HDFStore(path_or_buf, mode=mode, complevel=complevel,
+ complib=complib) as store:
+ f(store)
+ else:
+ f(path_or_buf)
+
+
+def read_hdf(path_or_buf, key=None, mode='r', **kwargs):
+ """
+ Read from the store, close it if we opened it.
+
+ Retrieve pandas object stored in file, optionally based on where
+ criteria
+
+ Parameters
+ ----------
+ path_or_buf : string, buffer or path object
+ Path to the file to open, or an open :class:`pandas.HDFStore` object.
+ Supports any object implementing the ``__fspath__`` protocol.
+ This includes :class:`pathlib.Path` and py._path.local.LocalPath
+ objects.
+
+ .. versionadded:: 0.19.0 support for pathlib, py.path.
+ .. versionadded:: 0.21.0 support for __fspath__ protocol.
+
+ key : object, optional
+ The group identifier in the store. Can be omitted if the HDF file
+ contains a single pandas object.
+ mode : {'r', 'r+', 'a'}, optional
+ Mode to use when opening the file. Ignored if path_or_buf is a
+ :class:`pandas.HDFStore`. Default is 'r'.
+ where : list, optional
+ A list of Term (or convertible) objects.
+ start : int, optional
+ Row number to start selection.
+ stop : int, optional
+ Row number to stop selection.
+ columns : list, optional
+ A list of columns names to return.
+ iterator : bool, optional
+ Return an iterator object.
+ chunksize : int, optional
+ Number of rows to include in an iteration when using an iterator.
+ errors : str, default 'strict'
+ Specifies how encoding and decoding errors are to be handled.
+ See the errors argument for :func:`open` for a full list
+ of options.
+ **kwargs
+ Additional keyword arguments passed to HDFStore.
+
+ Returns
+ -------
+ item : object
+ The selected object. Return type depends on the object stored.
+
+ See Also
+ --------
+ pandas.DataFrame.to_hdf : Write a HDF file from a DataFrame.
+ pandas.HDFStore : Low-level access to HDF files.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z'])
+ >>> df.to_hdf('./store.h5', 'data')
+ >>> reread = pd.read_hdf('./store.h5')
+ """
+
+ if mode not in ['r', 'r+', 'a']:
+ raise ValueError('mode {0} is not allowed while performing a read. '
+ 'Allowed modes are r, r+ and a.'.format(mode))
+ # grab the scope
+ if 'where' in kwargs:
+ kwargs['where'] = _ensure_term(kwargs['where'], scope_level=1)
+
+ if isinstance(path_or_buf, HDFStore):
+ if not path_or_buf.is_open:
+ raise IOError('The HDFStore must be open for reading.')
+
+ store = path_or_buf
+ auto_close = False
+ else:
+ path_or_buf = _stringify_path(path_or_buf)
+ if not isinstance(path_or_buf, string_types):
+ raise NotImplementedError('Support for generic buffers has not '
+ 'been implemented.')
+ try:
+ exists = os.path.exists(path_or_buf)
+
+ # if filepath is too long
+ except (TypeError, ValueError):
+ exists = False
+
+ if not exists:
+ raise compat.FileNotFoundError(
+ 'File {path} does not exist'.format(path=path_or_buf))
+
+ store = HDFStore(path_or_buf, mode=mode, **kwargs)
+ # can't auto open/close if we are using an iterator
+ # so delegate to the iterator
+ auto_close = True
+
+ try:
+ if key is None:
+ groups = store.groups()
+ if len(groups) == 0:
+ raise ValueError('No dataset in HDF5 file.')
+ candidate_only_group = groups[0]
+
+ # For the HDF file to have only one dataset, all other groups
+ # should then be metadata groups for that candidate group. (This
+ # assumes that the groups() method enumerates parent groups
+ # before their children.)
+ for group_to_check in groups[1:]:
+ if not _is_metadata_of(group_to_check, candidate_only_group):
+ raise ValueError('key must be provided when HDF5 file '
+ 'contains multiple datasets.')
+ key = candidate_only_group._v_pathname
+ return store.select(key, auto_close=auto_close, **kwargs)
+ except (ValueError, TypeError):
+ # if there is an error, close the store
+ try:
+ store.close()
+ except AttributeError:
+ pass
+
+ raise
+
+
+def _is_metadata_of(group, parent_group):
+ """Check if a given group is a metadata group for a given parent_group."""
+ if group._v_depth <= parent_group._v_depth:
+ return False
+
+ current = group
+ while current._v_depth > 1:
+ parent = current._v_parent
+ if parent == parent_group and current._v_name == 'meta':
+ return True
+ current = current._v_parent
+ return False
+
+
+class HDFStore(StringMixin):
+
+ """
+ Dict-like IO interface for storing pandas objects in PyTables
+ either Fixed or Table format.
+
+ Parameters
+ ----------
+ path : string
+ File path to HDF5 file
+ mode : {'a', 'w', 'r', 'r+'}, default 'a'
+
+ ``'r'``
+ Read-only; no data can be modified.
+ ``'w'``
+ Write; a new file is created (an existing file with the same
+ name would be deleted).
+ ``'a'``
+ Append; an existing file is opened for reading and writing,
+ and if the file does not exist it is created.
+ ``'r+'``
+ It is similar to ``'a'``, but the file must already exist.
+ complevel : int, 0-9, default None
+ Specifies a compression level for data.
+ A value of 0 disables compression.
+ complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
+ Specifies the compression library to be used.
+ As of v0.20.2 these additional compressors for Blosc are supported
+ (default if no compressor specified: 'blosc:blosclz'):
+ {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
+ 'blosc:zlib', 'blosc:zstd'}.
+ Specifying a compression library which is not available issues
+ a ValueError.
+ fletcher32 : bool, default False
+ If applying compression use the fletcher32 checksum
+
+ Examples
+ --------
+ >>> bar = pd.DataFrame(np.random.randn(10, 4))
+ >>> store = pd.HDFStore('test.h5')
+ >>> store['foo'] = bar # write to HDF5
+ >>> bar = store['foo'] # retrieve
+ >>> store.close()
+ """
+
+ def __init__(self, path, mode=None, complevel=None, complib=None,
+ fletcher32=False, **kwargs):
+
+ if 'format' in kwargs:
+ raise ValueError('format is not a defined argument for HDFStore')
+
+ try:
+ import tables # noqa
+ except ImportError as ex: # pragma: no cover
+ raise ImportError('HDFStore requires PyTables, "{ex!s}" problem '
+ 'importing'.format(ex=ex))
+
+ if complib is not None and complib not in tables.filters.all_complibs:
+ raise ValueError(
+ "complib only supports {libs} compression.".format(
+ libs=tables.filters.all_complibs))
+
+ if complib is None and complevel is not None:
+ complib = tables.filters.default_complib
+
+ self._path = _stringify_path(path)
+ if mode is None:
+ mode = 'a'
+ self._mode = mode
+ self._handle = None
+ self._complevel = complevel if complevel else 0
+ self._complib = complib
+ self._fletcher32 = fletcher32
+ self._filters = None
+ self.open(mode=mode, **kwargs)
+
+ def __fspath__(self):
+ return self._path
+
+ @property
+ def root(self):
+ """ return the root node """
+ self._check_if_open()
+ return self._handle.root
+
+ @property
+ def filename(self):
+ return self._path
+
+ def __getitem__(self, key):
+ return self.get(key)
+
+ def __setitem__(self, key, value):
+ self.put(key, value)
+
+ def __delitem__(self, key):
+ return self.remove(key)
+
+ def __getattr__(self, name):
+ """ allow attribute access to get stores """
+ try:
+ return self.get(name)
+ except (KeyError, ClosedFileError):
+ pass
+ raise AttributeError(
+ "'{object}' object has no attribute '{name}'".format(
+ object=type(self).__name__, name=name))
+
+ def __contains__(self, key):
+ """ check for existence of this key
+ can match the exact pathname or the pathnm w/o the leading '/'
+ """
+ node = self.get_node(key)
+ if node is not None:
+ name = node._v_pathname
+ if name == key or name[1:] == key:
+ return True
+ return False
+
+ def __len__(self):
+ return len(self.groups())
+
+ def __unicode__(self):
+ return '{type}\nFile path: {path}\n'.format(
+ type=type(self), path=pprint_thing(self._path))
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+
+ def keys(self):
+ """
+ Return a (potentially unordered) list of the keys corresponding to the
+ objects stored in the HDFStore. These are ABSOLUTE path-names (e.g.
+ have the leading '/'
+ """
+ return [n._v_pathname for n in self.groups()]
+
+ def __iter__(self):
+ return iter(self.keys())
+
+ def items(self):
+ """
+ iterate on key->group
+ """
+ for g in self.groups():
+ yield g._v_pathname, g
+
+ iteritems = items
+
+ def open(self, mode='a', **kwargs):
+ """
+ Open the file in the specified mode
+
+ Parameters
+ ----------
+ mode : {'a', 'w', 'r', 'r+'}, default 'a'
+ See HDFStore docstring or tables.open_file for info about modes
+ """
+ tables = _tables()
+
+ if self._mode != mode:
+
+ # if we are changing a write mode to read, ok
+ if self._mode in ['a', 'w'] and mode in ['r', 'r+']:
+ pass
+ elif mode in ['w']:
+
+ # this would truncate, raise here
+ if self.is_open:
+ raise PossibleDataLossError(
+ "Re-opening the file [{0}] with mode [{1}] "
+ "will delete the current file!"
+ .format(self._path, self._mode)
+ )
+
+ self._mode = mode
+
+ # close and reopen the handle
+ if self.is_open:
+ self.close()
+
+ if self._complevel and self._complevel > 0:
+ self._filters = _tables().Filters(self._complevel, self._complib,
+ fletcher32=self._fletcher32)
+
+ try:
+ self._handle = tables.open_file(self._path, self._mode, **kwargs)
+ except (IOError) as e: # pragma: no cover
+ if 'can not be written' in str(e):
+ print(
+ 'Opening {path} in read-only mode'.format(path=self._path))
+ self._handle = tables.open_file(self._path, 'r', **kwargs)
+ else:
+ raise
+
+ except (ValueError) as e:
+
+ # trap PyTables >= 3.1 FILE_OPEN_POLICY exception
+ # to provide an updated message
+ if 'FILE_OPEN_POLICY' in str(e):
+ e = ValueError(
+ "PyTables [{version}] no longer supports opening multiple "
+ "files\n"
+ "even in read-only mode on this HDF5 version "
+ "[{hdf_version}]. You can accept this\n"
+ "and not open the same file multiple times at once,\n"
+ "upgrade the HDF5 version, or downgrade to PyTables 3.0.0 "
+ "which allows\n"
+ "files to be opened multiple times at once\n"
+ .format(version=tables.__version__,
+ hdf_version=tables.get_hdf5_version()))
+
+ raise e
+
+ except (Exception) as e:
+
+ # trying to read from a non-existent file causes an error which
+ # is not part of IOError, make it one
+ if self._mode == 'r' and 'Unable to open/create file' in str(e):
+ raise IOError(str(e))
+ raise
+
+ def close(self):
+ """
+ Close the PyTables file handle
+ """
+ if self._handle is not None:
+ self._handle.close()
+ self._handle = None
+
+ @property
+ def is_open(self):
+ """
+ return a boolean indicating whether the file is open
+ """
+ if self._handle is None:
+ return False
+ return bool(self._handle.isopen)
+
+ def flush(self, fsync=False):
+ """
+ Force all buffered modifications to be written to disk.
+
+ Parameters
+ ----------
+ fsync : bool (default False)
+ call ``os.fsync()`` on the file handle to force writing to disk.
+
+ Notes
+ -----
+ Without ``fsync=True``, flushing may not guarantee that the OS writes
+ to disk. With fsync, the operation will block until the OS claims the
+ file has been written; however, other caching layers may still
+ interfere.
+ """
+ if self._handle is not None:
+ self._handle.flush()
+ if fsync:
+ try:
+ os.fsync(self._handle.fileno())
+ except OSError:
+ pass
+
+ def get(self, key):
+ """
+ Retrieve pandas object stored in file
+
+ Parameters
+ ----------
+ key : object
+
+ Returns
+ -------
+ obj : same type as object stored in file
+ """
+ group = self.get_node(key)
+ if group is None:
+ raise KeyError('No object named {key} in the file'.format(key=key))
+ return self._read_group(group)
+
+ def select(self, key, where=None, start=None, stop=None, columns=None,
+ iterator=False, chunksize=None, auto_close=False, **kwargs):
+ """
+ Retrieve pandas object stored in file, optionally based on where
+ criteria
+
+ Parameters
+ ----------
+ key : object
+ where : list of Term (or convertible) objects, optional
+ start : integer (defaults to None), row number to start selection
+ stop : integer (defaults to None), row number to stop selection
+ columns : a list of columns that if not None, will limit the return
+ columns
+ iterator : boolean, return an iterator, default False
+ chunksize : nrows to include in iteration, return an iterator
+ auto_close : boolean, should automatically close the store when
+ finished, default is False
+
+ Returns
+ -------
+ The selected object
+ """
+ group = self.get_node(key)
+ if group is None:
+ raise KeyError('No object named {key} in the file'.format(key=key))
+
+ # create the storer and axes
+ where = _ensure_term(where, scope_level=1)
+ s = self._create_storer(group)
+ s.infer_axes()
+
+ # function to call on iteration
+ def func(_start, _stop, _where):
+ return s.read(start=_start, stop=_stop,
+ where=_where,
+ columns=columns)
+
+ # create the iterator
+ it = TableIterator(self, s, func, where=where, nrows=s.nrows,
+ start=start, stop=stop, iterator=iterator,
+ chunksize=chunksize, auto_close=auto_close)
+
+ return it.get_result()
+
+ def select_as_coordinates(
+ self, key, where=None, start=None, stop=None, **kwargs):
+ """
+ return the selection as an Index
+
+ Parameters
+ ----------
+ key : object
+ where : list of Term (or convertible) objects, optional
+ start : integer (defaults to None), row number to start selection
+ stop : integer (defaults to None), row number to stop selection
+ """
+ where = _ensure_term(where, scope_level=1)
+ return self.get_storer(key).read_coordinates(where=where, start=start,
+ stop=stop, **kwargs)
+
+ def select_column(self, key, column, **kwargs):
+ """
+ return a single column from the table. This is generally only useful to
+ select an indexable
+
+ Parameters
+ ----------
+ key : object
+ column: the column of interest
+
+ Exceptions
+ ----------
+ raises KeyError if the column is not found (or key is not a valid
+ store)
+ raises ValueError if the column can not be extracted individually (it
+ is part of a data block)
+
+ """
+ return self.get_storer(key).read_column(column=column, **kwargs)
+
+ def select_as_multiple(self, keys, where=None, selector=None, columns=None,
+ start=None, stop=None, iterator=False,
+ chunksize=None, auto_close=False, **kwargs):
+ """ Retrieve pandas objects from multiple tables
+
+ Parameters
+ ----------
+ keys : a list of the tables
+ selector : the table to apply the where criteria (defaults to keys[0]
+ if not supplied)
+ columns : the columns I want back
+ start : integer (defaults to None), row number to start selection
+ stop : integer (defaults to None), row number to stop selection
+ iterator : boolean, return an iterator, default False
+ chunksize : nrows to include in iteration, return an iterator
+
+ Exceptions
+ ----------
+ raises KeyError if keys or selector is not found or keys is empty
+ raises TypeError if keys is not a list or tuple
+ raises ValueError if the tables are not ALL THE SAME DIMENSIONS
+ """
+
+ # default to single select
+ where = _ensure_term(where, scope_level=1)
+ if isinstance(keys, (list, tuple)) and len(keys) == 1:
+ keys = keys[0]
+ if isinstance(keys, string_types):
+ return self.select(key=keys, where=where, columns=columns,
+ start=start, stop=stop, iterator=iterator,
+ chunksize=chunksize, **kwargs)
+
+ if not isinstance(keys, (list, tuple)):
+ raise TypeError("keys must be a list/tuple")
+
+ if not len(keys):
+ raise ValueError("keys must have a non-zero length")
+
+ if selector is None:
+ selector = keys[0]
+
+ # collect the tables
+ tbls = [self.get_storer(k) for k in keys]
+ s = self.get_storer(selector)
+
+ # validate rows
+ nrows = None
+ for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):
+ if t is None:
+ raise KeyError("Invalid table [{key}]".format(key=k))
+ if not t.is_table:
+ raise TypeError(
+ "object [{obj}] is not a table, and cannot be used in all "
+ "select as multiple".format(obj=t.pathname)
+ )
+
+ if nrows is None:
+ nrows = t.nrows
+ elif t.nrows != nrows:
+ raise ValueError(
+ "all tables must have exactly the same nrows!")
+
+ # axis is the concentation axes
+ axis = list({t.non_index_axes[0][0] for t in tbls})[0]
+
+ def func(_start, _stop, _where):
+
+ # retrieve the objs, _where is always passed as a set of
+ # coordinates here
+ objs = [t.read(where=_where, columns=columns, start=_start,
+ stop=_stop, **kwargs) for t in tbls]
+
+ # concat and return
+ return concat(objs, axis=axis,
+ verify_integrity=False)._consolidate()
+
+ # create the iterator
+ it = TableIterator(self, s, func, where=where, nrows=nrows,
+ start=start, stop=stop, iterator=iterator,
+ chunksize=chunksize, auto_close=auto_close)
+
+ return it.get_result(coordinates=True)
+
+ def put(self, key, value, format=None, append=False, **kwargs):
+ """
+ Store object in HDFStore
+
+ Parameters
+ ----------
+ key : object
+ value : {Series, DataFrame, Panel}
+ format : 'fixed(f)|table(t)', default is 'fixed'
+ fixed(f) : Fixed format
+ Fast writing/reading. Not-appendable, nor searchable
+ table(t) : Table format
+ Write as a PyTables Table structure which may perform
+ worse but allow more flexible operations like searching
+ / selecting subsets of the data
+ append : boolean, default False
+ This will force Table format, append the input data to the
+ existing.
+ data_columns : list of columns to create as data columns, or True to
+ use all columns. See
+ `here <http://pandas.pydata.org/pandas-docs/stable/io.html#query-via-data-columns>`__ # noqa
+ encoding : default None, provide an encoding for strings
+ dropna : boolean, default False, do not write an ALL nan row to
+ the store settable by the option 'io.hdf.dropna_table'
+ """
+ if format is None:
+ format = get_option("io.hdf.default_format") or 'fixed'
+ kwargs = self._validate_format(format, kwargs)
+ self._write_to_group(key, value, append=append, **kwargs)
+
+ def remove(self, key, where=None, start=None, stop=None):
+ """
+ Remove pandas object partially by specifying the where condition
+
+ Parameters
+ ----------
+ key : string
+ Node to remove or delete rows from
+ where : list of Term (or convertible) objects, optional
+ start : integer (defaults to None), row number to start selection
+ stop : integer (defaults to None), row number to stop selection
+
+ Returns
+ -------
+ number of rows removed (or None if not a Table)
+
+ Exceptions
+ ----------
+ raises KeyError if key is not a valid store
+
+ """
+ where = _ensure_term(where, scope_level=1)
+ try:
+ s = self.get_storer(key)
+ except KeyError:
+ # the key is not a valid store, re-raising KeyError
+ raise
+ except Exception:
+
+ if where is not None:
+ raise ValueError(
+ "trying to remove a node with a non-None where clause!")
+
+ # we are actually trying to remove a node (with children)
+ s = self.get_node(key)
+ if s is not None:
+ s._f_remove(recursive=True)
+ return None
+
+ # remove the node
+ if com._all_none(where, start, stop):
+ s.group._f_remove(recursive=True)
+
+ # delete from the table
+ else:
+ if not s.is_table:
+ raise ValueError(
+ 'can only remove with where on objects written as tables')
+ return s.delete(where=where, start=start, stop=stop)
+
+ def append(self, key, value, format=None, append=True, columns=None,
+ dropna=None, **kwargs):
+ """
+ Append to Table in file. Node must already exist and be Table
+ format.
+
+ Parameters
+ ----------
+ key : object
+ value : {Series, DataFrame, Panel}
+ format : 'table' is the default
+ table(t) : table format
+ Write as a PyTables Table structure which may perform
+ worse but allow more flexible operations like searching
+ / selecting subsets of the data
+ append : boolean, default True, append the input data to the
+ existing
+ data_columns : list of columns, or True, default None
+ List of columns to create as indexed data columns for on-disk
+ queries, or True to use all columns. By default only the axes
+ of the object are indexed. See `here
+ <http://pandas.pydata.org/pandas-docs/stable/io.html#query-via-data-columns>`__.
+ min_itemsize : dict of columns that specify minimum string sizes
+ nan_rep : string to use as string nan represenation
+ chunksize : size to chunk the writing
+ expectedrows : expected TOTAL row size of this table
+ encoding : default None, provide an encoding for strings
+ dropna : boolean, default False, do not write an ALL nan row to
+ the store settable by the option 'io.hdf.dropna_table'
+
+ Notes
+ -----
+ Does *not* check if data being appended overlaps with existing
+ data in the table, so be careful
+ """
+ if columns is not None:
+ raise TypeError("columns is not a supported keyword in append, "
+ "try data_columns")
+
+ if dropna is None:
+ dropna = get_option("io.hdf.dropna_table")
+ if format is None:
+ format = get_option("io.hdf.default_format") or 'table'
+ kwargs = self._validate_format(format, kwargs)
+ self._write_to_group(key, value, append=append, dropna=dropna,
+ **kwargs)
+
+ def append_to_multiple(self, d, value, selector, data_columns=None,
+ axes=None, dropna=False, **kwargs):
+ """
+ Append to multiple tables
+
+ Parameters
+ ----------
+ d : a dict of table_name to table_columns, None is acceptable as the
+ values of one node (this will get all the remaining columns)
+ value : a pandas object
+ selector : a string that designates the indexable table; all of its
+ columns will be designed as data_columns, unless data_columns is
+ passed, in which case these are used
+ data_columns : list of columns to create as data columns, or True to
+ use all columns
+ dropna : if evaluates to True, drop rows from all tables if any single
+ row in each table has all NaN. Default False.
+
+ Notes
+ -----
+ axes parameter is currently not accepted
+
+ """
+ if axes is not None:
+ raise TypeError("axes is currently not accepted as a parameter to"
+ " append_to_multiple; you can create the "
+ "tables independently instead")
+
+ if not isinstance(d, dict):
+ raise ValueError(
+ "append_to_multiple must have a dictionary specified as the "
+ "way to split the value"
+ )
+
+ if selector not in d:
+ raise ValueError(
+ "append_to_multiple requires a selector that is in passed dict"
+ )
+
+ # figure out the splitting axis (the non_index_axis)
+ axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0]
+
+ # figure out how to split the value
+ remain_key = None
+ remain_values = []
+ for k, v in d.items():
+ if v is None:
+ if remain_key is not None:
+ raise ValueError(
+ "append_to_multiple can only have one value in d that "
+ "is None"
+ )
+ remain_key = k
+ else:
+ remain_values.extend(v)
+ if remain_key is not None:
+ ordered = value.axes[axis]
+ ordd = ordered.difference(Index(remain_values))
+ ordd = sorted(ordered.get_indexer(ordd))
+ d[remain_key] = ordered.take(ordd)
+
+ # data_columns
+ if data_columns is None:
+ data_columns = d[selector]
+
+ # ensure rows are synchronized across the tables
+ if dropna:
+ idxs = (value[cols].dropna(how='all').index for cols in d.values())
+ valid_index = next(idxs)
+ for index in idxs:
+ valid_index = valid_index.intersection(index)
+ value = value.loc[valid_index]
+
+ # append
+ for k, v in d.items():
+ dc = data_columns if k == selector else None
+
+ # compute the val
+ val = value.reindex(v, axis=axis)
+
+ self.append(k, val, data_columns=dc, **kwargs)
+
+ def create_table_index(self, key, **kwargs):
+ """ Create a pytables index on the table
+ Parameters
+ ----------
+ key : object (the node to index)
+
+ Exceptions
+ ----------
+ raises if the node is not a table
+
+ """
+
+ # version requirements
+ _tables()
+ s = self.get_storer(key)
+ if s is None:
+ return
+
+ if not s.is_table:
+ raise TypeError(
+ "cannot create table index on a Fixed format store")
+ s.create_index(**kwargs)
+
+ def groups(self):
+ """return a list of all the top-level nodes (that are not themselves a
+ pandas storage object)
+ """
+ _tables()
+ self._check_if_open()
+ return [
+ g for g in self._handle.walk_groups()
+ if (not isinstance(g, _table_mod.link.Link) and
+ (getattr(g._v_attrs, 'pandas_type', None) or
+ getattr(g, 'table', None) or
+ (isinstance(g, _table_mod.table.Table) and
+ g._v_name != u'table')))
+ ]
+
+ def walk(self, where="/"):
+ """ Walk the pytables group hierarchy for pandas objects
+
+ This generator will yield the group path, subgroups and pandas object
+ names for each group.
+ Any non-pandas PyTables objects that are not a group will be ignored.
+
+ The `where` group itself is listed first (preorder), then each of its
+ child groups (following an alphanumerical order) is also traversed,
+ following the same procedure.
+
+ .. versionadded:: 0.24.0
+
+ Parameters
+ ----------
+ where : str, optional
+ Group where to start walking.
+ If not supplied, the root group is used.
+
+ Yields
+ ------
+ path : str
+ Full path to a group (without trailing '/')
+ groups : list of str
+ names of the groups contained in `path`
+ leaves : list of str
+ names of the pandas objects contained in `path`
+ """
+ _tables()
+ self._check_if_open()
+ for g in self._handle.walk_groups(where):
+ if getattr(g._v_attrs, 'pandas_type', None) is not None:
+ continue
+
+ groups = []
+ leaves = []
+ for child in g._v_children.values():
+ pandas_type = getattr(child._v_attrs, 'pandas_type', None)
+ if pandas_type is None:
+ if isinstance(child, _table_mod.group.Group):
+ groups.append(child._v_name)
+ else:
+ leaves.append(child._v_name)
+
+ yield (g._v_pathname.rstrip('/'), groups, leaves)
+
+ def get_node(self, key):
+ """ return the node with the key or None if it does not exist """
+ self._check_if_open()
+ try:
+ if not key.startswith('/'):
+ key = '/' + key
+ return self._handle.get_node(self.root, key)
+ except _table_mod.exceptions.NoSuchNodeError:
+ return None
+
+ def get_storer(self, key):
+ """ return the storer object for a key, raise if not in the file """
+ group = self.get_node(key)
+ if group is None:
+ raise KeyError('No object named {key} in the file'.format(key=key))
+
+ s = self._create_storer(group)
+ s.infer_axes()
+ return s
+
+ def copy(self, file, mode='w', propindexes=True, keys=None, complib=None,
+ complevel=None, fletcher32=False, overwrite=True):
+ """ copy the existing store to a new file, upgrading in place
+
+ Parameters
+ ----------
+ propindexes: restore indexes in copied file (defaults to True)
+ keys : list of keys to include in the copy (defaults to all)
+ overwrite : overwrite (remove and replace) existing nodes in the
+ new store (default is True)
+ mode, complib, complevel, fletcher32 same as in HDFStore.__init__
+
+ Returns
+ -------
+ open file handle of the new store
+
+ """
+ new_store = HDFStore(
+ file,
+ mode=mode,
+ complib=complib,
+ complevel=complevel,
+ fletcher32=fletcher32)
+ if keys is None:
+ keys = list(self.keys())
+ if not isinstance(keys, (tuple, list)):
+ keys = [keys]
+ for k in keys:
+ s = self.get_storer(k)
+ if s is not None:
+
+ if k in new_store:
+ if overwrite:
+ new_store.remove(k)
+
+ data = self.select(k)
+ if s.is_table:
+
+ index = False
+ if propindexes:
+ index = [a.name for a in s.axes if a.is_indexed]
+ new_store.append(
+ k, data, index=index,
+ data_columns=getattr(s, 'data_columns', None),
+ encoding=s.encoding
+ )
+ else:
+ new_store.put(k, data, encoding=s.encoding)
+
+ return new_store
+
+ def info(self):
+ """
+ Print detailed information on the store.
+
+ .. versionadded:: 0.21.0
+ """
+ output = '{type}\nFile path: {path}\n'.format(
+ type=type(self), path=pprint_thing(self._path))
+ if self.is_open:
+ lkeys = sorted(list(self.keys()))
+ if len(lkeys):
+ keys = []
+ values = []
+
+ for k in lkeys:
+ try:
+ s = self.get_storer(k)
+ if s is not None:
+ keys.append(pprint_thing(s.pathname or k))
+ values.append(
+ pprint_thing(s or 'invalid_HDFStore node'))
+ except Exception as detail:
+ keys.append(k)
+ values.append(
+ "[invalid_HDFStore node: {detail}]".format(
+ detail=pprint_thing(detail)))
+
+ output += adjoin(12, keys, values)
+ else:
+ output += 'Empty'
+ else:
+ output += "File is CLOSED"
+
+ return output
+
+ # private methods ######
+ def _check_if_open(self):
+ if not self.is_open:
+ raise ClosedFileError("{0} file is not open!".format(self._path))
+
+ def _validate_format(self, format, kwargs):
+ """ validate / deprecate formats; return the new kwargs """
+ kwargs = kwargs.copy()
+
+ # validate
+ try:
+ kwargs['format'] = _FORMAT_MAP[format.lower()]
+ except KeyError:
+ raise TypeError("invalid HDFStore format specified [{0}]"
+ .format(format))
+
+ return kwargs
+
+ def _create_storer(self, group, format=None, value=None, append=False,
+ **kwargs):
+ """ return a suitable class to operate """
+
+ def error(t):
+ raise TypeError(
+ "cannot properly create the storer for: [{t}] [group->"
+ "{group},value->{value},format->{format},append->{append},"
+ "kwargs->{kwargs}]".format(t=t, group=group,
+ value=type(value), format=format,
+ append=append, kwargs=kwargs))
+
+ pt = _ensure_decoded(getattr(group._v_attrs, 'pandas_type', None))
+ tt = _ensure_decoded(getattr(group._v_attrs, 'table_type', None))
+
+ # infer the pt from the passed value
+ if pt is None:
+ if value is None:
+
+ _tables()
+ if (getattr(group, 'table', None) or
+ isinstance(group, _table_mod.table.Table)):
+ pt = u'frame_table'
+ tt = u'generic_table'
+ else:
+ raise TypeError(
+ "cannot create a storer if the object is not existing "
+ "nor a value are passed")
+ else:
+
+ try:
+ pt = _TYPE_MAP[type(value)]
+ except KeyError:
+ error('_TYPE_MAP')
+
+ # we are actually a table
+ if format == 'table':
+ pt += u'_table'
+
+ # a storer node
+ if u'table' not in pt:
+ try:
+ return globals()[_STORER_MAP[pt]](self, group, **kwargs)
+ except KeyError:
+ error('_STORER_MAP')
+
+ # existing node (and must be a table)
+ if tt is None:
+
+ # if we are a writer, determine the tt
+ if value is not None:
+
+ if pt == u'series_table':
+ index = getattr(value, 'index', None)
+ if index is not None:
+ if index.nlevels == 1:
+ tt = u'appendable_series'
+ elif index.nlevels > 1:
+ tt = u'appendable_multiseries'
+ elif pt == u'frame_table':
+ index = getattr(value, 'index', None)
+ if index is not None:
+ if index.nlevels == 1:
+ tt = u'appendable_frame'
+ elif index.nlevels > 1:
+ tt = u'appendable_multiframe'
+ elif pt == u'wide_table':
+ tt = u'appendable_panel'
+ elif pt == u'ndim_table':
+ tt = u'appendable_ndim'
+
+ else:
+
+ # distiguish between a frame/table
+ tt = u'legacy_panel'
+ try:
+ fields = group.table._v_attrs.fields
+ if len(fields) == 1 and fields[0] == u'value':
+ tt = u'legacy_frame'
+ except IndexError:
+ pass
+
+ try:
+ return globals()[_TABLE_MAP[tt]](self, group, **kwargs)
+ except KeyError:
+ error('_TABLE_MAP')
+
+ def _write_to_group(self, key, value, format, index=True, append=False,
+ complib=None, encoding=None, **kwargs):
+ group = self.get_node(key)
+
+ # remove the node if we are not appending
+ if group is not None and not append:
+ self._handle.remove_node(group, recursive=True)
+ group = None
+
+ # we don't want to store a table node at all if are object is 0-len
+ # as there are not dtypes
+ if getattr(value, 'empty', None) and (format == 'table' or append):
+ return
+
+ if group is None:
+ paths = key.split('/')
+
+ # recursively create the groups
+ path = '/'
+ for p in paths:
+ if not len(p):
+ continue
+ new_path = path
+ if not path.endswith('/'):
+ new_path += '/'
+ new_path += p
+ group = self.get_node(new_path)
+ if group is None:
+ group = self._handle.create_group(path, p)
+ path = new_path
+
+ s = self._create_storer(group, format, value, append=append,
+ encoding=encoding, **kwargs)
+ if append:
+ # raise if we are trying to append to a Fixed format,
+ # or a table that exists (and we are putting)
+ if (not s.is_table or
+ (s.is_table and format == 'fixed' and s.is_exists)):
+ raise ValueError('Can only append to Tables')
+ if not s.is_exists:
+ s.set_object_info()
+ else:
+ s.set_object_info()
+
+ if not s.is_table and complib:
+ raise ValueError(
+ 'Compression not supported on Fixed format stores'
+ )
+
+ # write the object
+ s.write(obj=value, append=append, complib=complib, **kwargs)
+
+ if s.is_table and index:
+ s.create_index(columns=index)
+
+ def _read_group(self, group, **kwargs):
+ s = self._create_storer(group)
+ s.infer_axes()
+ return s.read(**kwargs)
+
+
+class TableIterator(object):
+
+ """ define the iteration interface on a table
+
+ Parameters
+ ----------
+
+ store : the reference store
+ s : the referred storer
+ func : the function to execute the query
+ where : the where of the query
+ nrows : the rows to iterate on
+ start : the passed start value (default is None)
+ stop : the passed stop value (default is None)
+ iterator : boolean, whether to use the default iterator
+ chunksize : the passed chunking value (default is 50000)
+ auto_close : boolean, automatically close the store at the end of
+ iteration, default is False
+ kwargs : the passed kwargs
+ """
+
+ def __init__(self, store, s, func, where, nrows, start=None, stop=None,
+ iterator=False, chunksize=None, auto_close=False):
+ self.store = store
+ self.s = s
+ self.func = func
+ self.where = where
+
+ # set start/stop if they are not set if we are a table
+ if self.s.is_table:
+ if nrows is None:
+ nrows = 0
+ if start is None:
+ start = 0
+ if stop is None:
+ stop = nrows
+ stop = min(nrows, stop)
+
+ self.nrows = nrows
+ self.start = start
+ self.stop = stop
+
+ self.coordinates = None
+ if iterator or chunksize is not None:
+ if chunksize is None:
+ chunksize = 100000
+ self.chunksize = int(chunksize)
+ else:
+ self.chunksize = None
+
+ self.auto_close = auto_close
+
+ def __iter__(self):
+
+ # iterate
+ current = self.start
+ while current < self.stop:
+
+ stop = min(current + self.chunksize, self.stop)
+ value = self.func(None, None, self.coordinates[current:stop])
+ current = stop
+ if value is None or not len(value):
+ continue
+
+ yield value
+
+ self.close()
+
+ def close(self):
+ if self.auto_close:
+ self.store.close()
+
+ def get_result(self, coordinates=False):
+
+ # return the actual iterator
+ if self.chunksize is not None:
+ if not self.s.is_table:
+ raise TypeError(
+ "can only use an iterator or chunksize on a table")
+
+ self.coordinates = self.s.read_coordinates(where=self.where)
+
+ return self
+
+ # if specified read via coordinates (necessary for multiple selections
+ if coordinates:
+ where = self.s.read_coordinates(where=self.where, start=self.start,
+ stop=self.stop)
+ else:
+ where = self.where
+
+ # directly return the result
+ results = self.func(self.start, self.stop, where)
+ self.close()
+ return results
+
+
+class IndexCol(StringMixin):
+
+ """ an index column description class
+
+ Parameters
+ ----------
+
+ axis : axis which I reference
+ values : the ndarray like converted values
+ kind : a string description of this type
+ typ : the pytables type
+ pos : the position in the pytables
+
+ """
+ is_an_indexable = True
+ is_data_indexable = True
+ _info_fields = ['freq', 'tz', 'index_name']
+
+ def __init__(self, values=None, kind=None, typ=None, cname=None,
+ itemsize=None, name=None, axis=None, kind_attr=None,
+ pos=None, freq=None, tz=None, index_name=None, **kwargs):
+ self.values = values
+ self.kind = kind
+ self.typ = typ
+ self.itemsize = itemsize
+ self.name = name
+ self.cname = cname
+ self.kind_attr = kind_attr
+ self.axis = axis
+ self.pos = pos
+ self.freq = freq
+ self.tz = tz
+ self.index_name = index_name
+ self.table = None
+ self.meta = None
+ self.metadata = None
+
+ if name is not None:
+ self.set_name(name, kind_attr)
+ if pos is not None:
+ self.set_pos(pos)
+
+ def set_name(self, name, kind_attr=None):
+ """ set the name of this indexer """
+ self.name = name
+ self.kind_attr = kind_attr or "{name}_kind".format(name=name)
+ if self.cname is None:
+ self.cname = name
+
+ return self
+
+ def set_axis(self, axis):
+ """ set the axis over which I index """
+ self.axis = axis
+
+ return self
+
+ def set_pos(self, pos):
+ """ set the position of this column in the Table """
+ self.pos = pos
+ if pos is not None and self.typ is not None:
+ self.typ._v_pos = pos
+ return self
+
+ def set_table(self, table):
+ self.table = table
+ return self
+
+ def __unicode__(self):
+ temp = tuple(
+ map(pprint_thing,
+ (self.name,
+ self.cname,
+ self.axis,
+ self.pos,
+ self.kind)))
+ return ','.join(("{key}->{value}".format(key=key, value=value)
+ for key, value in zip(
+ ['name', 'cname', 'axis', 'pos', 'kind'], temp)))
+
+ def __eq__(self, other):
+ """ compare 2 col items """
+ return all(getattr(self, a, None) == getattr(other, a, None)
+ for a in ['name', 'cname', 'axis', 'pos'])
+
+ def __ne__(self, other):
+ return not self.__eq__(other)
+
+ @property
+ def is_indexed(self):
+ """ return whether I am an indexed column """
+ try:
+ return getattr(self.table.cols, self.cname).is_indexed
+ except AttributeError:
+ False
+
+ def copy(self):
+ new_self = copy.copy(self)
+ return new_self
+
+ def infer(self, handler):
+ """infer this column from the table: create and return a new object"""
+ table = handler.table
+ new_self = self.copy()
+ new_self.set_table(table)
+ new_self.get_attr()
+ new_self.read_metadata(handler)
+ return new_self
+
+ def convert(self, values, nan_rep, encoding, errors):
+ """ set the values from this selection: take = take ownership """
+
+ # values is a recarray
+ if values.dtype.fields is not None:
+ values = values[self.cname]
+
+ values = _maybe_convert(values, self.kind, encoding, errors)
+
+ kwargs = dict()
+ if self.freq is not None:
+ kwargs['freq'] = _ensure_decoded(self.freq)
+ if self.index_name is not None:
+ kwargs['name'] = _ensure_decoded(self.index_name)
+ # making an Index instance could throw a number of different errors
+ try:
+ self.values = Index(values, **kwargs)
+ except Exception: # noqa: E722
+
+ # if the output freq is different that what we recorded,
+ # it should be None (see also 'doc example part 2')
+ if 'freq' in kwargs:
+ kwargs['freq'] = None
+ self.values = Index(values, **kwargs)
+
+ self.values = _set_tz(self.values, self.tz)
+
+ return self
+
+ def take_data(self):
+ """ return the values & release the memory """
+ self.values, values = None, self.values
+ return values
+
+ @property
+ def attrs(self):
+ return self.table._v_attrs
+
+ @property
+ def description(self):
+ return self.table.description
+
+ @property
+ def col(self):
+ """ return my current col description """
+ return getattr(self.description, self.cname, None)
+
+ @property
+ def cvalues(self):
+ """ return my cython values """
+ return self.values
+
+ def __iter__(self):
+ return iter(self.values)
+
+ def maybe_set_size(self, min_itemsize=None):
+ """ maybe set a string col itemsize:
+ min_itemsize can be an integer or a dict with this columns name
+ with an integer size """
+ if _ensure_decoded(self.kind) == u'string':
+
+ if isinstance(min_itemsize, dict):
+ min_itemsize = min_itemsize.get(self.name)
+
+ if min_itemsize is not None and self.typ.itemsize < min_itemsize:
+ self.typ = _tables(
+ ).StringCol(itemsize=min_itemsize, pos=self.pos)
+
+ def validate(self, handler, append):
+ self.validate_names()
+
+ def validate_names(self):
+ pass
+
+ def validate_and_set(self, handler, append):
+ self.set_table(handler.table)
+ self.validate_col()
+ self.validate_attr(append)
+ self.validate_metadata(handler)
+ self.write_metadata(handler)
+ self.set_attr()
+
+ def validate_col(self, itemsize=None):
+ """ validate this column: return the compared against itemsize """
+
+ # validate this column for string truncation (or reset to the max size)
+ if _ensure_decoded(self.kind) == u'string':
+ c = self.col
+ if c is not None:
+ if itemsize is None:
+ itemsize = self.itemsize
+ if c.itemsize < itemsize:
+ raise ValueError(
+ "Trying to store a string with len [{itemsize}] in "
+ "[{cname}] column but\nthis column has a limit of "
+ "[{c_itemsize}]!\nConsider using min_itemsize to "
+ "preset the sizes on these columns".format(
+ itemsize=itemsize, cname=self.cname,
+ c_itemsize=c.itemsize))
+ return c.itemsize
+
+ return None
+
+ def validate_attr(self, append):
+ # check for backwards incompatibility
+ if append:
+ existing_kind = getattr(self.attrs, self.kind_attr, None)
+ if existing_kind is not None and existing_kind != self.kind:
+ raise TypeError(
+ "incompatible kind in col [{existing} - "
+ "{self_kind}]".format(
+ existing=existing_kind, self_kind=self.kind))
+
+ def update_info(self, info):
+ """ set/update the info for this indexable with the key/value
+ if there is a conflict raise/warn as needed """
+
+ for key in self._info_fields:
+
+ value = getattr(self, key, None)
+ idx = _get_info(info, self.name)
+
+ existing_value = idx.get(key)
+ if key in idx and value is not None and existing_value != value:
+
+ # frequency/name just warn
+ if key in ['freq', 'index_name']:
+ ws = attribute_conflict_doc % (key, existing_value, value)
+ warnings.warn(ws, AttributeConflictWarning, stacklevel=6)
+
+ # reset
+ idx[key] = None
+ setattr(self, key, None)
+
+ else:
+ raise ValueError(
+ "invalid info for [{name}] for [{key}], "
+ "existing_value [{existing_value}] conflicts with "
+ "new value [{value}]".format(
+ name=self.name, key=key,
+ existing_value=existing_value, value=value))
+ else:
+ if value is not None or existing_value is not None:
+ idx[key] = value
+
+ return self
+
+ def set_info(self, info):
+ """ set my state from the passed info """
+ idx = info.get(self.name)
+ if idx is not None:
+ self.__dict__.update(idx)
+
+ def get_attr(self):
+ """ set the kind for this column """
+ self.kind = getattr(self.attrs, self.kind_attr, None)
+
+ def set_attr(self):
+ """ set the kind for this column """
+ setattr(self.attrs, self.kind_attr, self.kind)
+
+ def read_metadata(self, handler):
+ """ retrieve the metadata for this columns """
+ self.metadata = handler.read_metadata(self.cname)
+
+ def validate_metadata(self, handler):
+ """ validate that kind=category does not change the categories """
+ if self.meta == 'category':
+ new_metadata = self.metadata
+ cur_metadata = handler.read_metadata(self.cname)
+ if (new_metadata is not None and cur_metadata is not None and
+ not array_equivalent(new_metadata, cur_metadata)):
+ raise ValueError("cannot append a categorical with "
+ "different categories to the existing")
+
+ def write_metadata(self, handler):
+ """ set the meta data """
+ if self.metadata is not None:
+ handler.write_metadata(self.cname, self.metadata)
+
+
+class GenericIndexCol(IndexCol):
+
+ """ an index which is not represented in the data of the table """
+
+ @property
+ def is_indexed(self):
+ return False
+
+ def convert(self, values, nan_rep, encoding, errors):
+ """ set the values from this selection: take = take ownership """
+
+ self.values = Int64Index(np.arange(self.table.nrows))
+ return self
+
+ def get_attr(self):
+ pass
+
+ def set_attr(self):
+ pass
+
+
+class DataCol(IndexCol):
+
+ """ a data holding column, by definition this is not indexable
+
+ Parameters
+ ----------
+
+ data : the actual data
+ cname : the column name in the table to hold the data (typically
+ values)
+ meta : a string description of the metadata
+ metadata : the actual metadata
+ """
+ is_an_indexable = False
+ is_data_indexable = False
+ _info_fields = ['tz', 'ordered']
+
+ @classmethod
+ def create_for_block(
+ cls, i=None, name=None, cname=None, version=None, **kwargs):
+ """ return a new datacol with the block i """
+
+ if cname is None:
+ cname = name or 'values_block_{idx}'.format(idx=i)
+ if name is None:
+ name = cname
+
+ # prior to 0.10.1, we named values blocks like: values_block_0 an the
+ # name values_0
+ try:
+ if version[0] == 0 and version[1] <= 10 and version[2] == 0:
+ m = re.search(r"values_block_(\d+)", name)
+ if m:
+ name = "values_{group}".format(group=m.groups()[0])
+ except IndexError:
+ pass
+
+ return cls(name=name, cname=cname, **kwargs)
+
+ def __init__(self, values=None, kind=None, typ=None,
+ cname=None, data=None, meta=None, metadata=None,
+ block=None, **kwargs):
+ super(DataCol, self).__init__(values=values, kind=kind, typ=typ,
+ cname=cname, **kwargs)
+ self.dtype = None
+ self.dtype_attr = u'{name}_dtype'.format(name=self.name)
+ self.meta = meta
+ self.meta_attr = u'{name}_meta'.format(name=self.name)
+ self.set_data(data)
+ self.set_metadata(metadata)
+
+ def __unicode__(self):
+ temp = tuple(
+ map(pprint_thing,
+ (self.name,
+ self.cname,
+ self.dtype,
+ self.kind,
+ self.shape)))
+ return ','.join(("{key}->{value}".format(key=key, value=value)
+ for key, value in zip(
+ ['name', 'cname', 'dtype', 'kind', 'shape'], temp)))
+
+ def __eq__(self, other):
+ """ compare 2 col items """
+ return all(getattr(self, a, None) == getattr(other, a, None)
+ for a in ['name', 'cname', 'dtype', 'pos'])
+
+ def set_data(self, data, dtype=None):
+ self.data = data
+ if data is not None:
+ if dtype is not None:
+ self.dtype = dtype
+ self.set_kind()
+ elif self.dtype is None:
+ self.dtype = data.dtype.name
+ self.set_kind()
+
+ def take_data(self):
+ """ return the data & release the memory """
+ self.data, data = None, self.data
+ return data
+
+ def set_metadata(self, metadata):
+ """ record the metadata """
+ if metadata is not None:
+ metadata = np.array(metadata, copy=False).ravel()
+ self.metadata = metadata
+
+ def set_kind(self):
+ # set my kind if we can
+
+ if self.dtype is not None:
+ dtype = _ensure_decoded(self.dtype)
+
+ if dtype.startswith(u'string') or dtype.startswith(u'bytes'):
+ self.kind = 'string'
+ elif dtype.startswith(u'float'):
+ self.kind = 'float'
+ elif dtype.startswith(u'complex'):
+ self.kind = 'complex'
+ elif dtype.startswith(u'int') or dtype.startswith(u'uint'):
+ self.kind = 'integer'
+ elif dtype.startswith(u'date'):
+ self.kind = 'datetime'
+ elif dtype.startswith(u'timedelta'):
+ self.kind = 'timedelta'
+ elif dtype.startswith(u'bool'):
+ self.kind = 'bool'
+ else:
+ raise AssertionError(
+ "cannot interpret dtype of [{dtype}] in [{obj}]".format(
+ dtype=dtype, obj=self))
+
+ # set my typ if we need
+ if self.typ is None:
+ self.typ = getattr(self.description, self.cname, None)
+
+ def set_atom(self, block, block_items, existing_col, min_itemsize,
+ nan_rep, info, encoding=None, errors='strict'):
+ """ create and setup my atom from the block b """
+
+ self.values = list(block_items)
+
+ # short-cut certain block types
+ if block.is_categorical:
+ return self.set_atom_categorical(block, items=block_items,
+ info=info)
+ elif block.is_datetimetz:
+ return self.set_atom_datetime64tz(block, info=info)
+ elif block.is_datetime:
+ return self.set_atom_datetime64(block)
+ elif block.is_timedelta:
+ return self.set_atom_timedelta64(block)
+ elif block.is_complex:
+ return self.set_atom_complex(block)
+
+ dtype = block.dtype.name
+ inferred_type = lib.infer_dtype(block.values, skipna=False)
+
+ if inferred_type == 'date':
+ raise TypeError(
+ "[date] is not implemented as a table column")
+ elif inferred_type == 'datetime':
+ # after 8260
+ # this only would be hit for a mutli-timezone dtype
+ # which is an error
+
+ raise TypeError(
+ "too many timezones in this block, create separate "
+ "data columns"
+ )
+ elif inferred_type == 'unicode':
+ raise TypeError(
+ "[unicode] is not implemented as a table column")
+
+ # this is basically a catchall; if say a datetime64 has nans then will
+ # end up here ###
+ elif inferred_type == 'string' or dtype == 'object':
+ self.set_atom_string(
+ block, block_items,
+ existing_col,
+ min_itemsize,
+ nan_rep,
+ encoding,
+ errors)
+
+ # set as a data block
+ else:
+ self.set_atom_data(block)
+
+ def get_atom_string(self, block, itemsize):
+ return _tables().StringCol(itemsize=itemsize, shape=block.shape[0])
+
+ def set_atom_string(self, block, block_items, existing_col, min_itemsize,
+ nan_rep, encoding, errors):
+ # fill nan items with myself, don't disturb the blocks by
+ # trying to downcast
+ block = block.fillna(nan_rep, downcast=False)
+ if isinstance(block, list):
+ block = block[0]
+ data = block.values
+
+ # see if we have a valid string type
+ inferred_type = lib.infer_dtype(data.ravel(), skipna=False)
+ if inferred_type != 'string':
+
+ # we cannot serialize this data, so report an exception on a column
+ # by column basis
+ for i, item in enumerate(block_items):
+
+ col = block.iget(i)
+ inferred_type = lib.infer_dtype(col.ravel(), skipna=False)
+ if inferred_type != 'string':
+ raise TypeError(
+ "Cannot serialize the column [{item}] because\n"
+ "its data contents are [{type}] object dtype".format(
+ item=item, type=inferred_type)
+ )
+
+ # itemsize is the maximum length of a string (along any dimension)
+ data_converted = _convert_string_array(data, encoding, errors)
+ itemsize = data_converted.itemsize
+
+ # specified min_itemsize?
+ if isinstance(min_itemsize, dict):
+ min_itemsize = int(min_itemsize.get(
+ self.name) or min_itemsize.get('values') or 0)
+ itemsize = max(min_itemsize or 0, itemsize)
+
+ # check for column in the values conflicts
+ if existing_col is not None:
+ eci = existing_col.validate_col(itemsize)
+ if eci > itemsize:
+ itemsize = eci
+
+ self.itemsize = itemsize
+ self.kind = 'string'
+ self.typ = self.get_atom_string(block, itemsize)
+ self.set_data(data_converted.astype(
+ '|S{size}'.format(size=itemsize), copy=False))
+
+ def get_atom_coltype(self, kind=None):
+ """ return the PyTables column class for this column """
+ if kind is None:
+ kind = self.kind
+ if self.kind.startswith('uint'):
+ col_name = "UInt{name}Col".format(name=kind[4:])
+ else:
+ col_name = "{name}Col".format(name=kind.capitalize())
+
+ return getattr(_tables(), col_name)
+
+ def get_atom_data(self, block, kind=None):
+ return self.get_atom_coltype(kind=kind)(shape=block.shape[0])
+
+ def set_atom_complex(self, block):
+ self.kind = block.dtype.name
+ itemsize = int(self.kind.split('complex')[-1]) // 8
+ self.typ = _tables().ComplexCol(
+ itemsize=itemsize, shape=block.shape[0])
+ self.set_data(block.values.astype(self.typ.type, copy=False))
+
+ def set_atom_data(self, block):
+ self.kind = block.dtype.name
+ self.typ = self.get_atom_data(block)
+ self.set_data(block.values.astype(self.typ.type, copy=False))
+
+ def set_atom_categorical(self, block, items, info=None, values=None):
+ # currently only supports a 1-D categorical
+ # in a 1-D block
+
+ values = block.values
+ codes = values.codes
+ self.kind = 'integer'
+ self.dtype = codes.dtype.name
+ if values.ndim > 1:
+ raise NotImplementedError("only support 1-d categoricals")
+ if len(items) > 1:
+ raise NotImplementedError("only support single block categoricals")
+
+ # write the codes; must be in a block shape
+ self.ordered = values.ordered
+ self.typ = self.get_atom_data(block, kind=codes.dtype.name)
+ self.set_data(_block_shape(codes))
+
+ # write the categories
+ self.meta = 'category'
+ self.set_metadata(block.values.categories)
+
+ # update the info
+ self.update_info(info)
+
+ def get_atom_datetime64(self, block):
+ return _tables().Int64Col(shape=block.shape[0])
+
+ def set_atom_datetime64(self, block, values=None):
+ self.kind = 'datetime64'
+ self.typ = self.get_atom_datetime64(block)
+ if values is None:
+ values = block.values.view('i8')
+ self.set_data(values, 'datetime64')
+
+ def set_atom_datetime64tz(self, block, info, values=None):
+
+ if values is None:
+ values = block.values
+
+ # convert this column to i8 in UTC, and save the tz
+ values = values.asi8.reshape(block.shape)
+
+ # store a converted timezone
+ self.tz = _get_tz(block.values.tz)
+ self.update_info(info)
+
+ self.kind = 'datetime64'
+ self.typ = self.get_atom_datetime64(block)
+ self.set_data(values, 'datetime64')
+
+ def get_atom_timedelta64(self, block):
+ return _tables().Int64Col(shape=block.shape[0])
+
+ def set_atom_timedelta64(self, block, values=None):
+ self.kind = 'timedelta64'
+ self.typ = self.get_atom_timedelta64(block)
+ if values is None:
+ values = block.values.view('i8')
+ self.set_data(values, 'timedelta64')
+
+ @property
+ def shape(self):
+ return getattr(self.data, 'shape', None)
+
+ @property
+ def cvalues(self):
+ """ return my cython values """
+ return self.data
+
+ def validate_attr(self, append):
+ """validate that we have the same order as the existing & same dtype"""
+ if append:
+ existing_fields = getattr(self.attrs, self.kind_attr, None)
+ if (existing_fields is not None and
+ existing_fields != list(self.values)):
+ raise ValueError("appended items do not match existing items"
+ " in table!")
+
+ existing_dtype = getattr(self.attrs, self.dtype_attr, None)
+ if (existing_dtype is not None and
+ existing_dtype != self.dtype):
+ raise ValueError("appended items dtype do not match existing "
+ "items dtype in table!")
+
+ def convert(self, values, nan_rep, encoding, errors):
+ """set the data from this selection (and convert to the correct dtype
+ if we can)
+ """
+
+ # values is a recarray
+ if values.dtype.fields is not None:
+ values = values[self.cname]
+
+ self.set_data(values)
+
+ # use the meta if needed
+ meta = _ensure_decoded(self.meta)
+
+ # convert to the correct dtype
+ if self.dtype is not None:
+ dtype = _ensure_decoded(self.dtype)
+
+ # reverse converts
+ if dtype == u'datetime64':
+
+ # recreate with tz if indicated
+ self.data = _set_tz(self.data, self.tz, coerce=True)
+
+ elif dtype == u'timedelta64':
+ self.data = np.asarray(self.data, dtype='m8[ns]')
+ elif dtype == u'date':
+ try:
+ self.data = np.asarray(
+ [date.fromordinal(v) for v in self.data], dtype=object)
+ except ValueError:
+ self.data = np.asarray(
+ [date.fromtimestamp(v) for v in self.data],
+ dtype=object)
+ elif dtype == u'datetime':
+ self.data = np.asarray(
+ [datetime.fromtimestamp(v) for v in self.data],
+ dtype=object)
+
+ elif meta == u'category':
+
+ # we have a categorical
+ categories = self.metadata
+ codes = self.data.ravel()
+
+ # if we have stored a NaN in the categories
+ # then strip it; in theory we could have BOTH
+ # -1s in the codes and nulls :<
+ if categories is None:
+ # Handle case of NaN-only categorical columns in which case
+ # the categories are an empty array; when this is stored,
+ # pytables cannot write a zero-len array, so on readback
+ # the categories would be None and `read_hdf()` would fail.
+ categories = Index([], dtype=np.float64)
+ else:
+ mask = isna(categories)
+ if mask.any():
+ categories = categories[~mask]
+ codes[codes != -1] -= mask.astype(int).cumsum().values
+
+ self.data = Categorical.from_codes(codes,
+ categories=categories,
+ ordered=self.ordered)
+
+ else:
+
+ try:
+ self.data = self.data.astype(dtype, copy=False)
+ except TypeError:
+ self.data = self.data.astype('O', copy=False)
+
+ # convert nans / decode
+ if _ensure_decoded(self.kind) == u'string':
+ self.data = _unconvert_string_array(
+ self.data, nan_rep=nan_rep, encoding=encoding, errors=errors)
+
+ return self
+
+ def get_attr(self):
+ """ get the data for this column """
+ self.values = getattr(self.attrs, self.kind_attr, None)
+ self.dtype = getattr(self.attrs, self.dtype_attr, None)
+ self.meta = getattr(self.attrs, self.meta_attr, None)
+ self.set_kind()
+
+ def set_attr(self):
+ """ set the data for this column """
+ setattr(self.attrs, self.kind_attr, self.values)
+ setattr(self.attrs, self.meta_attr, self.meta)
+ if self.dtype is not None:
+ setattr(self.attrs, self.dtype_attr, self.dtype)
+
+
+class DataIndexableCol(DataCol):
+
+ """ represent a data column that can be indexed """
+ is_data_indexable = True
+
+ def validate_names(self):
+ if not Index(self.values).is_object():
+ raise ValueError("cannot have non-object label DataIndexableCol")
+
+ def get_atom_string(self, block, itemsize):
+ return _tables().StringCol(itemsize=itemsize)
+
+ def get_atom_data(self, block, kind=None):
+ return self.get_atom_coltype(kind=kind)()
+
+ def get_atom_datetime64(self, block):
+ return _tables().Int64Col()
+
+ def get_atom_timedelta64(self, block):
+ return _tables().Int64Col()
+
+
+class GenericDataIndexableCol(DataIndexableCol):
+
+ """ represent a generic pytables data column """
+
+ def get_attr(self):
+ pass
+
+
+class Fixed(StringMixin):
+
+ """ represent an object in my store
+ facilitate read/write of various types of objects
+ this is an abstract base class
+
+ Parameters
+ ----------
+
+ parent : my parent HDFStore
+ group : the group node where the table resides
+ """
+ pandas_kind = None
+ obj_type = None
+ ndim = None
+ is_table = False
+
+ def __init__(self, parent, group, encoding=None, errors='strict',
+ **kwargs):
+ self.parent = parent
+ self.group = group
+ self.encoding = _ensure_encoding(encoding)
+ self.errors = errors
+ self.set_version()
+
+ @property
+ def is_old_version(self):
+ return (self.version[0] <= 0 and self.version[1] <= 10 and
+ self.version[2] < 1)
+
+ def set_version(self):
+ """ compute and set our version """
+ version = _ensure_decoded(
+ getattr(self.group._v_attrs, 'pandas_version', None))
+ try:
+ self.version = tuple(int(x) for x in version.split('.'))
+ if len(self.version) == 2:
+ self.version = self.version + (0,)
+ except AttributeError:
+ self.version = (0, 0, 0)
+
+ @property
+ def pandas_type(self):
+ return _ensure_decoded(getattr(self.group._v_attrs,
+ 'pandas_type', None))
+
+ @property
+ def format_type(self):
+ return 'fixed'
+
+ def __unicode__(self):
+ """ return a pretty representation of myself """
+ self.infer_axes()
+ s = self.shape
+ if s is not None:
+ if isinstance(s, (list, tuple)):
+ s = "[{shape}]".format(
+ shape=','.join(pprint_thing(x) for x in s))
+ return "{type:12.12} (shape->{shape})".format(
+ type=self.pandas_type, shape=s)
+ return self.pandas_type
+
+ def set_object_info(self):
+ """ set my pandas type & version """
+ self.attrs.pandas_type = str(self.pandas_kind)
+ self.attrs.pandas_version = str(_version)
+ self.set_version()
+
+ def copy(self):
+ new_self = copy.copy(self)
+ return new_self
+
+ @property
+ def storage_obj_type(self):
+ return self.obj_type
+
+ @property
+ def shape(self):
+ return self.nrows
+
+ @property
+ def pathname(self):
+ return self.group._v_pathname
+
+ @property
+ def _handle(self):
+ return self.parent._handle
+
+ @property
+ def _filters(self):
+ return self.parent._filters
+
+ @property
+ def _complevel(self):
+ return self.parent._complevel
+
+ @property
+ def _fletcher32(self):
+ return self.parent._fletcher32
+
+ @property
+ def _complib(self):
+ return self.parent._complib
+
+ @property
+ def attrs(self):
+ return self.group._v_attrs
+
+ def set_attrs(self):
+ """ set our object attributes """
+ pass
+
+ def get_attrs(self):
+ """ get our object attributes """
+ pass
+
+ @property
+ def storable(self):
+ """ return my storable """
+ return self.group
+
+ @property
+ def is_exists(self):
+ return False
+
+ @property
+ def nrows(self):
+ return getattr(self.storable, 'nrows', None)
+
+ def validate(self, other):
+ """ validate against an existing storable """
+ if other is None:
+ return
+ return True
+
+ def validate_version(self, where=None):
+ """ are we trying to operate on an old version? """
+ return True
+
+ def infer_axes(self):
+ """ infer the axes of my storer
+ return a boolean indicating if we have a valid storer or not """
+
+ s = self.storable
+ if s is None:
+ return False
+ self.get_attrs()
+ return True
+
+ def read(self, **kwargs):
+ raise NotImplementedError(
+ "cannot read on an abstract storer: subclasses should implement")
+
+ def write(self, **kwargs):
+ raise NotImplementedError(
+ "cannot write on an abstract storer: sublcasses should implement")
+
+ def delete(self, where=None, start=None, stop=None, **kwargs):
+ """
+ support fully deleting the node in its entirety (only) - where
+ specification must be None
+ """
+ if com._all_none(where, start, stop):
+ self._handle.remove_node(self.group, recursive=True)
+ return None
+
+ raise TypeError("cannot delete on an abstract storer")
+
+
+class GenericFixed(Fixed):
+
+ """ a generified fixed version """
+ _index_type_map = {DatetimeIndex: 'datetime', PeriodIndex: 'period'}
+ _reverse_index_map = {v: k for k, v in compat.iteritems(_index_type_map)}
+ attributes = []
+
+ # indexer helpders
+ def _class_to_alias(self, cls):
+ return self._index_type_map.get(cls, '')
+
+ def _alias_to_class(self, alias):
+ if isinstance(alias, type): # pragma: no cover
+ # compat: for a short period of time master stored types
+ return alias
+ return self._reverse_index_map.get(alias, Index)
+
+ def _get_index_factory(self, klass):
+ if klass == DatetimeIndex:
+ def f(values, freq=None, tz=None):
+ # data are already in UTC, localize and convert if tz present
+ result = DatetimeIndex._simple_new(values.values, name=None,
+ freq=freq)
+ if tz is not None:
+ result = result.tz_localize('UTC').tz_convert(tz)
+ return result
+ return f
+ elif klass == PeriodIndex:
+ def f(values, freq=None, tz=None):
+ return PeriodIndex._simple_new(values, name=None, freq=freq)
+ return f
+
+ return klass
+
+ def validate_read(self, kwargs):
+ """
+ remove table keywords from kwargs and return
+ raise if any keywords are passed which are not-None
+ """
+ kwargs = copy.copy(kwargs)
+
+ columns = kwargs.pop('columns', None)
+ if columns is not None:
+ raise TypeError("cannot pass a column specification when reading "
+ "a Fixed format store. this store must be "
+ "selected in its entirety")
+ where = kwargs.pop('where', None)
+ if where is not None:
+ raise TypeError("cannot pass a where specification when reading "
+ "from a Fixed format store. this store must be "
+ "selected in its entirety")
+ return kwargs
+
+ @property
+ def is_exists(self):
+ return True
+
+ def set_attrs(self):
+ """ set our object attributes """
+ self.attrs.encoding = self.encoding
+ self.attrs.errors = self.errors
+
+ def get_attrs(self):
+ """ retrieve our attributes """
+ self.encoding = _ensure_encoding(getattr(self.attrs, 'encoding', None))
+ self.errors = _ensure_decoded(getattr(self.attrs, 'errors', 'strict'))
+ for n in self.attributes:
+ setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
+
+ def write(self, obj, **kwargs):
+ self.set_attrs()
+
+ def read_array(self, key, start=None, stop=None):
+ """ read an array for the specified node (off of group """
+ import tables
+ node = getattr(self.group, key)
+ attrs = node._v_attrs
+
+ transposed = getattr(attrs, 'transposed', False)
+
+ if isinstance(node, tables.VLArray):
+ ret = node[0][start:stop]
+ else:
+ dtype = getattr(attrs, 'value_type', None)
+ shape = getattr(attrs, 'shape', None)
+
+ if shape is not None:
+ # length 0 axis
+ ret = np.empty(shape, dtype=dtype)
+ else:
+ ret = node[start:stop]
+
+ if dtype == u'datetime64':
+
+ # reconstruct a timezone if indicated
+ ret = _set_tz(ret, getattr(attrs, 'tz', None), coerce=True)
+
+ elif dtype == u'timedelta64':
+ ret = np.asarray(ret, dtype='m8[ns]')
+
+ if transposed:
+ return ret.T
+ else:
+ return ret
+
+ def read_index(self, key, **kwargs):
+ variety = _ensure_decoded(
+ getattr(self.attrs, '{key}_variety'.format(key=key)))
+
+ if variety == u'multi':
+ return self.read_multi_index(key, **kwargs)
+ elif variety == u'block':
+ return self.read_block_index(key, **kwargs)
+ elif variety == u'sparseint':
+ return self.read_sparse_intindex(key, **kwargs)
+ elif variety == u'regular':
+ _, index = self.read_index_node(getattr(self.group, key), **kwargs)
+ return index
+ else: # pragma: no cover
+ raise TypeError(
+ 'unrecognized index variety: {variety}'.format(
+ variety=variety))
+
+ def write_index(self, key, index):
+ if isinstance(index, MultiIndex):
+ setattr(self.attrs, '{key}_variety'.format(key=key), 'multi')
+ self.write_multi_index(key, index)
+ elif isinstance(index, BlockIndex):
+ setattr(self.attrs, '{key}_variety'.format(key=key), 'block')
+ self.write_block_index(key, index)
+ elif isinstance(index, IntIndex):
+ setattr(self.attrs, '{key}_variety'.format(key=key), 'sparseint')
+ self.write_sparse_intindex(key, index)
+ else:
+ setattr(self.attrs, '{key}_variety'.format(key=key), 'regular')
+ converted = _convert_index(index, self.encoding, self.errors,
+ self.format_type).set_name('index')
+
+ self.write_array(key, converted.values)
+
+ node = getattr(self.group, key)
+ node._v_attrs.kind = converted.kind
+ node._v_attrs.name = index.name
+
+ if isinstance(index, (DatetimeIndex, PeriodIndex)):
+ node._v_attrs.index_class = self._class_to_alias(type(index))
+
+ if hasattr(index, 'freq'):
+ node._v_attrs.freq = index.freq
+
+ if hasattr(index, 'tz') and index.tz is not None:
+ node._v_attrs.tz = _get_tz(index.tz)
+
+ def write_block_index(self, key, index):
+ self.write_array('{key}_blocs'.format(key=key), index.blocs)
+ self.write_array('{key}_blengths'.format(key=key), index.blengths)
+ setattr(self.attrs, '{key}_length'.format(key=key), index.length)
+
+ def read_block_index(self, key, **kwargs):
+ length = getattr(self.attrs, '{key}_length'.format(key=key))
+ blocs = self.read_array('{key}_blocs'.format(key=key), **kwargs)
+ blengths = self.read_array('{key}_blengths'.format(key=key), **kwargs)
+ return BlockIndex(length, blocs, blengths)
+
+ def write_sparse_intindex(self, key, index):
+ self.write_array('{key}_indices'.format(key=key), index.indices)
+ setattr(self.attrs, '{key}_length'.format(key=key), index.length)
+
+ def read_sparse_intindex(self, key, **kwargs):
+ length = getattr(self.attrs, '{key}_length'.format(key=key))
+ indices = self.read_array('{key}_indices'.format(key=key), **kwargs)
+ return IntIndex(length, indices)
+
+ def write_multi_index(self, key, index):
+ setattr(self.attrs, '{key}_nlevels'.format(key=key), index.nlevels)
+
+ for i, (lev, level_codes, name) in enumerate(zip(index.levels,
+ index.codes,
+ index.names)):
+ # write the level
+ level_key = '{key}_level{idx}'.format(key=key, idx=i)
+ conv_level = _convert_index(lev, self.encoding, self.errors,
+ self.format_type).set_name(level_key)
+ self.write_array(level_key, conv_level.values)
+ node = getattr(self.group, level_key)
+ node._v_attrs.kind = conv_level.kind
+ node._v_attrs.name = name
+
+ # write the name
+ setattr(node._v_attrs, '{key}_name{name}'.format(
+ key=key, name=name), name)
+
+ # write the labels
+ label_key = '{key}_label{idx}'.format(key=key, idx=i)
+ self.write_array(label_key, level_codes)
+
+ def read_multi_index(self, key, **kwargs):
+ nlevels = getattr(self.attrs, '{key}_nlevels'.format(key=key))
+
+ levels = []
+ codes = []
+ names = []
+ for i in range(nlevels):
+ level_key = '{key}_level{idx}'.format(key=key, idx=i)
+ name, lev = self.read_index_node(getattr(self.group, level_key),
+ **kwargs)
+ levels.append(lev)
+ names.append(name)
+
+ label_key = '{key}_label{idx}'.format(key=key, idx=i)
+ level_codes = self.read_array(label_key, **kwargs)
+ codes.append(level_codes)
+
+ return MultiIndex(levels=levels, codes=codes, names=names,
+ verify_integrity=True)
+
+ def read_index_node(self, node, start=None, stop=None):
+ data = node[start:stop]
+ # If the index was an empty array write_array_empty() will
+ # have written a sentinel. Here we relace it with the original.
+ if ('shape' in node._v_attrs and
+ self._is_empty_array(getattr(node._v_attrs, 'shape'))):
+ data = np.empty(getattr(node._v_attrs, 'shape'),
+ dtype=getattr(node._v_attrs, 'value_type'))
+ kind = _ensure_decoded(node._v_attrs.kind)
+ name = None
+
+ if 'name' in node._v_attrs:
+ name = _ensure_str(node._v_attrs.name)
+ name = _ensure_decoded(name)
+
+ index_class = self._alias_to_class(_ensure_decoded(
+ getattr(node._v_attrs, 'index_class', '')))
+ factory = self._get_index_factory(index_class)
+
+ kwargs = {}
+ if u'freq' in node._v_attrs:
+ kwargs['freq'] = node._v_attrs['freq']
+
+ if u'tz' in node._v_attrs:
+ kwargs['tz'] = node._v_attrs['tz']
+
+ if kind in (u'date', u'datetime'):
+ index = factory(_unconvert_index(data, kind,
+ encoding=self.encoding,
+ errors=self.errors),
+ dtype=object, **kwargs)
+ else:
+ index = factory(_unconvert_index(data, kind,
+ encoding=self.encoding,
+ errors=self.errors), **kwargs)
+
+ index.name = name
+
+ return name, index
+
+ def write_array_empty(self, key, value):
+ """ write a 0-len array """
+
+ # ugly hack for length 0 axes
+ arr = np.empty((1,) * value.ndim)
+ self._handle.create_array(self.group, key, arr)
+ getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
+ getattr(self.group, key)._v_attrs.shape = value.shape
+
+ def _is_empty_array(self, shape):
+ """Returns true if any axis is zero length."""
+ return any(x == 0 for x in shape)
+
+ def write_array(self, key, value, items=None):
+ if key in self.group:
+ self._handle.remove_node(self.group, key)
+
+ # Transform needed to interface with pytables row/col notation
+ empty_array = self._is_empty_array(value.shape)
+ transposed = False
+
+ if is_categorical_dtype(value):
+ raise NotImplementedError('Cannot store a category dtype in '
+ 'a HDF5 dataset that uses format='
+ '"fixed". Use format="table".')
+ if not empty_array:
+ if hasattr(value, 'T'):
+ # ExtensionArrays (1d) may not have transpose.
+ value = value.T
+ transposed = True
+
+ if self._filters is not None:
+ atom = None
+ try:
+ # get the atom for this datatype
+ atom = _tables().Atom.from_dtype(value.dtype)
+ except ValueError:
+ pass
+
+ if atom is not None:
+ # create an empty chunked array and fill it from value
+ if not empty_array:
+ ca = self._handle.create_carray(self.group, key, atom,
+ value.shape,
+ filters=self._filters)
+ ca[:] = value
+ getattr(self.group, key)._v_attrs.transposed = transposed
+
+ else:
+ self.write_array_empty(key, value)
+
+ return
+
+ if value.dtype.type == np.object_:
+
+ # infer the type, warn if we have a non-string type here (for
+ # performance)
+ inferred_type = lib.infer_dtype(value.ravel(), skipna=False)
+ if empty_array:
+ pass
+ elif inferred_type == 'string':
+ pass
+ else:
+ try:
+ items = list(items)
+ except TypeError:
+ pass
+ ws = performance_doc % (inferred_type, key, items)
+ warnings.warn(ws, PerformanceWarning, stacklevel=7)
+
+ vlarr = self._handle.create_vlarray(self.group, key,
+ _tables().ObjectAtom())
+ vlarr.append(value)
+ else:
+ if empty_array:
+ self.write_array_empty(key, value)
+ else:
+ if is_datetime64_dtype(value.dtype):
+ self._handle.create_array(
+ self.group, key, value.view('i8'))
+ getattr(
+ self.group, key)._v_attrs.value_type = 'datetime64'
+ elif is_datetime64tz_dtype(value.dtype):
+ # store as UTC
+ # with a zone
+ self._handle.create_array(self.group, key,
+ value.asi8)
+
+ node = getattr(self.group, key)
+ node._v_attrs.tz = _get_tz(value.tz)
+ node._v_attrs.value_type = 'datetime64'
+ elif is_timedelta64_dtype(value.dtype):
+ self._handle.create_array(
+ self.group, key, value.view('i8'))
+ getattr(
+ self.group, key)._v_attrs.value_type = 'timedelta64'
+ else:
+ self._handle.create_array(self.group, key, value)
+
+ getattr(self.group, key)._v_attrs.transposed = transposed
+
+
+class LegacyFixed(GenericFixed):
+
+ def read_index_legacy(self, key, start=None, stop=None):
+ node = getattr(self.group, key)
+ data = node[start:stop]
+ kind = node._v_attrs.kind
+ return _unconvert_index_legacy(data, kind, encoding=self.encoding,
+ errors=self.errors)
+
+
+class LegacySeriesFixed(LegacyFixed):
+
+ def read(self, **kwargs):
+ kwargs = self.validate_read(kwargs)
+ index = self.read_index_legacy('index')
+ values = self.read_array('values')
+ return Series(values, index=index)
+
+
+class LegacyFrameFixed(LegacyFixed):
+
+ def read(self, **kwargs):
+ kwargs = self.validate_read(kwargs)
+ index = self.read_index_legacy('index')
+ columns = self.read_index_legacy('columns')
+ values = self.read_array('values')
+ return DataFrame(values, index=index, columns=columns)
+
+
+class SeriesFixed(GenericFixed):
+ pandas_kind = u'series'
+ attributes = ['name']
+
+ @property
+ def shape(self):
+ try:
+ return len(getattr(self.group, 'values')),
+ except (TypeError, AttributeError):
+ return None
+
+ def read(self, **kwargs):
+ kwargs = self.validate_read(kwargs)
+ index = self.read_index('index', **kwargs)
+ values = self.read_array('values', **kwargs)
+ return Series(values, index=index, name=self.name)
+
+ def write(self, obj, **kwargs):
+ super(SeriesFixed, self).write(obj, **kwargs)
+ self.write_index('index', obj.index)
+ self.write_array('values', obj.values)
+ self.attrs.name = obj.name
+
+
+class SparseFixed(GenericFixed):
+
+ def validate_read(self, kwargs):
+ """
+ we don't support start, stop kwds in Sparse
+ """
+ kwargs = super(SparseFixed, self).validate_read(kwargs)
+ if 'start' in kwargs or 'stop' in kwargs:
+ raise NotImplementedError("start and/or stop are not supported "
+ "in fixed Sparse reading")
+ return kwargs
+
+
+class SparseSeriesFixed(SparseFixed):
+ pandas_kind = u'sparse_series'
+ attributes = ['name', 'fill_value', 'kind']
+
+ def read(self, **kwargs):
+ kwargs = self.validate_read(kwargs)
+ index = self.read_index('index')
+ sp_values = self.read_array('sp_values')
+ sp_index = self.read_index('sp_index')
+ return SparseSeries(sp_values, index=index, sparse_index=sp_index,
+ kind=self.kind or u'block',
+ fill_value=self.fill_value,
+ name=self.name)
+
+ def write(self, obj, **kwargs):
+ super(SparseSeriesFixed, self).write(obj, **kwargs)
+ self.write_index('index', obj.index)
+ self.write_index('sp_index', obj.sp_index)
+ self.write_array('sp_values', obj.sp_values)
+ self.attrs.name = obj.name
+ self.attrs.fill_value = obj.fill_value
+ self.attrs.kind = obj.kind
+
+
+class SparseFrameFixed(SparseFixed):
+ pandas_kind = u'sparse_frame'
+ attributes = ['default_kind', 'default_fill_value']
+
+ def read(self, **kwargs):
+ kwargs = self.validate_read(kwargs)
+ columns = self.read_index('columns')
+ sdict = {}
+ for c in columns:
+ key = 'sparse_series_{columns}'.format(columns=c)
+ s = SparseSeriesFixed(self.parent, getattr(self.group, key))
+ s.infer_axes()
+ sdict[c] = s.read()
+ return SparseDataFrame(sdict, columns=columns,
+ default_kind=self.default_kind,
+ default_fill_value=self.default_fill_value)
+
+ def write(self, obj, **kwargs):
+ """ write it as a collection of individual sparse series """
+ super(SparseFrameFixed, self).write(obj, **kwargs)
+ for name, ss in compat.iteritems(obj):
+ key = 'sparse_series_{name}'.format(name=name)
+ if key not in self.group._v_children:
+ node = self._handle.create_group(self.group, key)
+ else:
+ node = getattr(self.group, key)
+ s = SparseSeriesFixed(self.parent, node)
+ s.write(ss)
+ self.attrs.default_fill_value = obj.default_fill_value
+ self.attrs.default_kind = obj.default_kind
+ self.write_index('columns', obj.columns)
+
+
+class BlockManagerFixed(GenericFixed):
+ attributes = ['ndim', 'nblocks']
+ is_shape_reversed = False
+
+ @property
+ def shape(self):
+ try:
+ ndim = self.ndim
+
+ # items
+ items = 0
+ for i in range(self.nblocks):
+ node = getattr(self.group, 'block{idx}_items'.format(idx=i))
+ shape = getattr(node, 'shape', None)
+ if shape is not None:
+ items += shape[0]
+
+ # data shape
+ node = getattr(self.group, 'block0_values')
+ shape = getattr(node, 'shape', None)
+ if shape is not None:
+ shape = list(shape[0:(ndim - 1)])
+ else:
+ shape = []
+
+ shape.append(items)
+
+ # hacky - this works for frames, but is reversed for panels
+ if self.is_shape_reversed:
+ shape = shape[::-1]
+
+ return shape
+ except AttributeError:
+ return None
+
+ def read(self, start=None, stop=None, **kwargs):
+ # start, stop applied to rows, so 0th axis only
+
+ kwargs = self.validate_read(kwargs)
+ select_axis = self.obj_type()._get_block_manager_axis(0)
+
+ axes = []
+ for i in range(self.ndim):
+
+ _start, _stop = (start, stop) if i == select_axis else (None, None)
+ ax = self.read_index('axis{idx}'.format(
+ idx=i), start=_start, stop=_stop)
+ axes.append(ax)
+
+ items = axes[0]
+ blocks = []
+ for i in range(self.nblocks):
+
+ blk_items = self.read_index('block{idx}_items'.format(idx=i))
+ values = self.read_array('block{idx}_values'.format(idx=i),
+ start=_start, stop=_stop)
+ blk = make_block(values,
+ placement=items.get_indexer(blk_items))
+ blocks.append(blk)
+
+ return self.obj_type(BlockManager(blocks, axes))
+
+ def write(self, obj, **kwargs):
+ super(BlockManagerFixed, self).write(obj, **kwargs)
+ data = obj._data
+ if not data.is_consolidated():
+ data = data.consolidate()
+
+ self.attrs.ndim = data.ndim
+ for i, ax in enumerate(data.axes):
+ if i == 0:
+ if not ax.is_unique:
+ raise ValueError(
+ "Columns index has to be unique for fixed format")
+ self.write_index('axis{idx}'.format(idx=i), ax)
+
+ # Supporting mixed-type DataFrame objects...nontrivial
+ self.attrs.nblocks = len(data.blocks)
+ for i, blk in enumerate(data.blocks):
+ # I have no idea why, but writing values before items fixed #2299
+ blk_items = data.items.take(blk.mgr_locs)
+ self.write_array('block{idx}_values'.format(idx=i),
+ blk.values, items=blk_items)
+ self.write_index('block{idx}_items'.format(idx=i), blk_items)
+
+
+class FrameFixed(BlockManagerFixed):
+ pandas_kind = u'frame'
+ obj_type = DataFrame
+
+
+class PanelFixed(BlockManagerFixed):
+ pandas_kind = u'wide'
+ obj_type = Panel
+ is_shape_reversed = True
+
+ def write(self, obj, **kwargs):
+ obj._consolidate_inplace()
+ return super(PanelFixed, self).write(obj, **kwargs)
+
+
+class Table(Fixed):
+
+ """ represent a table:
+ facilitate read/write of various types of tables
+
+ Attrs in Table Node
+ -------------------
+ These are attributes that are store in the main table node, they are
+ necessary to recreate these tables when read back in.
+
+ index_axes : a list of tuples of the (original indexing axis and
+ index column)
+ non_index_axes: a list of tuples of the (original index axis and
+ columns on a non-indexing axis)
+ values_axes : a list of the columns which comprise the data of this
+ table
+ data_columns : a list of the columns that we are allowing indexing
+ (these become single columns in values_axes), or True to force all
+ columns
+ nan_rep : the string to use for nan representations for string
+ objects
+ levels : the names of levels
+ metadata : the names of the metadata columns
+
+ """
+ pandas_kind = u'wide_table'
+ table_type = None
+ levels = 1
+ is_table = True
+ is_shape_reversed = False
+
+ def __init__(self, *args, **kwargs):
+ super(Table, self).__init__(*args, **kwargs)
+ self.index_axes = []
+ self.non_index_axes = []
+ self.values_axes = []
+ self.data_columns = []
+ self.metadata = []
+ self.info = dict()
+ self.nan_rep = None
+ self.selection = None
+
+ @property
+ def table_type_short(self):
+ return self.table_type.split('_')[0]
+
+ @property
+ def format_type(self):
+ return 'table'
+
+ def __unicode__(self):
+ """ return a pretty representatgion of myself """
+ self.infer_axes()
+ dc = ",dc->[{columns}]".format(columns=(','.join(
+ self.data_columns) if len(self.data_columns) else ''))
+
+ ver = ''
+ if self.is_old_version:
+ ver = "[{version}]".format(
+ version='.'.join(str(x) for x in self.version))
+
+ return (
+ "{pandas_type:12.12}{ver} (typ->{table_type},nrows->{nrows},"
+ "ncols->{ncols},indexers->[{index_axes}]{dc})".format(
+ pandas_type=self.pandas_type, ver=ver,
+ table_type=self.table_type_short, nrows=self.nrows,
+ ncols=self.ncols,
+ index_axes=(','.join(a.name for a in self.index_axes)), dc=dc
+ ))
+
+ def __getitem__(self, c):
+ """ return the axis for c """
+ for a in self.axes:
+ if c == a.name:
+ return a
+ return None
+
+ def validate(self, other):
+ """ validate against an existing table """
+ if other is None:
+ return
+
+ if other.table_type != self.table_type:
+ raise TypeError(
+ "incompatible table_type with existing "
+ "[{other} - {self}]".format(
+ other=other.table_type, self=self.table_type))
+
+ for c in ['index_axes', 'non_index_axes', 'values_axes']:
+ sv = getattr(self, c, None)
+ ov = getattr(other, c, None)
+ if sv != ov:
+
+ # show the error for the specific axes
+ for i, sax in enumerate(sv):
+ oax = ov[i]
+ if sax != oax:
+ raise ValueError(
+ "invalid combinate of [{c}] on appending data "
+ "[{sax}] vs current table [{oax}]".format(
+ c=c, sax=sax, oax=oax))
+
+ # should never get here
+ raise Exception(
+ "invalid combinate of [{c}] on appending data [{sv}] vs "
+ "current table [{ov}]".format(c=c, sv=sv, ov=ov))
+
+ @property
+ def is_multi_index(self):
+ """the levels attribute is 1 or a list in the case of a multi-index"""
+ return isinstance(self.levels, list)
+
+ def validate_metadata(self, existing):
+ """ create / validate metadata """
+ self.metadata = [
+ c.name for c in self.values_axes if c.metadata is not None]
+
+ def validate_multiindex(self, obj):
+ """validate that we can store the multi-index; reset and return the
+ new object
+ """
+ levels = [l if l is not None else "level_{0}".format(i)
+ for i, l in enumerate(obj.index.names)]
+ try:
+ return obj.reset_index(), levels
+ except ValueError:
+ raise ValueError("duplicate names/columns in the multi-index when "
+ "storing as a table")
+
+ @property
+ def nrows_expected(self):
+ """ based on our axes, compute the expected nrows """
+ return np.prod([i.cvalues.shape[0] for i in self.index_axes])
+
+ @property
+ def is_exists(self):
+ """ has this table been created """
+ return u'table' in self.group
+
+ @property
+ def storable(self):
+ return getattr(self.group, 'table', None)
+
+ @property
+ def table(self):
+ """ return the table group (this is my storable) """
+ return self.storable
+
+ @property
+ def dtype(self):
+ return self.table.dtype
+
+ @property
+ def description(self):
+ return self.table.description
+
+ @property
+ def axes(self):
+ return itertools.chain(self.index_axes, self.values_axes)
+
+ @property
+ def ncols(self):
+ """ the number of total columns in the values axes """
+ return sum(len(a.values) for a in self.values_axes)
+
+ @property
+ def is_transposed(self):
+ return False
+
+ @property
+ def data_orientation(self):
+ """return a tuple of my permutated axes, non_indexable at the front"""
+ return tuple(itertools.chain([int(a[0]) for a in self.non_index_axes],
+ [int(a.axis) for a in self.index_axes]))
+
+ def queryables(self):
+ """ return a dict of the kinds allowable columns for this object """
+
+ # compute the values_axes queryables
+ return dict(
+ [(a.cname, a) for a in self.index_axes] +
+ [(self.storage_obj_type._AXIS_NAMES[axis], None)
+ for axis, values in self.non_index_axes] +
+ [(v.cname, v) for v in self.values_axes
+ if v.name in set(self.data_columns)]
+ )
+
+ def index_cols(self):
+ """ return a list of my index cols """
+ return [(i.axis, i.cname) for i in self.index_axes]
+
+ def values_cols(self):
+ """ return a list of my values cols """
+ return [i.cname for i in self.values_axes]
+
+ def _get_metadata_path(self, key):
+ """ return the metadata pathname for this key """
+ return "{group}/meta/{key}/meta".format(group=self.group._v_pathname,
+ key=key)
+
+ def write_metadata(self, key, values):
+ """
+ write out a meta data array to the key as a fixed-format Series
+
+ Parameters
+ ----------
+ key : string
+ values : ndarray
+
+ """
+ values = Series(values)
+ self.parent.put(self._get_metadata_path(key), values, format='table',
+ encoding=self.encoding, errors=self.errors,
+ nan_rep=self.nan_rep)
+
+ def read_metadata(self, key):
+ """ return the meta data array for this key """
+ if getattr(getattr(self.group, 'meta', None), key, None) is not None:
+ return self.parent.select(self._get_metadata_path(key))
+ return None
+
+ def set_info(self):
+ """ update our table index info """
+ self.attrs.info = self.info
+
+ def set_attrs(self):
+ """ set our table type & indexables """
+ self.attrs.table_type = str(self.table_type)
+ self.attrs.index_cols = self.index_cols()
+ self.attrs.values_cols = self.values_cols()
+ self.attrs.non_index_axes = self.non_index_axes
+ self.attrs.data_columns = self.data_columns
+ self.attrs.nan_rep = self.nan_rep
+ self.attrs.encoding = self.encoding
+ self.attrs.errors = self.errors
+ self.attrs.levels = self.levels
+ self.attrs.metadata = self.metadata
+ self.set_info()
+
+ def get_attrs(self):
+ """ retrieve our attributes """
+ self.non_index_axes = getattr(
+ self.attrs, 'non_index_axes', None) or []
+ self.data_columns = getattr(
+ self.attrs, 'data_columns', None) or []
+ self.info = getattr(
+ self.attrs, 'info', None) or dict()
+ self.nan_rep = getattr(self.attrs, 'nan_rep', None)
+ self.encoding = _ensure_encoding(
+ getattr(self.attrs, 'encoding', None))
+ self.errors = _ensure_decoded(getattr(self.attrs, 'errors', 'strict'))
+ self.levels = getattr(
+ self.attrs, 'levels', None) or []
+ self.index_axes = [
+ a.infer(self) for a in self.indexables if a.is_an_indexable
+ ]
+ self.values_axes = [
+ a.infer(self) for a in self.indexables if not a.is_an_indexable
+ ]
+ self.metadata = getattr(
+ self.attrs, 'metadata', None) or []
+
+ def validate_version(self, where=None):
+ """ are we trying to operate on an old version? """
+ if where is not None:
+ if (self.version[0] <= 0 and self.version[1] <= 10 and
+ self.version[2] < 1):
+ ws = incompatibility_doc % '.'.join(
+ [str(x) for x in self.version])
+ warnings.warn(ws, IncompatibilityWarning)
+
+ def validate_min_itemsize(self, min_itemsize):
+ """validate the min_itemisze doesn't contain items that are not in the
+ axes this needs data_columns to be defined
+ """
+ if min_itemsize is None:
+ return
+ if not isinstance(min_itemsize, dict):
+ return
+
+ q = self.queryables()
+ for k, v in min_itemsize.items():
+
+ # ok, apply generally
+ if k == 'values':
+ continue
+ if k not in q:
+ raise ValueError(
+ "min_itemsize has the key [{key}] which is not an axis or "
+ "data_column".format(key=k))
+
+ @property
+ def indexables(self):
+ """ create/cache the indexables if they don't exist """
+ if self._indexables is None:
+
+ self._indexables = []
+
+ # index columns
+ self._indexables.extend([
+ IndexCol(name=name, axis=axis, pos=i)
+ for i, (axis, name) in enumerate(self.attrs.index_cols)
+ ])
+
+ # values columns
+ dc = set(self.data_columns)
+ base_pos = len(self._indexables)
+
+ def f(i, c):
+ klass = DataCol
+ if c in dc:
+ klass = DataIndexableCol
+ return klass.create_for_block(i=i, name=c, pos=base_pos + i,
+ version=self.version)
+
+ self._indexables.extend(
+ [f(i, c) for i, c in enumerate(self.attrs.values_cols)])
+
+ return self._indexables
+
+ def create_index(self, columns=None, optlevel=None, kind=None):
+ """
+ Create a pytables index on the specified columns
+ note: cannot index Time64Col() or ComplexCol currently;
+ PyTables must be >= 3.0
+
+ Parameters
+ ----------
+ columns : False (don't create an index), True (create all columns
+ index), None or list_like (the indexers to index)
+ optlevel: optimization level (defaults to 6)
+ kind : kind of index (defaults to 'medium')
+
+ Exceptions
+ ----------
+ raises if the node is not a table
+
+ """
+
+ if not self.infer_axes():
+ return
+ if columns is False:
+ return
+
+ # index all indexables and data_columns
+ if columns is None or columns is True:
+ columns = [a.cname for a in self.axes if a.is_data_indexable]
+ if not isinstance(columns, (tuple, list)):
+ columns = [columns]
+
+ kw = dict()
+ if optlevel is not None:
+ kw['optlevel'] = optlevel
+ if kind is not None:
+ kw['kind'] = kind
+
+ table = self.table
+ for c in columns:
+ v = getattr(table.cols, c, None)
+ if v is not None:
+
+ # remove the index if the kind/optlevel have changed
+ if v.is_indexed:
+ index = v.index
+ cur_optlevel = index.optlevel
+ cur_kind = index.kind
+
+ if kind is not None and cur_kind != kind:
+ v.remove_index()
+ else:
+ kw['kind'] = cur_kind
+
+ if optlevel is not None and cur_optlevel != optlevel:
+ v.remove_index()
+ else:
+ kw['optlevel'] = cur_optlevel
+
+ # create the index
+ if not v.is_indexed:
+ if v.type.startswith('complex'):
+ raise TypeError(
+ 'Columns containing complex values can be stored '
+ 'but cannot'
+ ' be indexed when using table format. Either use '
+ 'fixed format, set index=False, or do not include '
+ 'the columns containing complex values to '
+ 'data_columns when initializing the table.')
+ v.create_index(**kw)
+
+ def read_axes(self, where, **kwargs):
+ """create and return the axes sniffed from the table: return boolean
+ for success
+ """
+
+ # validate the version
+ self.validate_version(where)
+
+ # infer the data kind
+ if not self.infer_axes():
+ return False
+
+ # create the selection
+ self.selection = Selection(self, where=where, **kwargs)
+ values = self.selection.select()
+
+ # convert the data
+ for a in self.axes:
+ a.set_info(self.info)
+ a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding,
+ errors=self.errors)
+
+ return True
+
+ def get_object(self, obj):
+ """ return the data for this obj """
+ return obj
+
+ def validate_data_columns(self, data_columns, min_itemsize):
+ """take the input data_columns and min_itemize and create a data
+ columns spec
+ """
+
+ if not len(self.non_index_axes):
+ return []
+
+ axis, axis_labels = self.non_index_axes[0]
+ info = self.info.get(axis, dict())
+ if info.get('type') == 'MultiIndex' and data_columns:
+ raise ValueError("cannot use a multi-index on axis [{0}] with "
+ "data_columns {1}".format(axis, data_columns))
+
+ # evaluate the passed data_columns, True == use all columns
+ # take only valide axis labels
+ if data_columns is True:
+ data_columns = list(axis_labels)
+ elif data_columns is None:
+ data_columns = []
+
+ # if min_itemsize is a dict, add the keys (exclude 'values')
+ if isinstance(min_itemsize, dict):
+
+ existing_data_columns = set(data_columns)
+ data_columns.extend([
+ k for k in min_itemsize.keys()
+ if k != 'values' and k not in existing_data_columns
+ ])
+
+ # return valid columns in the order of our axis
+ return [c for c in data_columns if c in axis_labels]
+
+ def create_axes(self, axes, obj, validate=True, nan_rep=None,
+ data_columns=None, min_itemsize=None, **kwargs):
+ """ create and return the axes
+ leagcy tables create an indexable column, indexable index,
+ non-indexable fields
+
+ Parameters:
+ -----------
+ axes: a list of the axes in order to create (names or numbers of
+ the axes)
+ obj : the object to create axes on
+ validate: validate the obj against an existing object already
+ written
+ min_itemsize: a dict of the min size for a column in bytes
+ nan_rep : a values to use for string column nan_rep
+ encoding : the encoding for string values
+ data_columns : a list of columns that we want to create separate to
+ allow indexing (or True will force all columns)
+
+ """
+
+ # set the default axes if needed
+ if axes is None:
+ try:
+ axes = _AXES_MAP[type(obj)]
+ except KeyError:
+ raise TypeError(
+ "cannot properly create the storer for: [group->{group},"
+ "value->{value}]".format(
+ group=self.group._v_name, value=type(obj)))
+
+ # map axes to numbers
+ axes = [obj._get_axis_number(a) for a in axes]
+
+ # do we have an existing table (if so, use its axes & data_columns)
+ if self.infer_axes():
+ existing_table = self.copy()
+ existing_table.infer_axes()
+ axes = [a.axis for a in existing_table.index_axes]
+ data_columns = existing_table.data_columns
+ nan_rep = existing_table.nan_rep
+ self.encoding = existing_table.encoding
+ self.errors = existing_table.errors
+ self.info = copy.copy(existing_table.info)
+ else:
+ existing_table = None
+
+ # currently support on ndim-1 axes
+ if len(axes) != self.ndim - 1:
+ raise ValueError(
+ "currently only support ndim-1 indexers in an AppendableTable")
+
+ # create according to the new data
+ self.non_index_axes = []
+ self.data_columns = []
+
+ # nan_representation
+ if nan_rep is None:
+ nan_rep = 'nan'
+
+ self.nan_rep = nan_rep
+
+ # create axes to index and non_index
+ index_axes_map = dict()
+ for i, a in enumerate(obj.axes):
+
+ if i in axes:
+ name = obj._AXIS_NAMES[i]
+ index_axes_map[i] = _convert_index(
+ a, self.encoding, self.errors, self.format_type
+ ).set_name(name).set_axis(i)
+ else:
+
+ # we might be able to change the axes on the appending data if
+ # necessary
+ append_axis = list(a)
+ if existing_table is not None:
+ indexer = len(self.non_index_axes)
+ exist_axis = existing_table.non_index_axes[indexer][1]
+ if not array_equivalent(np.array(append_axis),
+ np.array(exist_axis)):
+
+ # ahah! -> reindex
+ if array_equivalent(np.array(sorted(append_axis)),
+ np.array(sorted(exist_axis))):
+ append_axis = exist_axis
+
+ # the non_index_axes info
+ info = _get_info(self.info, i)
+ info['names'] = list(a.names)
+ info['type'] = a.__class__.__name__
+
+ self.non_index_axes.append((i, append_axis))
+
+ # set axis positions (based on the axes)
+ self.index_axes = [
+ index_axes_map[a].set_pos(j).update_info(self.info)
+ for j, a in enumerate(axes)
+ ]
+ j = len(self.index_axes)
+
+ # check for column conflicts
+ for a in self.axes:
+ a.maybe_set_size(min_itemsize=min_itemsize)
+
+ # reindex by our non_index_axes & compute data_columns
+ for a in self.non_index_axes:
+ obj = _reindex_axis(obj, a[0], a[1])
+
+ def get_blk_items(mgr, blocks):
+ return [mgr.items.take(blk.mgr_locs) for blk in blocks]
+
+ # figure out data_columns and get out blocks
+ block_obj = self.get_object(obj)._consolidate()
+ blocks = block_obj._data.blocks
+ blk_items = get_blk_items(block_obj._data, blocks)
+ if len(self.non_index_axes):
+ axis, axis_labels = self.non_index_axes[0]
+ data_columns = self.validate_data_columns(
+ data_columns, min_itemsize)
+ if len(data_columns):
+ mgr = block_obj.reindex(
+ Index(axis_labels).difference(Index(data_columns)),
+ axis=axis
+ )._data
+
+ blocks = list(mgr.blocks)
+ blk_items = get_blk_items(mgr, blocks)
+ for c in data_columns:
+ mgr = block_obj.reindex([c], axis=axis)._data
+ blocks.extend(mgr.blocks)
+ blk_items.extend(get_blk_items(mgr, mgr.blocks))
+
+ # reorder the blocks in the same order as the existing_table if we can
+ if existing_table is not None:
+ by_items = {tuple(b_items.tolist()): (b, b_items)
+ for b, b_items in zip(blocks, blk_items)}
+ new_blocks = []
+ new_blk_items = []
+ for ea in existing_table.values_axes:
+ items = tuple(ea.values)
+ try:
+ b, b_items = by_items.pop(items)
+ new_blocks.append(b)
+ new_blk_items.append(b_items)
+ except (IndexError, KeyError):
+ raise ValueError(
+ "cannot match existing table structure for [{items}] "
+ "on appending data".format(
+ items=(','.join(pprint_thing(item) for
+ item in items))))
+ blocks = new_blocks
+ blk_items = new_blk_items
+
+ # add my values
+ self.values_axes = []
+ for i, (b, b_items) in enumerate(zip(blocks, blk_items)):
+
+ # shape of the data column are the indexable axes
+ klass = DataCol
+ name = None
+
+ # we have a data_column
+ if (data_columns and len(b_items) == 1 and
+ b_items[0] in data_columns):
+ klass = DataIndexableCol
+ name = b_items[0]
+ self.data_columns.append(name)
+
+ # make sure that we match up the existing columns
+ # if we have an existing table
+ if existing_table is not None and validate:
+ try:
+ existing_col = existing_table.values_axes[i]
+ except (IndexError, KeyError):
+ raise ValueError(
+ "Incompatible appended table [{blocks}]"
+ "with existing table [{table}]".format(
+ blocks=blocks,
+ table=existing_table.values_axes))
+ else:
+ existing_col = None
+
+ try:
+ col = klass.create_for_block(
+ i=i, name=name, version=self.version)
+ col.set_atom(block=b, block_items=b_items,
+ existing_col=existing_col,
+ min_itemsize=min_itemsize,
+ nan_rep=nan_rep,
+ encoding=self.encoding,
+ errors=self.errors,
+ info=self.info)
+ col.set_pos(j)
+
+ self.values_axes.append(col)
+ except (NotImplementedError, ValueError, TypeError) as e:
+ raise e
+ except Exception as detail:
+ raise Exception(
+ "cannot find the correct atom type -> "
+ "[dtype->{name},items->{items}] {detail!s}".format(
+ name=b.dtype.name, items=b_items, detail=detail))
+ j += 1
+
+ # validate our min_itemsize
+ self.validate_min_itemsize(min_itemsize)
+
+ # validate our metadata
+ self.validate_metadata(existing_table)
+
+ # validate the axes if we have an existing table
+ if validate:
+ self.validate(existing_table)
+
+ def process_axes(self, obj, columns=None):
+ """ process axes filters """
+
+ # make a copy to avoid side effects
+ if columns is not None:
+ columns = list(columns)
+
+ # make sure to include levels if we have them
+ if columns is not None and self.is_multi_index:
+ for n in self.levels:
+ if n not in columns:
+ columns.insert(0, n)
+
+ # reorder by any non_index_axes & limit to the select columns
+ for axis, labels in self.non_index_axes:
+ obj = _reindex_axis(obj, axis, labels, columns)
+
+ # apply the selection filters (but keep in the same order)
+ if self.selection.filter is not None:
+ for field, op, filt in self.selection.filter.format():
+
+ def process_filter(field, filt):
+
+ for axis_name in obj._AXIS_NAMES.values():
+ axis_number = obj._get_axis_number(axis_name)
+ axis_values = obj._get_axis(axis_name)
+
+ # see if the field is the name of an axis
+ if field == axis_name:
+
+ # if we have a multi-index, then need to include
+ # the levels
+ if self.is_multi_index:
+ filt = filt.union(Index(self.levels))
+
+ takers = op(axis_values, filt)
+ return obj.loc._getitem_axis(takers,
+ axis=axis_number)
+
+ # this might be the name of a file IN an axis
+ elif field in axis_values:
+
+ # we need to filter on this dimension
+ values = ensure_index(getattr(obj, field).values)
+ filt = ensure_index(filt)
+
+ # hack until we support reversed dim flags
+ if isinstance(obj, DataFrame):
+ axis_number = 1 - axis_number
+ takers = op(values, filt)
+ return obj.loc._getitem_axis(takers,
+ axis=axis_number)
+
+ raise ValueError("cannot find the field [{field}] for "
+ "filtering!".format(field=field))
+
+ obj = process_filter(field, filt)
+
+ return obj
+
+ def create_description(self, complib=None, complevel=None,
+ fletcher32=False, expectedrows=None):
+ """ create the description of the table from the axes & values """
+
+ # provided expected rows if its passed
+ if expectedrows is None:
+ expectedrows = max(self.nrows_expected, 10000)
+
+ d = dict(name='table', expectedrows=expectedrows)
+
+ # description from the axes & values
+ d['description'] = {a.cname: a.typ for a in self.axes}
+
+ if complib:
+ if complevel is None:
+ complevel = self._complevel or 9
+ filters = _tables().Filters(
+ complevel=complevel, complib=complib,
+ fletcher32=fletcher32 or self._fletcher32)
+ d['filters'] = filters
+ elif self._filters is not None:
+ d['filters'] = self._filters
+
+ return d
+
+ def read_coordinates(self, where=None, start=None, stop=None, **kwargs):
+ """select coordinates (row numbers) from a table; return the
+ coordinates object
+ """
+
+ # validate the version
+ self.validate_version(where)
+
+ # infer the data kind
+ if not self.infer_axes():
+ return False
+
+ # create the selection
+ self.selection = Selection(
+ self, where=where, start=start, stop=stop, **kwargs)
+ coords = self.selection.select_coords()
+ if self.selection.filter is not None:
+ for field, op, filt in self.selection.filter.format():
+ data = self.read_column(
+ field, start=coords.min(), stop=coords.max() + 1)
+ coords = coords[
+ op(data.iloc[coords - coords.min()], filt).values]
+
+ return Index(coords)
+
+ def read_column(self, column, where=None, start=None, stop=None):
+ """return a single column from the table, generally only indexables
+ are interesting
+ """
+
+ # validate the version
+ self.validate_version()
+
+ # infer the data kind
+ if not self.infer_axes():
+ return False
+
+ if where is not None:
+ raise TypeError("read_column does not currently accept a where "
+ "clause")
+
+ # find the axes
+ for a in self.axes:
+ if column == a.name:
+
+ if not a.is_data_indexable:
+ raise ValueError(
+ "column [{column}] can not be extracted individually; "
+ "it is not data indexable".format(column=column))
+
+ # column must be an indexable or a data column
+ c = getattr(self.table.cols, column)
+ a.set_info(self.info)
+ return Series(_set_tz(a.convert(c[start:stop],
+ nan_rep=self.nan_rep,
+ encoding=self.encoding,
+ errors=self.errors
+ ).take_data(),
+ a.tz, True), name=column)
+
+ raise KeyError(
+ "column [{column}] not found in the table".format(column=column))
+
+
+class WORMTable(Table):
+
+ """ a write-once read-many table: this format DOES NOT ALLOW appending to a
+ table. writing is a one-time operation the data are stored in a format
+ that allows for searching the data on disk
+ """
+ table_type = u'worm'
+
+ def read(self, **kwargs):
+ """ read the indices and the indexing array, calculate offset rows and
+ return """
+ raise NotImplementedError("WORMTable needs to implement read")
+
+ def write(self, **kwargs):
+ """ write in a format that we can search later on (but cannot append
+ to): write out the indices and the values using _write_array
+ (e.g. a CArray) create an indexing table so that we can search
+ """
+ raise NotImplementedError("WORKTable needs to implement write")
+
+
+class LegacyTable(Table):
+
+ """ an appendable table: allow append/query/delete operations to a
+ (possibly) already existing appendable table this table ALLOWS
+ append (but doesn't require them), and stores the data in a format
+ that can be easily searched
+
+ """
+ _indexables = [
+ IndexCol(name='index', axis=1, pos=0),
+ IndexCol(name='column', axis=2, pos=1, index_kind='columns_kind'),
+ DataCol(name='fields', cname='values', kind_attr='fields', pos=2)
+ ]
+ table_type = u'legacy'
+ ndim = 3
+
+ def write(self, **kwargs):
+ raise TypeError("write operations are not allowed on legacy tables!")
+
+ def read(self, where=None, columns=None, **kwargs):
+ """we have n indexable columns, with an arbitrary number of data
+ axes
+ """
+
+ if not self.read_axes(where=where, **kwargs):
+ return None
+
+ lst_vals = [a.values for a in self.index_axes]
+ labels, levels = _factorize_from_iterables(lst_vals)
+ # labels and levels are tuples but lists are expected
+ labels = list(labels)
+ levels = list(levels)
+ N = [len(lvl) for lvl in levels]
+
+ # compute the key
+ key = _factor_indexer(N[1:], labels)
+
+ objs = []
+ if len(unique(key)) == len(key):
+
+ sorter, _ = algos.groupsort_indexer(
+ ensure_int64(key), np.prod(N))
+ sorter = ensure_platform_int(sorter)
+
+ # create the objs
+ for c in self.values_axes:
+
+ # the data need to be sorted
+ sorted_values = c.take_data().take(sorter, axis=0)
+ if sorted_values.ndim == 1:
+ sorted_values = sorted_values.reshape(
+ (sorted_values.shape[0], 1))
+
+ take_labels = [l.take(sorter) for l in labels]
+ items = Index(c.values)
+ block = _block2d_to_blocknd(
+ values=sorted_values, placement=np.arange(len(items)),
+ shape=tuple(N), labels=take_labels, ref_items=items)
+
+ # create the object
+ mgr = BlockManager([block], [items] + levels)
+ obj = self.obj_type(mgr)
+
+ # permute if needed
+ if self.is_transposed:
+ obj = obj.transpose(
+ *tuple(Series(self.data_orientation).argsort()))
+
+ objs.append(obj)
+
+ else:
+ warnings.warn(duplicate_doc, DuplicateWarning, stacklevel=5)
+
+ # reconstruct
+ long_index = MultiIndex.from_arrays(
+ [i.values for i in self.index_axes])
+
+ for c in self.values_axes:
+ lp = DataFrame(c.data, index=long_index, columns=c.values)
+
+ # need a better algorithm
+ tuple_index = long_index.values
+
+ unique_tuples = unique(tuple_index)
+ unique_tuples = com.asarray_tuplesafe(unique_tuples)
+
+ indexer = match(unique_tuples, tuple_index)
+ indexer = ensure_platform_int(indexer)
+
+ new_index = long_index.take(indexer)
+ new_values = lp.values.take(indexer, axis=0)
+
+ lp = DataFrame(new_values, index=new_index, columns=lp.columns)
+ objs.append(lp.to_panel())
+
+ # create the composite object
+ if len(objs) == 1:
+ wp = objs[0]
+ else:
+ wp = concat(objs, axis=0, verify_integrity=False)._consolidate()
+
+ # apply the selection filters & axis orderings
+ wp = self.process_axes(wp, columns=columns)
+
+ return wp
+
+
+class LegacyFrameTable(LegacyTable):
+
+ """ support the legacy frame table """
+ pandas_kind = u'frame_table'
+ table_type = u'legacy_frame'
+ obj_type = Panel
+
+ def read(self, *args, **kwargs):
+ return super(LegacyFrameTable, self).read(*args, **kwargs)['value']
+
+
+class LegacyPanelTable(LegacyTable):
+
+ """ support the legacy panel table """
+ table_type = u'legacy_panel'
+ obj_type = Panel
+
+
+class AppendableTable(LegacyTable):
+
+ """ suppor the new appendable table formats """
+ _indexables = None
+ table_type = u'appendable'
+
+ def write(self, obj, axes=None, append=False, complib=None,
+ complevel=None, fletcher32=None, min_itemsize=None,
+ chunksize=None, expectedrows=None, dropna=False, **kwargs):
+
+ if not append and self.is_exists:
+ self._handle.remove_node(self.group, 'table')
+
+ # create the axes
+ self.create_axes(axes=axes, obj=obj, validate=append,
+ min_itemsize=min_itemsize,
+ **kwargs)
+
+ for a in self.axes:
+ a.validate(self, append)
+
+ if not self.is_exists:
+
+ # create the table
+ options = self.create_description(complib=complib,
+ complevel=complevel,
+ fletcher32=fletcher32,
+ expectedrows=expectedrows)
+
+ # set the table attributes
+ self.set_attrs()
+
+ # create the table
+ self._handle.create_table(self.group, **options)
+ else:
+ pass
+ # table = self.table
+
+ # update my info
+ self.set_info()
+
+ # validate the axes and set the kinds
+ for a in self.axes:
+ a.validate_and_set(self, append)
+
+ # add the rows
+ self.write_data(chunksize, dropna=dropna)
+
+ def write_data(self, chunksize, dropna=False):
+ """ we form the data into a 2-d including indexes,values,mask
+ write chunk-by-chunk """
+
+ names = self.dtype.names
+ nrows = self.nrows_expected
+
+ # if dropna==True, then drop ALL nan rows
+ masks = []
+ if dropna:
+
+ for a in self.values_axes:
+
+ # figure the mask: only do if we can successfully process this
+ # column, otherwise ignore the mask
+ mask = isna(a.data).all(axis=0)
+ if isinstance(mask, np.ndarray):
+ masks.append(mask.astype('u1', copy=False))
+
+ # consolidate masks
+ if len(masks):
+ mask = masks[0]
+ for m in masks[1:]:
+ mask = mask & m
+ mask = mask.ravel()
+ else:
+ mask = None
+
+ # broadcast the indexes if needed
+ indexes = [a.cvalues for a in self.index_axes]
+ nindexes = len(indexes)
+ bindexes = []
+ for i, idx in enumerate(indexes):
+
+ # broadcast to all other indexes except myself
+ if i > 0 and i < nindexes:
+ repeater = np.prod(
+ [indexes[bi].shape[0] for bi in range(0, i)])
+ idx = np.tile(idx, repeater)
+
+ if i < nindexes - 1:
+ repeater = np.prod([indexes[bi].shape[0]
+ for bi in range(i + 1, nindexes)])
+ idx = np.repeat(idx, repeater)
+
+ bindexes.append(idx)
+
+ # transpose the values so first dimension is last
+ # reshape the values if needed
+ values = [a.take_data() for a in self.values_axes]
+ values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1))
+ for v in values]
+ bvalues = []
+ for i, v in enumerate(values):
+ new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
+ bvalues.append(values[i].reshape(new_shape))
+
+ # write the chunks
+ if chunksize is None:
+ chunksize = 100000
+
+ rows = np.empty(min(chunksize, nrows), dtype=self.dtype)
+ chunks = int(nrows / chunksize) + 1
+ for i in range(chunks):
+ start_i = i * chunksize
+ end_i = min((i + 1) * chunksize, nrows)
+ if start_i >= end_i:
+ break
+
+ self.write_data_chunk(
+ rows,
+ indexes=[a[start_i:end_i] for a in bindexes],
+ mask=mask[start_i:end_i] if mask is not None else None,
+ values=[v[start_i:end_i] for v in bvalues])
+
+ def write_data_chunk(self, rows, indexes, mask, values):
+ """
+ Parameters
+ ----------
+ rows : an empty memory space where we are putting the chunk
+ indexes : an array of the indexes
+ mask : an array of the masks
+ values : an array of the values
+ """
+
+ # 0 len
+ for v in values:
+ if not np.prod(v.shape):
+ return
+
+ try:
+ nrows = indexes[0].shape[0]
+ if nrows != len(rows):
+ rows = np.empty(nrows, dtype=self.dtype)
+ names = self.dtype.names
+ nindexes = len(indexes)
+
+ # indexes
+ for i, idx in enumerate(indexes):
+ rows[names[i]] = idx
+
+ # values
+ for i, v in enumerate(values):
+ rows[names[i + nindexes]] = v
+
+ # mask
+ if mask is not None:
+ m = ~mask.ravel().astype(bool, copy=False)
+ if not m.all():
+ rows = rows[m]
+
+ except Exception as detail:
+ raise Exception(
+ "cannot create row-data -> {detail}".format(detail=detail))
+
+ try:
+ if len(rows):
+ self.table.append(rows)
+ self.table.flush()
+ except Exception as detail:
+ raise TypeError(
+ "tables cannot write this data -> {detail}".format(
+ detail=detail))
+
+ def delete(self, where=None, start=None, stop=None, **kwargs):
+
+ # delete all rows (and return the nrows)
+ if where is None or not len(where):
+ if start is None and stop is None:
+ nrows = self.nrows
+ self._handle.remove_node(self.group, recursive=True)
+ else:
+ # pytables<3.0 would remove a single row with stop=None
+ if stop is None:
+ stop = self.nrows
+ nrows = self.table.remove_rows(start=start, stop=stop)
+ self.table.flush()
+ return nrows
+
+ # infer the data kind
+ if not self.infer_axes():
+ return None
+
+ # create the selection
+ table = self.table
+ self.selection = Selection(
+ self, where, start=start, stop=stop, **kwargs)
+ values = self.selection.select_coords()
+
+ # delete the rows in reverse order
+ sorted_series = Series(values).sort_values()
+ ln = len(sorted_series)
+
+ if ln:
+
+ # construct groups of consecutive rows
+ diff = sorted_series.diff()
+ groups = list(diff[diff > 1].index)
+
+ # 1 group
+ if not len(groups):
+ groups = [0]
+
+ # final element
+ if groups[-1] != ln:
+ groups.append(ln)
+
+ # initial element
+ if groups[0] != 0:
+ groups.insert(0, 0)
+
+ # we must remove in reverse order!
+ pg = groups.pop()
+ for g in reversed(groups):
+ rows = sorted_series.take(lrange(g, pg))
+ table.remove_rows(start=rows[rows.index[0]
+ ], stop=rows[rows.index[-1]] + 1)
+ pg = g
+
+ self.table.flush()
+
+ # return the number of rows removed
+ return ln
+
+
+class AppendableFrameTable(AppendableTable):
+
+ """ suppor the new appendable table formats """
+ pandas_kind = u'frame_table'
+ table_type = u'appendable_frame'
+ ndim = 2
+ obj_type = DataFrame
+
+ @property
+ def is_transposed(self):
+ return self.index_axes[0].axis == 1
+
+ def get_object(self, obj):
+ """ these are written transposed """
+ if self.is_transposed:
+ obj = obj.T
+ return obj
+
+ def read(self, where=None, columns=None, **kwargs):
+
+ if not self.read_axes(where=where, **kwargs):
+ return None
+
+ info = (self.info.get(self.non_index_axes[0][0], dict())
+ if len(self.non_index_axes) else dict())
+ index = self.index_axes[0].values
+ frames = []
+ for a in self.values_axes:
+
+ # we could have a multi-index constructor here
+ # ensure_index doesn't recognized our list-of-tuples here
+ if info.get('type') == 'MultiIndex':
+ cols = MultiIndex.from_tuples(a.values)
+ else:
+ cols = Index(a.values)
+ names = info.get('names')
+ if names is not None:
+ cols.set_names(names, inplace=True)
+
+ if self.is_transposed:
+ values = a.cvalues
+ index_ = cols
+ cols_ = Index(index, name=getattr(index, 'name', None))
+ else:
+ values = a.cvalues.T
+ index_ = Index(index, name=getattr(index, 'name', None))
+ cols_ = cols
+
+ # if we have a DataIndexableCol, its shape will only be 1 dim
+ if values.ndim == 1 and isinstance(values, np.ndarray):
+ values = values.reshape((1, values.shape[0]))
+
+ block = make_block(values, placement=np.arange(len(cols_)))
+ mgr = BlockManager([block], [cols_, index_])
+ frames.append(DataFrame(mgr))
+
+ if len(frames) == 1:
+ df = frames[0]
+ else:
+ df = concat(frames, axis=1)
+
+ # apply the selection filters & axis orderings
+ df = self.process_axes(df, columns=columns)
+
+ return df
+
+
+class AppendableSeriesTable(AppendableFrameTable):
+ """ support the new appendable table formats """
+ pandas_kind = u'series_table'
+ table_type = u'appendable_series'
+ ndim = 2
+ obj_type = Series
+ storage_obj_type = DataFrame
+
+ @property
+ def is_transposed(self):
+ return False
+
+ def get_object(self, obj):
+ return obj
+
+ def write(self, obj, data_columns=None, **kwargs):
+ """ we are going to write this as a frame table """
+ if not isinstance(obj, DataFrame):
+ name = obj.name or 'values'
+ obj = DataFrame({name: obj}, index=obj.index)
+ obj.columns = [name]
+ return super(AppendableSeriesTable, self).write(
+ obj=obj, data_columns=obj.columns.tolist(), **kwargs)
+
+ def read(self, columns=None, **kwargs):
+
+ is_multi_index = self.is_multi_index
+ if columns is not None and is_multi_index:
+ for n in self.levels:
+ if n not in columns:
+ columns.insert(0, n)
+ s = super(AppendableSeriesTable, self).read(columns=columns, **kwargs)
+ if is_multi_index:
+ s.set_index(self.levels, inplace=True)
+
+ s = s.iloc[:, 0]
+
+ # remove the default name
+ if s.name == 'values':
+ s.name = None
+ return s
+
+
+class AppendableMultiSeriesTable(AppendableSeriesTable):
+ """ support the new appendable table formats """
+ pandas_kind = u'series_table'
+ table_type = u'appendable_multiseries'
+
+ def write(self, obj, **kwargs):
+ """ we are going to write this as a frame table """
+ name = obj.name or 'values'
+ obj, self.levels = self.validate_multiindex(obj)
+ cols = list(self.levels)
+ cols.append(name)
+ obj.columns = cols
+ return super(AppendableMultiSeriesTable, self).write(obj=obj, **kwargs)
+
+
+class GenericTable(AppendableFrameTable):
+ """ a table that read/writes the generic pytables table format """
+ pandas_kind = u'frame_table'
+ table_type = u'generic_table'
+ ndim = 2
+ obj_type = DataFrame
+
+ @property
+ def pandas_type(self):
+ return self.pandas_kind
+
+ @property
+ def storable(self):
+ return getattr(self.group, 'table', None) or self.group
+
+ def get_attrs(self):
+ """ retrieve our attributes """
+ self.non_index_axes = []
+ self.nan_rep = None
+ self.levels = []
+
+ self.index_axes = [a.infer(self)
+ for a in self.indexables if a.is_an_indexable]
+ self.values_axes = [a.infer(self)
+ for a in self.indexables if not a.is_an_indexable]
+ self.data_columns = [a.name for a in self.values_axes]
+
+ @property
+ def indexables(self):
+ """ create the indexables from the table description """
+ if self._indexables is None:
+
+ d = self.description
+
+ # the index columns is just a simple index
+ self._indexables = [GenericIndexCol(name='index', axis=0)]
+
+ for i, n in enumerate(d._v_names):
+
+ dc = GenericDataIndexableCol(
+ name=n, pos=i, values=[n], version=self.version)
+ self._indexables.append(dc)
+
+ return self._indexables
+
+ def write(self, **kwargs):
+ raise NotImplementedError("cannot write on an generic table")
+
+
+class AppendableMultiFrameTable(AppendableFrameTable):
+
+ """ a frame with a multi-index """
+ table_type = u'appendable_multiframe'
+ obj_type = DataFrame
+ ndim = 2
+ _re_levels = re.compile(r"^level_\d+$")
+
+ @property
+ def table_type_short(self):
+ return u'appendable_multi'
+
+ def write(self, obj, data_columns=None, **kwargs):
+ if data_columns is None:
+ data_columns = []
+ elif data_columns is True:
+ data_columns = obj.columns.tolist()
+ obj, self.levels = self.validate_multiindex(obj)
+ for n in self.levels:
+ if n not in data_columns:
+ data_columns.insert(0, n)
+ return super(AppendableMultiFrameTable, self).write(
+ obj=obj, data_columns=data_columns, **kwargs)
+
+ def read(self, **kwargs):
+
+ df = super(AppendableMultiFrameTable, self).read(**kwargs)
+ df = df.set_index(self.levels)
+
+ # remove names for 'level_%d'
+ df.index = df.index.set_names([
+ None if self._re_levels.search(l) else l for l in df.index.names
+ ])
+
+ return df
+
+
+class AppendablePanelTable(AppendableTable):
+
+ """ suppor the new appendable table formats """
+ table_type = u'appendable_panel'
+ ndim = 3
+ obj_type = Panel
+
+ def get_object(self, obj):
+ """ these are written transposed """
+ if self.is_transposed:
+ obj = obj.transpose(*self.data_orientation)
+ return obj
+
+ @property
+ def is_transposed(self):
+ return self.data_orientation != tuple(range(self.ndim))
+
+
+def _reindex_axis(obj, axis, labels, other=None):
+ ax = obj._get_axis(axis)
+ labels = ensure_index(labels)
+
+ # try not to reindex even if other is provided
+ # if it equals our current index
+ if other is not None:
+ other = ensure_index(other)
+ if (other is None or labels.equals(other)) and labels.equals(ax):
+ return obj
+
+ labels = ensure_index(labels.unique())
+ if other is not None:
+ labels = ensure_index(other.unique()).intersection(labels, sort=False)
+ if not labels.equals(ax):
+ slicer = [slice(None, None)] * obj.ndim
+ slicer[axis] = labels
+ obj = obj.loc[tuple(slicer)]
+ return obj
+
+
+def _get_info(info, name):
+ """ get/create the info for this name """
+ try:
+ idx = info[name]
+ except KeyError:
+ idx = info[name] = dict()
+ return idx
+
+# tz to/from coercion
+
+
+def _get_tz(tz):
+ """ for a tz-aware type, return an encoded zone """
+ zone = timezones.get_timezone(tz)
+ if zone is None:
+ zone = tz.utcoffset().total_seconds()
+ return zone
+
+
+def _set_tz(values, tz, preserve_UTC=False, coerce=False):
+ """
+ coerce the values to a DatetimeIndex if tz is set
+ preserve the input shape if possible
+
+ Parameters
+ ----------
+ values : ndarray
+ tz : string/pickled tz object
+ preserve_UTC : boolean,
+ preserve the UTC of the result
+ coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray
+ """
+ if tz is not None:
+ name = getattr(values, 'name', None)
+ values = values.ravel()
+ tz = timezones.get_timezone(_ensure_decoded(tz))
+ values = DatetimeIndex(values, name=name)
+ if values.tz is None:
+ values = values.tz_localize('UTC').tz_convert(tz)
+ if preserve_UTC:
+ if tz == 'UTC':
+ values = list(values)
+ elif coerce:
+ values = np.asarray(values, dtype='M8[ns]')
+
+ return values
+
+
+def _convert_index(index, encoding=None, errors='strict', format_type=None):
+ index_name = getattr(index, 'name', None)
+
+ if isinstance(index, DatetimeIndex):
+ converted = index.asi8
+ return IndexCol(converted, 'datetime64', _tables().Int64Col(),
+ freq=getattr(index, 'freq', None),
+ tz=getattr(index, 'tz', None),
+ index_name=index_name)
+ elif isinstance(index, TimedeltaIndex):
+ converted = index.asi8
+ return IndexCol(converted, 'timedelta64', _tables().Int64Col(),
+ freq=getattr(index, 'freq', None),
+ index_name=index_name)
+ elif isinstance(index, (Int64Index, PeriodIndex)):
+ atom = _tables().Int64Col()
+ # avoid to store ndarray of Period objects
+ return IndexCol(index._ndarray_values, 'integer', atom,
+ freq=getattr(index, 'freq', None),
+ index_name=index_name)
+
+ if isinstance(index, MultiIndex):
+ raise TypeError('MultiIndex not supported here!')
+
+ inferred_type = lib.infer_dtype(index, skipna=False)
+
+ values = np.asarray(index)
+
+ if inferred_type == 'datetime64':
+ converted = values.view('i8')
+ return IndexCol(converted, 'datetime64', _tables().Int64Col(),
+ freq=getattr(index, 'freq', None),
+ tz=getattr(index, 'tz', None),
+ index_name=index_name)
+ elif inferred_type == 'timedelta64':
+ converted = values.view('i8')
+ return IndexCol(converted, 'timedelta64', _tables().Int64Col(),
+ freq=getattr(index, 'freq', None),
+ index_name=index_name)
+ elif inferred_type == 'datetime':
+ converted = np.asarray([(time.mktime(v.timetuple()) +
+ v.microsecond / 1E6) for v in values],
+ dtype=np.float64)
+ return IndexCol(converted, 'datetime', _tables().Time64Col(),
+ index_name=index_name)
+ elif inferred_type == 'date':
+ converted = np.asarray([v.toordinal() for v in values],
+ dtype=np.int32)
+ return IndexCol(converted, 'date', _tables().Time32Col(),
+ index_name=index_name)
+ elif inferred_type == 'string':
+ # atom = _tables().ObjectAtom()
+ # return np.asarray(values, dtype='O'), 'object', atom
+
+ converted = _convert_string_array(values, encoding, errors)
+ itemsize = converted.dtype.itemsize
+ return IndexCol(
+ converted, 'string', _tables().StringCol(itemsize),
+ itemsize=itemsize, index_name=index_name
+ )
+ elif inferred_type == 'unicode':
+ if format_type == 'fixed':
+ atom = _tables().ObjectAtom()
+ return IndexCol(np.asarray(values, dtype='O'), 'object', atom,
+ index_name=index_name)
+ raise TypeError(
+ "[unicode] is not supported as a in index type for [{0}] formats"
+ .format(format_type)
+ )
+
+ elif inferred_type == 'integer':
+ # take a guess for now, hope the values fit
+ atom = _tables().Int64Col()
+ return IndexCol(np.asarray(values, dtype=np.int64), 'integer', atom,
+ index_name=index_name)
+ elif inferred_type == 'floating':
+ atom = _tables().Float64Col()
+ return IndexCol(np.asarray(values, dtype=np.float64), 'float', atom,
+ index_name=index_name)
+ else: # pragma: no cover
+ atom = _tables().ObjectAtom()
+ return IndexCol(np.asarray(values, dtype='O'), 'object', atom,
+ index_name=index_name)
+
+
+def _unconvert_index(data, kind, encoding=None, errors='strict'):
+ kind = _ensure_decoded(kind)
+ if kind == u'datetime64':
+ index = DatetimeIndex(data)
+ elif kind == u'timedelta64':
+ index = TimedeltaIndex(data)
+ elif kind == u'datetime':
+ index = np.asarray([datetime.fromtimestamp(v) for v in data],
+ dtype=object)
+ elif kind == u'date':
+ try:
+ index = np.asarray(
+ [date.fromordinal(v) for v in data], dtype=object)
+ except (ValueError):
+ index = np.asarray(
+ [date.fromtimestamp(v) for v in data], dtype=object)
+ elif kind in (u'integer', u'float'):
+ index = np.asarray(data)
+ elif kind in (u'string'):
+ index = _unconvert_string_array(data, nan_rep=None, encoding=encoding,
+ errors=errors)
+ elif kind == u'object':
+ index = np.asarray(data[0])
+ else: # pragma: no cover
+ raise ValueError('unrecognized index type {kind}'.format(kind=kind))
+ return index
+
+
+def _unconvert_index_legacy(data, kind, legacy=False, encoding=None,
+ errors='strict'):
+ kind = _ensure_decoded(kind)
+ if kind == u'datetime':
+ index = to_datetime(data)
+ elif kind in (u'integer'):
+ index = np.asarray(data, dtype=object)
+ elif kind in (u'string'):
+ index = _unconvert_string_array(data, nan_rep=None, encoding=encoding,
+ errors=errors)
+ else: # pragma: no cover
+ raise ValueError('unrecognized index type {kind}'.format(kind=kind))
+ return index
+
+
+def _convert_string_array(data, encoding, errors, itemsize=None):
+ """
+ we take a string-like that is object dtype and coerce to a fixed size
+ string type
+
+ Parameters
+ ----------
+ data : a numpy array of object dtype
+ encoding : None or string-encoding
+ errors : handler for encoding errors
+ itemsize : integer, optional, defaults to the max length of the strings
+
+ Returns
+ -------
+ data in a fixed-length string dtype, encoded to bytes if needed
+ """
+
+ # encode if needed
+ if encoding is not None and len(data):
+ data = Series(data.ravel()).str.encode(
+ encoding, errors).values.reshape(data.shape)
+
+ # create the sized dtype
+ if itemsize is None:
+ ensured = ensure_object(data.ravel())
+ itemsize = max(1, libwriters.max_len_string_array(ensured))
+
+ data = np.asarray(data, dtype="S{size}".format(size=itemsize))
+ return data
+
+
+def _unconvert_string_array(data, nan_rep=None, encoding=None,
+ errors='strict'):
+ """
+ inverse of _convert_string_array
+
+ Parameters
+ ----------
+ data : fixed length string dtyped array
+ nan_rep : the storage repr of NaN, optional
+ encoding : the encoding of the data, optional
+ errors : handler for encoding errors, default 'strict'
+
+ Returns
+ -------
+ an object array of the decoded data
+
+ """
+ shape = data.shape
+ data = np.asarray(data.ravel(), dtype=object)
+
+ # guard against a None encoding in PY3 (because of a legacy
+ # where the passed encoding is actually None)
+ encoding = _ensure_encoding(encoding)
+ if encoding is not None and len(data):
+
+ itemsize = libwriters.max_len_string_array(ensure_object(data))
+ if compat.PY3:
+ dtype = "U{0}".format(itemsize)
+ else:
+ dtype = "S{0}".format(itemsize)
+
+ if isinstance(data[0], compat.binary_type):
+ data = Series(data).str.decode(encoding, errors=errors).values
+ else:
+ data = data.astype(dtype, copy=False).astype(object, copy=False)
+
+ if nan_rep is None:
+ nan_rep = 'nan'
+
+ data = libwriters.string_array_replace_from_nan_rep(data, nan_rep)
+ return data.reshape(shape)
+
+
+def _maybe_convert(values, val_kind, encoding, errors):
+ if _need_convert(val_kind):
+ conv = _get_converter(val_kind, encoding, errors)
+ # conv = np.frompyfunc(conv, 1, 1)
+ values = conv(values)
+ return values
+
+
+def _get_converter(kind, encoding, errors):
+ kind = _ensure_decoded(kind)
+ if kind == 'datetime64':
+ return lambda x: np.asarray(x, dtype='M8[ns]')
+ elif kind == 'datetime':
+ return lambda x: to_datetime(x, cache=True).to_pydatetime()
+ elif kind == 'string':
+ return lambda x: _unconvert_string_array(x, encoding=encoding,
+ errors=errors)
+ else: # pragma: no cover
+ raise ValueError('invalid kind {kind}'.format(kind=kind))
+
+
+def _need_convert(kind):
+ kind = _ensure_decoded(kind)
+ if kind in (u'datetime', u'datetime64', u'string'):
+ return True
+ return False
+
+
+class Selection(object):
+
+ """
+ Carries out a selection operation on a tables.Table object.
+
+ Parameters
+ ----------
+ table : a Table object
+ where : list of Terms (or convertible to)
+ start, stop: indices to start and/or stop selection
+
+ """
+
+ def __init__(self, table, where=None, start=None, stop=None):
+ self.table = table
+ self.where = where
+ self.start = start
+ self.stop = stop
+ self.condition = None
+ self.filter = None
+ self.terms = None
+ self.coordinates = None
+
+ if is_list_like(where):
+
+ # see if we have a passed coordinate like
+ try:
+ inferred = lib.infer_dtype(where, skipna=False)
+ if inferred == 'integer' or inferred == 'boolean':
+ where = np.asarray(where)
+ if where.dtype == np.bool_:
+ start, stop = self.start, self.stop
+ if start is None:
+ start = 0
+ if stop is None:
+ stop = self.table.nrows
+ self.coordinates = np.arange(start, stop)[where]
+ elif issubclass(where.dtype.type, np.integer):
+ if ((self.start is not None and
+ (where < self.start).any()) or
+ (self.stop is not None and
+ (where >= self.stop).any())):
+ raise ValueError(
+ "where must have index locations >= start and "
+ "< stop"
+ )
+ self.coordinates = where
+
+ except ValueError:
+ pass
+
+ if self.coordinates is None:
+
+ self.terms = self.generate(where)
+
+ # create the numexpr & the filter
+ if self.terms is not None:
+ self.condition, self.filter = self.terms.evaluate()
+
+ def generate(self, where):
+ """ where can be a : dict,list,tuple,string """
+ if where is None:
+ return None
+
+ q = self.table.queryables()
+ try:
+ return Expr(where, queryables=q, encoding=self.table.encoding)
+ except NameError:
+ # raise a nice message, suggesting that the user should use
+ # data_columns
+ raise ValueError(
+ "The passed where expression: {0}\n"
+ " contains an invalid variable reference\n"
+ " all of the variable references must be a "
+ "reference to\n"
+ " an axis (e.g. 'index' or 'columns'), or a "
+ "data_column\n"
+ " The currently defined references are: {1}\n"
+ .format(where, ','.join(q.keys()))
+ )
+
+ def select(self):
+ """
+ generate the selection
+ """
+ if self.condition is not None:
+ return self.table.table.read_where(self.condition.format(),
+ start=self.start,
+ stop=self.stop)
+ elif self.coordinates is not None:
+ return self.table.table.read_coordinates(self.coordinates)
+ return self.table.table.read(start=self.start, stop=self.stop)
+
+ def select_coords(self):
+ """
+ generate the selection
+ """
+ start, stop = self.start, self.stop
+ nrows = self.table.nrows
+ if start is None:
+ start = 0
+ elif start < 0:
+ start += nrows
+ if self.stop is None:
+ stop = nrows
+ elif stop < 0:
+ stop += nrows
+
+ if self.condition is not None:
+ return self.table.table.get_where_list(self.condition.format(),
+ start=start, stop=stop,
+ sort=True)
+ elif self.coordinates is not None:
+ return self.coordinates
+
+ return np.arange(start, stop)
+
+# utilities ###
+
+
+def timeit(key, df, fn=None, remove=True, **kwargs):
+ if fn is None:
+ fn = 'timeit.h5'
+ store = HDFStore(fn, mode='w')
+ store.append(key, df, **kwargs)
+ store.close()
+
+ if remove:
+ os.remove(fn)
diff --git a/contrib/python/pandas/py2/pandas/io/s3.py b/contrib/python/pandas/py2/pandas/io/s3.py
new file mode 100644
index 00000000000..4998e4c0400
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/s3.py
@@ -0,0 +1,40 @@
+""" s3 support for remote file interactivity """
+from pandas import compat
+
+try:
+ import s3fs
+ from botocore.exceptions import NoCredentialsError
+except ImportError:
+ raise ImportError("The s3fs library is required to handle s3 files")
+
+if compat.PY3:
+ from urllib.parse import urlparse as parse_url
+else:
+ from urlparse import urlparse as parse_url
+
+
+def _strip_schema(url):
+ """Returns the url without the s3:// part"""
+ result = parse_url(url)
+ return result.netloc + result.path
+
+
+def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
+ compression=None, mode=None):
+
+ if mode is None:
+ mode = 'rb'
+
+ fs = s3fs.S3FileSystem(anon=False)
+ try:
+ filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
+ except (compat.FileNotFoundError, NoCredentialsError):
+ # boto3 has troubles when trying to access a public file
+ # when credentialed...
+ # An OSError is raised if you have credentials, but they
+ # aren't valid for that bucket.
+ # A NoCredentialsError is raised if you don't have creds
+ # for that bucket.
+ fs = s3fs.S3FileSystem(anon=True)
+ filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
+ return filepath_or_buffer, None, compression, True
diff --git a/contrib/python/pandas/py2/pandas/io/sas/__init__.py b/contrib/python/pandas/py2/pandas/io/sas/__init__.py
new file mode 100644
index 00000000000..fa6b29a1a3f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/sas/__init__.py
@@ -0,0 +1 @@
+from .sasreader import read_sas # noqa
diff --git a/contrib/python/pandas/py2/pandas/io/sas/sas.pyx b/contrib/python/pandas/py2/pandas/io/sas/sas.pyx
new file mode 100644
index 00000000000..a5bfd5866a2
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/sas/sas.pyx
@@ -0,0 +1,445 @@
+# cython: profile=False
+# cython: boundscheck=False, initializedcheck=False
+
+import numpy as np
+import sas_constants as const
+
+ctypedef signed long long int64_t
+ctypedef unsigned char uint8_t
+ctypedef unsigned short uint16_t
+
+# rle_decompress decompresses data using a Run Length Encoding
+# algorithm. It is partially documented here:
+#
+# https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf
+cdef const uint8_t[:] rle_decompress(int result_length,
+ const uint8_t[:] inbuff):
+
+ cdef:
+ uint8_t control_byte, x
+ uint8_t[:] result = np.zeros(result_length, np.uint8)
+ int rpos = 0, ipos = 0, length = len(inbuff)
+ int i, nbytes, end_of_first_byte
+
+ while ipos < length:
+ control_byte = inbuff[ipos] & 0xF0
+ end_of_first_byte = <int>(inbuff[ipos] & 0x0F)
+ ipos += 1
+
+ if control_byte == 0x00:
+ if end_of_first_byte != 0:
+ raise ValueError("Unexpected non-zero end_of_first_byte")
+ nbytes = <int>(inbuff[ipos]) + 64
+ ipos += 1
+ for i in range(nbytes):
+ result[rpos] = inbuff[ipos]
+ rpos += 1
+ ipos += 1
+ elif control_byte == 0x40:
+ # not documented
+ nbytes = end_of_first_byte * 16
+ nbytes += <int>(inbuff[ipos])
+ ipos += 1
+ for i in range(nbytes):
+ result[rpos] = inbuff[ipos]
+ rpos += 1
+ ipos += 1
+ elif control_byte == 0x60:
+ nbytes = end_of_first_byte * 256 + <int>(inbuff[ipos]) + 17
+ ipos += 1
+ for i in range(nbytes):
+ result[rpos] = 0x20
+ rpos += 1
+ elif control_byte == 0x70:
+ nbytes = end_of_first_byte * 256 + <int>(inbuff[ipos]) + 17
+ ipos += 1
+ for i in range(nbytes):
+ result[rpos] = 0x00
+ rpos += 1
+ elif control_byte == 0x80:
+ nbytes = end_of_first_byte + 1
+ for i in range(nbytes):
+ result[rpos] = inbuff[ipos + i]
+ rpos += 1
+ ipos += nbytes
+ elif control_byte == 0x90:
+ nbytes = end_of_first_byte + 17
+ for i in range(nbytes):
+ result[rpos] = inbuff[ipos + i]
+ rpos += 1
+ ipos += nbytes
+ elif control_byte == 0xA0:
+ nbytes = end_of_first_byte + 33
+ for i in range(nbytes):
+ result[rpos] = inbuff[ipos + i]
+ rpos += 1
+ ipos += nbytes
+ elif control_byte == 0xB0:
+ nbytes = end_of_first_byte + 49
+ for i in range(nbytes):
+ result[rpos] = inbuff[ipos + i]
+ rpos += 1
+ ipos += nbytes
+ elif control_byte == 0xC0:
+ nbytes = end_of_first_byte + 3
+ x = inbuff[ipos]
+ ipos += 1
+ for i in range(nbytes):
+ result[rpos] = x
+ rpos += 1
+ elif control_byte == 0xD0:
+ nbytes = end_of_first_byte + 2
+ for i in range(nbytes):
+ result[rpos] = 0x40
+ rpos += 1
+ elif control_byte == 0xE0:
+ nbytes = end_of_first_byte + 2
+ for i in range(nbytes):
+ result[rpos] = 0x20
+ rpos += 1
+ elif control_byte == 0xF0:
+ nbytes = end_of_first_byte + 2
+ for i in range(nbytes):
+ result[rpos] = 0x00
+ rpos += 1
+ else:
+ raise ValueError("unknown control byte: {byte}"
+ .format(byte=control_byte))
+
+ # In py37 cython/clang sees `len(outbuff)` as size_t and not Py_ssize_t
+ if <Py_ssize_t>len(result) != <Py_ssize_t>result_length:
+ raise ValueError("RLE: {got} != {expect}".format(got=len(result),
+ expect=result_length))
+
+ return np.asarray(result)
+
+
+# rdc_decompress decompresses data using the Ross Data Compression algorithm:
+#
+# http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
+cdef const uint8_t[:] rdc_decompress(int result_length,
+ const uint8_t[:] inbuff):
+
+ cdef:
+ uint8_t cmd
+ uint16_t ctrl_bits, ctrl_mask = 0, ofs, cnt
+ int ipos = 0, rpos = 0, k
+ uint8_t[:] outbuff = np.zeros(result_length, dtype=np.uint8)
+
+ ii = -1
+
+ while ipos < len(inbuff):
+ ii += 1
+ ctrl_mask = ctrl_mask >> 1
+ if ctrl_mask == 0:
+ ctrl_bits = ((<uint16_t>inbuff[ipos] << 8) +
+ <uint16_t>inbuff[ipos + 1])
+ ipos += 2
+ ctrl_mask = 0x8000
+
+ if ctrl_bits & ctrl_mask == 0:
+ outbuff[rpos] = inbuff[ipos]
+ ipos += 1
+ rpos += 1
+ continue
+
+ cmd = (inbuff[ipos] >> 4) & 0x0F
+ cnt = <uint16_t>(inbuff[ipos] & 0x0F)
+ ipos += 1
+
+ # short RLE
+ if cmd == 0:
+ cnt += 3
+ for k in range(cnt):
+ outbuff[rpos + k] = inbuff[ipos]
+ rpos += cnt
+ ipos += 1
+
+ # long RLE
+ elif cmd == 1:
+ cnt += <uint16_t>inbuff[ipos] << 4
+ cnt += 19
+ ipos += 1
+ for k in range(cnt):
+ outbuff[rpos + k] = inbuff[ipos]
+ rpos += cnt
+ ipos += 1
+
+ # long pattern
+ elif cmd == 2:
+ ofs = cnt + 3
+ ofs += <uint16_t>inbuff[ipos] << 4
+ ipos += 1
+ cnt = <uint16_t>inbuff[ipos]
+ ipos += 1
+ cnt += 16
+ for k in range(cnt):
+ outbuff[rpos + k] = outbuff[rpos - <int>ofs + k]
+ rpos += cnt
+
+ # short pattern
+ elif (cmd >= 3) & (cmd <= 15):
+ ofs = cnt + 3
+ ofs += <uint16_t>inbuff[ipos] << 4
+ ipos += 1
+ for k in range(cmd):
+ outbuff[rpos + k] = outbuff[rpos - <int>ofs + k]
+ rpos += cmd
+
+ else:
+ raise ValueError("unknown RDC command")
+
+ # In py37 cython/clang sees `len(outbuff)` as size_t and not Py_ssize_t
+ if <Py_ssize_t>len(outbuff) != <Py_ssize_t>result_length:
+ raise ValueError("RDC: {got} != {expect}\n"
+ .format(got=len(outbuff), expect=result_length))
+
+ return np.asarray(outbuff)
+
+
+cdef enum ColumnTypes:
+ column_type_decimal = 1
+ column_type_string = 2
+
+
+# type the page_data types
+cdef int page_meta_type = const.page_meta_type
+cdef int page_mix_types_0 = const.page_mix_types[0]
+cdef int page_mix_types_1 = const.page_mix_types[1]
+cdef int page_data_type = const.page_data_type
+cdef int subheader_pointers_offset = const.subheader_pointers_offset
+
+
+cdef class Parser(object):
+
+ cdef:
+ int column_count
+ int64_t[:] lengths
+ int64_t[:] offsets
+ int64_t[:] column_types
+ uint8_t[:, :] byte_chunk
+ object[:, :] string_chunk
+ char *cached_page
+ int current_row_on_page_index
+ int current_page_block_count
+ int current_page_data_subheader_pointers_len
+ int current_page_subheaders_count
+ int current_row_in_chunk_index
+ int current_row_in_file_index
+ int header_length
+ int row_length
+ int bit_offset
+ int subheader_pointer_length
+ int current_page_type
+ bint is_little_endian
+ const uint8_t[:] (*decompress)(int result_length,
+ const uint8_t[:] inbuff)
+ object parser
+
+ def __init__(self, object parser):
+ cdef:
+ int j
+ char[:] column_types
+
+ self.parser = parser
+ self.header_length = self.parser.header_length
+ self.column_count = parser.column_count
+ self.lengths = parser.column_data_lengths()
+ self.offsets = parser.column_data_offsets()
+ self.byte_chunk = parser._byte_chunk
+ self.string_chunk = parser._string_chunk
+ self.row_length = parser.row_length
+ self.bit_offset = self.parser._page_bit_offset
+ self.subheader_pointer_length = self.parser._subheader_pointer_length
+ self.is_little_endian = parser.byte_order == "<"
+ self.column_types = np.empty(self.column_count, dtype='int64')
+
+ # page indicators
+ self.update_next_page()
+
+ column_types = parser.column_types()
+
+ # map column types
+ for j in range(self.column_count):
+ if column_types[j] == b'd':
+ self.column_types[j] = column_type_decimal
+ elif column_types[j] == b's':
+ self.column_types[j] = column_type_string
+ else:
+ raise ValueError("unknown column type: "
+ "{typ}"
+ .format(typ=self.parser.columns[j].ctype))
+
+ # compression
+ if parser.compression == const.rle_compression:
+ self.decompress = rle_decompress
+ elif parser.compression == const.rdc_compression:
+ self.decompress = rdc_decompress
+ else:
+ self.decompress = NULL
+
+ # update to current state of the parser
+ self.current_row_in_chunk_index = parser._current_row_in_chunk_index
+ self.current_row_in_file_index = parser._current_row_in_file_index
+ self.current_row_on_page_index = parser._current_row_on_page_index
+
+ def read(self, int nrows):
+ cdef:
+ bint done
+ int i
+
+ for i in range(nrows):
+ done = self.readline()
+ if done:
+ break
+
+ # update the parser
+ self.parser._current_row_on_page_index = self.current_row_on_page_index
+ self.parser._current_row_in_chunk_index =\
+ self.current_row_in_chunk_index
+ self.parser._current_row_in_file_index = self.current_row_in_file_index
+
+ cdef bint read_next_page(self):
+ cdef done
+
+ done = self.parser._read_next_page()
+ if done:
+ self.cached_page = NULL
+ else:
+ self.update_next_page()
+ return done
+
+ cdef update_next_page(self):
+ # update data for the current page
+
+ self.cached_page = <char *>self.parser._cached_page
+ self.current_row_on_page_index = 0
+ self.current_page_type = self.parser._current_page_type
+ self.current_page_block_count = self.parser._current_page_block_count
+ self.current_page_data_subheader_pointers_len = len(
+ self.parser._current_page_data_subheader_pointers)
+ self.current_page_subheaders_count =\
+ self.parser._current_page_subheaders_count
+
+ cdef readline(self):
+
+ cdef:
+ int offset, bit_offset, align_correction
+ int subheader_pointer_length, mn
+ bint done, flag
+
+ bit_offset = self.bit_offset
+ subheader_pointer_length = self.subheader_pointer_length
+
+ # If there is no page, go to the end of the header and read a page.
+ if self.cached_page == NULL:
+ self.parser._path_or_buf.seek(self.header_length)
+ done = self.read_next_page()
+ if done:
+ return True
+
+ # Loop until a data row is read
+ while True:
+ if self.current_page_type == page_meta_type:
+ flag = self.current_row_on_page_index >=\
+ self.current_page_data_subheader_pointers_len
+ if flag:
+ done = self.read_next_page()
+ if done:
+ return True
+ continue
+ current_subheader_pointer = (
+ self.parser._current_page_data_subheader_pointers[
+ self.current_row_on_page_index])
+ self.process_byte_array_with_data(
+ current_subheader_pointer.offset,
+ current_subheader_pointer.length)
+ return False
+ elif (self.current_page_type == page_mix_types_0 or
+ self.current_page_type == page_mix_types_1):
+ align_correction = (bit_offset + subheader_pointers_offset +
+ self.current_page_subheaders_count *
+ subheader_pointer_length)
+ align_correction = align_correction % 8
+ offset = bit_offset + align_correction
+ offset += subheader_pointers_offset
+ offset += (self.current_page_subheaders_count *
+ subheader_pointer_length)
+ offset += self.current_row_on_page_index * self.row_length
+ self.process_byte_array_with_data(offset,
+ self.row_length)
+ mn = min(self.parser.row_count,
+ self.parser._mix_page_row_count)
+ if self.current_row_on_page_index == mn:
+ done = self.read_next_page()
+ if done:
+ return True
+ return False
+ elif self.current_page_type & page_data_type == page_data_type:
+ self.process_byte_array_with_data(
+ bit_offset + subheader_pointers_offset +
+ self.current_row_on_page_index * self.row_length,
+ self.row_length)
+ flag = (self.current_row_on_page_index ==
+ self.current_page_block_count)
+ if flag:
+ done = self.read_next_page()
+ if done:
+ return True
+ return False
+ else:
+ raise ValueError("unknown page type: {typ}"
+ .format(typ=self.current_page_type))
+
+ cdef void process_byte_array_with_data(self, int offset, int length):
+
+ cdef:
+ Py_ssize_t j
+ int s, k, m, jb, js, current_row
+ int64_t lngt, start, ct
+ const uint8_t[:] source
+ int64_t[:] column_types
+ int64_t[:] lengths
+ int64_t[:] offsets
+ uint8_t[:, :] byte_chunk
+ object[:, :] string_chunk
+
+ source = np.frombuffer(
+ self.cached_page[offset:offset + length], dtype=np.uint8)
+
+ if self.decompress != NULL and (length < self.row_length):
+ source = self.decompress(self.row_length, source)
+
+ current_row = self.current_row_in_chunk_index
+ column_types = self.column_types
+ lengths = self.lengths
+ offsets = self.offsets
+ byte_chunk = self.byte_chunk
+ string_chunk = self.string_chunk
+ s = 8 * self.current_row_in_chunk_index
+ js = 0
+ jb = 0
+ for j in range(self.column_count):
+ lngt = lengths[j]
+ if lngt == 0:
+ break
+ start = offsets[j]
+ ct = column_types[j]
+ if ct == column_type_decimal:
+ # decimal
+ if self.is_little_endian:
+ m = s + 8 - lngt
+ else:
+ m = s
+ for k in range(lngt):
+ byte_chunk[jb, m + k] = source[start + k]
+ jb += 1
+ elif column_types[j] == column_type_string:
+ # string
+ string_chunk[js, current_row] = np.array(source[start:(
+ start + lngt)]).tostring().rstrip(b"\x00 ")
+ js += 1
+
+ self.current_row_on_page_index += 1
+ self.current_row_in_chunk_index += 1
+ self.current_row_in_file_index += 1
diff --git a/contrib/python/pandas/py2/pandas/io/sas/sas7bdat.py b/contrib/python/pandas/py2/pandas/io/sas/sas7bdat.py
new file mode 100644
index 00000000000..eb77f79d38d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/sas/sas7bdat.py
@@ -0,0 +1,703 @@
+"""
+Read SAS7BDAT files
+
+Based on code written by Jared Hobbs:
+ https://bitbucket.org/jaredhobbs/sas7bdat
+
+See also:
+ https://github.com/BioStatMatt/sas7bdat
+
+Partial documentation of the file format:
+ https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf
+
+Reference for binary data compression:
+ http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
+"""
+from datetime import datetime
+import struct
+
+import numpy as np
+
+from pandas.errors import EmptyDataError
+
+import pandas as pd
+from pandas import compat
+
+from pandas.io.common import BaseIterator, get_filepath_or_buffer
+from pandas.io.sas._sas import Parser
+import pandas.io.sas.sas_constants as const
+
+
+class _subheader_pointer(object):
+ pass
+
+
+class _column(object):
+ pass
+
+
+# SAS7BDAT represents a SAS data file in SAS7BDAT format.
+class SAS7BDATReader(BaseIterator):
+ """
+ Read SAS files in SAS7BDAT format.
+
+ Parameters
+ ----------
+ path_or_buf : path name or buffer
+ Name of SAS file or file-like object pointing to SAS file
+ contents.
+ index : column identifier, defaults to None
+ Column to use as index.
+ convert_dates : boolean, defaults to True
+ Attempt to convert dates to Pandas datetime values. Note that
+ some rarely used SAS date formats may be unsupported.
+ blank_missing : boolean, defaults to True
+ Convert empty strings to missing values (SAS uses blanks to
+ indicate missing character variables).
+ chunksize : int, defaults to None
+ Return SAS7BDATReader object for iterations, returns chunks
+ with given number of lines.
+ encoding : string, defaults to None
+ String encoding.
+ convert_text : bool, defaults to True
+ If False, text variables are left as raw bytes.
+ convert_header_text : bool, defaults to True
+ If False, header text, including column names, are left as raw
+ bytes.
+ """
+
+ def __init__(self, path_or_buf, index=None, convert_dates=True,
+ blank_missing=True, chunksize=None, encoding=None,
+ convert_text=True, convert_header_text=True):
+
+ self.index = index
+ self.convert_dates = convert_dates
+ self.blank_missing = blank_missing
+ self.chunksize = chunksize
+ self.encoding = encoding
+ self.convert_text = convert_text
+ self.convert_header_text = convert_header_text
+
+ self.default_encoding = "latin-1"
+ self.compression = ""
+ self.column_names_strings = []
+ self.column_names = []
+ self.column_formats = []
+ self.columns = []
+
+ self._current_page_data_subheader_pointers = []
+ self._cached_page = None
+ self._column_data_lengths = []
+ self._column_data_offsets = []
+ self._column_types = []
+
+ self._current_row_in_file_index = 0
+ self._current_row_on_page_index = 0
+ self._current_row_in_file_index = 0
+
+ self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf)
+ if isinstance(self._path_or_buf, compat.string_types):
+ self._path_or_buf = open(self._path_or_buf, 'rb')
+ self.handle = self._path_or_buf
+
+ self._get_properties()
+ self._parse_metadata()
+
+ def column_data_lengths(self):
+ """Return a numpy int64 array of the column data lengths"""
+ return np.asarray(self._column_data_lengths, dtype=np.int64)
+
+ def column_data_offsets(self):
+ """Return a numpy int64 array of the column offsets"""
+ return np.asarray(self._column_data_offsets, dtype=np.int64)
+
+ def column_types(self):
+ """Returns a numpy character array of the column types:
+ s (string) or d (double)"""
+ return np.asarray(self._column_types, dtype=np.dtype('S1'))
+
+ def close(self):
+ try:
+ self.handle.close()
+ except AttributeError:
+ pass
+
+ def _get_properties(self):
+
+ # Check magic number
+ self._path_or_buf.seek(0)
+ self._cached_page = self._path_or_buf.read(288)
+ if self._cached_page[0:len(const.magic)] != const.magic:
+ self.close()
+ raise ValueError("magic number mismatch (not a SAS file?)")
+
+ # Get alignment information
+ align1, align2 = 0, 0
+ buf = self._read_bytes(const.align_1_offset, const.align_1_length)
+ if buf == const.u64_byte_checker_value:
+ align2 = const.align_2_value
+ self.U64 = True
+ self._int_length = 8
+ self._page_bit_offset = const.page_bit_offset_x64
+ self._subheader_pointer_length = const.subheader_pointer_length_x64
+ else:
+ self.U64 = False
+ self._page_bit_offset = const.page_bit_offset_x86
+ self._subheader_pointer_length = const.subheader_pointer_length_x86
+ self._int_length = 4
+ buf = self._read_bytes(const.align_2_offset, const.align_2_length)
+ if buf == const.align_1_checker_value:
+ align1 = const.align_2_value
+ total_align = align1 + align2
+
+ # Get endianness information
+ buf = self._read_bytes(const.endianness_offset,
+ const.endianness_length)
+ if buf == b'\x01':
+ self.byte_order = "<"
+ else:
+ self.byte_order = ">"
+
+ # Get encoding information
+ buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
+ if buf in const.encoding_names:
+ self.file_encoding = const.encoding_names[buf]
+ else:
+ self.file_encoding = "unknown (code={name!s})".format(name=buf)
+
+ # Get platform information
+ buf = self._read_bytes(const.platform_offset, const.platform_length)
+ if buf == b'1':
+ self.platform = "unix"
+ elif buf == b'2':
+ self.platform = "windows"
+ else:
+ self.platform = "unknown"
+
+ buf = self._read_bytes(const.dataset_offset, const.dataset_length)
+ self.name = buf.rstrip(b'\x00 ')
+ if self.convert_header_text:
+ self.name = self.name.decode(
+ self.encoding or self.default_encoding)
+
+ buf = self._read_bytes(const.file_type_offset, const.file_type_length)
+ self.file_type = buf.rstrip(b'\x00 ')
+ if self.convert_header_text:
+ self.file_type = self.file_type.decode(
+ self.encoding or self.default_encoding)
+
+ # Timestamp is epoch 01/01/1960
+ epoch = datetime(1960, 1, 1)
+ x = self._read_float(const.date_created_offset + align1,
+ const.date_created_length)
+ self.date_created = epoch + pd.to_timedelta(x, unit='s')
+ x = self._read_float(const.date_modified_offset + align1,
+ const.date_modified_length)
+ self.date_modified = epoch + pd.to_timedelta(x, unit='s')
+
+ self.header_length = self._read_int(const.header_size_offset + align1,
+ const.header_size_length)
+
+ # Read the rest of the header into cached_page.
+ buf = self._path_or_buf.read(self.header_length - 288)
+ self._cached_page += buf
+ if len(self._cached_page) != self.header_length:
+ self.close()
+ raise ValueError("The SAS7BDAT file appears to be truncated.")
+
+ self._page_length = self._read_int(const.page_size_offset + align1,
+ const.page_size_length)
+ self._page_count = self._read_int(const.page_count_offset + align1,
+ const.page_count_length)
+
+ buf = self._read_bytes(const.sas_release_offset + total_align,
+ const.sas_release_length)
+ self.sas_release = buf.rstrip(b'\x00 ')
+ if self.convert_header_text:
+ self.sas_release = self.sas_release.decode(
+ self.encoding or self.default_encoding)
+
+ buf = self._read_bytes(const.sas_server_type_offset + total_align,
+ const.sas_server_type_length)
+ self.server_type = buf.rstrip(b'\x00 ')
+ if self.convert_header_text:
+ self.server_type = self.server_type.decode(
+ self.encoding or self.default_encoding)
+
+ buf = self._read_bytes(const.os_version_number_offset + total_align,
+ const.os_version_number_length)
+ self.os_version = buf.rstrip(b'\x00 ')
+ if self.convert_header_text:
+ self.os_version = self.os_version.decode(
+ self.encoding or self.default_encoding)
+
+ buf = self._read_bytes(const.os_name_offset + total_align,
+ const.os_name_length)
+ buf = buf.rstrip(b'\x00 ')
+ if len(buf) > 0:
+ self.os_name = buf.decode(self.encoding or self.default_encoding)
+ else:
+ buf = self._read_bytes(const.os_maker_offset + total_align,
+ const.os_maker_length)
+ self.os_name = buf.rstrip(b'\x00 ')
+ if self.convert_header_text:
+ self.os_name = self.os_name.decode(
+ self.encoding or self.default_encoding)
+
+ def __next__(self):
+ da = self.read(nrows=self.chunksize or 1)
+ if da is None:
+ raise StopIteration
+ return da
+
+ # Read a single float of the given width (4 or 8).
+ def _read_float(self, offset, width):
+ if width not in (4, 8):
+ self.close()
+ raise ValueError("invalid float width")
+ buf = self._read_bytes(offset, width)
+ fd = "f" if width == 4 else "d"
+ return struct.unpack(self.byte_order + fd, buf)[0]
+
+ # Read a single signed integer of the given width (1, 2, 4 or 8).
+ def _read_int(self, offset, width):
+ if width not in (1, 2, 4, 8):
+ self.close()
+ raise ValueError("invalid int width")
+ buf = self._read_bytes(offset, width)
+ it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
+ iv = struct.unpack(self.byte_order + it, buf)[0]
+ return iv
+
+ def _read_bytes(self, offset, length):
+ if self._cached_page is None:
+ self._path_or_buf.seek(offset)
+ buf = self._path_or_buf.read(length)
+ if len(buf) < length:
+ self.close()
+ msg = "Unable to read {:d} bytes from file position {:d}."
+ raise ValueError(msg.format(length, offset))
+ return buf
+ else:
+ if offset + length > len(self._cached_page):
+ self.close()
+ raise ValueError("The cached page is too small.")
+ return self._cached_page[offset:offset + length]
+
+ def _parse_metadata(self):
+ done = False
+ while not done:
+ self._cached_page = self._path_or_buf.read(self._page_length)
+ if len(self._cached_page) <= 0:
+ break
+ if len(self._cached_page) != self._page_length:
+ self.close()
+ raise ValueError(
+ "Failed to read a meta data page from the SAS file.")
+ done = self._process_page_meta()
+
+ def _process_page_meta(self):
+ self._read_page_header()
+ pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
+ if self._current_page_type in pt:
+ self._process_page_metadata()
+ is_data_page = self._current_page_type & const.page_data_type
+ is_mix_page = self._current_page_type in const.page_mix_types
+ return (is_data_page or is_mix_page
+ or self._current_page_data_subheader_pointers != [])
+
+ def _read_page_header(self):
+ bit_offset = self._page_bit_offset
+ tx = const.page_type_offset + bit_offset
+ self._current_page_type = self._read_int(tx, const.page_type_length)
+ tx = const.block_count_offset + bit_offset
+ self._current_page_block_count = self._read_int(
+ tx, const.block_count_length)
+ tx = const.subheader_count_offset + bit_offset
+ self._current_page_subheaders_count = (
+ self._read_int(tx, const.subheader_count_length))
+
+ def _process_page_metadata(self):
+ bit_offset = self._page_bit_offset
+
+ for i in range(self._current_page_subheaders_count):
+ pointer = self._process_subheader_pointers(
+ const.subheader_pointers_offset + bit_offset, i)
+ if pointer.length == 0:
+ continue
+ if pointer.compression == const.truncated_subheader_id:
+ continue
+ subheader_signature = self._read_subheader_signature(
+ pointer.offset)
+ subheader_index = (
+ self._get_subheader_index(subheader_signature,
+ pointer.compression, pointer.ptype))
+ self._process_subheader(subheader_index, pointer)
+
+ def _get_subheader_index(self, signature, compression, ptype):
+ index = const.subheader_signature_to_index.get(signature)
+ if index is None:
+ f1 = ((compression == const.compressed_subheader_id) or
+ (compression == 0))
+ f2 = (ptype == const.compressed_subheader_type)
+ if (self.compression != "") and f1 and f2:
+ index = const.SASIndex.data_subheader_index
+ else:
+ self.close()
+ raise ValueError("Unknown subheader signature")
+ return index
+
+ def _process_subheader_pointers(self, offset, subheader_pointer_index):
+
+ subheader_pointer_length = self._subheader_pointer_length
+ total_offset = (offset +
+ subheader_pointer_length * subheader_pointer_index)
+
+ subheader_offset = self._read_int(total_offset, self._int_length)
+ total_offset += self._int_length
+
+ subheader_length = self._read_int(total_offset, self._int_length)
+ total_offset += self._int_length
+
+ subheader_compression = self._read_int(total_offset, 1)
+ total_offset += 1
+
+ subheader_type = self._read_int(total_offset, 1)
+
+ x = _subheader_pointer()
+ x.offset = subheader_offset
+ x.length = subheader_length
+ x.compression = subheader_compression
+ x.ptype = subheader_type
+
+ return x
+
+ def _read_subheader_signature(self, offset):
+ subheader_signature = self._read_bytes(offset, self._int_length)
+ return subheader_signature
+
+ def _process_subheader(self, subheader_index, pointer):
+ offset = pointer.offset
+ length = pointer.length
+
+ if subheader_index == const.SASIndex.row_size_index:
+ processor = self._process_rowsize_subheader
+ elif subheader_index == const.SASIndex.column_size_index:
+ processor = self._process_columnsize_subheader
+ elif subheader_index == const.SASIndex.column_text_index:
+ processor = self._process_columntext_subheader
+ elif subheader_index == const.SASIndex.column_name_index:
+ processor = self._process_columnname_subheader
+ elif subheader_index == const.SASIndex.column_attributes_index:
+ processor = self._process_columnattributes_subheader
+ elif subheader_index == const.SASIndex.format_and_label_index:
+ processor = self._process_format_subheader
+ elif subheader_index == const.SASIndex.column_list_index:
+ processor = self._process_columnlist_subheader
+ elif subheader_index == const.SASIndex.subheader_counts_index:
+ processor = self._process_subheader_counts
+ elif subheader_index == const.SASIndex.data_subheader_index:
+ self._current_page_data_subheader_pointers.append(pointer)
+ return
+ else:
+ raise ValueError("unknown subheader index")
+
+ processor(offset, length)
+
+ def _process_rowsize_subheader(self, offset, length):
+
+ int_len = self._int_length
+ lcs_offset = offset
+ lcp_offset = offset
+ if self.U64:
+ lcs_offset += 682
+ lcp_offset += 706
+ else:
+ lcs_offset += 354
+ lcp_offset += 378
+
+ self.row_length = self._read_int(
+ offset + const.row_length_offset_multiplier * int_len, int_len)
+ self.row_count = self._read_int(
+ offset + const.row_count_offset_multiplier * int_len, int_len)
+ self.col_count_p1 = self._read_int(
+ offset + const.col_count_p1_multiplier * int_len, int_len)
+ self.col_count_p2 = self._read_int(
+ offset + const.col_count_p2_multiplier * int_len, int_len)
+ mx = const.row_count_on_mix_page_offset_multiplier * int_len
+ self._mix_page_row_count = self._read_int(offset + mx, int_len)
+ self._lcs = self._read_int(lcs_offset, 2)
+ self._lcp = self._read_int(lcp_offset, 2)
+
+ def _process_columnsize_subheader(self, offset, length):
+ int_len = self._int_length
+ offset += int_len
+ self.column_count = self._read_int(offset, int_len)
+ if (self.col_count_p1 + self.col_count_p2 !=
+ self.column_count):
+ print(
+ "Warning: column count mismatch ({p1} + {p2} != "
+ "{column_count})\n".format(
+ p1=self.col_count_p1, p2=self.col_count_p2,
+ column_count=self.column_count))
+
+ # Unknown purpose
+ def _process_subheader_counts(self, offset, length):
+ pass
+
+ def _process_columntext_subheader(self, offset, length):
+
+ offset += self._int_length
+ text_block_size = self._read_int(offset, const.text_block_size_length)
+
+ buf = self._read_bytes(offset, text_block_size)
+ cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
+ cname = cname_raw
+ if self.convert_header_text:
+ cname = cname.decode(self.encoding or self.default_encoding)
+ self.column_names_strings.append(cname)
+
+ if len(self.column_names_strings) == 1:
+ compression_literal = ""
+ for cl in const.compression_literals:
+ if cl in cname_raw:
+ compression_literal = cl
+ self.compression = compression_literal
+ offset -= self._int_length
+
+ offset1 = offset + 16
+ if self.U64:
+ offset1 += 4
+
+ buf = self._read_bytes(offset1, self._lcp)
+ compression_literal = buf.rstrip(b"\x00")
+ if compression_literal == "":
+ self._lcs = 0
+ offset1 = offset + 32
+ if self.U64:
+ offset1 += 4
+ buf = self._read_bytes(offset1, self._lcp)
+ self.creator_proc = buf[0:self._lcp]
+ elif compression_literal == const.rle_compression:
+ offset1 = offset + 40
+ if self.U64:
+ offset1 += 4
+ buf = self._read_bytes(offset1, self._lcp)
+ self.creator_proc = buf[0:self._lcp]
+ elif self._lcs > 0:
+ self._lcp = 0
+ offset1 = offset + 16
+ if self.U64:
+ offset1 += 4
+ buf = self._read_bytes(offset1, self._lcs)
+ self.creator_proc = buf[0:self._lcp]
+ if self.convert_header_text:
+ if hasattr(self, "creator_proc"):
+ self.creator_proc = self.creator_proc.decode(
+ self.encoding or self.default_encoding)
+
+ def _process_columnname_subheader(self, offset, length):
+ int_len = self._int_length
+ offset += int_len
+ column_name_pointers_count = (length - 2 * int_len - 12) // 8
+ for i in range(column_name_pointers_count):
+ text_subheader = offset + const.column_name_pointer_length * \
+ (i + 1) + const.column_name_text_subheader_offset
+ col_name_offset = offset + const.column_name_pointer_length * \
+ (i + 1) + const.column_name_offset_offset
+ col_name_length = offset + const.column_name_pointer_length * \
+ (i + 1) + const.column_name_length_offset
+
+ idx = self._read_int(
+ text_subheader, const.column_name_text_subheader_length)
+ col_offset = self._read_int(
+ col_name_offset, const.column_name_offset_length)
+ col_len = self._read_int(
+ col_name_length, const.column_name_length_length)
+
+ name_str = self.column_names_strings[idx]
+ self.column_names.append(name_str[col_offset:col_offset + col_len])
+
+ def _process_columnattributes_subheader(self, offset, length):
+ int_len = self._int_length
+ column_attributes_vectors_count = (
+ length - 2 * int_len - 12) // (int_len + 8)
+ for i in range(column_attributes_vectors_count):
+ col_data_offset = (offset + int_len +
+ const.column_data_offset_offset +
+ i * (int_len + 8))
+ col_data_len = (offset + 2 * int_len +
+ const.column_data_length_offset +
+ i * (int_len + 8))
+ col_types = (offset + 2 * int_len +
+ const.column_type_offset + i * (int_len + 8))
+
+ x = self._read_int(col_data_offset, int_len)
+ self._column_data_offsets.append(x)
+
+ x = self._read_int(col_data_len, const.column_data_length_length)
+ self._column_data_lengths.append(x)
+
+ x = self._read_int(col_types, const.column_type_length)
+ self._column_types.append(b'd' if x == 1 else b's')
+
+ def _process_columnlist_subheader(self, offset, length):
+ # unknown purpose
+ pass
+
+ def _process_format_subheader(self, offset, length):
+ int_len = self._int_length
+ text_subheader_format = (
+ offset +
+ const.column_format_text_subheader_index_offset +
+ 3 * int_len)
+ col_format_offset = (offset +
+ const.column_format_offset_offset +
+ 3 * int_len)
+ col_format_len = (offset +
+ const.column_format_length_offset +
+ 3 * int_len)
+ text_subheader_label = (
+ offset +
+ const.column_label_text_subheader_index_offset +
+ 3 * int_len)
+ col_label_offset = (offset +
+ const.column_label_offset_offset +
+ 3 * int_len)
+ col_label_len = offset + const.column_label_length_offset + 3 * int_len
+
+ x = self._read_int(text_subheader_format,
+ const.column_format_text_subheader_index_length)
+ format_idx = min(x, len(self.column_names_strings) - 1)
+
+ format_start = self._read_int(
+ col_format_offset, const.column_format_offset_length)
+ format_len = self._read_int(
+ col_format_len, const.column_format_length_length)
+
+ label_idx = self._read_int(
+ text_subheader_label,
+ const.column_label_text_subheader_index_length)
+ label_idx = min(label_idx, len(self.column_names_strings) - 1)
+
+ label_start = self._read_int(
+ col_label_offset, const.column_label_offset_length)
+ label_len = self._read_int(col_label_len,
+ const.column_label_length_length)
+
+ label_names = self.column_names_strings[label_idx]
+ column_label = label_names[label_start: label_start + label_len]
+ format_names = self.column_names_strings[format_idx]
+ column_format = format_names[format_start: format_start + format_len]
+ current_column_number = len(self.columns)
+
+ col = _column()
+ col.col_id = current_column_number
+ col.name = self.column_names[current_column_number]
+ col.label = column_label
+ col.format = column_format
+ col.ctype = self._column_types[current_column_number]
+ col.length = self._column_data_lengths[current_column_number]
+
+ self.column_formats.append(column_format)
+ self.columns.append(col)
+
+ def read(self, nrows=None):
+
+ if (nrows is None) and (self.chunksize is not None):
+ nrows = self.chunksize
+ elif nrows is None:
+ nrows = self.row_count
+
+ if len(self._column_types) == 0:
+ self.close()
+ raise EmptyDataError("No columns to parse from file")
+
+ if self._current_row_in_file_index >= self.row_count:
+ return None
+
+ m = self.row_count - self._current_row_in_file_index
+ if nrows > m:
+ nrows = m
+
+ nd = self._column_types.count(b'd')
+ ns = self._column_types.count(b's')
+
+ self._string_chunk = np.empty((ns, nrows), dtype=np.object)
+ self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
+
+ self._current_row_in_chunk_index = 0
+ p = Parser(self)
+ p.read(nrows)
+
+ rslt = self._chunk_to_dataframe()
+ if self.index is not None:
+ rslt = rslt.set_index(self.index)
+
+ return rslt
+
+ def _read_next_page(self):
+ self._current_page_data_subheader_pointers = []
+ self._cached_page = self._path_or_buf.read(self._page_length)
+ if len(self._cached_page) <= 0:
+ return True
+ elif len(self._cached_page) != self._page_length:
+ self.close()
+ msg = ("failed to read complete page from file "
+ "(read {:d} of {:d} bytes)")
+ raise ValueError(msg.format(len(self._cached_page),
+ self._page_length))
+
+ self._read_page_header()
+ page_type = self._current_page_type
+ if page_type == const.page_meta_type:
+ self._process_page_metadata()
+
+ is_data_page = page_type & const.page_data_type
+ pt = [const.page_meta_type] + const.page_mix_types
+ if not is_data_page and self._current_page_type not in pt:
+ return self._read_next_page()
+
+ return False
+
+ def _chunk_to_dataframe(self):
+
+ n = self._current_row_in_chunk_index
+ m = self._current_row_in_file_index
+ ix = range(m - n, m)
+ rslt = pd.DataFrame(index=ix)
+
+ js, jb = 0, 0
+ for j in range(self.column_count):
+
+ name = self.column_names[j]
+
+ if self._column_types[j] == b'd':
+ rslt[name] = self._byte_chunk[jb, :].view(
+ dtype=self.byte_order + 'd')
+ rslt[name] = np.asarray(rslt[name], dtype=np.float64)
+ if self.convert_dates:
+ unit = None
+ if self.column_formats[j] in const.sas_date_formats:
+ unit = 'd'
+ elif self.column_formats[j] in const.sas_datetime_formats:
+ unit = 's'
+ if unit:
+ rslt[name] = pd.to_datetime(rslt[name], unit=unit,
+ origin="1960-01-01")
+ jb += 1
+ elif self._column_types[j] == b's':
+ rslt[name] = self._string_chunk[js, :]
+ if self.convert_text and (self.encoding is not None):
+ rslt[name] = rslt[name].str.decode(
+ self.encoding or self.default_encoding)
+ if self.blank_missing:
+ ii = rslt[name].str.len() == 0
+ rslt.loc[ii, name] = np.nan
+ js += 1
+ else:
+ self.close()
+ raise ValueError("unknown column type {type}".format(
+ type=self._column_types[j]))
+
+ return rslt
diff --git a/contrib/python/pandas/py2/pandas/io/sas/sas_constants.py b/contrib/python/pandas/py2/pandas/io/sas/sas_constants.py
new file mode 100644
index 00000000000..98502d32d39
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/sas/sas_constants.py
@@ -0,0 +1,171 @@
+magic = (b"\x00\x00\x00\x00\x00\x00\x00\x00" +
+ b"\x00\x00\x00\x00\xc2\xea\x81\x60" +
+ b"\xb3\x14\x11\xcf\xbd\x92\x08\x00" +
+ b"\x09\xc7\x31\x8c\x18\x1f\x10\x11")
+
+align_1_checker_value = b'3'
+align_1_offset = 32
+align_1_length = 1
+align_1_value = 4
+u64_byte_checker_value = b'3'
+align_2_offset = 35
+align_2_length = 1
+align_2_value = 4
+endianness_offset = 37
+endianness_length = 1
+platform_offset = 39
+platform_length = 1
+encoding_offset = 70
+encoding_length = 1
+dataset_offset = 92
+dataset_length = 64
+file_type_offset = 156
+file_type_length = 8
+date_created_offset = 164
+date_created_length = 8
+date_modified_offset = 172
+date_modified_length = 8
+header_size_offset = 196
+header_size_length = 4
+page_size_offset = 200
+page_size_length = 4
+page_count_offset = 204
+page_count_length = 4
+sas_release_offset = 216
+sas_release_length = 8
+sas_server_type_offset = 224
+sas_server_type_length = 16
+os_version_number_offset = 240
+os_version_number_length = 16
+os_maker_offset = 256
+os_maker_length = 16
+os_name_offset = 272
+os_name_length = 16
+page_bit_offset_x86 = 16
+page_bit_offset_x64 = 32
+subheader_pointer_length_x86 = 12
+subheader_pointer_length_x64 = 24
+page_type_offset = 0
+page_type_length = 2
+block_count_offset = 2
+block_count_length = 2
+subheader_count_offset = 4
+subheader_count_length = 2
+page_meta_type = 0
+page_data_type = 256
+page_amd_type = 1024
+page_metc_type = 16384
+page_comp_type = -28672
+page_mix_types = [512, 640]
+subheader_pointers_offset = 8
+truncated_subheader_id = 1
+compressed_subheader_id = 4
+compressed_subheader_type = 1
+text_block_size_length = 2
+row_length_offset_multiplier = 5
+row_count_offset_multiplier = 6
+col_count_p1_multiplier = 9
+col_count_p2_multiplier = 10
+row_count_on_mix_page_offset_multiplier = 15
+column_name_pointer_length = 8
+column_name_text_subheader_offset = 0
+column_name_text_subheader_length = 2
+column_name_offset_offset = 2
+column_name_offset_length = 2
+column_name_length_offset = 4
+column_name_length_length = 2
+column_data_offset_offset = 8
+column_data_length_offset = 8
+column_data_length_length = 4
+column_type_offset = 14
+column_type_length = 1
+column_format_text_subheader_index_offset = 22
+column_format_text_subheader_index_length = 2
+column_format_offset_offset = 24
+column_format_offset_length = 2
+column_format_length_offset = 26
+column_format_length_length = 2
+column_label_text_subheader_index_offset = 28
+column_label_text_subheader_index_length = 2
+column_label_offset_offset = 30
+column_label_offset_length = 2
+column_label_length_offset = 32
+column_label_length_length = 2
+rle_compression = b'SASYZCRL'
+rdc_compression = b'SASYZCR2'
+
+compression_literals = [rle_compression, rdc_compression]
+
+# Incomplete list of encodings, using SAS nomenclature:
+# http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm
+encoding_names = {29: "latin1", 20: "utf-8", 33: "cyrillic", 60: "wlatin2",
+ 61: "wcyrillic", 62: "wlatin1", 90: "ebcdic870"}
+
+
+class SASIndex(object):
+ row_size_index = 0
+ column_size_index = 1
+ subheader_counts_index = 2
+ column_text_index = 3
+ column_name_index = 4
+ column_attributes_index = 5
+ format_and_label_index = 6
+ column_list_index = 7
+ data_subheader_index = 8
+
+
+subheader_signature_to_index = {
+ b"\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
+ b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
+ b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": SASIndex.row_size_index,
+ b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": SASIndex.row_size_index,
+ b"\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
+ b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
+ b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": SASIndex.column_size_index,
+ b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": SASIndex.column_size_index,
+ b"\x00\xFC\xFF\xFF": SASIndex.subheader_counts_index,
+ b"\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
+ b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.subheader_counts_index,
+ b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
+ b"\xFD\xFF\xFF\xFF": SASIndex.column_text_index,
+ b"\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
+ b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_text_index,
+ b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
+ b"\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
+ b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
+ b"\xFC\xFF\xFF\xFF": SASIndex.column_attributes_index,
+ b"\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
+ b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_attributes_index,
+ b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
+ b"\xFE\xFB\xFF\xFF": SASIndex.format_and_label_index,
+ b"\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
+ b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.format_and_label_index,
+ b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
+ b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index,
+ b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
+ b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index,
+ b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index}
+
+
+# List of frequently used SAS date and datetime formats
+# http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm
+# https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java
+sas_date_formats = ("DATE", "DAY", "DDMMYY", "DOWNAME", "JULDAY", "JULIAN",
+ "MMDDYY", "MMYY", "MMYYC", "MMYYD", "MMYYP", "MMYYS",
+ "MMYYN", "MONNAME", "MONTH", "MONYY", "QTR", "QTRR",
+ "NENGO", "WEEKDATE", "WEEKDATX", "WEEKDAY", "WEEKV",
+ "WORDDATE", "WORDDATX", "YEAR", "YYMM", "YYMMC", "YYMMD",
+ "YYMMP", "YYMMS", "YYMMN", "YYMON", "YYMMDD", "YYQ",
+ "YYQC", "YYQD", "YYQP", "YYQS", "YYQN", "YYQR", "YYQRC",
+ "YYQRD", "YYQRP", "YYQRS", "YYQRN",
+ "YYMMDDP", "YYMMDDC", "E8601DA", "YYMMDDN", "MMDDYYC",
+ "MMDDYYS", "MMDDYYD", "YYMMDDS", "B8601DA", "DDMMYYN",
+ "YYMMDDD", "DDMMYYB", "DDMMYYP", "MMDDYYP", "YYMMDDB",
+ "MMDDYYN", "DDMMYYC", "DDMMYYD", "DDMMYYS",
+ "MINGUO")
+
+sas_datetime_formats = ("DATETIME", "DTWKDATX",
+ "B8601DN", "B8601DT", "B8601DX", "B8601DZ", "B8601LX",
+ "E8601DN", "E8601DT", "E8601DX", "E8601DZ", "E8601LX",
+ "DATEAMPM", "DTDATE", "DTMONYY", "DTMONYY", "DTWKDATX",
+ "DTYEAR", "TOD", "MDYAMPM")
diff --git a/contrib/python/pandas/py2/pandas/io/sas/sas_xport.py b/contrib/python/pandas/py2/pandas/io/sas/sas_xport.py
new file mode 100644
index 00000000000..3c607d62b42
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/sas/sas_xport.py
@@ -0,0 +1,464 @@
+"""
+Read a SAS XPort format file into a Pandas DataFrame.
+
+Based on code from Jack Cushman (github.com/jcushman/xport).
+
+The file format is defined here:
+
+https://support.sas.com/techsup/technote/ts140.pdf
+"""
+
+from datetime import datetime
+import struct
+import warnings
+
+import numpy as np
+
+from pandas.util._decorators import Appender
+
+import pandas as pd
+from pandas import compat
+
+from pandas.io.common import BaseIterator, get_filepath_or_buffer
+
+_correct_line1 = ("HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!"
+ "000000000000000000000000000000 ")
+_correct_header1 = ("HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!"
+ "000000000000000001600000000")
+_correct_header2 = ("HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!"
+ "000000000000000000000000000000 ")
+_correct_obs_header = ("HEADER RECORD*******OBS HEADER RECORD!!!!!!!"
+ "000000000000000000000000000000 ")
+_fieldkeys = ['ntype', 'nhfun', 'field_length', 'nvar0', 'name', 'label',
+ 'nform', 'nfl', 'num_decimals', 'nfj', 'nfill', 'niform',
+ 'nifl', 'nifd', 'npos', '_']
+
+
+_base_params_doc = """\
+Parameters
+----------
+filepath_or_buffer : string or file-like object
+ Path to SAS file or object implementing binary read method."""
+
+_params2_doc = """\
+index : identifier of index column
+ Identifier of column that should be used as index of the DataFrame.
+encoding : string
+ Encoding for text data.
+chunksize : int
+ Read file `chunksize` lines at a time, returns iterator."""
+
+_format_params_doc = """\
+format : string
+ File format, only `xport` is currently supported."""
+
+_iterator_doc = """\
+iterator : boolean, default False
+ Return XportReader object for reading file incrementally."""
+
+
+_read_sas_doc = """Read a SAS file into a DataFrame.
+
+%(_base_params_doc)s
+%(_format_params_doc)s
+%(_params2_doc)s
+%(_iterator_doc)s
+
+Returns
+-------
+DataFrame or XportReader
+
+Examples
+--------
+Read a SAS Xport file:
+
+>>> df = pd.read_sas('filename.XPT')
+
+Read a Xport file in 10,000 line chunks:
+
+>>> itr = pd.read_sas('filename.XPT', chunksize=10000)
+>>> for chunk in itr:
+>>> do_something(chunk)
+
+""" % {"_base_params_doc": _base_params_doc,
+ "_format_params_doc": _format_params_doc,
+ "_params2_doc": _params2_doc,
+ "_iterator_doc": _iterator_doc}
+
+
+_xport_reader_doc = """\
+Class for reading SAS Xport files.
+
+%(_base_params_doc)s
+%(_params2_doc)s
+
+Attributes
+----------
+member_info : list
+ Contains information about the file
+fields : list
+ Contains information about the variables in the file
+""" % {"_base_params_doc": _base_params_doc,
+ "_params2_doc": _params2_doc}
+
+
+_read_method_doc = """\
+Read observations from SAS Xport file, returning as data frame.
+
+Parameters
+----------
+nrows : int
+ Number of rows to read from data file; if None, read whole
+ file.
+
+Returns
+-------
+A DataFrame.
+"""
+
+
+def _parse_date(datestr):
+ """ Given a date in xport format, return Python date. """
+ try:
+ # e.g. "16FEB11:10:07:55"
+ return datetime.strptime(datestr, "%d%b%y:%H:%M:%S")
+ except ValueError:
+ return pd.NaT
+
+
+def _split_line(s, parts):
+ """
+ Parameters
+ ----------
+ s: string
+ Fixed-length string to split
+ parts: list of (name, length) pairs
+ Used to break up string, name '_' will be filtered from output.
+
+ Returns
+ -------
+ Dict of name:contents of string at given location.
+ """
+ out = {}
+ start = 0
+ for name, length in parts:
+ out[name] = s[start:start + length].strip()
+ start += length
+ del out['_']
+ return out
+
+
+def _handle_truncated_float_vec(vec, nbytes):
+ # This feature is not well documented, but some SAS XPORT files
+ # have 2-7 byte "truncated" floats. To read these truncated
+ # floats, pad them with zeros on the right to make 8 byte floats.
+ #
+ # References:
+ # https://github.com/jcushman/xport/pull/3
+ # The R "foreign" library
+
+ if nbytes != 8:
+ vec1 = np.zeros(len(vec), np.dtype('S8'))
+ dtype = np.dtype('S%d,S%d' % (nbytes, 8 - nbytes))
+ vec2 = vec1.view(dtype=dtype)
+ vec2['f0'] = vec
+ return vec2
+
+ return vec
+
+
+def _parse_float_vec(vec):
+ """
+ Parse a vector of float values representing IBM 8 byte floats into
+ native 8 byte floats.
+ """
+
+ dtype = np.dtype('>u4,>u4')
+ vec1 = vec.view(dtype=dtype)
+ xport1 = vec1['f0']
+ xport2 = vec1['f1']
+
+ # Start by setting first half of ieee number to first half of IBM
+ # number sans exponent
+ ieee1 = xport1 & 0x00ffffff
+
+ # The fraction bit to the left of the binary point in the ieee
+ # format was set and the number was shifted 0, 1, 2, or 3
+ # places. This will tell us how to adjust the ibm exponent to be a
+ # power of 2 ieee exponent and how to shift the fraction bits to
+ # restore the correct magnitude.
+ shift = np.zeros(len(vec), dtype=np.uint8)
+ shift[np.where(xport1 & 0x00200000)] = 1
+ shift[np.where(xport1 & 0x00400000)] = 2
+ shift[np.where(xport1 & 0x00800000)] = 3
+
+ # shift the ieee number down the correct number of places then
+ # set the second half of the ieee number to be the second half
+ # of the ibm number shifted appropriately, ored with the bits
+ # from the first half that would have been shifted in if we
+ # could shift a double. All we are worried about are the low
+ # order 3 bits of the first half since we're only shifting by
+ # 1, 2, or 3.
+ ieee1 >>= shift
+ ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift)))
+
+ # clear the 1 bit to the left of the binary point
+ ieee1 &= 0xffefffff
+
+ # set the exponent of the ieee number to be the actual exponent
+ # plus the shift count + 1023. Or this into the first half of the
+ # ieee number. The ibm exponent is excess 64 but is adjusted by 65
+ # since during conversion to ibm format the exponent is
+ # incremented by 1 and the fraction bits left 4 positions to the
+ # right of the radix point. (had to add >> 24 because C treats &
+ # 0x7f as 0x7f000000 and Python doesn't)
+ ieee1 |= ((((((xport1 >> 24) & 0x7f) - 65) << 2) +
+ shift + 1023) << 20) | (xport1 & 0x80000000)
+
+ ieee = np.empty((len(ieee1),), dtype='>u4,>u4')
+ ieee['f0'] = ieee1
+ ieee['f1'] = ieee2
+ ieee = ieee.view(dtype='>f8')
+ ieee = ieee.astype('f8')
+
+ return ieee
+
+
+class XportReader(BaseIterator):
+ __doc__ = _xport_reader_doc
+
+ def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
+ chunksize=None):
+
+ self._encoding = encoding
+ self._lines_read = 0
+ self._index = index
+ self._chunksize = chunksize
+
+ if isinstance(filepath_or_buffer, str):
+ (filepath_or_buffer, encoding,
+ compression, should_close) = get_filepath_or_buffer(
+ filepath_or_buffer, encoding=encoding)
+
+ if isinstance(filepath_or_buffer, (str, compat.text_type, bytes)):
+ self.filepath_or_buffer = open(filepath_or_buffer, 'rb')
+ else:
+ # Copy to BytesIO, and ensure no encoding
+ contents = filepath_or_buffer.read()
+ try:
+ contents = contents.encode(self._encoding)
+ except UnicodeEncodeError:
+ pass
+ self.filepath_or_buffer = compat.BytesIO(contents)
+
+ self._read_header()
+
+ def close(self):
+ self.filepath_or_buffer.close()
+
+ def _get_row(self):
+ return self.filepath_or_buffer.read(80).decode()
+
+ def _read_header(self):
+ self.filepath_or_buffer.seek(0)
+
+ # read file header
+ line1 = self._get_row()
+ if line1 != _correct_line1:
+ self.close()
+ raise ValueError("Header record is not an XPORT file.")
+
+ line2 = self._get_row()
+ fif = [['prefix', 24], ['version', 8], ['OS', 8],
+ ['_', 24], ['created', 16]]
+ file_info = _split_line(line2, fif)
+ if file_info['prefix'] != "SAS SAS SASLIB":
+ self.close()
+ raise ValueError("Header record has invalid prefix.")
+ file_info['created'] = _parse_date(file_info['created'])
+ self.file_info = file_info
+
+ line3 = self._get_row()
+ file_info['modified'] = _parse_date(line3[:16])
+
+ # read member header
+ header1 = self._get_row()
+ header2 = self._get_row()
+ headflag1 = header1.startswith(_correct_header1)
+ headflag2 = (header2 == _correct_header2)
+ if not (headflag1 and headflag2):
+ self.close()
+ raise ValueError("Member header not found")
+ # usually 140, could be 135
+ fieldnamelength = int(header1[-5:-2])
+
+ # member info
+ mem = [['prefix', 8], ['set_name', 8], ['sasdata', 8],
+ ['version', 8], ['OS', 8], ['_', 24], ['created', 16]]
+ member_info = _split_line(self._get_row(), mem)
+ mem = [['modified', 16], ['_', 16], ['label', 40], ['type', 8]]
+ member_info.update(_split_line(self._get_row(), mem))
+ member_info['modified'] = _parse_date(member_info['modified'])
+ member_info['created'] = _parse_date(member_info['created'])
+ self.member_info = member_info
+
+ # read field names
+ types = {1: 'numeric', 2: 'char'}
+ fieldcount = int(self._get_row()[54:58])
+ datalength = fieldnamelength * fieldcount
+ # round up to nearest 80
+ if datalength % 80:
+ datalength += 80 - datalength % 80
+ fielddata = self.filepath_or_buffer.read(datalength)
+ fields = []
+ obs_length = 0
+ while len(fielddata) >= fieldnamelength:
+ # pull data for one field
+ field, fielddata = (fielddata[:fieldnamelength],
+ fielddata[fieldnamelength:])
+
+ # rest at end gets ignored, so if field is short, pad out
+ # to match struct pattern below
+ field = field.ljust(140)
+
+ fieldstruct = struct.unpack('>hhhh8s40s8shhh2s8shhl52s', field)
+ field = dict(zip(_fieldkeys, fieldstruct))
+ del field['_']
+ field['ntype'] = types[field['ntype']]
+ fl = field['field_length']
+ if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)):
+ self.close()
+ msg = "Floating field width {0} is not between 2 and 8."
+ raise TypeError(msg.format(fl))
+
+ for k, v in field.items():
+ try:
+ field[k] = v.strip()
+ except AttributeError:
+ pass
+
+ obs_length += field['field_length']
+ fields += [field]
+
+ header = self._get_row()
+ if not header == _correct_obs_header:
+ self.close()
+ raise ValueError("Observation header not found.")
+
+ self.fields = fields
+ self.record_length = obs_length
+ self.record_start = self.filepath_or_buffer.tell()
+
+ self.nobs = self._record_count()
+ self.columns = [x['name'].decode() for x in self.fields]
+
+ # Setup the dtype.
+ dtypel = [('s' + str(i), "S" + str(field['field_length']))
+ for i, field in enumerate(self.fields)]
+ dtype = np.dtype(dtypel)
+ self._dtype = dtype
+
+ def __next__(self):
+ return self.read(nrows=self._chunksize or 1)
+
+ def _record_count(self):
+ """
+ Get number of records in file.
+
+ This is maybe suboptimal because we have to seek to the end of
+ the file.
+
+ Side effect: returns file position to record_start.
+ """
+
+ self.filepath_or_buffer.seek(0, 2)
+ total_records_length = (self.filepath_or_buffer.tell() -
+ self.record_start)
+
+ if total_records_length % 80 != 0:
+ warnings.warn("xport file may be corrupted")
+
+ if self.record_length > 80:
+ self.filepath_or_buffer.seek(self.record_start)
+ return total_records_length // self.record_length
+
+ self.filepath_or_buffer.seek(-80, 2)
+ last_card = self.filepath_or_buffer.read(80)
+ last_card = np.frombuffer(last_card, dtype=np.uint64)
+
+ # 8 byte blank
+ ix = np.flatnonzero(last_card == 2314885530818453536)
+
+ if len(ix) == 0:
+ tail_pad = 0
+ else:
+ tail_pad = 8 * len(ix)
+
+ self.filepath_or_buffer.seek(self.record_start)
+
+ return (total_records_length - tail_pad) // self.record_length
+
+ def get_chunk(self, size=None):
+ """
+ Reads lines from Xport file and returns as dataframe
+
+ Parameters
+ ----------
+ size : int, defaults to None
+ Number of lines to read. If None, reads whole file.
+
+ Returns
+ -------
+ DataFrame
+ """
+ if size is None:
+ size = self._chunksize
+ return self.read(nrows=size)
+
+ def _missing_double(self, vec):
+ v = vec.view(dtype='u1,u1,u2,u4')
+ miss = (v['f1'] == 0) & (v['f2'] == 0) & (v['f3'] == 0)
+ miss1 = (((v['f0'] >= 0x41) & (v['f0'] <= 0x5a)) |
+ (v['f0'] == 0x5f) | (v['f0'] == 0x2e))
+ miss &= miss1
+ return miss
+
+ @Appender(_read_method_doc)
+ def read(self, nrows=None):
+
+ if nrows is None:
+ nrows = self.nobs
+
+ read_lines = min(nrows, self.nobs - self._lines_read)
+ read_len = read_lines * self.record_length
+ if read_len <= 0:
+ self.close()
+ raise StopIteration
+ raw = self.filepath_or_buffer.read(read_len)
+ data = np.frombuffer(raw, dtype=self._dtype, count=read_lines)
+
+ df = pd.DataFrame(index=range(read_lines))
+ for j, x in enumerate(self.columns):
+ vec = data['s%d' % j]
+ ntype = self.fields[j]['ntype']
+ if ntype == "numeric":
+ vec = _handle_truncated_float_vec(
+ vec, self.fields[j]['field_length'])
+ miss = self._missing_double(vec)
+ v = _parse_float_vec(vec)
+ v[miss] = np.nan
+ elif self.fields[j]['ntype'] == 'char':
+ v = [y.rstrip() for y in vec]
+ if compat.PY3:
+ if self._encoding is not None:
+ v = [y.decode(self._encoding) for y in v]
+ df[x] = v
+
+ if self._index is None:
+ df.index = range(self._lines_read, self._lines_read + read_lines)
+ else:
+ df = df.set_index(self._index)
+
+ self._lines_read += read_lines
+
+ return df
diff --git a/contrib/python/pandas/py2/pandas/io/sas/sasreader.py b/contrib/python/pandas/py2/pandas/io/sas/sasreader.py
new file mode 100644
index 00000000000..9fae0da670b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/sas/sasreader.py
@@ -0,0 +1,68 @@
+"""
+Read SAS sas7bdat or xport files.
+"""
+from pandas import compat
+
+from pandas.io.common import _stringify_path
+
+
+def read_sas(filepath_or_buffer, format=None, index=None, encoding=None,
+ chunksize=None, iterator=False):
+ """
+ Read SAS files stored as either XPORT or SAS7BDAT format files.
+
+ Parameters
+ ----------
+ filepath_or_buffer : string or file-like object
+ Path to the SAS file.
+ format : string {'xport', 'sas7bdat'} or None
+ If None, file format is inferred from file extension. If 'xport' or
+ 'sas7bdat', uses the corresponding format.
+ index : identifier of index column, defaults to None
+ Identifier of column that should be used as index of the DataFrame.
+ encoding : string, default is None
+ Encoding for text data. If None, text data are stored as raw bytes.
+ chunksize : int
+ Read file `chunksize` lines at a time, returns iterator.
+ iterator : bool, defaults to False
+ If True, returns an iterator for reading the file incrementally.
+
+ Returns
+ -------
+ DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
+ or XportReader
+ """
+ if format is None:
+ buffer_error_msg = ("If this is a buffer object rather "
+ "than a string name, you must specify "
+ "a format string")
+ filepath_or_buffer = _stringify_path(filepath_or_buffer)
+ if not isinstance(filepath_or_buffer, compat.string_types):
+ raise ValueError(buffer_error_msg)
+ fname = filepath_or_buffer.lower()
+ if fname.endswith(".xpt"):
+ format = "xport"
+ elif fname.endswith(".sas7bdat"):
+ format = "sas7bdat"
+ else:
+ raise ValueError("unable to infer format of SAS file")
+
+ if format.lower() == 'xport':
+ from pandas.io.sas.sas_xport import XportReader
+ reader = XportReader(filepath_or_buffer, index=index,
+ encoding=encoding,
+ chunksize=chunksize)
+ elif format.lower() == 'sas7bdat':
+ from pandas.io.sas.sas7bdat import SAS7BDATReader
+ reader = SAS7BDATReader(filepath_or_buffer, index=index,
+ encoding=encoding,
+ chunksize=chunksize)
+ else:
+ raise ValueError('unknown SAS format')
+
+ if iterator or chunksize:
+ return reader
+
+ data = reader.read()
+ reader.close()
+ return data
diff --git a/contrib/python/pandas/py2/pandas/io/sql.py b/contrib/python/pandas/py2/pandas/io/sql.py
new file mode 100644
index 00000000000..aaface54153
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/sql.py
@@ -0,0 +1,1596 @@
+# -*- coding: utf-8 -*-
+"""
+Collection of query wrappers / abstractions to both facilitate data
+retrieval and to reduce dependency on DB-specific API.
+"""
+
+from __future__ import division, print_function
+
+from contextlib import contextmanager
+from datetime import date, datetime, time
+from functools import partial
+import re
+import warnings
+
+import numpy as np
+
+import pandas._libs.lib as lib
+from pandas.compat import (
+ map, raise_with_traceback, string_types, text_type, zip)
+
+from pandas.core.dtypes.common import (
+ is_datetime64tz_dtype, is_dict_like, is_list_like)
+from pandas.core.dtypes.dtypes import DatetimeTZDtype
+from pandas.core.dtypes.missing import isna
+
+from pandas.core.api import DataFrame, Series
+from pandas.core.base import PandasObject
+from pandas.core.tools.datetimes import to_datetime
+
+
+class SQLAlchemyRequired(ImportError):
+ pass
+
+
+class DatabaseError(IOError):
+ pass
+
+
+# -----------------------------------------------------------------------------
+# -- Helper functions
+
+_SQLALCHEMY_INSTALLED = None
+
+
+def _is_sqlalchemy_connectable(con):
+ global _SQLALCHEMY_INSTALLED
+ if _SQLALCHEMY_INSTALLED is None:
+ try:
+ import sqlalchemy
+ _SQLALCHEMY_INSTALLED = True
+
+ from distutils.version import LooseVersion
+ ver = sqlalchemy.__version__
+ # For sqlalchemy versions < 0.8.2, the BIGINT type is recognized
+ # for a sqlite engine, which results in a warning when trying to
+ # read/write a DataFrame with int64 values. (GH7433)
+ if LooseVersion(ver) < LooseVersion('0.8.2'):
+ from sqlalchemy import BigInteger
+ from sqlalchemy.ext.compiler import compiles
+
+ @compiles(BigInteger, 'sqlite')
+ def compile_big_int_sqlite(type_, compiler, **kw):
+ return 'INTEGER'
+ except ImportError:
+ _SQLALCHEMY_INSTALLED = False
+
+ if _SQLALCHEMY_INSTALLED:
+ import sqlalchemy
+ return isinstance(con, sqlalchemy.engine.Connectable)
+ else:
+ return False
+
+
+def _convert_params(sql, params):
+ """Convert SQL and params args to DBAPI2.0 compliant format."""
+ args = [sql]
+ if params is not None:
+ if hasattr(params, 'keys'): # test if params is a mapping
+ args += [params]
+ else:
+ args += [list(params)]
+ return args
+
+
+def _process_parse_dates_argument(parse_dates):
+ """Process parse_dates argument for read_sql functions"""
+ # handle non-list entries for parse_dates gracefully
+ if parse_dates is True or parse_dates is None or parse_dates is False:
+ parse_dates = []
+
+ elif not hasattr(parse_dates, '__iter__'):
+ parse_dates = [parse_dates]
+ return parse_dates
+
+
+def _handle_date_column(col, utc=None, format=None):
+ if isinstance(format, dict):
+ return to_datetime(col, errors='ignore', **format)
+ else:
+ # Allow passing of formatting string for integers
+ # GH17855
+ if format is None and (issubclass(col.dtype.type, np.floating) or
+ issubclass(col.dtype.type, np.integer)):
+ format = 's'
+ if format in ['D', 'd', 'h', 'm', 's', 'ms', 'us', 'ns']:
+ return to_datetime(col, errors='coerce', unit=format, utc=utc)
+ elif is_datetime64tz_dtype(col):
+ # coerce to UTC timezone
+ # GH11216
+ return to_datetime(col, utc=True)
+ else:
+ return to_datetime(col, errors='coerce', format=format, utc=utc)
+
+
+def _parse_date_columns(data_frame, parse_dates):
+ """
+ Force non-datetime columns to be read as such.
+ Supports both string formatted and integer timestamp columns.
+ """
+ parse_dates = _process_parse_dates_argument(parse_dates)
+
+ # we want to coerce datetime64_tz dtypes for now to UTC
+ # we could in theory do a 'nice' conversion from a FixedOffset tz
+ # GH11216
+ for col_name, df_col in data_frame.iteritems():
+ if is_datetime64tz_dtype(df_col) or col_name in parse_dates:
+ try:
+ fmt = parse_dates[col_name]
+ except TypeError:
+ fmt = None
+ data_frame[col_name] = _handle_date_column(df_col, format=fmt)
+
+ return data_frame
+
+
+def _wrap_result(data, columns, index_col=None, coerce_float=True,
+ parse_dates=None):
+ """Wrap result set of query in a DataFrame."""
+
+ frame = DataFrame.from_records(data, columns=columns,
+ coerce_float=coerce_float)
+
+ frame = _parse_date_columns(frame, parse_dates)
+
+ if index_col is not None:
+ frame.set_index(index_col, inplace=True)
+
+ return frame
+
+
+def execute(sql, con, cur=None, params=None):
+ """
+ Execute the given SQL query using the provided connection object.
+
+ Parameters
+ ----------
+ sql : string
+ SQL query to be executed.
+ con : SQLAlchemy connectable(engine/connection) or sqlite3 connection
+ Using SQLAlchemy makes it possible to use any DB supported by the
+ library.
+ If a DBAPI2 object, only sqlite3 is supported.
+ cur : deprecated, cursor is obtained from connection, default: None
+ params : list or tuple, optional, default: None
+ List of parameters to pass to execute method.
+
+ Returns
+ -------
+ Results Iterable
+ """
+ if cur is None:
+ pandas_sql = pandasSQL_builder(con)
+ else:
+ pandas_sql = pandasSQL_builder(cur, is_cursor=True)
+ args = _convert_params(sql, params)
+ return pandas_sql.execute(*args)
+
+
+# -----------------------------------------------------------------------------
+# -- Read and write to DataFrames
+
+def read_sql_table(table_name, con, schema=None, index_col=None,
+ coerce_float=True, parse_dates=None, columns=None,
+ chunksize=None):
+ """Read SQL database table into a DataFrame.
+
+ Given a table name and a SQLAlchemy connectable, returns a DataFrame.
+ This function does not support DBAPI connections.
+
+ Parameters
+ ----------
+ table_name : string
+ Name of SQL table in database.
+ con : SQLAlchemy connectable (or database string URI)
+ SQLite DBAPI connection mode not supported.
+ schema : string, default None
+ Name of SQL schema in database to query (if database flavor
+ supports this). Uses default schema if None (default).
+ index_col : string or list of strings, optional, default: None
+ Column(s) to set as index(MultiIndex).
+ coerce_float : boolean, default True
+ Attempts to convert values of non-string, non-numeric objects (like
+ decimal.Decimal) to floating point. Can result in loss of Precision.
+ parse_dates : list or dict, default: None
+ - List of column names to parse as dates.
+ - Dict of ``{column_name: format string}`` where format string is
+ strftime compatible in case of parsing string times or is one of
+ (D, s, ns, ms, us) in case of parsing integer timestamps.
+ - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
+ to the keyword arguments of :func:`pandas.to_datetime`
+ Especially useful with databases without native Datetime support,
+ such as SQLite.
+ columns : list, default: None
+ List of column names to select from SQL table
+ chunksize : int, default None
+ If specified, returns an iterator where `chunksize` is the number of
+ rows to include in each chunk.
+
+ Returns
+ -------
+ DataFrame
+
+ See Also
+ --------
+ read_sql_query : Read SQL query into a DataFrame.
+ read_sql
+
+ Notes
+ -----
+ Any datetime values with time zone information will be converted to UTC.
+ """
+
+ con = _engine_builder(con)
+ if not _is_sqlalchemy_connectable(con):
+ raise NotImplementedError("read_sql_table only supported for "
+ "SQLAlchemy connectable.")
+ import sqlalchemy
+ from sqlalchemy.schema import MetaData
+ meta = MetaData(con, schema=schema)
+ try:
+ meta.reflect(only=[table_name], views=True)
+ except sqlalchemy.exc.InvalidRequestError:
+ raise ValueError("Table {name} not found".format(name=table_name))
+
+ pandas_sql = SQLDatabase(con, meta=meta)
+ table = pandas_sql.read_table(
+ table_name, index_col=index_col, coerce_float=coerce_float,
+ parse_dates=parse_dates, columns=columns, chunksize=chunksize)
+
+ if table is not None:
+ return table
+ else:
+ raise ValueError("Table {name} not found".format(name=table_name), con)
+
+
+def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None,
+ parse_dates=None, chunksize=None):
+ """Read SQL query into a DataFrame.
+
+ Returns a DataFrame corresponding to the result set of the query
+ string. Optionally provide an `index_col` parameter to use one of the
+ columns as the index, otherwise default integer index will be used.
+
+ Parameters
+ ----------
+ sql : string SQL query or SQLAlchemy Selectable (select or text object)
+ SQL query to be executed.
+ con : SQLAlchemy connectable(engine/connection), database string URI,
+ or sqlite3 DBAPI2 connection
+ Using SQLAlchemy makes it possible to use any DB supported by that
+ library.
+ If a DBAPI2 object, only sqlite3 is supported.
+ index_col : string or list of strings, optional, default: None
+ Column(s) to set as index(MultiIndex).
+ coerce_float : boolean, default True
+ Attempts to convert values of non-string, non-numeric objects (like
+ decimal.Decimal) to floating point. Useful for SQL result sets.
+ params : list, tuple or dict, optional, default: None
+ List of parameters to pass to execute method. The syntax used
+ to pass parameters is database driver dependent. Check your
+ database driver documentation for which of the five syntax styles,
+ described in PEP 249's paramstyle, is supported.
+ Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}
+ parse_dates : list or dict, default: None
+ - List of column names to parse as dates.
+ - Dict of ``{column_name: format string}`` where format string is
+ strftime compatible in case of parsing string times, or is one of
+ (D, s, ns, ms, us) in case of parsing integer timestamps.
+ - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
+ to the keyword arguments of :func:`pandas.to_datetime`
+ Especially useful with databases without native Datetime support,
+ such as SQLite.
+ chunksize : int, default None
+ If specified, return an iterator where `chunksize` is the number of
+ rows to include in each chunk.
+
+ Returns
+ -------
+ DataFrame
+
+ See Also
+ --------
+ read_sql_table : Read SQL database table into a DataFrame.
+ read_sql
+
+ Notes
+ -----
+ Any datetime values with time zone information parsed via the `parse_dates`
+ parameter will be converted to UTC.
+ """
+ pandas_sql = pandasSQL_builder(con)
+ return pandas_sql.read_query(
+ sql, index_col=index_col, params=params, coerce_float=coerce_float,
+ parse_dates=parse_dates, chunksize=chunksize)
+
+
+def read_sql(sql, con, index_col=None, coerce_float=True, params=None,
+ parse_dates=None, columns=None, chunksize=None):
+ """
+ Read SQL query or database table into a DataFrame.
+
+ This function is a convenience wrapper around ``read_sql_table`` and
+ ``read_sql_query`` (for backward compatibility). It will delegate
+ to the specific function depending on the provided input. A SQL query
+ will be routed to ``read_sql_query``, while a database table name will
+ be routed to ``read_sql_table``. Note that the delegated function might
+ have more specific notes about their functionality not listed here.
+
+ Parameters
+ ----------
+ sql : string or SQLAlchemy Selectable (select or text object)
+ SQL query to be executed or a table name.
+ con : SQLAlchemy connectable (engine/connection) or database string URI
+ or DBAPI2 connection (fallback mode)
+
+ Using SQLAlchemy makes it possible to use any DB supported by that
+ library. If a DBAPI2 object, only sqlite3 is supported.
+ index_col : string or list of strings, optional, default: None
+ Column(s) to set as index(MultiIndex).
+ coerce_float : boolean, default True
+ Attempts to convert values of non-string, non-numeric objects (like
+ decimal.Decimal) to floating point, useful for SQL result sets.
+ params : list, tuple or dict, optional, default: None
+ List of parameters to pass to execute method. The syntax used
+ to pass parameters is database driver dependent. Check your
+ database driver documentation for which of the five syntax styles,
+ described in PEP 249's paramstyle, is supported.
+ Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}
+ parse_dates : list or dict, default: None
+ - List of column names to parse as dates.
+ - Dict of ``{column_name: format string}`` where format string is
+ strftime compatible in case of parsing string times, or is one of
+ (D, s, ns, ms, us) in case of parsing integer timestamps.
+ - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
+ to the keyword arguments of :func:`pandas.to_datetime`
+ Especially useful with databases without native Datetime support,
+ such as SQLite.
+ columns : list, default: None
+ List of column names to select from SQL table (only used when reading
+ a table).
+ chunksize : int, default None
+ If specified, return an iterator where `chunksize` is the
+ number of rows to include in each chunk.
+
+ Returns
+ -------
+ DataFrame
+
+ See Also
+ --------
+ read_sql_table : Read SQL database table into a DataFrame.
+ read_sql_query : Read SQL query into a DataFrame.
+ """
+ pandas_sql = pandasSQL_builder(con)
+
+ if isinstance(pandas_sql, SQLiteDatabase):
+ return pandas_sql.read_query(
+ sql, index_col=index_col, params=params,
+ coerce_float=coerce_float, parse_dates=parse_dates,
+ chunksize=chunksize)
+
+ try:
+ _is_table_name = pandas_sql.has_table(sql)
+ except Exception:
+ # using generic exception to catch errors from sql drivers (GH24988)
+ _is_table_name = False
+
+ if _is_table_name:
+ pandas_sql.meta.reflect(only=[sql])
+ return pandas_sql.read_table(
+ sql, index_col=index_col, coerce_float=coerce_float,
+ parse_dates=parse_dates, columns=columns, chunksize=chunksize)
+ else:
+ return pandas_sql.read_query(
+ sql, index_col=index_col, params=params,
+ coerce_float=coerce_float, parse_dates=parse_dates,
+ chunksize=chunksize)
+
+
+def to_sql(frame, name, con, schema=None, if_exists='fail', index=True,
+ index_label=None, chunksize=None, dtype=None, method=None):
+ """
+ Write records stored in a DataFrame to a SQL database.
+
+ Parameters
+ ----------
+ frame : DataFrame, Series
+ name : string
+ Name of SQL table.
+ con : SQLAlchemy connectable(engine/connection) or database string URI
+ or sqlite3 DBAPI2 connection
+ Using SQLAlchemy makes it possible to use any DB supported by that
+ library.
+ If a DBAPI2 object, only sqlite3 is supported.
+ schema : string, default None
+ Name of SQL schema in database to write to (if database flavor
+ supports this). If None, use default schema (default).
+ if_exists : {'fail', 'replace', 'append'}, default 'fail'
+ - fail: If table exists, do nothing.
+ - replace: If table exists, drop it, recreate it, and insert data.
+ - append: If table exists, insert data. Create if does not exist.
+ index : boolean, default True
+ Write DataFrame index as a column.
+ index_label : string or sequence, default None
+ Column label for index column(s). If None is given (default) and
+ `index` is True, then the index names are used.
+ A sequence should be given if the DataFrame uses MultiIndex.
+ chunksize : int, default None
+ If not None, then rows will be written in batches of this size at a
+ time. If None, all rows will be written at once.
+ dtype : single SQLtype or dict of column name to SQL type, default None
+ Optional specifying the datatype for columns. The SQL type should
+ be a SQLAlchemy type, or a string for sqlite3 fallback connection.
+ If all columns are of the same type, one single value can be used.
+ method : {None, 'multi', callable}, default None
+ Controls the SQL insertion clause used:
+
+ - None : Uses standard SQL ``INSERT`` clause (one per row).
+ - 'multi': Pass multiple values in a single ``INSERT`` clause.
+ - callable with signature ``(pd_table, conn, keys, data_iter)``.
+
+ Details and a sample callable implementation can be found in the
+ section :ref:`insert method <io.sql.method>`.
+
+ .. versionadded:: 0.24.0
+ """
+ if if_exists not in ('fail', 'replace', 'append'):
+ raise ValueError("'{0}' is not valid for if_exists".format(if_exists))
+
+ pandas_sql = pandasSQL_builder(con, schema=schema)
+
+ if isinstance(frame, Series):
+ frame = frame.to_frame()
+ elif not isinstance(frame, DataFrame):
+ raise NotImplementedError("'frame' argument should be either a "
+ "Series or a DataFrame")
+
+ pandas_sql.to_sql(frame, name, if_exists=if_exists, index=index,
+ index_label=index_label, schema=schema,
+ chunksize=chunksize, dtype=dtype, method=method)
+
+
+def has_table(table_name, con, schema=None):
+ """
+ Check if DataBase has named table.
+
+ Parameters
+ ----------
+ table_name: string
+ Name of SQL table.
+ con: SQLAlchemy connectable(engine/connection) or sqlite3 DBAPI2 connection
+ Using SQLAlchemy makes it possible to use any DB supported by that
+ library.
+ If a DBAPI2 object, only sqlite3 is supported.
+ schema : string, default None
+ Name of SQL schema in database to write to (if database flavor supports
+ this). If None, use default schema (default).
+
+ Returns
+ -------
+ boolean
+ """
+ pandas_sql = pandasSQL_builder(con, schema=schema)
+ return pandas_sql.has_table(table_name)
+
+
+table_exists = has_table
+
+
+def _engine_builder(con):
+ """
+ Returns a SQLAlchemy engine from a URI (if con is a string)
+ else it just return con without modifying it.
+ """
+ global _SQLALCHEMY_INSTALLED
+ if isinstance(con, string_types):
+ try:
+ import sqlalchemy
+ except ImportError:
+ _SQLALCHEMY_INSTALLED = False
+ else:
+ con = sqlalchemy.create_engine(con)
+ return con
+
+ return con
+
+
+def pandasSQL_builder(con, schema=None, meta=None,
+ is_cursor=False):
+ """
+ Convenience function to return the correct PandasSQL subclass based on the
+ provided parameters.
+ """
+ # When support for DBAPI connections is removed,
+ # is_cursor should not be necessary.
+ con = _engine_builder(con)
+ if _is_sqlalchemy_connectable(con):
+ return SQLDatabase(con, schema=schema, meta=meta)
+ elif isinstance(con, string_types):
+ raise ImportError("Using URI string without sqlalchemy installed.")
+ else:
+ return SQLiteDatabase(con, is_cursor=is_cursor)
+
+
+class SQLTable(PandasObject):
+ """
+ For mapping Pandas tables to SQL tables.
+ Uses fact that table is reflected by SQLAlchemy to
+ do better type conversions.
+ Also holds various flags needed to avoid having to
+ pass them between functions all the time.
+ """
+ # TODO: support for multiIndex
+
+ def __init__(self, name, pandas_sql_engine, frame=None, index=True,
+ if_exists='fail', prefix='pandas', index_label=None,
+ schema=None, keys=None, dtype=None):
+ self.name = name
+ self.pd_sql = pandas_sql_engine
+ self.prefix = prefix
+ self.frame = frame
+ self.index = self._index_name(index, index_label)
+ self.schema = schema
+ self.if_exists = if_exists
+ self.keys = keys
+ self.dtype = dtype
+
+ if frame is not None:
+ # We want to initialize based on a dataframe
+ self.table = self._create_table_setup()
+ else:
+ # no data provided, read-only mode
+ self.table = self.pd_sql.get_table(self.name, self.schema)
+
+ if self.table is None:
+ raise ValueError(
+ "Could not init table '{name}'".format(name=name))
+
+ def exists(self):
+ return self.pd_sql.has_table(self.name, self.schema)
+
+ def sql_schema(self):
+ from sqlalchemy.schema import CreateTable
+ return str(CreateTable(self.table).compile(self.pd_sql.connectable))
+
+ def _execute_create(self):
+ # Inserting table into database, add to MetaData object
+ self.table = self.table.tometadata(self.pd_sql.meta)
+ self.table.create()
+
+ def create(self):
+ if self.exists():
+ if self.if_exists == 'fail':
+ raise ValueError(
+ "Table '{name}' already exists.".format(name=self.name))
+ elif self.if_exists == 'replace':
+ self.pd_sql.drop_table(self.name, self.schema)
+ self._execute_create()
+ elif self.if_exists == 'append':
+ pass
+ else:
+ raise ValueError(
+ "'{0}' is not valid for if_exists".format(self.if_exists))
+ else:
+ self._execute_create()
+
+ def _execute_insert(self, conn, keys, data_iter):
+ """Execute SQL statement inserting data
+
+ Parameters
+ ----------
+ conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
+ keys : list of str
+ Column names
+ data_iter : generator of list
+ Each item contains a list of values to be inserted
+ """
+ data = [dict(zip(keys, row)) for row in data_iter]
+ conn.execute(self.table.insert(), data)
+
+ def _execute_insert_multi(self, conn, keys, data_iter):
+ """Alternative to _execute_insert for DBs support multivalue INSERT.
+
+ Note: multi-value insert is usually faster for analytics DBs
+ and tables containing a few columns
+ but performance degrades quickly with increase of columns.
+ """
+ data = [dict(zip(keys, row)) for row in data_iter]
+ conn.execute(self.table.insert(data))
+
+ def insert_data(self):
+ if self.index is not None:
+ temp = self.frame.copy()
+ temp.index.names = self.index
+ try:
+ temp.reset_index(inplace=True)
+ except ValueError as err:
+ raise ValueError(
+ "duplicate name in index/columns: {0}".format(err))
+ else:
+ temp = self.frame
+
+ column_names = list(map(text_type, temp.columns))
+ ncols = len(column_names)
+ data_list = [None] * ncols
+ blocks = temp._data.blocks
+
+ for b in blocks:
+ if b.is_datetime:
+ # return datetime.datetime objects
+ if b.is_datetimetz:
+ # GH 9086: Ensure we return datetimes with timezone info
+ # Need to return 2-D data; DatetimeIndex is 1D
+ d = b.values.to_pydatetime()
+ d = np.expand_dims(d, axis=0)
+ else:
+ # convert to microsecond resolution for datetime.datetime
+ d = b.values.astype('M8[us]').astype(object)
+ else:
+ d = np.array(b.get_values(), dtype=object)
+
+ # replace NaN with None
+ if b._can_hold_na:
+ mask = isna(d)
+ d[mask] = None
+
+ for col_loc, col in zip(b.mgr_locs, d):
+ data_list[col_loc] = col
+
+ return column_names, data_list
+
+ def insert(self, chunksize=None, method=None):
+
+ # set insert method
+ if method is None:
+ exec_insert = self._execute_insert
+ elif method == 'multi':
+ exec_insert = self._execute_insert_multi
+ elif callable(method):
+ exec_insert = partial(method, self)
+ else:
+ raise ValueError('Invalid parameter `method`: {}'.format(method))
+
+ keys, data_list = self.insert_data()
+
+ nrows = len(self.frame)
+
+ if nrows == 0:
+ return
+
+ if chunksize is None:
+ chunksize = nrows
+ elif chunksize == 0:
+ raise ValueError('chunksize argument should be non-zero')
+
+ chunks = int(nrows / chunksize) + 1
+
+ with self.pd_sql.run_transaction() as conn:
+ for i in range(chunks):
+ start_i = i * chunksize
+ end_i = min((i + 1) * chunksize, nrows)
+ if start_i >= end_i:
+ break
+
+ chunk_iter = zip(*[arr[start_i:end_i] for arr in data_list])
+ exec_insert(conn, keys, chunk_iter)
+
+ def _query_iterator(self, result, chunksize, columns, coerce_float=True,
+ parse_dates=None):
+ """Return generator through chunked result set."""
+
+ while True:
+ data = result.fetchmany(chunksize)
+ if not data:
+ break
+ else:
+ self.frame = DataFrame.from_records(
+ data, columns=columns, coerce_float=coerce_float)
+
+ self._harmonize_columns(parse_dates=parse_dates)
+
+ if self.index is not None:
+ self.frame.set_index(self.index, inplace=True)
+
+ yield self.frame
+
+ def read(self, coerce_float=True, parse_dates=None, columns=None,
+ chunksize=None):
+
+ if columns is not None and len(columns) > 0:
+ from sqlalchemy import select
+ cols = [self.table.c[n] for n in columns]
+ if self.index is not None:
+ [cols.insert(0, self.table.c[idx]) for idx in self.index[::-1]]
+ sql_select = select(cols)
+ else:
+ sql_select = self.table.select()
+
+ result = self.pd_sql.execute(sql_select)
+ column_names = result.keys()
+
+ if chunksize is not None:
+ return self._query_iterator(result, chunksize, column_names,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates)
+ else:
+ data = result.fetchall()
+ self.frame = DataFrame.from_records(
+ data, columns=column_names, coerce_float=coerce_float)
+
+ self._harmonize_columns(parse_dates=parse_dates)
+
+ if self.index is not None:
+ self.frame.set_index(self.index, inplace=True)
+
+ return self.frame
+
+ def _index_name(self, index, index_label):
+ # for writing: index=True to include index in sql table
+ if index is True:
+ nlevels = self.frame.index.nlevels
+ # if index_label is specified, set this as index name(s)
+ if index_label is not None:
+ if not isinstance(index_label, list):
+ index_label = [index_label]
+ if len(index_label) != nlevels:
+ raise ValueError(
+ "Length of 'index_label' should match number of "
+ "levels, which is {0}".format(nlevels))
+ else:
+ return index_label
+ # return the used column labels for the index columns
+ if (nlevels == 1 and 'index' not in self.frame.columns and
+ self.frame.index.name is None):
+ return ['index']
+ else:
+ return [l if l is not None else "level_{0}".format(i)
+ for i, l in enumerate(self.frame.index.names)]
+
+ # for reading: index=(list of) string to specify column to set as index
+ elif isinstance(index, string_types):
+ return [index]
+ elif isinstance(index, list):
+ return index
+ else:
+ return None
+
+ def _get_column_names_and_types(self, dtype_mapper):
+ column_names_and_types = []
+ if self.index is not None:
+ for i, idx_label in enumerate(self.index):
+ idx_type = dtype_mapper(
+ self.frame.index._get_level_values(i))
+ column_names_and_types.append((text_type(idx_label),
+ idx_type, True))
+
+ column_names_and_types += [
+ (text_type(self.frame.columns[i]),
+ dtype_mapper(self.frame.iloc[:, i]),
+ False)
+ for i in range(len(self.frame.columns))
+ ]
+
+ return column_names_and_types
+
+ def _create_table_setup(self):
+ from sqlalchemy import Table, Column, PrimaryKeyConstraint
+
+ column_names_and_types = self._get_column_names_and_types(
+ self._sqlalchemy_type
+ )
+
+ columns = [Column(name, typ, index=is_index)
+ for name, typ, is_index in column_names_and_types]
+
+ if self.keys is not None:
+ if not is_list_like(self.keys):
+ keys = [self.keys]
+ else:
+ keys = self.keys
+ pkc = PrimaryKeyConstraint(*keys, name=self.name + '_pk')
+ columns.append(pkc)
+
+ schema = self.schema or self.pd_sql.meta.schema
+
+ # At this point, attach to new metadata, only attach to self.meta
+ # once table is created.
+ from sqlalchemy.schema import MetaData
+ meta = MetaData(self.pd_sql, schema=schema)
+
+ return Table(self.name, meta, *columns, schema=schema)
+
+ def _harmonize_columns(self, parse_dates=None):
+ """
+ Make the DataFrame's column types align with the SQL table
+ column types.
+ Need to work around limited NA value support. Floats are always
+ fine, ints must always be floats if there are Null values.
+ Booleans are hard because converting bool column with None replaces
+ all Nones with false. Therefore only convert bool if there are no
+ NA values.
+ Datetimes should already be converted to np.datetime64 if supported,
+ but here we also force conversion if required.
+ """
+ parse_dates = _process_parse_dates_argument(parse_dates)
+
+ for sql_col in self.table.columns:
+ col_name = sql_col.name
+ try:
+ df_col = self.frame[col_name]
+
+ # Handle date parsing upfront; don't try to convert columns
+ # twice
+ if col_name in parse_dates:
+ try:
+ fmt = parse_dates[col_name]
+ except TypeError:
+ fmt = None
+ self.frame[col_name] = _handle_date_column(
+ df_col, format=fmt)
+ continue
+
+ # the type the dataframe column should have
+ col_type = self._get_dtype(sql_col.type)
+
+ if (col_type is datetime or col_type is date or
+ col_type is DatetimeTZDtype):
+ # Convert tz-aware Datetime SQL columns to UTC
+ utc = col_type is DatetimeTZDtype
+ self.frame[col_name] = _handle_date_column(df_col, utc=utc)
+ elif col_type is float:
+ # floats support NA, can always convert!
+ self.frame[col_name] = df_col.astype(col_type, copy=False)
+
+ elif len(df_col) == df_col.count():
+ # No NA values, can convert ints and bools
+ if col_type is np.dtype('int64') or col_type is bool:
+ self.frame[col_name] = df_col.astype(
+ col_type, copy=False)
+ except KeyError:
+ pass # this column not in results
+
+ def _sqlalchemy_type(self, col):
+
+ dtype = self.dtype or {}
+ if col.name in dtype:
+ return self.dtype[col.name]
+
+ # Infer type of column, while ignoring missing values.
+ # Needed for inserting typed data containing NULLs, GH 8778.
+ col_type = lib.infer_dtype(col, skipna=True)
+
+ from sqlalchemy.types import (BigInteger, Integer, Float,
+ Text, Boolean,
+ DateTime, Date, Time, TIMESTAMP)
+
+ if col_type == 'datetime64' or col_type == 'datetime':
+ # GH 9086: TIMESTAMP is the suggested type if the column contains
+ # timezone information
+ try:
+ if col.dt.tz is not None:
+ return TIMESTAMP(timezone=True)
+ except AttributeError:
+ # The column is actually a DatetimeIndex
+ if col.tz is not None:
+ return TIMESTAMP(timezone=True)
+ return DateTime
+ if col_type == 'timedelta64':
+ warnings.warn("the 'timedelta' type is not supported, and will be "
+ "written as integer values (ns frequency) to the "
+ "database.", UserWarning, stacklevel=8)
+ return BigInteger
+ elif col_type == 'floating':
+ if col.dtype == 'float32':
+ return Float(precision=23)
+ else:
+ return Float(precision=53)
+ elif col_type == 'integer':
+ if col.dtype == 'int32':
+ return Integer
+ else:
+ return BigInteger
+ elif col_type == 'boolean':
+ return Boolean
+ elif col_type == 'date':
+ return Date
+ elif col_type == 'time':
+ return Time
+ elif col_type == 'complex':
+ raise ValueError('Complex datatypes not supported')
+
+ return Text
+
+ def _get_dtype(self, sqltype):
+ from sqlalchemy.types import (Integer, Float, Boolean, DateTime,
+ Date, TIMESTAMP)
+
+ if isinstance(sqltype, Float):
+ return float
+ elif isinstance(sqltype, Integer):
+ # TODO: Refine integer size.
+ return np.dtype('int64')
+ elif isinstance(sqltype, TIMESTAMP):
+ # we have a timezone capable type
+ if not sqltype.timezone:
+ return datetime
+ return DatetimeTZDtype
+ elif isinstance(sqltype, DateTime):
+ # Caution: np.datetime64 is also a subclass of np.number.
+ return datetime
+ elif isinstance(sqltype, Date):
+ return date
+ elif isinstance(sqltype, Boolean):
+ return bool
+ return object
+
+
+class PandasSQL(PandasObject):
+ """
+ Subclasses Should define read_sql and to_sql.
+ """
+
+ def read_sql(self, *args, **kwargs):
+ raise ValueError("PandasSQL must be created with an SQLAlchemy "
+ "connectable or sqlite connection")
+
+ def to_sql(self, *args, **kwargs):
+ raise ValueError("PandasSQL must be created with an SQLAlchemy "
+ "connectable or sqlite connection")
+
+
+class SQLDatabase(PandasSQL):
+ """
+ This class enables conversion between DataFrame and SQL databases
+ using SQLAlchemy to handle DataBase abstraction.
+
+ Parameters
+ ----------
+ engine : SQLAlchemy connectable
+ Connectable to connect with the database. Using SQLAlchemy makes it
+ possible to use any DB supported by that library.
+ schema : string, default None
+ Name of SQL schema in database to write to (if database flavor
+ supports this). If None, use default schema (default).
+ meta : SQLAlchemy MetaData object, default None
+ If provided, this MetaData object is used instead of a newly
+ created. This allows to specify database flavor specific
+ arguments in the MetaData object.
+
+ """
+
+ def __init__(self, engine, schema=None, meta=None):
+ self.connectable = engine
+ if not meta:
+ from sqlalchemy.schema import MetaData
+ meta = MetaData(self.connectable, schema=schema)
+
+ self.meta = meta
+
+ @contextmanager
+ def run_transaction(self):
+ with self.connectable.begin() as tx:
+ if hasattr(tx, 'execute'):
+ yield tx
+ else:
+ yield self.connectable
+
+ def execute(self, *args, **kwargs):
+ """Simple passthrough to SQLAlchemy connectable"""
+ return self.connectable.execute(*args, **kwargs)
+
+ def read_table(self, table_name, index_col=None, coerce_float=True,
+ parse_dates=None, columns=None, schema=None,
+ chunksize=None):
+ """Read SQL database table into a DataFrame.
+
+ Parameters
+ ----------
+ table_name : string
+ Name of SQL table in database.
+ index_col : string, optional, default: None
+ Column to set as index.
+ coerce_float : boolean, default True
+ Attempts to convert values of non-string, non-numeric objects
+ (like decimal.Decimal) to floating point. This can result in
+ loss of precision.
+ parse_dates : list or dict, default: None
+ - List of column names to parse as dates.
+ - Dict of ``{column_name: format string}`` where format string is
+ strftime compatible in case of parsing string times, or is one of
+ (D, s, ns, ms, us) in case of parsing integer timestamps.
+ - Dict of ``{column_name: arg}``, where the arg corresponds
+ to the keyword arguments of :func:`pandas.to_datetime`.
+ Especially useful with databases without native Datetime support,
+ such as SQLite.
+ columns : list, default: None
+ List of column names to select from SQL table.
+ schema : string, default None
+ Name of SQL schema in database to query (if database flavor
+ supports this). If specified, this overwrites the default
+ schema of the SQL database object.
+ chunksize : int, default None
+ If specified, return an iterator where `chunksize` is the number
+ of rows to include in each chunk.
+
+ Returns
+ -------
+ DataFrame
+
+ See Also
+ --------
+ pandas.read_sql_table
+ SQLDatabase.read_query
+
+ """
+ table = SQLTable(table_name, self, index=index_col, schema=schema)
+ return table.read(coerce_float=coerce_float,
+ parse_dates=parse_dates, columns=columns,
+ chunksize=chunksize)
+
+ @staticmethod
+ def _query_iterator(result, chunksize, columns, index_col=None,
+ coerce_float=True, parse_dates=None):
+ """Return generator through chunked result set"""
+
+ while True:
+ data = result.fetchmany(chunksize)
+ if not data:
+ break
+ else:
+ yield _wrap_result(data, columns, index_col=index_col,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates)
+
+ def read_query(self, sql, index_col=None, coerce_float=True,
+ parse_dates=None, params=None, chunksize=None):
+ """Read SQL query into a DataFrame.
+
+ Parameters
+ ----------
+ sql : string
+ SQL query to be executed.
+ index_col : string, optional, default: None
+ Column name to use as index for the returned DataFrame object.
+ coerce_float : boolean, default True
+ Attempt to convert values of non-string, non-numeric objects (like
+ decimal.Decimal) to floating point, useful for SQL result sets.
+ params : list, tuple or dict, optional, default: None
+ List of parameters to pass to execute method. The syntax used
+ to pass parameters is database driver dependent. Check your
+ database driver documentation for which of the five syntax styles,
+ described in PEP 249's paramstyle, is supported.
+ Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}
+ parse_dates : list or dict, default: None
+ - List of column names to parse as dates.
+ - Dict of ``{column_name: format string}`` where format string is
+ strftime compatible in case of parsing string times, or is one of
+ (D, s, ns, ms, us) in case of parsing integer timestamps.
+ - Dict of ``{column_name: arg dict}``, where the arg dict
+ corresponds to the keyword arguments of
+ :func:`pandas.to_datetime` Especially useful with databases
+ without native Datetime support, such as SQLite.
+ chunksize : int, default None
+ If specified, return an iterator where `chunksize` is the number
+ of rows to include in each chunk.
+
+ Returns
+ -------
+ DataFrame
+
+ See Also
+ --------
+ read_sql_table : Read SQL database table into a DataFrame.
+ read_sql
+
+ """
+ args = _convert_params(sql, params)
+
+ result = self.execute(*args)
+ columns = result.keys()
+
+ if chunksize is not None:
+ return self._query_iterator(result, chunksize, columns,
+ index_col=index_col,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates)
+ else:
+ data = result.fetchall()
+ frame = _wrap_result(data, columns, index_col=index_col,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates)
+ return frame
+
+ read_sql = read_query
+
+ def to_sql(self, frame, name, if_exists='fail', index=True,
+ index_label=None, schema=None, chunksize=None, dtype=None,
+ method=None):
+ """
+ Write records stored in a DataFrame to a SQL database.
+
+ Parameters
+ ----------
+ frame : DataFrame
+ name : string
+ Name of SQL table.
+ if_exists : {'fail', 'replace', 'append'}, default 'fail'
+ - fail: If table exists, do nothing.
+ - replace: If table exists, drop it, recreate it, and insert data.
+ - append: If table exists, insert data. Create if does not exist.
+ index : boolean, default True
+ Write DataFrame index as a column.
+ index_label : string or sequence, default None
+ Column label for index column(s). If None is given (default) and
+ `index` is True, then the index names are used.
+ A sequence should be given if the DataFrame uses MultiIndex.
+ schema : string, default None
+ Name of SQL schema in database to write to (if database flavor
+ supports this). If specified, this overwrites the default
+ schema of the SQLDatabase object.
+ chunksize : int, default None
+ If not None, then rows will be written in batches of this size at a
+ time. If None, all rows will be written at once.
+ dtype : single type or dict of column name to SQL type, default None
+ Optional specifying the datatype for columns. The SQL type should
+ be a SQLAlchemy type. If all columns are of the same type, one
+ single value can be used.
+ method : {None', 'multi', callable}, default None
+ Controls the SQL insertion clause used:
+
+ * None : Uses standard SQL ``INSERT`` clause (one per row).
+ * 'multi': Pass multiple values in a single ``INSERT`` clause.
+ * callable with signature ``(pd_table, conn, keys, data_iter)``.
+
+ Details and a sample callable implementation can be found in the
+ section :ref:`insert method <io.sql.method>`.
+
+ .. versionadded:: 0.24.0
+ """
+ if dtype and not is_dict_like(dtype):
+ dtype = {col_name: dtype for col_name in frame}
+
+ if dtype is not None:
+ from sqlalchemy.types import to_instance, TypeEngine
+ for col, my_type in dtype.items():
+ if not isinstance(to_instance(my_type), TypeEngine):
+ raise ValueError('The type of {column} is not a '
+ 'SQLAlchemy type '.format(column=col))
+
+ table = SQLTable(name, self, frame=frame, index=index,
+ if_exists=if_exists, index_label=index_label,
+ schema=schema, dtype=dtype)
+ table.create()
+ table.insert(chunksize, method=method)
+ if (not name.isdigit() and not name.islower()):
+ # check for potentially case sensitivity issues (GH7815)
+ # Only check when name is not a number and name is not lower case
+ engine = self.connectable.engine
+ with self.connectable.connect() as conn:
+ table_names = engine.table_names(
+ schema=schema or self.meta.schema,
+ connection=conn,
+ )
+ if name not in table_names:
+ msg = (
+ "The provided table name '{0}' is not found exactly as "
+ "such in the database after writing the table, possibly "
+ "due to case sensitivity issues. Consider using lower "
+ "case table names."
+ ).format(name)
+ warnings.warn(msg, UserWarning)
+
+ @property
+ def tables(self):
+ return self.meta.tables
+
+ def has_table(self, name, schema=None):
+ return self.connectable.run_callable(
+ self.connectable.dialect.has_table,
+ name,
+ schema or self.meta.schema,
+ )
+
+ def get_table(self, table_name, schema=None):
+ schema = schema or self.meta.schema
+ if schema:
+ tbl = self.meta.tables.get('.'.join([schema, table_name]))
+ else:
+ tbl = self.meta.tables.get(table_name)
+
+ # Avoid casting double-precision floats into decimals
+ from sqlalchemy import Numeric
+ for column in tbl.columns:
+ if isinstance(column.type, Numeric):
+ column.type.asdecimal = False
+
+ return tbl
+
+ def drop_table(self, table_name, schema=None):
+ schema = schema or self.meta.schema
+ if self.has_table(table_name, schema):
+ self.meta.reflect(only=[table_name], schema=schema)
+ self.get_table(table_name, schema).drop()
+ self.meta.clear()
+
+ def _create_sql_schema(self, frame, table_name, keys=None, dtype=None):
+ table = SQLTable(table_name, self, frame=frame, index=False, keys=keys,
+ dtype=dtype)
+ return str(table.sql_schema())
+
+
+# ---- SQL without SQLAlchemy ---
+# sqlite-specific sql strings and handler class
+# dictionary used for readability purposes
+_SQL_TYPES = {
+ 'string': 'TEXT',
+ 'floating': 'REAL',
+ 'integer': 'INTEGER',
+ 'datetime': 'TIMESTAMP',
+ 'date': 'DATE',
+ 'time': 'TIME',
+ 'boolean': 'INTEGER',
+}
+
+
+def _get_unicode_name(name):
+ try:
+ uname = text_type(name).encode("utf-8", "strict").decode("utf-8")
+ except UnicodeError:
+ raise ValueError(
+ "Cannot convert identifier to UTF-8: '{name}'".format(name=name))
+ return uname
+
+
+def _get_valid_sqlite_name(name):
+ # See http://stackoverflow.com/questions/6514274/how-do-you-escape-strings\
+ # -for-sqlite-table-column-names-in-python
+ # Ensure the string can be encoded as UTF-8.
+ # Ensure the string does not include any NUL characters.
+ # Replace all " with "".
+ # Wrap the entire thing in double quotes.
+
+ uname = _get_unicode_name(name)
+ if not len(uname):
+ raise ValueError("Empty table or column name specified")
+
+ nul_index = uname.find("\x00")
+ if nul_index >= 0:
+ raise ValueError('SQLite identifier cannot contain NULs')
+ return '"' + uname.replace('"', '""') + '"'
+
+
+_SAFE_NAMES_WARNING = ("The spaces in these column names will not be changed. "
+ "In pandas versions < 0.14, spaces were converted to "
+ "underscores.")
+
+
+class SQLiteTable(SQLTable):
+ """
+ Patch the SQLTable for fallback support.
+ Instead of a table variable just use the Create Table statement.
+ """
+
+ def __init__(self, *args, **kwargs):
+ # GH 8341
+ # register an adapter callable for datetime.time object
+ import sqlite3
+ # this will transform time(12,34,56,789) into '12:34:56.000789'
+ # (this is what sqlalchemy does)
+ sqlite3.register_adapter(time, lambda _: _.strftime("%H:%M:%S.%f"))
+ super(SQLiteTable, self).__init__(*args, **kwargs)
+
+ def sql_schema(self):
+ return str(";\n".join(self.table))
+
+ def _execute_create(self):
+ with self.pd_sql.run_transaction() as conn:
+ for stmt in self.table:
+ conn.execute(stmt)
+
+ def insert_statement(self):
+ names = list(map(text_type, self.frame.columns))
+ wld = '?' # wildcard char
+ escape = _get_valid_sqlite_name
+
+ if self.index is not None:
+ [names.insert(0, idx) for idx in self.index[::-1]]
+
+ bracketed_names = [escape(column) for column in names]
+ col_names = ','.join(bracketed_names)
+ wildcards = ','.join([wld] * len(names))
+ insert_statement = \
+ u'INSERT INTO {table} ({columns}) VALUES ({wld})'.format(
+ table=escape(self.name), columns=col_names, wld=wildcards)
+ return insert_statement
+
+ def _execute_insert(self, conn, keys, data_iter):
+ data_list = list(data_iter)
+ conn.executemany(self.insert_statement(), data_list)
+
+ def _create_table_setup(self):
+ """
+ Return a list of SQL statements that creates a table reflecting the
+ structure of a DataFrame. The first entry will be a CREATE TABLE
+ statement while the rest will be CREATE INDEX statements.
+ """
+ column_names_and_types = self._get_column_names_and_types(
+ self._sql_type_name
+ )
+
+ pat = re.compile(r'\s+')
+ column_names = [col_name for col_name, _, _ in column_names_and_types]
+ if any(map(pat.search, column_names)):
+ warnings.warn(_SAFE_NAMES_WARNING, stacklevel=6)
+
+ escape = _get_valid_sqlite_name
+
+ create_tbl_stmts = [escape(cname) + ' ' + ctype
+ for cname, ctype, _ in column_names_and_types]
+
+ if self.keys is not None and len(self.keys):
+ if not is_list_like(self.keys):
+ keys = [self.keys]
+ else:
+ keys = self.keys
+ cnames_br = ", ".join(escape(c) for c in keys)
+ create_tbl_stmts.append(
+ "CONSTRAINT {tbl}_pk PRIMARY KEY ({cnames_br})".format(
+ tbl=self.name, cnames_br=cnames_br))
+
+ create_stmts = ["CREATE TABLE " + escape(self.name) + " (\n" +
+ ',\n '.join(create_tbl_stmts) + "\n)"]
+
+ ix_cols = [cname for cname, _, is_index in column_names_and_types
+ if is_index]
+ if len(ix_cols):
+ cnames = "_".join(ix_cols)
+ cnames_br = ",".join(escape(c) for c in ix_cols)
+ create_stmts.append(
+ "CREATE INDEX " + escape("ix_" + self.name + "_" + cnames) +
+ "ON " + escape(self.name) + " (" + cnames_br + ")")
+
+ return create_stmts
+
+ def _sql_type_name(self, col):
+ dtype = self.dtype or {}
+ if col.name in dtype:
+ return dtype[col.name]
+
+ # Infer type of column, while ignoring missing values.
+ # Needed for inserting typed data containing NULLs, GH 8778.
+ col_type = lib.infer_dtype(col, skipna=True)
+
+ if col_type == 'timedelta64':
+ warnings.warn("the 'timedelta' type is not supported, and will be "
+ "written as integer values (ns frequency) to the "
+ "database.", UserWarning, stacklevel=8)
+ col_type = "integer"
+
+ elif col_type == "datetime64":
+ col_type = "datetime"
+
+ elif col_type == "empty":
+ col_type = "string"
+
+ elif col_type == "complex":
+ raise ValueError('Complex datatypes not supported')
+
+ if col_type not in _SQL_TYPES:
+ col_type = "string"
+
+ return _SQL_TYPES[col_type]
+
+
+class SQLiteDatabase(PandasSQL):
+ """
+ Version of SQLDatabase to support SQLite connections (fallback without
+ SQLAlchemy). This should only be used internally.
+
+ Parameters
+ ----------
+ con : sqlite connection object
+
+ """
+
+ def __init__(self, con, is_cursor=False):
+ self.is_cursor = is_cursor
+ self.con = con
+
+ @contextmanager
+ def run_transaction(self):
+ cur = self.con.cursor()
+ try:
+ yield cur
+ self.con.commit()
+ except Exception:
+ self.con.rollback()
+ raise
+ finally:
+ cur.close()
+
+ def execute(self, *args, **kwargs):
+ if self.is_cursor:
+ cur = self.con
+ else:
+ cur = self.con.cursor()
+ try:
+ if kwargs:
+ cur.execute(*args, **kwargs)
+ else:
+ cur.execute(*args)
+ return cur
+ except Exception as exc:
+ try:
+ self.con.rollback()
+ except Exception: # pragma: no cover
+ ex = DatabaseError(
+ "Execution failed on sql: {sql}\n{exc}\nunable "
+ "to rollback".format(sql=args[0], exc=exc))
+ raise_with_traceback(ex)
+
+ ex = DatabaseError(
+ "Execution failed on sql '{sql}': {exc}".format(
+ sql=args[0], exc=exc))
+ raise_with_traceback(ex)
+
+ @staticmethod
+ def _query_iterator(cursor, chunksize, columns, index_col=None,
+ coerce_float=True, parse_dates=None):
+ """Return generator through chunked result set"""
+
+ while True:
+ data = cursor.fetchmany(chunksize)
+ if type(data) == tuple:
+ data = list(data)
+ if not data:
+ cursor.close()
+ break
+ else:
+ yield _wrap_result(data, columns, index_col=index_col,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates)
+
+ def read_query(self, sql, index_col=None, coerce_float=True, params=None,
+ parse_dates=None, chunksize=None):
+
+ args = _convert_params(sql, params)
+ cursor = self.execute(*args)
+ columns = [col_desc[0] for col_desc in cursor.description]
+
+ if chunksize is not None:
+ return self._query_iterator(cursor, chunksize, columns,
+ index_col=index_col,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates)
+ else:
+ data = self._fetchall_as_list(cursor)
+ cursor.close()
+
+ frame = _wrap_result(data, columns, index_col=index_col,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates)
+ return frame
+
+ def _fetchall_as_list(self, cur):
+ result = cur.fetchall()
+ if not isinstance(result, list):
+ result = list(result)
+ return result
+
+ def to_sql(self, frame, name, if_exists='fail', index=True,
+ index_label=None, schema=None, chunksize=None, dtype=None,
+ method=None):
+ """
+ Write records stored in a DataFrame to a SQL database.
+
+ Parameters
+ ----------
+ frame: DataFrame
+ name: string
+ Name of SQL table.
+ if_exists: {'fail', 'replace', 'append'}, default 'fail'
+ fail: If table exists, do nothing.
+ replace: If table exists, drop it, recreate it, and insert data.
+ append: If table exists, insert data. Create if it does not exist.
+ index : boolean, default True
+ Write DataFrame index as a column
+ index_label : string or sequence, default None
+ Column label for index column(s). If None is given (default) and
+ `index` is True, then the index names are used.
+ A sequence should be given if the DataFrame uses MultiIndex.
+ schema : string, default None
+ Ignored parameter included for compatibility with SQLAlchemy
+ version of ``to_sql``.
+ chunksize : int, default None
+ If not None, then rows will be written in batches of this
+ size at a time. If None, all rows will be written at once.
+ dtype : single type or dict of column name to SQL type, default None
+ Optional specifying the datatype for columns. The SQL type should
+ be a string. If all columns are of the same type, one single value
+ can be used.
+ method : {None, 'multi', callable}, default None
+ Controls the SQL insertion clause used:
+
+ * None : Uses standard SQL ``INSERT`` clause (one per row).
+ * 'multi': Pass multiple values in a single ``INSERT`` clause.
+ * callable with signature ``(pd_table, conn, keys, data_iter)``.
+
+ Details and a sample callable implementation can be found in the
+ section :ref:`insert method <io.sql.method>`.
+
+ .. versionadded:: 0.24.0
+ """
+ if dtype and not is_dict_like(dtype):
+ dtype = {col_name: dtype for col_name in frame}
+
+ if dtype is not None:
+ for col, my_type in dtype.items():
+ if not isinstance(my_type, str):
+ raise ValueError('{column} ({type!s}) not a string'.format(
+ column=col, type=my_type))
+
+ table = SQLiteTable(name, self, frame=frame, index=index,
+ if_exists=if_exists, index_label=index_label,
+ dtype=dtype)
+ table.create()
+ table.insert(chunksize, method)
+
+ def has_table(self, name, schema=None):
+ # TODO(wesm): unused?
+ # escape = _get_valid_sqlite_name
+ # esc_name = escape(name)
+
+ wld = '?'
+ query = ("SELECT name FROM sqlite_master "
+ "WHERE type='table' AND name={wld};").format(wld=wld)
+
+ return len(self.execute(query, [name, ]).fetchall()) > 0
+
+ def get_table(self, table_name, schema=None):
+ return None # not supported in fallback mode
+
+ def drop_table(self, name, schema=None):
+ drop_sql = "DROP TABLE {name}".format(
+ name=_get_valid_sqlite_name(name))
+ self.execute(drop_sql)
+
+ def _create_sql_schema(self, frame, table_name, keys=None, dtype=None):
+ table = SQLiteTable(table_name, self, frame=frame, index=False,
+ keys=keys, dtype=dtype)
+ return str(table.sql_schema())
+
+
+def get_schema(frame, name, keys=None, con=None, dtype=None):
+ """
+ Get the SQL db table schema for the given frame.
+
+ Parameters
+ ----------
+ frame : DataFrame
+ name : string
+ name of SQL table
+ keys : string or sequence, default: None
+ columns to use a primary key
+ con: an open SQL database connection object or a SQLAlchemy connectable
+ Using SQLAlchemy makes it possible to use any DB supported by that
+ library, default: None
+ If a DBAPI2 object, only sqlite3 is supported.
+ dtype : dict of column name to SQL type, default None
+ Optional specifying the datatype for columns. The SQL type should
+ be a SQLAlchemy type, or a string for sqlite3 fallback connection.
+
+ """
+
+ pandas_sql = pandasSQL_builder(con=con)
+ return pandas_sql._create_sql_schema(frame, name, keys=keys, dtype=dtype)
diff --git a/contrib/python/pandas/py2/pandas/io/stata.py b/contrib/python/pandas/py2/pandas/io/stata.py
new file mode 100644
index 00000000000..1b0660171ec
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/io/stata.py
@@ -0,0 +1,2988 @@
+"""
+Module contains tools for processing Stata files into DataFrames
+
+The StataReader below was originally written by Joe Presbrey as part of PyDTA.
+It has been extended and improved by Skipper Seabold from the Statsmodels
+project who also developed the StataWriter and was finally added to pandas in
+a once again improved version.
+
+You can find more information on http://presbrey.mit.edu/PyDTA and
+http://www.statsmodels.org/devel/
+"""
+
+from collections import OrderedDict
+import datetime
+import os
+import struct
+import sys
+import warnings
+
+from dateutil.relativedelta import relativedelta
+import numpy as np
+
+from pandas._libs.lib import infer_dtype
+from pandas._libs.tslibs import NaT, Timestamp
+from pandas._libs.writers import max_len_string_array
+from pandas.compat import (
+ BytesIO, ResourceWarning, lmap, lrange, lzip, range, string_types,
+ text_type, zip)
+from pandas.util._decorators import Appender, deprecate_kwarg
+
+from pandas.core.dtypes.common import (
+ ensure_object, is_categorical_dtype, is_datetime64_dtype)
+
+from pandas import DatetimeIndex, compat, isna, to_datetime, to_timedelta
+from pandas.core.arrays import Categorical
+from pandas.core.base import StringMixin
+from pandas.core.frame import DataFrame
+from pandas.core.series import Series
+
+from pandas.io.common import (
+ BaseIterator, _stringify_path, get_filepath_or_buffer)
+
+_version_error = ("Version of given Stata file is not 104, 105, 108, "
+ "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
+ "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)")
+
+_statafile_processing_params1 = """\
+convert_dates : boolean, defaults to True
+ Convert date variables to DataFrame time values.
+convert_categoricals : boolean, defaults to True
+ Read value labels and convert columns to Categorical/Factor variables."""
+
+_encoding_params = """\
+encoding : string, None or encoding
+ Encoding used to parse the files. None defaults to latin-1."""
+
+_statafile_processing_params2 = """\
+index_col : string, optional, default: None
+ Column to set as index.
+convert_missing : boolean, defaults to False
+ Flag indicating whether to convert missing values to their Stata
+ representations. If False, missing values are replaced with nan.
+ If True, columns containing missing values are returned with
+ object data types and missing values are represented by
+ StataMissingValue objects.
+preserve_dtypes : boolean, defaults to True
+ Preserve Stata datatypes. If False, numeric data are upcast to pandas
+ default types for foreign data (float64 or int64).
+columns : list or None
+ Columns to retain. Columns will be returned in the given order. None
+ returns all columns.
+order_categoricals : boolean, defaults to True
+ Flag indicating whether converted categorical data are ordered."""
+
+_chunksize_params = """\
+chunksize : int, default None
+ Return StataReader object for iterations, returns chunks with
+ given number of lines."""
+
+_iterator_params = """\
+iterator : boolean, default False
+ Return StataReader object."""
+
+_read_stata_doc = """
+Read Stata file into DataFrame.
+
+Parameters
+----------
+filepath_or_buffer : string or file-like object
+ Path to .dta file or object implementing a binary read() functions.
+%s
+%s
+%s
+%s
+%s
+
+Returns
+-------
+DataFrame or StataReader
+
+See Also
+--------
+pandas.io.stata.StataReader : Low-level reader for Stata data files.
+pandas.DataFrame.to_stata: Export Stata data files.
+
+Examples
+--------
+Read a Stata dta file:
+
+>>> df = pd.read_stata('filename.dta')
+
+Read a Stata dta file in 10,000 line chunks:
+
+>>> itr = pd.read_stata('filename.dta', chunksize=10000)
+>>> for chunk in itr:
+... do_something(chunk)
+""" % (_statafile_processing_params1, _encoding_params,
+ _statafile_processing_params2, _chunksize_params,
+ _iterator_params)
+
+_data_method_doc = """\
+Reads observations from Stata file, converting them into a dataframe
+
+.. deprecated::
+ This is a legacy method. Use `read` in new code.
+
+Parameters
+----------
+%s
+%s
+
+Returns
+-------
+DataFrame
+""" % (_statafile_processing_params1, _statafile_processing_params2)
+
+_read_method_doc = """\
+Reads observations from Stata file, converting them into a dataframe
+
+Parameters
+----------
+nrows : int
+ Number of lines to read from data file, if None read whole file.
+%s
+%s
+
+Returns
+-------
+DataFrame
+""" % (_statafile_processing_params1, _statafile_processing_params2)
+
+
+_stata_reader_doc = """\
+Class for reading Stata dta files.
+
+Parameters
+----------
+path_or_buf : path (string), buffer or path object
+ string, path object (pathlib.Path or py._path.local.LocalPath) or object
+ implementing a binary read() functions.
+
+ .. versionadded:: 0.23.0 support for pathlib, py.path.
+%s
+%s
+%s
+%s
+""" % (_statafile_processing_params1, _statafile_processing_params2,
+ _encoding_params, _chunksize_params)
+
+
+@Appender(_read_stata_doc)
+@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
+@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
+def read_stata(filepath_or_buffer, convert_dates=True,
+ convert_categoricals=True, encoding=None, index_col=None,
+ convert_missing=False, preserve_dtypes=True, columns=None,
+ order_categoricals=True, chunksize=None, iterator=False):
+
+ reader = StataReader(filepath_or_buffer,
+ convert_dates=convert_dates,
+ convert_categoricals=convert_categoricals,
+ index_col=index_col, convert_missing=convert_missing,
+ preserve_dtypes=preserve_dtypes,
+ columns=columns,
+ order_categoricals=order_categoricals,
+ chunksize=chunksize)
+
+ if iterator or chunksize:
+ data = reader
+ else:
+ try:
+ data = reader.read()
+ finally:
+ reader.close()
+ return data
+
+
+_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"]
+
+
+stata_epoch = datetime.datetime(1960, 1, 1)
+
+
+def _stata_elapsed_date_to_datetime_vec(dates, fmt):
+ """
+ Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime
+
+ Parameters
+ ----------
+ dates : Series
+ The Stata Internal Format date to convert to datetime according to fmt
+ fmt : str
+ The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
+ Returns
+
+ Returns
+ -------
+ converted : Series
+ The converted dates
+
+ Examples
+ --------
+ >>> dates = pd.Series([52])
+ >>> _stata_elapsed_date_to_datetime_vec(dates , "%tw")
+ 0 1961-01-01
+ dtype: datetime64[ns]
+
+ Notes
+ -----
+ datetime/c - tc
+ milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day
+ datetime/C - tC - NOT IMPLEMENTED
+ milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds
+ date - td
+ days since 01jan1960 (01jan1960 = 0)
+ weekly date - tw
+ weeks since 1960w1
+ This assumes 52 weeks in a year, then adds 7 * remainder of the weeks.
+ The datetime value is the start of the week in terms of days in the
+ year, not ISO calendar weeks.
+ monthly date - tm
+ months since 1960m1
+ quarterly date - tq
+ quarters since 1960q1
+ half-yearly date - th
+ half-years since 1960h1 yearly
+ date - ty
+ years since 0000
+
+ If you don't have pandas with datetime support, then you can't do
+ milliseconds accurately.
+ """
+ MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year
+ MAX_DAY_DELTA = (Timestamp.max - datetime.datetime(1960, 1, 1)).days
+ MIN_DAY_DELTA = (Timestamp.min - datetime.datetime(1960, 1, 1)).days
+ MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000
+ MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000
+
+ def convert_year_month_safe(year, month):
+ """
+ Convert year and month to datetimes, using pandas vectorized versions
+ when the date range falls within the range supported by pandas.
+ Otherwise it falls back to a slower but more robust method
+ using datetime.
+ """
+ if year.max() < MAX_YEAR and year.min() > MIN_YEAR:
+ return to_datetime(100 * year + month, format='%Y%m')
+ else:
+ index = getattr(year, 'index', None)
+ return Series(
+ [datetime.datetime(y, m, 1) for y, m in zip(year, month)],
+ index=index)
+
+ def convert_year_days_safe(year, days):
+ """
+ Converts year (e.g. 1999) and days since the start of the year to a
+ datetime or datetime64 Series
+ """
+ if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR:
+ return (to_datetime(year, format='%Y') +
+ to_timedelta(days, unit='d'))
+ else:
+ index = getattr(year, 'index', None)
+ value = [datetime.datetime(y, 1, 1) + relativedelta(days=int(d))
+ for y, d in zip(year, days)]
+ return Series(value, index=index)
+
+ def convert_delta_safe(base, deltas, unit):
+ """
+ Convert base dates and deltas to datetimes, using pandas vectorized
+ versions if the deltas satisfy restrictions required to be expressed
+ as dates in pandas.
+ """
+ index = getattr(deltas, 'index', None)
+ if unit == 'd':
+ if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA:
+ values = [base + relativedelta(days=int(d)) for d in deltas]
+ return Series(values, index=index)
+ elif unit == 'ms':
+ if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA:
+ values = [base + relativedelta(microseconds=(int(d) * 1000))
+ for d in deltas]
+ return Series(values, index=index)
+ else:
+ raise ValueError('format not understood')
+ base = to_datetime(base)
+ deltas = to_timedelta(deltas, unit=unit)
+ return base + deltas
+
+ # TODO: If/when pandas supports more than datetime64[ns], this should be
+ # improved to use correct range, e.g. datetime[Y] for yearly
+ bad_locs = np.isnan(dates)
+ has_bad_values = False
+ if bad_locs.any():
+ has_bad_values = True
+ data_col = Series(dates)
+ data_col[bad_locs] = 1.0 # Replace with NaT
+ dates = dates.astype(np.int64)
+
+ if fmt.startswith(("%tc", "tc")): # Delta ms relative to base
+ base = stata_epoch
+ ms = dates
+ conv_dates = convert_delta_safe(base, ms, 'ms')
+ elif fmt.startswith(("%tC", "tC")):
+
+ warnings.warn("Encountered %tC format. Leaving in Stata "
+ "Internal Format.")
+ conv_dates = Series(dates, dtype=np.object)
+ if has_bad_values:
+ conv_dates[bad_locs] = NaT
+ return conv_dates
+ # Delta days relative to base
+ elif fmt.startswith(("%td", "td", "%d", "d")):
+ base = stata_epoch
+ days = dates
+ conv_dates = convert_delta_safe(base, days, 'd')
+ # does not count leap days - 7 days is a week.
+ # 52nd week may have more than 7 days
+ elif fmt.startswith(("%tw", "tw")):
+ year = stata_epoch.year + dates // 52
+ days = (dates % 52) * 7
+ conv_dates = convert_year_days_safe(year, days)
+ elif fmt.startswith(("%tm", "tm")): # Delta months relative to base
+ year = stata_epoch.year + dates // 12
+ month = (dates % 12) + 1
+ conv_dates = convert_year_month_safe(year, month)
+ elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base
+ year = stata_epoch.year + dates // 4
+ month = (dates % 4) * 3 + 1
+ conv_dates = convert_year_month_safe(year, month)
+ elif fmt.startswith(("%th", "th")): # Delta half-years relative to base
+ year = stata_epoch.year + dates // 2
+ month = (dates % 2) * 6 + 1
+ conv_dates = convert_year_month_safe(year, month)
+ elif fmt.startswith(("%ty", "ty")): # Years -- not delta
+ year = dates
+ month = np.ones_like(dates)
+ conv_dates = convert_year_month_safe(year, month)
+ else:
+ raise ValueError("Date fmt {fmt} not understood".format(fmt=fmt))
+
+ if has_bad_values: # Restore NaT for bad values
+ conv_dates[bad_locs] = NaT
+
+ return conv_dates
+
+
+def _datetime_to_stata_elapsed_vec(dates, fmt):
+ """
+ Convert from datetime to SIF. http://www.stata.com/help.cgi?datetime
+
+ Parameters
+ ----------
+ dates : Series
+ Series or array containing datetime.datetime or datetime64[ns] to
+ convert to the Stata Internal Format given by fmt
+ fmt : str
+ The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
+ """
+ index = dates.index
+ NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000
+ US_PER_DAY = NS_PER_DAY / 1000
+
+ def parse_dates_safe(dates, delta=False, year=False, days=False):
+ d = {}
+ if is_datetime64_dtype(dates.values):
+ if delta:
+ delta = dates - stata_epoch
+ d['delta'] = delta.values.astype(
+ np.int64) // 1000 # microseconds
+ if days or year:
+ dates = DatetimeIndex(dates)
+ d['year'], d['month'] = dates.year, dates.month
+ if days:
+ days = (dates.astype(np.int64) -
+ to_datetime(d['year'], format='%Y').astype(np.int64))
+ d['days'] = days // NS_PER_DAY
+
+ elif infer_dtype(dates, skipna=False) == 'datetime':
+ if delta:
+ delta = dates.values - stata_epoch
+ f = lambda x: \
+ US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds
+ v = np.vectorize(f)
+ d['delta'] = v(delta)
+ if year:
+ year_month = dates.apply(lambda x: 100 * x.year + x.month)
+ d['year'] = year_month.values // 100
+ d['month'] = (year_month.values - d['year'] * 100)
+ if days:
+ f = lambda x: (x - datetime.datetime(x.year, 1, 1)).days
+ v = np.vectorize(f)
+ d['days'] = v(dates)
+ else:
+ raise ValueError('Columns containing dates must contain either '
+ 'datetime64, datetime.datetime or null values.')
+
+ return DataFrame(d, index=index)
+
+ bad_loc = isna(dates)
+ index = dates.index
+ if bad_loc.any():
+ dates = Series(dates)
+ if is_datetime64_dtype(dates):
+ dates[bad_loc] = to_datetime(stata_epoch)
+ else:
+ dates[bad_loc] = stata_epoch
+
+ if fmt in ["%tc", "tc"]:
+ d = parse_dates_safe(dates, delta=True)
+ conv_dates = d.delta / 1000
+ elif fmt in ["%tC", "tC"]:
+ warnings.warn("Stata Internal Format tC not supported.")
+ conv_dates = dates
+ elif fmt in ["%td", "td"]:
+ d = parse_dates_safe(dates, delta=True)
+ conv_dates = d.delta // US_PER_DAY
+ elif fmt in ["%tw", "tw"]:
+ d = parse_dates_safe(dates, year=True, days=True)
+ conv_dates = (52 * (d.year - stata_epoch.year) + d.days // 7)
+ elif fmt in ["%tm", "tm"]:
+ d = parse_dates_safe(dates, year=True)
+ conv_dates = (12 * (d.year - stata_epoch.year) + d.month - 1)
+ elif fmt in ["%tq", "tq"]:
+ d = parse_dates_safe(dates, year=True)
+ conv_dates = 4 * (d.year - stata_epoch.year) + (d.month - 1) // 3
+ elif fmt in ["%th", "th"]:
+ d = parse_dates_safe(dates, year=True)
+ conv_dates = (2 * (d.year - stata_epoch.year) +
+ (d.month > 6).astype(np.int))
+ elif fmt in ["%ty", "ty"]:
+ d = parse_dates_safe(dates, year=True)
+ conv_dates = d.year
+ else:
+ raise ValueError(
+ "Format {fmt} is not a known Stata date format".format(fmt=fmt))
+
+ conv_dates = Series(conv_dates, dtype=np.float64)
+ missing_value = struct.unpack('<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0]
+ conv_dates[bad_loc] = missing_value
+
+ return Series(conv_dates, index=index)
+
+
+excessive_string_length_error = """
+Fixed width strings in Stata .dta files are limited to 244 (or fewer)
+characters. Column '%s' does not satisfy this restriction. Use the
+'version=117' parameter to write the newer (Stata 13 and later) format.
+"""
+
+
+class PossiblePrecisionLoss(Warning):
+ pass
+
+
+precision_loss_doc = """
+Column converted from %s to %s, and some data are outside of the lossless
+conversion range. This may result in a loss of precision in the saved data.
+"""
+
+
+class ValueLabelTypeMismatch(Warning):
+ pass
+
+
+value_label_mismatch_doc = """
+Stata value labels (pandas categories) must be strings. Column {0} contains
+non-string labels which will be converted to strings. Please check that the
+Stata data file created has not lost information due to duplicate labels.
+"""
+
+
+class InvalidColumnName(Warning):
+ pass
+
+
+invalid_name_doc = """
+Not all pandas column names were valid Stata variable names.
+The following replacements have been made:
+
+ {0}
+
+If this is not what you expect, please make sure you have Stata-compliant
+column names in your DataFrame (strings only, max 32 characters, only
+alphanumerics and underscores, no Stata reserved words)
+"""
+
+
+def _cast_to_stata_types(data):
+ """Checks the dtypes of the columns of a pandas DataFrame for
+ compatibility with the data types and ranges supported by Stata, and
+ converts if necessary.
+
+ Parameters
+ ----------
+ data : DataFrame
+ The DataFrame to check and convert
+
+ Notes
+ -----
+ Numeric columns in Stata must be one of int8, int16, int32, float32 or
+ float64, with some additional value restrictions. int8 and int16 columns
+ are checked for violations of the value restrictions and upcast if needed.
+ int64 data is not usable in Stata, and so it is downcast to int32 whenever
+ the value are in the int32 range, and sidecast to float64 when larger than
+ this range. If the int64 values are outside of the range of those
+ perfectly representable as float64 values, a warning is raised.
+
+ bool columns are cast to int8. uint columns are converted to int of the
+ same size if there is no loss in precision, otherwise are upcast to a
+ larger type. uint64 is currently not supported since it is concerted to
+ object in a DataFrame.
+ """
+ ws = ''
+ # original, if small, if large
+ conversion_data = ((np.bool, np.int8, np.int8),
+ (np.uint8, np.int8, np.int16),
+ (np.uint16, np.int16, np.int32),
+ (np.uint32, np.int32, np.int64))
+
+ float32_max = struct.unpack('<f', b'\xff\xff\xff\x7e')[0]
+ float64_max = struct.unpack('<d', b'\xff\xff\xff\xff\xff\xff\xdf\x7f')[0]
+
+ for col in data:
+ dtype = data[col].dtype
+ # Cast from unsupported types to supported types
+ for c_data in conversion_data:
+ if dtype == c_data[0]:
+ if data[col].max() <= np.iinfo(c_data[1]).max:
+ dtype = c_data[1]
+ else:
+ dtype = c_data[2]
+ if c_data[2] == np.float64: # Warn if necessary
+ if data[col].max() >= 2 ** 53:
+ ws = precision_loss_doc % ('uint64', 'float64')
+
+ data[col] = data[col].astype(dtype)
+
+ # Check values and upcast if necessary
+ if dtype == np.int8:
+ if data[col].max() > 100 or data[col].min() < -127:
+ data[col] = data[col].astype(np.int16)
+ elif dtype == np.int16:
+ if data[col].max() > 32740 or data[col].min() < -32767:
+ data[col] = data[col].astype(np.int32)
+ elif dtype == np.int64:
+ if (data[col].max() <= 2147483620 and
+ data[col].min() >= -2147483647):
+ data[col] = data[col].astype(np.int32)
+ else:
+ data[col] = data[col].astype(np.float64)
+ if data[col].max() >= 2 ** 53 or data[col].min() <= -2 ** 53:
+ ws = precision_loss_doc % ('int64', 'float64')
+ elif dtype in (np.float32, np.float64):
+ value = data[col].max()
+ if np.isinf(value):
+ raise ValueError('Column {col} has a maximum value of '
+ 'infinity which is outside the range '
+ 'supported by Stata.'.format(col=col))
+ if dtype == np.float32 and value > float32_max:
+ data[col] = data[col].astype(np.float64)
+ elif dtype == np.float64:
+ if value > float64_max:
+ raise ValueError('Column {col} has a maximum value '
+ '({val}) outside the range supported by '
+ 'Stata ({float64_max})'
+ .format(col=col, val=value,
+ float64_max=float64_max))
+
+ if ws:
+ warnings.warn(ws, PossiblePrecisionLoss)
+
+ return data
+
+
+class StataValueLabel(object):
+ """
+ Parse a categorical column and prepare formatted output
+
+ Parameters
+ -----------
+ value : int8, int16, int32, float32 or float64
+ The Stata missing value code
+
+ Attributes
+ ----------
+ string : string
+ String representation of the Stata missing value
+ value : int8, int16, int32, float32 or float64
+ The original encoded missing value
+
+ Methods
+ -------
+ generate_value_label
+
+ """
+
+ def __init__(self, catarray):
+
+ self.labname = catarray.name
+
+ categories = catarray.cat.categories
+ self.value_labels = list(zip(np.arange(len(categories)), categories))
+ self.value_labels.sort(key=lambda x: x[0])
+ self.text_len = np.int32(0)
+ self.off = []
+ self.val = []
+ self.txt = []
+ self.n = 0
+
+ # Compute lengths and setup lists of offsets and labels
+ for vl in self.value_labels:
+ category = vl[1]
+ if not isinstance(category, string_types):
+ category = str(category)
+ warnings.warn(value_label_mismatch_doc.format(catarray.name),
+ ValueLabelTypeMismatch)
+
+ self.off.append(self.text_len)
+ self.text_len += len(category) + 1 # +1 for the padding
+ self.val.append(vl[0])
+ self.txt.append(category)
+ self.n += 1
+
+ if self.text_len > 32000:
+ raise ValueError('Stata value labels for a single variable must '
+ 'have a combined length less than 32,000 '
+ 'characters.')
+
+ # Ensure int32
+ self.off = np.array(self.off, dtype=np.int32)
+ self.val = np.array(self.val, dtype=np.int32)
+
+ # Total length
+ self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len
+
+ def _encode(self, s):
+ """
+ Python 3 compatibility shim
+ """
+ if compat.PY3:
+ return s.encode(self._encoding)
+ else:
+ return s
+
+ def generate_value_label(self, byteorder, encoding):
+ """
+ Parameters
+ ----------
+ byteorder : str
+ Byte order of the output
+ encoding : str
+ File encoding
+
+ Returns
+ -------
+ value_label : bytes
+ Bytes containing the formatted value label
+ """
+
+ self._encoding = encoding
+ bio = BytesIO()
+ null_string = '\x00'
+ null_byte = b'\x00'
+
+ # len
+ bio.write(struct.pack(byteorder + 'i', self.len))
+
+ # labname
+ labname = self._encode(_pad_bytes(self.labname[:32], 33))
+ bio.write(labname)
+
+ # padding - 3 bytes
+ for i in range(3):
+ bio.write(struct.pack('c', null_byte))
+
+ # value_label_table
+ # n - int32
+ bio.write(struct.pack(byteorder + 'i', self.n))
+
+ # textlen - int32
+ bio.write(struct.pack(byteorder + 'i', self.text_len))
+
+ # off - int32 array (n elements)
+ for offset in self.off:
+ bio.write(struct.pack(byteorder + 'i', offset))
+
+ # val - int32 array (n elements)
+ for value in self.val:
+ bio.write(struct.pack(byteorder + 'i', value))
+
+ # txt - Text labels, null terminated
+ for text in self.txt:
+ bio.write(self._encode(text + null_string))
+
+ bio.seek(0)
+ return bio.read()
+
+
+class StataMissingValue(StringMixin):
+ """
+ An observation's missing value.
+
+ Parameters
+ -----------
+ value : int8, int16, int32, float32 or float64
+ The Stata missing value code
+
+ Attributes
+ ----------
+ string : string
+ String representation of the Stata missing value
+ value : int8, int16, int32, float32 or float64
+ The original encoded missing value
+
+ Notes
+ -----
+ More information: <http://www.stata.com/help.cgi?missing>
+
+ Integer missing values make the code '.', '.a', ..., '.z' to the ranges
+ 101 ... 127 (for int8), 32741 ... 32767 (for int16) and 2147483621 ...
+ 2147483647 (for int32). Missing values for floating point data types are
+ more complex but the pattern is simple to discern from the following table.
+
+ np.float32 missing values (float in Stata)
+ 0000007f .
+ 0008007f .a
+ 0010007f .b
+ ...
+ 00c0007f .x
+ 00c8007f .y
+ 00d0007f .z
+
+ np.float64 missing values (double in Stata)
+ 000000000000e07f .
+ 000000000001e07f .a
+ 000000000002e07f .b
+ ...
+ 000000000018e07f .x
+ 000000000019e07f .y
+ 00000000001ae07f .z
+ """
+
+ # Construct a dictionary of missing values
+ MISSING_VALUES = {}
+ bases = (101, 32741, 2147483621)
+ for b in bases:
+ # Conversion to long to avoid hash issues on 32 bit platforms #8968
+ MISSING_VALUES[compat.long(b)] = '.'
+ for i in range(1, 27):
+ MISSING_VALUES[compat.long(i + b)] = '.' + chr(96 + i)
+
+ float32_base = b'\x00\x00\x00\x7f'
+ increment = struct.unpack('<i', b'\x00\x08\x00\x00')[0]
+ for i in range(27):
+ value = struct.unpack('<f', float32_base)[0]
+ MISSING_VALUES[value] = '.'
+ if i > 0:
+ MISSING_VALUES[value] += chr(96 + i)
+ int_value = struct.unpack('<i', struct.pack('<f', value))[
+ 0] + increment
+ float32_base = struct.pack('<i', int_value)
+
+ float64_base = b'\x00\x00\x00\x00\x00\x00\xe0\x7f'
+ increment = struct.unpack('q', b'\x00\x00\x00\x00\x00\x01\x00\x00')[0]
+ for i in range(27):
+ value = struct.unpack('<d', float64_base)[0]
+ MISSING_VALUES[value] = '.'
+ if i > 0:
+ MISSING_VALUES[value] += chr(96 + i)
+ int_value = struct.unpack('q', struct.pack('<d', value))[0] + increment
+ float64_base = struct.pack('q', int_value)
+
+ BASE_MISSING_VALUES = {'int8': 101,
+ 'int16': 32741,
+ 'int32': 2147483621,
+ 'float32': struct.unpack('<f', float32_base)[0],
+ 'float64': struct.unpack('<d', float64_base)[0]}
+
+ def __init__(self, value):
+ self._value = value
+ # Conversion to long to avoid hash issues on 32 bit platforms #8968
+ value = compat.long(value) if value < 2147483648 else float(value)
+ self._str = self.MISSING_VALUES[value]
+
+ string = property(lambda self: self._str,
+ doc="The Stata representation of the missing value: "
+ "'.', '.a'..'.z'")
+ value = property(lambda self: self._value,
+ doc='The binary representation of the missing value.')
+
+ def __unicode__(self):
+ return self.string
+
+ def __repr__(self):
+ # not perfect :-/
+ return "{cls}({obj})".format(cls=self.__class__, obj=self)
+
+ def __eq__(self, other):
+ return (isinstance(other, self.__class__) and
+ self.string == other.string and self.value == other.value)
+
+ @classmethod
+ def get_base_missing_value(cls, dtype):
+ if dtype == np.int8:
+ value = cls.BASE_MISSING_VALUES['int8']
+ elif dtype == np.int16:
+ value = cls.BASE_MISSING_VALUES['int16']
+ elif dtype == np.int32:
+ value = cls.BASE_MISSING_VALUES['int32']
+ elif dtype == np.float32:
+ value = cls.BASE_MISSING_VALUES['float32']
+ elif dtype == np.float64:
+ value = cls.BASE_MISSING_VALUES['float64']
+ else:
+ raise ValueError('Unsupported dtype')
+ return value
+
+
+class StataParser(object):
+
+ def __init__(self):
+
+ # type code.
+ # --------------------
+ # str1 1 = 0x01
+ # str2 2 = 0x02
+ # ...
+ # str244 244 = 0xf4
+ # byte 251 = 0xfb (sic)
+ # int 252 = 0xfc
+ # long 253 = 0xfd
+ # float 254 = 0xfe
+ # double 255 = 0xff
+ # --------------------
+ # NOTE: the byte type seems to be reserved for categorical variables
+ # with a label, but the underlying variable is -127 to 100
+ # we're going to drop the label and cast to int
+ self.DTYPE_MAP = \
+ dict(
+ lzip(range(1, 245), ['a' + str(i) for i in range(1, 245)]) +
+ [
+ (251, np.int8),
+ (252, np.int16),
+ (253, np.int32),
+ (254, np.float32),
+ (255, np.float64)
+ ]
+ )
+ self.DTYPE_MAP_XML = \
+ dict(
+ [
+ (32768, np.uint8), # Keys to GSO
+ (65526, np.float64),
+ (65527, np.float32),
+ (65528, np.int32),
+ (65529, np.int16),
+ (65530, np.int8)
+ ]
+ )
+ self.TYPE_MAP = lrange(251) + list('bhlfd')
+ self.TYPE_MAP_XML = \
+ dict(
+ [
+ # Not really a Q, unclear how to handle byteswap
+ (32768, 'Q'),
+
+ (65526, 'd'),
+ (65527, 'f'),
+ (65528, 'l'),
+ (65529, 'h'),
+ (65530, 'b')
+ ]
+ )
+ # NOTE: technically, some of these are wrong. there are more numbers
+ # that can be represented. it's the 27 ABOVE and BELOW the max listed
+ # numeric data type in [U] 12.2.2 of the 11.2 manual
+ float32_min = b'\xff\xff\xff\xfe'
+ float32_max = b'\xff\xff\xff\x7e'
+ float64_min = b'\xff\xff\xff\xff\xff\xff\xef\xff'
+ float64_max = b'\xff\xff\xff\xff\xff\xff\xdf\x7f'
+ self.VALID_RANGE = {
+ 'b': (-127, 100),
+ 'h': (-32767, 32740),
+ 'l': (-2147483647, 2147483620),
+ 'f': (np.float32(struct.unpack('<f', float32_min)[0]),
+ np.float32(struct.unpack('<f', float32_max)[0])),
+ 'd': (np.float64(struct.unpack('<d', float64_min)[0]),
+ np.float64(struct.unpack('<d', float64_max)[0]))
+ }
+
+ self.OLD_TYPE_MAPPING = {
+ 98: 251, # byte
+ 105: 252, # int
+ 108: 253, # long
+ 102: 254 # float
+ # don't know old code for double
+ }
+
+ # These missing values are the generic '.' in Stata, and are used
+ # to replace nans
+ self.MISSING_VALUES = {
+ 'b': 101,
+ 'h': 32741,
+ 'l': 2147483621,
+ 'f': np.float32(struct.unpack('<f', b'\x00\x00\x00\x7f')[0]),
+ 'd': np.float64(
+ struct.unpack('<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0])
+ }
+ self.NUMPY_TYPE_MAP = {
+ 'b': 'i1',
+ 'h': 'i2',
+ 'l': 'i4',
+ 'f': 'f4',
+ 'd': 'f8',
+ 'Q': 'u8'
+ }
+
+ # Reserved words cannot be used as variable names
+ self.RESERVED_WORDS = ('aggregate', 'array', 'boolean', 'break',
+ 'byte', 'case', 'catch', 'class', 'colvector',
+ 'complex', 'const', 'continue', 'default',
+ 'delegate', 'delete', 'do', 'double', 'else',
+ 'eltypedef', 'end', 'enum', 'explicit',
+ 'export', 'external', 'float', 'for', 'friend',
+ 'function', 'global', 'goto', 'if', 'inline',
+ 'int', 'local', 'long', 'NULL', 'pragma',
+ 'protected', 'quad', 'rowvector', 'short',
+ 'typedef', 'typename', 'virtual')
+
+
+class StataReader(StataParser, BaseIterator):
+ __doc__ = _stata_reader_doc
+
+ @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
+ @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
+ def __init__(self, path_or_buf, convert_dates=True,
+ convert_categoricals=True, index_col=None,
+ convert_missing=False, preserve_dtypes=True,
+ columns=None, order_categoricals=True,
+ encoding=None, chunksize=None):
+ super(StataReader, self).__init__()
+ self.col_sizes = ()
+
+ # Arguments to the reader (can be temporarily overridden in
+ # calls to read).
+ self._convert_dates = convert_dates
+ self._convert_categoricals = convert_categoricals
+ self._index_col = index_col
+ self._convert_missing = convert_missing
+ self._preserve_dtypes = preserve_dtypes
+ self._columns = columns
+ self._order_categoricals = order_categoricals
+ self._encoding = None
+ self._chunksize = chunksize
+
+ # State variables for the file
+ self._has_string_data = False
+ self._missing_values = False
+ self._can_read_value_labels = False
+ self._column_selector_set = False
+ self._value_labels_read = False
+ self._data_read = False
+ self._dtype = None
+ self._lines_read = 0
+
+ self._native_byteorder = _set_endianness(sys.byteorder)
+ path_or_buf = _stringify_path(path_or_buf)
+ if isinstance(path_or_buf, str):
+ path_or_buf, encoding, _, should_close = get_filepath_or_buffer(
+ path_or_buf)
+
+ if isinstance(path_or_buf, (str, text_type, bytes)):
+ self.path_or_buf = open(path_or_buf, 'rb')
+ else:
+ # Copy to BytesIO, and ensure no encoding
+ contents = path_or_buf.read()
+ self.path_or_buf = BytesIO(contents)
+
+ self._read_header()
+ self._setup_dtype()
+
+ def __enter__(self):
+ """ enter context manager """
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ """ exit context manager """
+ self.close()
+
+ def close(self):
+ """ close the handle if its open """
+ try:
+ self.path_or_buf.close()
+ except IOError:
+ pass
+
+ def _set_encoding(self):
+ """
+ Set string encoding which depends on file version
+ """
+ if self.format_version < 118:
+ self._encoding = 'latin-1'
+ else:
+ self._encoding = 'utf-8'
+
+ def _read_header(self):
+ first_char = self.path_or_buf.read(1)
+ if struct.unpack('c', first_char)[0] == b'<':
+ self._read_new_header(first_char)
+ else:
+ self._read_old_header(first_char)
+
+ self.has_string_data = len([x for x in self.typlist
+ if type(x) is int]) > 0
+
+ # calculate size of a data record
+ self.col_sizes = lmap(lambda x: self._calcsize(x), self.typlist)
+
+ def _read_new_header(self, first_char):
+ # The first part of the header is common to 117 and 118.
+ self.path_or_buf.read(27) # stata_dta><header><release>
+ self.format_version = int(self.path_or_buf.read(3))
+ if self.format_version not in [117, 118]:
+ raise ValueError(_version_error)
+ self._set_encoding()
+ self.path_or_buf.read(21) # </release><byteorder>
+ self.byteorder = self.path_or_buf.read(3) == b'MSF' and '>' or '<'
+ self.path_or_buf.read(15) # </byteorder><K>
+ self.nvar = struct.unpack(self.byteorder + 'H',
+ self.path_or_buf.read(2))[0]
+ self.path_or_buf.read(7) # </K><N>
+
+ self.nobs = self._get_nobs()
+ self.path_or_buf.read(11) # </N><label>
+ self.data_label = self._get_data_label()
+ self.path_or_buf.read(19) # </label><timestamp>
+ self.time_stamp = self._get_time_stamp()
+ self.path_or_buf.read(26) # </timestamp></header><map>
+ self.path_or_buf.read(8) # 0x0000000000000000
+ self.path_or_buf.read(8) # position of <map>
+
+ self._seek_vartypes = struct.unpack(
+ self.byteorder + 'q', self.path_or_buf.read(8))[0] + 16
+ self._seek_varnames = struct.unpack(
+ self.byteorder + 'q', self.path_or_buf.read(8))[0] + 10
+ self._seek_sortlist = struct.unpack(
+ self.byteorder + 'q', self.path_or_buf.read(8))[0] + 10
+ self._seek_formats = struct.unpack(
+ self.byteorder + 'q', self.path_or_buf.read(8))[0] + 9
+ self._seek_value_label_names = struct.unpack(
+ self.byteorder + 'q', self.path_or_buf.read(8))[0] + 19
+
+ # Requires version-specific treatment
+ self._seek_variable_labels = self._get_seek_variable_labels()
+
+ self.path_or_buf.read(8) # <characteristics>
+ self.data_location = struct.unpack(
+ self.byteorder + 'q', self.path_or_buf.read(8))[0] + 6
+ self.seek_strls = struct.unpack(
+ self.byteorder + 'q', self.path_or_buf.read(8))[0] + 7
+ self.seek_value_labels = struct.unpack(
+ self.byteorder + 'q', self.path_or_buf.read(8))[0] + 14
+
+ self.typlist, self.dtyplist = self._get_dtypes(self._seek_vartypes)
+
+ self.path_or_buf.seek(self._seek_varnames)
+ self.varlist = self._get_varlist()
+
+ self.path_or_buf.seek(self._seek_sortlist)
+ self.srtlist = struct.unpack(
+ self.byteorder + ('h' * (self.nvar + 1)),
+ self.path_or_buf.read(2 * (self.nvar + 1))
+ )[:-1]
+
+ self.path_or_buf.seek(self._seek_formats)
+ self.fmtlist = self._get_fmtlist()
+
+ self.path_or_buf.seek(self._seek_value_label_names)
+ self.lbllist = self._get_lbllist()
+
+ self.path_or_buf.seek(self._seek_variable_labels)
+ self._variable_labels = self._get_variable_labels()
+
+ # Get data type information, works for versions 117-118.
+ def _get_dtypes(self, seek_vartypes):
+
+ self.path_or_buf.seek(seek_vartypes)
+ raw_typlist = [struct.unpack(self.byteorder + 'H',
+ self.path_or_buf.read(2))[0]
+ for i in range(self.nvar)]
+
+ def f(typ):
+ if typ <= 2045:
+ return typ
+ try:
+ return self.TYPE_MAP_XML[typ]
+ except KeyError:
+ raise ValueError("cannot convert stata types [{0}]".
+ format(typ))
+
+ typlist = [f(x) for x in raw_typlist]
+
+ def f(typ):
+ if typ <= 2045:
+ return str(typ)
+ try:
+ return self.DTYPE_MAP_XML[typ]
+ except KeyError:
+ raise ValueError("cannot convert stata dtype [{0}]"
+ .format(typ))
+
+ dtyplist = [f(x) for x in raw_typlist]
+
+ return typlist, dtyplist
+
+ def _get_varlist(self):
+ if self.format_version == 117:
+ b = 33
+ elif self.format_version == 118:
+ b = 129
+
+ return [self._null_terminate(self.path_or_buf.read(b))
+ for i in range(self.nvar)]
+
+ # Returns the format list
+ def _get_fmtlist(self):
+ if self.format_version == 118:
+ b = 57
+ elif self.format_version > 113:
+ b = 49
+ elif self.format_version > 104:
+ b = 12
+ else:
+ b = 7
+
+ return [self._null_terminate(self.path_or_buf.read(b))
+ for i in range(self.nvar)]
+
+ # Returns the label list
+ def _get_lbllist(self):
+ if self.format_version >= 118:
+ b = 129
+ elif self.format_version > 108:
+ b = 33
+ else:
+ b = 9
+ return [self._null_terminate(self.path_or_buf.read(b))
+ for i in range(self.nvar)]
+
+ def _get_variable_labels(self):
+ if self.format_version == 118:
+ vlblist = [self._decode(self.path_or_buf.read(321))
+ for i in range(self.nvar)]
+ elif self.format_version > 105:
+ vlblist = [self._null_terminate(self.path_or_buf.read(81))
+ for i in range(self.nvar)]
+ else:
+ vlblist = [self._null_terminate(self.path_or_buf.read(32))
+ for i in range(self.nvar)]
+ return vlblist
+
+ def _get_nobs(self):
+ if self.format_version == 118:
+ return struct.unpack(self.byteorder + 'Q',
+ self.path_or_buf.read(8))[0]
+ else:
+ return struct.unpack(self.byteorder + 'I',
+ self.path_or_buf.read(4))[0]
+
+ def _get_data_label(self):
+ if self.format_version == 118:
+ strlen = struct.unpack(self.byteorder + 'H',
+ self.path_or_buf.read(2))[0]
+ return self._decode(self.path_or_buf.read(strlen))
+ elif self.format_version == 117:
+ strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
+ return self._null_terminate(self.path_or_buf.read(strlen))
+ elif self.format_version > 105:
+ return self._null_terminate(self.path_or_buf.read(81))
+ else:
+ return self._null_terminate(self.path_or_buf.read(32))
+
+ def _get_time_stamp(self):
+ if self.format_version == 118:
+ strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
+ return self.path_or_buf.read(strlen).decode("utf-8")
+ elif self.format_version == 117:
+ strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
+ return self._null_terminate(self.path_or_buf.read(strlen))
+ elif self.format_version > 104:
+ return self._null_terminate(self.path_or_buf.read(18))
+ else:
+ raise ValueError()
+
+ def _get_seek_variable_labels(self):
+ if self.format_version == 117:
+ self.path_or_buf.read(8) # <variable_lables>, throw away
+ # Stata 117 data files do not follow the described format. This is
+ # a work around that uses the previous label, 33 bytes for each
+ # variable, 20 for the closing tag and 17 for the opening tag
+ return self._seek_value_label_names + (33 * self.nvar) + 20 + 17
+ elif self.format_version == 118:
+ return struct.unpack(self.byteorder + 'q',
+ self.path_or_buf.read(8))[0] + 17
+ else:
+ raise ValueError()
+
+ def _read_old_header(self, first_char):
+ self.format_version = struct.unpack('b', first_char)[0]
+ if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
+ raise ValueError(_version_error)
+ self._set_encoding()
+ self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[
+ 0] == 0x1 and '>' or '<'
+ self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0]
+ self.path_or_buf.read(1) # unused
+
+ self.nvar = struct.unpack(self.byteorder + 'H',
+ self.path_or_buf.read(2))[0]
+ self.nobs = self._get_nobs()
+
+ self.data_label = self._get_data_label()
+
+ self.time_stamp = self._get_time_stamp()
+
+ # descriptors
+ if self.format_version > 108:
+ typlist = [ord(self.path_or_buf.read(1))
+ for i in range(self.nvar)]
+ else:
+ buf = self.path_or_buf.read(self.nvar)
+ typlistb = np.frombuffer(buf, dtype=np.uint8)
+ typlist = []
+ for tp in typlistb:
+ if tp in self.OLD_TYPE_MAPPING:
+ typlist.append(self.OLD_TYPE_MAPPING[tp])
+ else:
+ typlist.append(tp - 127) # py2 string, py3 bytes
+
+ try:
+ self.typlist = [self.TYPE_MAP[typ] for typ in typlist]
+ except ValueError:
+ raise ValueError("cannot convert stata types [{0}]"
+ .format(','.join(str(x) for x in typlist)))
+ try:
+ self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist]
+ except ValueError:
+ raise ValueError("cannot convert stata dtypes [{0}]"
+ .format(','.join(str(x) for x in typlist)))
+
+ if self.format_version > 108:
+ self.varlist = [self._null_terminate(self.path_or_buf.read(33))
+ for i in range(self.nvar)]
+ else:
+ self.varlist = [self._null_terminate(self.path_or_buf.read(9))
+ for i in range(self.nvar)]
+ self.srtlist = struct.unpack(
+ self.byteorder + ('h' * (self.nvar + 1)),
+ self.path_or_buf.read(2 * (self.nvar + 1))
+ )[:-1]
+
+ self.fmtlist = self._get_fmtlist()
+
+ self.lbllist = self._get_lbllist()
+
+ self._variable_labels = self._get_variable_labels()
+
+ # ignore expansion fields (Format 105 and later)
+ # When reading, read five bytes; the last four bytes now tell you
+ # the size of the next read, which you discard. You then continue
+ # like this until you read 5 bytes of zeros.
+
+ if self.format_version > 104:
+ while True:
+ data_type = struct.unpack(self.byteorder + 'b',
+ self.path_or_buf.read(1))[0]
+ if self.format_version > 108:
+ data_len = struct.unpack(self.byteorder + 'i',
+ self.path_or_buf.read(4))[0]
+ else:
+ data_len = struct.unpack(self.byteorder + 'h',
+ self.path_or_buf.read(2))[0]
+ if data_type == 0:
+ break
+ self.path_or_buf.read(data_len)
+
+ # necessary data to continue parsing
+ self.data_location = self.path_or_buf.tell()
+
+ def _setup_dtype(self):
+ """Map between numpy and state dtypes"""
+ if self._dtype is not None:
+ return self._dtype
+
+ dtype = [] # Convert struct data types to numpy data type
+ for i, typ in enumerate(self.typlist):
+ if typ in self.NUMPY_TYPE_MAP:
+ dtype.append(('s' + str(i), self.byteorder +
+ self.NUMPY_TYPE_MAP[typ]))
+ else:
+ dtype.append(('s' + str(i), 'S' + str(typ)))
+ dtype = np.dtype(dtype)
+ self._dtype = dtype
+
+ return self._dtype
+
+ def _calcsize(self, fmt):
+ return (type(fmt) is int and fmt or
+ struct.calcsize(self.byteorder + fmt))
+
+ def _decode(self, s):
+ s = s.partition(b"\0")[0]
+ return s.decode('utf-8')
+
+ def _null_terminate(self, s):
+ # have bytes not strings, so must decode
+ s = s.partition(b"\0")[0]
+ return s.decode(self._encoding)
+
+ def _read_value_labels(self):
+ if self._value_labels_read:
+ # Don't read twice
+ return
+ if self.format_version <= 108:
+ # Value labels are not supported in version 108 and earlier.
+ self._value_labels_read = True
+ self.value_label_dict = dict()
+ return
+
+ if self.format_version >= 117:
+ self.path_or_buf.seek(self.seek_value_labels)
+ else:
+ offset = self.nobs * self._dtype.itemsize
+ self.path_or_buf.seek(self.data_location + offset)
+
+ self._value_labels_read = True
+ self.value_label_dict = dict()
+
+ while True:
+ if self.format_version >= 117:
+ if self.path_or_buf.read(5) == b'</val': # <lbl>
+ break # end of value label table
+
+ slength = self.path_or_buf.read(4)
+ if not slength:
+ break # end of value label table (format < 117)
+ if self.format_version <= 117:
+ labname = self._null_terminate(self.path_or_buf.read(33))
+ else:
+ labname = self._decode(self.path_or_buf.read(129))
+ self.path_or_buf.read(3) # padding
+
+ n = struct.unpack(self.byteorder + 'I',
+ self.path_or_buf.read(4))[0]
+ txtlen = struct.unpack(self.byteorder + 'I',
+ self.path_or_buf.read(4))[0]
+ off = np.frombuffer(self.path_or_buf.read(4 * n),
+ dtype=self.byteorder + "i4",
+ count=n)
+ val = np.frombuffer(self.path_or_buf.read(4 * n),
+ dtype=self.byteorder + "i4",
+ count=n)
+ ii = np.argsort(off)
+ off = off[ii]
+ val = val[ii]
+ txt = self.path_or_buf.read(txtlen)
+ self.value_label_dict[labname] = dict()
+ for i in range(n):
+ end = off[i + 1] if i < n - 1 else txtlen
+ if self.format_version <= 117:
+ self.value_label_dict[labname][val[i]] = (
+ self._null_terminate(txt[off[i]:end]))
+ else:
+ self.value_label_dict[labname][val[i]] = (
+ self._decode(txt[off[i]:end]))
+ if self.format_version >= 117:
+ self.path_or_buf.read(6) # </lbl>
+ self._value_labels_read = True
+
+ def _read_strls(self):
+ self.path_or_buf.seek(self.seek_strls)
+ # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
+ self.GSO = {'0': ''}
+ while True:
+ if self.path_or_buf.read(3) != b'GSO':
+ break
+
+ if self.format_version == 117:
+ v_o = struct.unpack(self.byteorder + 'Q',
+ self.path_or_buf.read(8))[0]
+ else:
+ buf = self.path_or_buf.read(12)
+ # Only tested on little endian file on little endian machine.
+ if self.byteorder == '<':
+ buf = buf[0:2] + buf[4:10]
+ else:
+ buf = buf[0:2] + buf[6:]
+ v_o = struct.unpack('Q', buf)[0]
+ typ = struct.unpack('B', self.path_or_buf.read(1))[0]
+ length = struct.unpack(self.byteorder + 'I',
+ self.path_or_buf.read(4))[0]
+ va = self.path_or_buf.read(length)
+ if typ == 130:
+ va = va[0:-1].decode(self._encoding)
+ # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
+ self.GSO[str(v_o)] = va
+
+ # legacy
+ @Appender(_data_method_doc)
+ def data(self, **kwargs):
+
+ warnings.warn("'data' is deprecated, use 'read' instead")
+
+ if self._data_read:
+ raise Exception("Data has already been read.")
+ self._data_read = True
+
+ return self.read(None, **kwargs)
+
+ def __next__(self):
+ return self.read(nrows=self._chunksize or 1)
+
+ def get_chunk(self, size=None):
+ """
+ Reads lines from Stata file and returns as dataframe
+
+ Parameters
+ ----------
+ size : int, defaults to None
+ Number of lines to read. If None, reads whole file.
+
+ Returns
+ -------
+ DataFrame
+ """
+ if size is None:
+ size = self._chunksize
+ return self.read(nrows=size)
+
+ @Appender(_read_method_doc)
+ @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
+ def read(self, nrows=None, convert_dates=None,
+ convert_categoricals=None, index_col=None,
+ convert_missing=None, preserve_dtypes=None,
+ columns=None, order_categoricals=None):
+ # Handle empty file or chunk. If reading incrementally raise
+ # StopIteration. If reading the whole thing return an empty
+ # data frame.
+ if (self.nobs == 0) and (nrows is None):
+ self._can_read_value_labels = True
+ self._data_read = True
+ self.close()
+ return DataFrame(columns=self.varlist)
+
+ # Handle options
+ if convert_dates is None:
+ convert_dates = self._convert_dates
+ if convert_categoricals is None:
+ convert_categoricals = self._convert_categoricals
+ if convert_missing is None:
+ convert_missing = self._convert_missing
+ if preserve_dtypes is None:
+ preserve_dtypes = self._preserve_dtypes
+ if columns is None:
+ columns = self._columns
+ if order_categoricals is None:
+ order_categoricals = self._order_categoricals
+ if index_col is None:
+ index_col = self._index_col
+
+ if nrows is None:
+ nrows = self.nobs
+
+ if (self.format_version >= 117) and (not self._value_labels_read):
+ self._can_read_value_labels = True
+ self._read_strls()
+
+ # Read data
+ dtype = self._dtype
+ max_read_len = (self.nobs - self._lines_read) * dtype.itemsize
+ read_len = nrows * dtype.itemsize
+ read_len = min(read_len, max_read_len)
+ if read_len <= 0:
+ # Iterator has finished, should never be here unless
+ # we are reading the file incrementally
+ if convert_categoricals:
+ self._read_value_labels()
+ self.close()
+ raise StopIteration
+ offset = self._lines_read * dtype.itemsize
+ self.path_or_buf.seek(self.data_location + offset)
+ read_lines = min(nrows, self.nobs - self._lines_read)
+ data = np.frombuffer(self.path_or_buf.read(read_len), dtype=dtype,
+ count=read_lines)
+
+ self._lines_read += read_lines
+ if self._lines_read == self.nobs:
+ self._can_read_value_labels = True
+ self._data_read = True
+ # if necessary, swap the byte order to native here
+ if self.byteorder != self._native_byteorder:
+ data = data.byteswap().newbyteorder()
+
+ if convert_categoricals:
+ self._read_value_labels()
+
+ if len(data) == 0:
+ data = DataFrame(columns=self.varlist)
+ else:
+ data = DataFrame.from_records(data)
+ data.columns = self.varlist
+
+ # If index is not specified, use actual row number rather than
+ # restarting at 0 for each chunk.
+ if index_col is None:
+ ix = np.arange(self._lines_read - read_lines, self._lines_read)
+ data = data.set_index(ix)
+
+ if columns is not None:
+ try:
+ data = self._do_select_columns(data, columns)
+ except ValueError:
+ self.close()
+ raise
+
+ # Decode strings
+ for col, typ in zip(data, self.typlist):
+ if type(typ) is int:
+ data[col] = data[col].apply(
+ self._null_terminate, convert_dtype=True)
+
+ data = self._insert_strls(data)
+
+ cols_ = np.where(self.dtyplist)[0]
+
+ # Convert columns (if needed) to match input type
+ ix = data.index
+ requires_type_conversion = False
+ data_formatted = []
+ for i in cols_:
+ if self.dtyplist[i] is not None:
+ col = data.columns[i]
+ dtype = data[col].dtype
+ if dtype != np.dtype(object) and dtype != self.dtyplist[i]:
+ requires_type_conversion = True
+ data_formatted.append(
+ (col, Series(data[col], ix, self.dtyplist[i])))
+ else:
+ data_formatted.append((col, data[col]))
+ if requires_type_conversion:
+ data = DataFrame.from_dict(OrderedDict(data_formatted))
+ del data_formatted
+
+ self._do_convert_missing(data, convert_missing)
+
+ if convert_dates:
+ cols = np.where(lmap(lambda x: any(x.startswith(fmt)
+ for fmt in _date_formats),
+ self.fmtlist))[0]
+ for i in cols:
+ col = data.columns[i]
+ try:
+ data[col] = _stata_elapsed_date_to_datetime_vec(
+ data[col],
+ self.fmtlist[i])
+ except ValueError:
+ self.close()
+ raise
+
+ if convert_categoricals and self.format_version > 108:
+ data = self._do_convert_categoricals(data,
+ self.value_label_dict,
+ self.lbllist,
+ order_categoricals)
+
+ if not preserve_dtypes:
+ retyped_data = []
+ convert = False
+ for col in data:
+ dtype = data[col].dtype
+ if dtype in (np.float16, np.float32):
+ dtype = np.float64
+ convert = True
+ elif dtype in (np.int8, np.int16, np.int32):
+ dtype = np.int64
+ convert = True
+ retyped_data.append((col, data[col].astype(dtype)))
+ if convert:
+ data = DataFrame.from_dict(OrderedDict(retyped_data))
+
+ if index_col is not None:
+ data = data.set_index(data.pop(index_col))
+
+ return data
+
+ def _do_convert_missing(self, data, convert_missing):
+ # Check for missing values, and replace if found
+
+ for i, colname in enumerate(data):
+ fmt = self.typlist[i]
+ if fmt not in self.VALID_RANGE:
+ continue
+
+ nmin, nmax = self.VALID_RANGE[fmt]
+ series = data[colname]
+ missing = np.logical_or(series < nmin, series > nmax)
+
+ if not missing.any():
+ continue
+
+ if convert_missing: # Replacement follows Stata notation
+
+ missing_loc = np.argwhere(missing._ndarray_values)
+ umissing, umissing_loc = np.unique(series[missing],
+ return_inverse=True)
+ replacement = Series(series, dtype=np.object)
+ for j, um in enumerate(umissing):
+ missing_value = StataMissingValue(um)
+
+ loc = missing_loc[umissing_loc == j]
+ replacement.iloc[loc] = missing_value
+ else: # All replacements are identical
+ dtype = series.dtype
+ if dtype not in (np.float32, np.float64):
+ dtype = np.float64
+ replacement = Series(series, dtype=dtype)
+ replacement[missing] = np.nan
+
+ data[colname] = replacement
+
+ def _insert_strls(self, data):
+ if not hasattr(self, 'GSO') or len(self.GSO) == 0:
+ return data
+ for i, typ in enumerate(self.typlist):
+ if typ != 'Q':
+ continue
+ # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
+ data.iloc[:, i] = [self.GSO[str(k)] for k in data.iloc[:, i]]
+ return data
+
+ def _do_select_columns(self, data, columns):
+
+ if not self._column_selector_set:
+ column_set = set(columns)
+ if len(column_set) != len(columns):
+ raise ValueError('columns contains duplicate entries')
+ unmatched = column_set.difference(data.columns)
+ if unmatched:
+ raise ValueError('The following columns were not found in the '
+ 'Stata data set: ' +
+ ', '.join(list(unmatched)))
+ # Copy information for retained columns for later processing
+ dtyplist = []
+ typlist = []
+ fmtlist = []
+ lbllist = []
+ for col in columns:
+ i = data.columns.get_loc(col)
+ dtyplist.append(self.dtyplist[i])
+ typlist.append(self.typlist[i])
+ fmtlist.append(self.fmtlist[i])
+ lbllist.append(self.lbllist[i])
+
+ self.dtyplist = dtyplist
+ self.typlist = typlist
+ self.fmtlist = fmtlist
+ self.lbllist = lbllist
+ self._column_selector_set = True
+
+ return data[columns]
+
+ def _do_convert_categoricals(self, data, value_label_dict, lbllist,
+ order_categoricals):
+ """
+ Converts categorical columns to Categorical type.
+ """
+ value_labels = list(compat.iterkeys(value_label_dict))
+ cat_converted_data = []
+ for col, label in zip(data, lbllist):
+ if label in value_labels:
+ # Explicit call with ordered=True
+ cat_data = Categorical(data[col], ordered=order_categoricals)
+ categories = []
+ for category in cat_data.categories:
+ if category in value_label_dict[label]:
+ categories.append(value_label_dict[label][category])
+ else:
+ categories.append(category) # Partially labeled
+ try:
+ cat_data.categories = categories
+ except ValueError:
+ vc = Series(categories).value_counts()
+ repeats = list(vc.index[vc > 1])
+ repeats = '\n' + '-' * 80 + '\n'.join(repeats)
+ raise ValueError('Value labels for column {col} are not '
+ 'unique. The repeated labels are:\n'
+ '{repeats}'
+ .format(col=col, repeats=repeats))
+ # TODO: is the next line needed above in the data(...) method?
+ cat_data = Series(cat_data, index=data.index)
+ cat_converted_data.append((col, cat_data))
+ else:
+ cat_converted_data.append((col, data[col]))
+ data = DataFrame.from_dict(OrderedDict(cat_converted_data))
+ return data
+
+ def data_label(self):
+ """Returns data label of Stata file"""
+ return self.data_label
+
+ def variable_labels(self):
+ """Returns variable labels as a dict, associating each variable name
+ with corresponding label
+ """
+ return dict(zip(self.varlist, self._variable_labels))
+
+ def value_labels(self):
+ """Returns a dict, associating each variable name a dict, associating
+ each value its corresponding label
+ """
+ if not self._value_labels_read:
+ self._read_value_labels()
+
+ return self.value_label_dict
+
+
+def _open_file_binary_write(fname):
+ """
+ Open a binary file or no-op if file-like
+
+ Parameters
+ ----------
+ fname : string path, path object or buffer
+
+ Returns
+ -------
+ file : file-like object
+ File object supporting write
+ own : bool
+ True if the file was created, otherwise False
+ """
+ if hasattr(fname, 'write'):
+ # if 'b' not in fname.mode:
+ return fname, False
+ return open(fname, "wb"), True
+
+
+def _set_endianness(endianness):
+ if endianness.lower() in ["<", "little"]:
+ return "<"
+ elif endianness.lower() in [">", "big"]:
+ return ">"
+ else: # pragma : no cover
+ raise ValueError(
+ "Endianness {endian} not understood".format(endian=endianness))
+
+
+def _pad_bytes(name, length):
+ """
+ Takes a char string and pads it with null bytes until it's length chars
+ """
+ return name + "\x00" * (length - len(name))
+
+
+def _convert_datetime_to_stata_type(fmt):
+ """
+ Converts from one of the stata date formats to a type in TYPE_MAP
+ """
+ if fmt in ["tc", "%tc", "td", "%td", "tw", "%tw", "tm", "%tm", "tq",
+ "%tq", "th", "%th", "ty", "%ty"]:
+ return np.float64 # Stata expects doubles for SIFs
+ else:
+ raise NotImplementedError(
+ "Format {fmt} not implemented".format(fmt=fmt))
+
+
+def _maybe_convert_to_int_keys(convert_dates, varlist):
+ new_dict = {}
+ for key in convert_dates:
+ if not convert_dates[key].startswith("%"): # make sure proper fmts
+ convert_dates[key] = "%" + convert_dates[key]
+ if key in varlist:
+ new_dict.update({varlist.index(key): convert_dates[key]})
+ else:
+ if not isinstance(key, int):
+ raise ValueError("convert_dates key must be a "
+ "column or an integer")
+ new_dict.update({key: convert_dates[key]})
+ return new_dict
+
+
+def _dtype_to_stata_type(dtype, column):
+ """
+ Converts dtype types to stata types. Returns the byte of the given ordinal.
+ See TYPE_MAP and comments for an explanation. This is also explained in
+ the dta spec.
+ 1 - 244 are strings of this length
+ Pandas Stata
+ 251 - for int8 byte
+ 252 - for int16 int
+ 253 - for int32 long
+ 254 - for float32 float
+ 255 - for double double
+
+ If there are dates to convert, then dtype will already have the correct
+ type inserted.
+ """
+ # TODO: expand to handle datetime to integer conversion
+ if dtype.type == np.object_: # try to coerce it to the biggest string
+ # not memory efficient, what else could we
+ # do?
+ itemsize = max_len_string_array(ensure_object(column.values))
+ return max(itemsize, 1)
+ elif dtype == np.float64:
+ return 255
+ elif dtype == np.float32:
+ return 254
+ elif dtype == np.int32:
+ return 253
+ elif dtype == np.int16:
+ return 252
+ elif dtype == np.int8:
+ return 251
+ else: # pragma : no cover
+ raise NotImplementedError(
+ "Data type {dtype} not supported.".format(dtype=dtype))
+
+
+def _dtype_to_default_stata_fmt(dtype, column, dta_version=114,
+ force_strl=False):
+ """
+ Maps numpy dtype to stata's default format for this type. Not terribly
+ important since users can change this in Stata. Semantics are
+
+ object -> "%DDs" where DD is the length of the string. If not a string,
+ raise ValueError
+ float64 -> "%10.0g"
+ float32 -> "%9.0g"
+ int64 -> "%9.0g"
+ int32 -> "%12.0g"
+ int16 -> "%8.0g"
+ int8 -> "%8.0g"
+ strl -> "%9s"
+ """
+ # TODO: Refactor to combine type with format
+ # TODO: expand this to handle a default datetime format?
+ if dta_version < 117:
+ max_str_len = 244
+ else:
+ max_str_len = 2045
+ if force_strl:
+ return '%9s'
+ if dtype.type == np.object_:
+ inferred_dtype = infer_dtype(column, skipna=True)
+ if not (inferred_dtype in ('string', 'unicode') or
+ len(column) == 0):
+ raise ValueError('Column `{col}` cannot be exported.\n\nOnly '
+ 'string-like object arrays containing all '
+ 'strings or a mix of strings and None can be '
+ 'exported. Object arrays containing only null '
+ 'values are prohibited. Other object types'
+ 'cannot be exported and must first be converted '
+ 'to one of the supported '
+ 'types.'.format(col=column.name))
+ itemsize = max_len_string_array(ensure_object(column.values))
+ if itemsize > max_str_len:
+ if dta_version >= 117:
+ return '%9s'
+ else:
+ raise ValueError(excessive_string_length_error % column.name)
+ return "%" + str(max(itemsize, 1)) + "s"
+ elif dtype == np.float64:
+ return "%10.0g"
+ elif dtype == np.float32:
+ return "%9.0g"
+ elif dtype == np.int32:
+ return "%12.0g"
+ elif dtype == np.int8 or dtype == np.int16:
+ return "%8.0g"
+ else: # pragma : no cover
+ raise NotImplementedError(
+ "Data type {dtype} not supported.".format(dtype=dtype))
+
+
+class StataWriter(StataParser):
+ """
+ A class for writing Stata binary dta files
+
+ Parameters
+ ----------
+ fname : path (string), buffer or path object
+ string, path object (pathlib.Path or py._path.local.LocalPath) or
+ object implementing a binary write() functions. If using a buffer
+ then the buffer will not be automatically closed after the file
+ is written.
+
+ .. versionadded:: 0.23.0 support for pathlib, py.path.
+
+ data : DataFrame
+ Input to save
+ convert_dates : dict
+ Dictionary mapping columns containing datetime types to stata internal
+ format to use when writing the dates. Options are 'tc', 'td', 'tm',
+ 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
+ Datetime columns that do not have a conversion type specified will be
+ converted to 'tc'. Raises NotImplementedError if a datetime column has
+ timezone information
+ write_index : bool
+ Write the index to Stata dataset.
+ encoding : str
+ Default is latin-1. Only latin-1 and ascii are supported.
+ byteorder : str
+ Can be ">", "<", "little", or "big". default is `sys.byteorder`
+ time_stamp : datetime
+ A datetime to use as file creation date. Default is the current time
+ data_label : str
+ A label for the data set. Must be 80 characters or smaller.
+ variable_labels : dict
+ Dictionary containing columns as keys and variable labels as values.
+ Each label must be 80 characters or smaller.
+
+ .. versionadded:: 0.19.0
+
+ Returns
+ -------
+ writer : StataWriter instance
+ The StataWriter instance has a write_file method, which will
+ write the file to the given `fname`.
+
+ Raises
+ ------
+ NotImplementedError
+ * If datetimes contain timezone information
+ ValueError
+ * Columns listed in convert_dates are neither datetime64[ns]
+ or datetime.datetime
+ * Column dtype is not representable in Stata
+ * Column listed in convert_dates is not in DataFrame
+ * Categorical label contains more than 32,000 characters
+
+ Examples
+ --------
+ >>> data = pd.DataFrame([[1.0, 1]], columns=['a', 'b'])
+ >>> writer = StataWriter('./data_file.dta', data)
+ >>> writer.write_file()
+
+ Or with dates
+ >>> from datetime import datetime
+ >>> data = pd.DataFrame([[datetime(2000,1,1)]], columns=['date'])
+ >>> writer = StataWriter('./date_data_file.dta', data, {'date' : 'tw'})
+ >>> writer.write_file()
+ """
+
+ _max_string_length = 244
+
+ @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
+ def __init__(self, fname, data, convert_dates=None, write_index=True,
+ encoding="latin-1", byteorder=None, time_stamp=None,
+ data_label=None, variable_labels=None):
+ super(StataWriter, self).__init__()
+ self._convert_dates = {} if convert_dates is None else convert_dates
+ self._write_index = write_index
+ self._encoding = 'latin-1'
+ self._time_stamp = time_stamp
+ self._data_label = data_label
+ self._variable_labels = variable_labels
+ self._own_file = True
+ # attach nobs, nvars, data, varlist, typlist
+ self._prepare_pandas(data)
+
+ if byteorder is None:
+ byteorder = sys.byteorder
+ self._byteorder = _set_endianness(byteorder)
+ self._fname = _stringify_path(fname)
+ self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8}
+ self._converted_names = {}
+
+ def _write(self, to_write):
+ """
+ Helper to call encode before writing to file for Python 3 compat.
+ """
+ if compat.PY3:
+ self._file.write(to_write.encode(self._encoding or
+ self._default_encoding))
+ else:
+ self._file.write(to_write)
+
+ def _prepare_categoricals(self, data):
+ """Check for categorical columns, retain categorical information for
+ Stata file and convert categorical data to int"""
+
+ is_cat = [is_categorical_dtype(data[col]) for col in data]
+ self._is_col_cat = is_cat
+ self._value_labels = []
+ if not any(is_cat):
+ return data
+
+ get_base_missing_value = StataMissingValue.get_base_missing_value
+ data_formatted = []
+ for col, col_is_cat in zip(data, is_cat):
+ if col_is_cat:
+ self._value_labels.append(StataValueLabel(data[col]))
+ dtype = data[col].cat.codes.dtype
+ if dtype == np.int64:
+ raise ValueError('It is not possible to export '
+ 'int64-based categorical data to Stata.')
+ values = data[col].cat.codes.values.copy()
+
+ # Upcast if needed so that correct missing values can be set
+ if values.max() >= get_base_missing_value(dtype):
+ if dtype == np.int8:
+ dtype = np.int16
+ elif dtype == np.int16:
+ dtype = np.int32
+ else:
+ dtype = np.float64
+ values = np.array(values, dtype=dtype)
+
+ # Replace missing values with Stata missing value for type
+ values[values == -1] = get_base_missing_value(dtype)
+ data_formatted.append((col, values))
+ else:
+ data_formatted.append((col, data[col]))
+ return DataFrame.from_dict(OrderedDict(data_formatted))
+
+ def _replace_nans(self, data):
+ # return data
+ """Checks floating point data columns for nans, and replaces these with
+ the generic Stata for missing value (.)"""
+ for c in data:
+ dtype = data[c].dtype
+ if dtype in (np.float32, np.float64):
+ if dtype == np.float32:
+ replacement = self.MISSING_VALUES['f']
+ else:
+ replacement = self.MISSING_VALUES['d']
+ data[c] = data[c].fillna(replacement)
+
+ return data
+
+ def _update_strl_names(self):
+ """No-op, forward compatibility"""
+ pass
+
+ def _check_column_names(self, data):
+ """
+ Checks column names to ensure that they are valid Stata column names.
+ This includes checks for:
+ * Non-string names
+ * Stata keywords
+ * Variables that start with numbers
+ * Variables with names that are too long
+
+ When an illegal variable name is detected, it is converted, and if
+ dates are exported, the variable name is propagated to the date
+ conversion dictionary
+ """
+ converted_names = {}
+ columns = list(data.columns)
+ original_columns = columns[:]
+
+ duplicate_var_id = 0
+ for j, name in enumerate(columns):
+ orig_name = name
+ if not isinstance(name, string_types):
+ name = text_type(name)
+
+ for c in name:
+ if ((c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and
+ (c < '0' or c > '9') and c != '_'):
+ name = name.replace(c, '_')
+
+ # Variable name must not be a reserved word
+ if name in self.RESERVED_WORDS:
+ name = '_' + name
+
+ # Variable name may not start with a number
+ if name[0] >= '0' and name[0] <= '9':
+ name = '_' + name
+
+ name = name[:min(len(name), 32)]
+
+ if not name == orig_name:
+ # check for duplicates
+ while columns.count(name) > 0:
+ # prepend ascending number to avoid duplicates
+ name = '_' + str(duplicate_var_id) + name
+ name = name[:min(len(name), 32)]
+ duplicate_var_id += 1
+ converted_names[orig_name] = name
+
+ columns[j] = name
+
+ data.columns = columns
+
+ # Check date conversion, and fix key if needed
+ if self._convert_dates:
+ for c, o in zip(columns, original_columns):
+ if c != o:
+ self._convert_dates[c] = self._convert_dates[o]
+ del self._convert_dates[o]
+
+ if converted_names:
+ conversion_warning = []
+ for orig_name, name in converted_names.items():
+ # need to possibly encode the orig name if its unicode
+ try:
+ orig_name = orig_name.encode('utf-8')
+ except (UnicodeDecodeError, AttributeError):
+ pass
+ msg = '{0} -> {1}'.format(orig_name, name)
+ conversion_warning.append(msg)
+
+ ws = invalid_name_doc.format('\n '.join(conversion_warning))
+ warnings.warn(ws, InvalidColumnName)
+
+ self._converted_names = converted_names
+ self._update_strl_names()
+
+ return data
+
+ def _set_formats_and_types(self, data, dtypes):
+ self.typlist = []
+ self.fmtlist = []
+ for col, dtype in dtypes.iteritems():
+ self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, data[col]))
+ self.typlist.append(_dtype_to_stata_type(dtype, data[col]))
+
+ def _prepare_pandas(self, data):
+ # NOTE: we might need a different API / class for pandas objects so
+ # we can set different semantics - handle this with a PR to pandas.io
+
+ data = data.copy()
+
+ if self._write_index:
+ data = data.reset_index()
+
+ # Ensure column names are strings
+ data = self._check_column_names(data)
+
+ # Check columns for compatibility with stata, upcast if necessary
+ # Raise if outside the supported range
+ data = _cast_to_stata_types(data)
+
+ # Replace NaNs with Stata missing values
+ data = self._replace_nans(data)
+
+ # Convert categoricals to int data, and strip labels
+ data = self._prepare_categoricals(data)
+
+ self.nobs, self.nvar = data.shape
+ self.data = data
+ self.varlist = data.columns.tolist()
+
+ dtypes = data.dtypes
+
+ # Ensure all date columns are converted
+ for col in data:
+ if col in self._convert_dates:
+ continue
+ if is_datetime64_dtype(data[col]):
+ self._convert_dates[col] = 'tc'
+
+ self._convert_dates = _maybe_convert_to_int_keys(self._convert_dates,
+ self.varlist)
+ for key in self._convert_dates:
+ new_type = _convert_datetime_to_stata_type(
+ self._convert_dates[key]
+ )
+ dtypes[key] = np.dtype(new_type)
+
+ self._set_formats_and_types(data, dtypes)
+
+ # set the given format for the datetime cols
+ if self._convert_dates is not None:
+ for key in self._convert_dates:
+ self.fmtlist[key] = self._convert_dates[key]
+
+ def write_file(self):
+ self._file, self._own_file = _open_file_binary_write(self._fname)
+ try:
+ self._write_header(time_stamp=self._time_stamp,
+ data_label=self._data_label)
+ self._write_map()
+ self._write_variable_types()
+ self._write_varnames()
+ self._write_sortlist()
+ self._write_formats()
+ self._write_value_label_names()
+ self._write_variable_labels()
+ self._write_expansion_fields()
+ self._write_characteristics()
+ self._prepare_data()
+ self._write_data()
+ self._write_strls()
+ self._write_value_labels()
+ self._write_file_close_tag()
+ self._write_map()
+ except Exception as exc:
+ self._close()
+ try:
+ if self._own_file:
+ os.unlink(self._fname)
+ except Exception:
+ warnings.warn('This save was not successful but {0} could not '
+ 'be deleted. This file is not '
+ 'valid.'.format(self._fname), ResourceWarning)
+ raise exc
+ else:
+ self._close()
+
+ def _close(self):
+ """
+ Close the file if it was created by the writer.
+
+ If a buffer or file-like object was passed in, for example a GzipFile,
+ then leave this file open for the caller to close. In either case,
+ attempt to flush the file contents to ensure they are written to disk
+ (if supported)
+ """
+ # Some file-like objects might not support flush
+ try:
+ self._file.flush()
+ except AttributeError:
+ pass
+ if self._own_file:
+ self._file.close()
+
+ def _write_map(self):
+ """No-op, future compatibility"""
+ pass
+
+ def _write_file_close_tag(self):
+ """No-op, future compatibility"""
+ pass
+
+ def _write_characteristics(self):
+ """No-op, future compatibility"""
+ pass
+
+ def _write_strls(self):
+ """No-op, future compatibility"""
+ pass
+
+ def _write_expansion_fields(self):
+ """Write 5 zeros for expansion fields"""
+ self._write(_pad_bytes("", 5))
+
+ def _write_value_labels(self):
+ for vl in self._value_labels:
+ self._file.write(vl.generate_value_label(self._byteorder,
+ self._encoding))
+
+ def _write_header(self, data_label=None, time_stamp=None):
+ byteorder = self._byteorder
+ # ds_format - just use 114
+ self._file.write(struct.pack("b", 114))
+ # byteorder
+ self._write(byteorder == ">" and "\x01" or "\x02")
+ # filetype
+ self._write("\x01")
+ # unused
+ self._write("\x00")
+ # number of vars, 2 bytes
+ self._file.write(struct.pack(byteorder + "h", self.nvar)[:2])
+ # number of obs, 4 bytes
+ self._file.write(struct.pack(byteorder + "i", self.nobs)[:4])
+ # data label 81 bytes, char, null terminated
+ if data_label is None:
+ self._file.write(self._null_terminate(_pad_bytes("", 80)))
+ else:
+ self._file.write(
+ self._null_terminate(_pad_bytes(data_label[:80], 80))
+ )
+ # time stamp, 18 bytes, char, null terminated
+ # format dd Mon yyyy hh:mm
+ if time_stamp is None:
+ time_stamp = datetime.datetime.now()
+ elif not isinstance(time_stamp, datetime.datetime):
+ raise ValueError("time_stamp should be datetime type")
+ # GH #13856
+ # Avoid locale-specific month conversion
+ months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug',
+ 'Sep', 'Oct', 'Nov', 'Dec']
+ month_lookup = {i + 1: month for i, month in enumerate(months)}
+ ts = (time_stamp.strftime("%d ") +
+ month_lookup[time_stamp.month] +
+ time_stamp.strftime(" %Y %H:%M"))
+ self._file.write(self._null_terminate(ts))
+
+ def _write_variable_types(self):
+ for typ in self.typlist:
+ self._file.write(struct.pack('B', typ))
+
+ def _write_varnames(self):
+ # varlist names are checked by _check_column_names
+ # varlist, requires null terminated
+ for name in self.varlist:
+ name = self._null_terminate(name, True)
+ name = _pad_bytes(name[:32], 33)
+ self._write(name)
+
+ def _write_sortlist(self):
+ # srtlist, 2*(nvar+1), int array, encoded by byteorder
+ srtlist = _pad_bytes("", 2 * (self.nvar + 1))
+ self._write(srtlist)
+
+ def _write_formats(self):
+ # fmtlist, 49*nvar, char array
+ for fmt in self.fmtlist:
+ self._write(_pad_bytes(fmt, 49))
+
+ def _write_value_label_names(self):
+ # lbllist, 33*nvar, char array
+ for i in range(self.nvar):
+ # Use variable name when categorical
+ if self._is_col_cat[i]:
+ name = self.varlist[i]
+ name = self._null_terminate(name, True)
+ name = _pad_bytes(name[:32], 33)
+ self._write(name)
+ else: # Default is empty label
+ self._write(_pad_bytes("", 33))
+
+ def _write_variable_labels(self):
+ # Missing labels are 80 blank characters plus null termination
+ blank = _pad_bytes('', 81)
+
+ if self._variable_labels is None:
+ for i in range(self.nvar):
+ self._write(blank)
+ return
+
+ for col in self.data:
+ if col in self._variable_labels:
+ label = self._variable_labels[col]
+ if len(label) > 80:
+ raise ValueError('Variable labels must be 80 characters '
+ 'or fewer')
+ is_latin1 = all(ord(c) < 256 for c in label)
+ if not is_latin1:
+ raise ValueError('Variable labels must contain only '
+ 'characters that can be encoded in '
+ 'Latin-1')
+ self._write(_pad_bytes(label, 81))
+ else:
+ self._write(blank)
+
+ def _convert_strls(self, data):
+ """No-op, future compatibility"""
+ return data
+
+ def _prepare_data(self):
+ data = self.data
+ typlist = self.typlist
+ convert_dates = self._convert_dates
+ # 1. Convert dates
+ if self._convert_dates is not None:
+ for i, col in enumerate(data):
+ if i in convert_dates:
+ data[col] = _datetime_to_stata_elapsed_vec(data[col],
+ self.fmtlist[i])
+ # 2. Convert strls
+ data = self._convert_strls(data)
+
+ # 3. Convert bad string data to '' and pad to correct length
+ dtypes = []
+ data_cols = []
+ has_strings = False
+ native_byteorder = self._byteorder == _set_endianness(sys.byteorder)
+ for i, col in enumerate(data):
+ typ = typlist[i]
+ if typ <= self._max_string_length:
+ has_strings = True
+ data[col] = data[col].fillna('').apply(_pad_bytes, args=(typ,))
+ stype = 'S{type}'.format(type=typ)
+ dtypes.append(('c' + str(i), stype))
+ string = data[col].str.encode(self._encoding)
+ data_cols.append(string.values.astype(stype))
+ else:
+ values = data[col].values
+ dtype = data[col].dtype
+ if not native_byteorder:
+ dtype = dtype.newbyteorder(self._byteorder)
+ dtypes.append(('c' + str(i), dtype))
+ data_cols.append(values)
+ dtypes = np.dtype(dtypes)
+
+ if has_strings or not native_byteorder:
+ self.data = np.fromiter(zip(*data_cols), dtype=dtypes)
+ else:
+ self.data = data.to_records(index=False)
+
+ def _write_data(self):
+ data = self.data
+ self._file.write(data.tobytes())
+
+ def _null_terminate(self, s, as_string=False):
+ null_byte = '\x00'
+ if compat.PY3 and not as_string:
+ s += null_byte
+ return s.encode(self._encoding)
+ else:
+ s += null_byte
+ return s
+
+
+def _dtype_to_stata_type_117(dtype, column, force_strl):
+ """
+ Converts dtype types to stata types. Returns the byte of the given ordinal.
+ See TYPE_MAP and comments for an explanation. This is also explained in
+ the dta spec.
+ 1 - 2045 are strings of this length
+ Pandas Stata
+ 32768 - for object strL
+ 65526 - for int8 byte
+ 65527 - for int16 int
+ 65528 - for int32 long
+ 65529 - for float32 float
+ 65530 - for double double
+
+ If there are dates to convert, then dtype will already have the correct
+ type inserted.
+ """
+ # TODO: expand to handle datetime to integer conversion
+ if force_strl:
+ return 32768
+ if dtype.type == np.object_: # try to coerce it to the biggest string
+ # not memory efficient, what else could we
+ # do?
+ itemsize = max_len_string_array(ensure_object(column.values))
+ itemsize = max(itemsize, 1)
+ if itemsize <= 2045:
+ return itemsize
+ return 32768
+ elif dtype == np.float64:
+ return 65526
+ elif dtype == np.float32:
+ return 65527
+ elif dtype == np.int32:
+ return 65528
+ elif dtype == np.int16:
+ return 65529
+ elif dtype == np.int8:
+ return 65530
+ else: # pragma : no cover
+ raise NotImplementedError("Data type %s not supported." % dtype)
+
+
+def _bytes(s, encoding):
+ if compat.PY3:
+ return bytes(s, encoding)
+ else:
+ return bytes(s.encode(encoding))
+
+
+def _pad_bytes_new(name, length):
+ """
+ Takes a bytes instance and pads it with null bytes until it's length chars.
+ """
+ if isinstance(name, string_types):
+ name = _bytes(name, 'utf-8')
+ return name + b'\x00' * (length - len(name))
+
+
+class StataStrLWriter(object):
+ """
+ Converter for Stata StrLs
+
+ Stata StrLs map 8 byte values to strings which are stored using a
+ dictionary-like format where strings are keyed to two values.
+
+ Parameters
+ ----------
+ df : DataFrame
+ DataFrame to convert
+ columns : list
+ List of columns names to convert to StrL
+ version : int, optional
+ dta version. Currently supports 117, 118 and 119
+ byteorder : str, optional
+ Can be ">", "<", "little", or "big". default is `sys.byteorder`
+
+ Notes
+ -----
+ Supports creation of the StrL block of a dta file for dta versions
+ 117, 118 and 119. These differ in how the GSO is stored. 118 and
+ 119 store the GSO lookup value as a uint32 and a uint64, while 117
+ uses two uint32s. 118 and 119 also encode all strings as unicode
+ which is required by the format. 117 uses 'latin-1' a fixed width
+ encoding that extends the 7-bit ascii table with an additional 128
+ characters.
+ """
+
+ def __init__(self, df, columns, version=117, byteorder=None):
+ if version not in (117, 118, 119):
+ raise ValueError('Only dta versions 117, 118 and 119 supported')
+ self._dta_ver = version
+
+ self.df = df
+ self.columns = columns
+ self._gso_table = OrderedDict((('', (0, 0)),))
+ if byteorder is None:
+ byteorder = sys.byteorder
+ self._byteorder = _set_endianness(byteorder)
+
+ gso_v_type = 'I' # uint32
+ gso_o_type = 'Q' # uint64
+ self._encoding = 'utf-8'
+ if version == 117:
+ o_size = 4
+ gso_o_type = 'I' # 117 used uint32
+ self._encoding = 'latin-1'
+ elif version == 118:
+ o_size = 6
+ else: # version == 119
+ o_size = 5
+ self._o_offet = 2 ** (8 * (8 - o_size))
+ self._gso_o_type = gso_o_type
+ self._gso_v_type = gso_v_type
+
+ def _convert_key(self, key):
+ v, o = key
+ return v + self._o_offet * o
+
+ def generate_table(self):
+ """
+ Generates the GSO lookup table for the DataFRame
+
+ Returns
+ -------
+ gso_table : OrderedDict
+ Ordered dictionary using the string found as keys
+ and their lookup position (v,o) as values
+ gso_df : DataFrame
+ DataFrame where strl columns have been converted to
+ (v,o) values
+
+ Notes
+ -----
+ Modifies the DataFrame in-place.
+
+ The DataFrame returned encodes the (v,o) values as uint64s. The
+ encoding depends on teh dta version, and can be expressed as
+
+ enc = v + o * 2 ** (o_size * 8)
+
+ so that v is stored in the lower bits and o is in the upper
+ bits. o_size is
+
+ * 117: 4
+ * 118: 6
+ * 119: 5
+ """
+
+ gso_table = self._gso_table
+ gso_df = self.df
+ columns = list(gso_df.columns)
+ selected = gso_df[self.columns]
+ col_index = [(col, columns.index(col)) for col in self.columns]
+ keys = np.empty(selected.shape, dtype=np.uint64)
+ for o, (idx, row) in enumerate(selected.iterrows()):
+ for j, (col, v) in enumerate(col_index):
+ val = row[col]
+ # Allow columns with mixed str and None (GH 23633)
+ val = '' if val is None else val
+ key = gso_table.get(val, None)
+ if key is None:
+ # Stata prefers human numbers
+ key = (v + 1, o + 1)
+ gso_table[val] = key
+ keys[o, j] = self._convert_key(key)
+ for i, col in enumerate(self.columns):
+ gso_df[col] = keys[:, i]
+
+ return gso_table, gso_df
+
+ def _encode(self, s):
+ """
+ Python 3 compatibility shim
+ """
+ if compat.PY3:
+ return s.encode(self._encoding)
+ else:
+ if isinstance(s, text_type):
+ return s.encode(self._encoding)
+ return s
+
+ def generate_blob(self, gso_table):
+ """
+ Generates the binary blob of GSOs that is written to the dta file.
+
+ Parameters
+ ----------
+ gso_table : OrderedDict
+ Ordered dictionary (str, vo)
+
+ Returns
+ -------
+ gso : bytes
+ Binary content of dta file to be placed between strl tags
+
+ Notes
+ -----
+ Output format depends on dta version. 117 uses two uint32s to
+ express v and o while 118+ uses a uint32 for v and a uint64 for o.
+ """
+ # Format information
+ # Length includes null term
+ # 117
+ # GSOvvvvooootllllxxxxxxxxxxxxxxx...x
+ # 3 u4 u4 u1 u4 string + null term
+ #
+ # 118, 119
+ # GSOvvvvooooooootllllxxxxxxxxxxxxxxx...x
+ # 3 u4 u8 u1 u4 string + null term
+
+ bio = BytesIO()
+ gso = _bytes('GSO', 'ascii')
+ gso_type = struct.pack(self._byteorder + 'B', 130)
+ null = struct.pack(self._byteorder + 'B', 0)
+ v_type = self._byteorder + self._gso_v_type
+ o_type = self._byteorder + self._gso_o_type
+ len_type = self._byteorder + 'I'
+ for strl, vo in gso_table.items():
+ if vo == (0, 0):
+ continue
+ v, o = vo
+
+ # GSO
+ bio.write(gso)
+
+ # vvvv
+ bio.write(struct.pack(v_type, v))
+
+ # oooo / oooooooo
+ bio.write(struct.pack(o_type, o))
+
+ # t
+ bio.write(gso_type)
+
+ # llll
+ utf8_string = _bytes(strl, 'utf-8')
+ bio.write(struct.pack(len_type, len(utf8_string) + 1))
+
+ # xxx...xxx
+ bio.write(utf8_string)
+ bio.write(null)
+
+ bio.seek(0)
+ return bio.read()
+
+
+class StataWriter117(StataWriter):
+ """
+ A class for writing Stata binary dta files in Stata 13 format (117)
+
+ .. versionadded:: 0.23.0
+
+ Parameters
+ ----------
+ fname : path (string), buffer or path object
+ string, path object (pathlib.Path or py._path.local.LocalPath) or
+ object implementing a binary write() functions. If using a buffer
+ then the buffer will not be automatically closed after the file
+ is written.
+ data : DataFrame
+ Input to save
+ convert_dates : dict
+ Dictionary mapping columns containing datetime types to stata internal
+ format to use when writing the dates. Options are 'tc', 'td', 'tm',
+ 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
+ Datetime columns that do not have a conversion type specified will be
+ converted to 'tc'. Raises NotImplementedError if a datetime column has
+ timezone information
+ write_index : bool
+ Write the index to Stata dataset.
+ encoding : str
+ Default is latin-1. Only latin-1 and ascii are supported.
+ byteorder : str
+ Can be ">", "<", "little", or "big". default is `sys.byteorder`
+ time_stamp : datetime
+ A datetime to use as file creation date. Default is the current time
+ data_label : str
+ A label for the data set. Must be 80 characters or smaller.
+ variable_labels : dict
+ Dictionary containing columns as keys and variable labels as values.
+ Each label must be 80 characters or smaller.
+ convert_strl : list
+ List of columns names to convert to Stata StrL format. Columns with
+ more than 2045 characters are aautomatically written as StrL.
+ Smaller columns can be converted by including the column name. Using
+ StrLs can reduce output file size when strings are longer than 8
+ characters, and either frequently repeated or sparse.
+
+ Returns
+ -------
+ writer : StataWriter117 instance
+ The StataWriter117 instance has a write_file method, which will
+ write the file to the given `fname`.
+
+ Raises
+ ------
+ NotImplementedError
+ * If datetimes contain timezone information
+ ValueError
+ * Columns listed in convert_dates are neither datetime64[ns]
+ or datetime.datetime
+ * Column dtype is not representable in Stata
+ * Column listed in convert_dates is not in DataFrame
+ * Categorical label contains more than 32,000 characters
+
+ Examples
+ --------
+ >>> from pandas.io.stata import StataWriter117
+ >>> data = pd.DataFrame([[1.0, 1, 'a']], columns=['a', 'b', 'c'])
+ >>> writer = StataWriter117('./data_file.dta', data)
+ >>> writer.write_file()
+
+ Or with long strings stored in strl format
+
+ >>> data = pd.DataFrame([['A relatively long string'], [''], ['']],
+ ... columns=['strls'])
+ >>> writer = StataWriter117('./data_file_with_long_strings.dta', data,
+ ... convert_strl=['strls'])
+ >>> writer.write_file()
+ """
+
+ _max_string_length = 2045
+
+ @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
+ def __init__(self, fname, data, convert_dates=None, write_index=True,
+ encoding="latin-1", byteorder=None, time_stamp=None,
+ data_label=None, variable_labels=None, convert_strl=None):
+ # Shallow copy since convert_strl might be modified later
+ self._convert_strl = [] if convert_strl is None else convert_strl[:]
+
+ super(StataWriter117, self).__init__(fname, data, convert_dates,
+ write_index, byteorder=byteorder,
+ time_stamp=time_stamp,
+ data_label=data_label,
+ variable_labels=variable_labels)
+ self._map = None
+ self._strl_blob = None
+
+ @staticmethod
+ def _tag(val, tag):
+ """Surround val with <tag></tag>"""
+ if isinstance(val, str) and compat.PY3:
+ val = _bytes(val, 'utf-8')
+ return (_bytes('<' + tag + '>', 'utf-8') + val +
+ _bytes('</' + tag + '>', 'utf-8'))
+
+ def _update_map(self, tag):
+ """Update map location for tag with file position"""
+ self._map[tag] = self._file.tell()
+
+ def _write_header(self, data_label=None, time_stamp=None):
+ """Write the file header"""
+ byteorder = self._byteorder
+ self._file.write(_bytes('<stata_dta>', 'utf-8'))
+ bio = BytesIO()
+ # ds_format - 117
+ bio.write(self._tag(_bytes('117', 'utf-8'), 'release'))
+ # byteorder
+ bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", 'byteorder'))
+ # number of vars, 2 bytes
+ assert self.nvar < 2 ** 16
+ bio.write(self._tag(struct.pack(byteorder + "H", self.nvar), 'K'))
+ # number of obs, 4 bytes
+ bio.write(self._tag(struct.pack(byteorder + "I", self.nobs), 'N'))
+ # data label 81 bytes, char, null terminated
+ label = data_label[:80] if data_label is not None else ''
+ label_len = struct.pack(byteorder + "B", len(label))
+ label = label_len + _bytes(label, 'utf-8')
+ bio.write(self._tag(label, 'label'))
+ # time stamp, 18 bytes, char, null terminated
+ # format dd Mon yyyy hh:mm
+ if time_stamp is None:
+ time_stamp = datetime.datetime.now()
+ elif not isinstance(time_stamp, datetime.datetime):
+ raise ValueError("time_stamp should be datetime type")
+ # Avoid locale-specific month conversion
+ months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug',
+ 'Sep', 'Oct', 'Nov', 'Dec']
+ month_lookup = {i + 1: month for i, month in enumerate(months)}
+ ts = (time_stamp.strftime("%d ") +
+ month_lookup[time_stamp.month] +
+ time_stamp.strftime(" %Y %H:%M"))
+ # '\x11' added due to inspection of Stata file
+ ts = b'\x11' + _bytes(ts, 'utf8')
+ bio.write(self._tag(ts, 'timestamp'))
+ bio.seek(0)
+ self._file.write(self._tag(bio.read(), 'header'))
+
+ def _write_map(self):
+ """Called twice during file write. The first populates the values in
+ the map with 0s. The second call writes the final map locations when
+ all blocks have been written."""
+ if self._map is None:
+ self._map = OrderedDict((('stata_data', 0),
+ ('map', self._file.tell()),
+ ('variable_types', 0),
+ ('varnames', 0),
+ ('sortlist', 0),
+ ('formats', 0),
+ ('value_label_names', 0),
+ ('variable_labels', 0),
+ ('characteristics', 0),
+ ('data', 0),
+ ('strls', 0),
+ ('value_labels', 0),
+ ('stata_data_close', 0),
+ ('end-of-file', 0)))
+ # Move to start of map
+ self._file.seek(self._map['map'])
+ bio = BytesIO()
+ for val in self._map.values():
+ bio.write(struct.pack(self._byteorder + 'Q', val))
+ bio.seek(0)
+ self._file.write(self._tag(bio.read(), 'map'))
+
+ def _write_variable_types(self):
+ self._update_map('variable_types')
+ bio = BytesIO()
+ for typ in self.typlist:
+ bio.write(struct.pack(self._byteorder + 'H', typ))
+ bio.seek(0)
+ self._file.write(self._tag(bio.read(), 'variable_types'))
+
+ def _write_varnames(self):
+ self._update_map('varnames')
+ bio = BytesIO()
+ for name in self.varlist:
+ name = self._null_terminate(name, True)
+ name = _pad_bytes_new(name[:32], 33)
+ bio.write(name)
+ bio.seek(0)
+ self._file.write(self._tag(bio.read(), 'varnames'))
+
+ def _write_sortlist(self):
+ self._update_map('sortlist')
+ self._file.write(self._tag(b'\x00\00' * (self.nvar + 1), 'sortlist'))
+
+ def _write_formats(self):
+ self._update_map('formats')
+ bio = BytesIO()
+ for fmt in self.fmtlist:
+ bio.write(_pad_bytes_new(fmt, 49))
+ bio.seek(0)
+ self._file.write(self._tag(bio.read(), 'formats'))
+
+ def _write_value_label_names(self):
+ self._update_map('value_label_names')
+ bio = BytesIO()
+ for i in range(self.nvar):
+ # Use variable name when categorical
+ name = '' # default name
+ if self._is_col_cat[i]:
+ name = self.varlist[i]
+ name = self._null_terminate(name, True)
+ name = _pad_bytes_new(name[:32], 33)
+ bio.write(name)
+ bio.seek(0)
+ self._file.write(self._tag(bio.read(), 'value_label_names'))
+
+ def _write_variable_labels(self):
+ # Missing labels are 80 blank characters plus null termination
+ self._update_map('variable_labels')
+ bio = BytesIO()
+ blank = _pad_bytes_new('', 81)
+
+ if self._variable_labels is None:
+ for _ in range(self.nvar):
+ bio.write(blank)
+ bio.seek(0)
+ self._file.write(self._tag(bio.read(), 'variable_labels'))
+ return
+
+ for col in self.data:
+ if col in self._variable_labels:
+ label = self._variable_labels[col]
+ if len(label) > 80:
+ raise ValueError('Variable labels must be 80 characters '
+ 'or fewer')
+ is_latin1 = all(ord(c) < 256 for c in label)
+ if not is_latin1:
+ raise ValueError('Variable labels must contain only '
+ 'characters that can be encoded in '
+ 'Latin-1')
+ bio.write(_pad_bytes_new(label, 81))
+ else:
+ bio.write(blank)
+ bio.seek(0)
+ self._file.write(self._tag(bio.read(), 'variable_labels'))
+
+ def _write_characteristics(self):
+ self._update_map('characteristics')
+ self._file.write(self._tag(b'', 'characteristics'))
+
+ def _write_data(self):
+ self._update_map('data')
+ data = self.data
+ self._file.write(b'<data>')
+ self._file.write(data.tobytes())
+ self._file.write(b'</data>')
+
+ def _write_strls(self):
+ self._update_map('strls')
+ strls = b''
+ if self._strl_blob is not None:
+ strls = self._strl_blob
+ self._file.write(self._tag(strls, 'strls'))
+
+ def _write_expansion_fields(self):
+ """No-op in dta 117+"""
+ pass
+
+ def _write_value_labels(self):
+ self._update_map('value_labels')
+ bio = BytesIO()
+ for vl in self._value_labels:
+ lab = vl.generate_value_label(self._byteorder, self._encoding)
+ lab = self._tag(lab, 'lbl')
+ bio.write(lab)
+ bio.seek(0)
+ self._file.write(self._tag(bio.read(), 'value_labels'))
+
+ def _write_file_close_tag(self):
+ self._update_map('stata_data_close')
+ self._file.write(_bytes('</stata_dta>', 'utf-8'))
+ self._update_map('end-of-file')
+
+ def _update_strl_names(self):
+ """Update column names for conversion to strl if they might have been
+ changed to comply with Stata naming rules"""
+ # Update convert_strl if names changed
+ for orig, new in self._converted_names.items():
+ if orig in self._convert_strl:
+ idx = self._convert_strl.index(orig)
+ self._convert_strl[idx] = new
+
+ def _convert_strls(self, data):
+ """Convert columns to StrLs if either very large or in the
+ convert_strl variable"""
+ convert_cols = [
+ col for i, col in enumerate(data)
+ if self.typlist[i] == 32768 or col in self._convert_strl]
+
+ if convert_cols:
+ ssw = StataStrLWriter(data, convert_cols)
+ tab, new_data = ssw.generate_table()
+ data = new_data
+ self._strl_blob = ssw.generate_blob(tab)
+ return data
+
+ def _set_formats_and_types(self, data, dtypes):
+ self.typlist = []
+ self.fmtlist = []
+ for col, dtype in dtypes.iteritems():
+ force_strl = col in self._convert_strl
+ fmt = _dtype_to_default_stata_fmt(dtype, data[col],
+ dta_version=117,
+ force_strl=force_strl)
+ self.fmtlist.append(fmt)
+ self.typlist.append(_dtype_to_stata_type_117(dtype, data[col],
+ force_strl))
diff --git a/contrib/python/pandas/py2/pandas/plotting/__init__.py b/contrib/python/pandas/py2/pandas/plotting/__init__.py
new file mode 100644
index 00000000000..ff5351bb6c6
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/plotting/__init__.py
@@ -0,0 +1,20 @@
+"""
+Plotting api
+"""
+
+# flake8: noqa
+
+from pandas.plotting._misc import (scatter_matrix, radviz,
+ andrews_curves, bootstrap_plot,
+ parallel_coordinates, lag_plot,
+ autocorrelation_plot)
+from pandas.plotting._core import boxplot
+from pandas.plotting._style import plot_params
+from pandas.plotting._tools import table
+try:
+ from pandas.plotting._converter import (
+ register as register_matplotlib_converters)
+ from pandas.plotting._converter import (
+ deregister as deregister_matplotlib_converters)
+except ImportError:
+ pass
diff --git a/contrib/python/pandas/py2/pandas/plotting/_compat.py b/contrib/python/pandas/py2/pandas/plotting/_compat.py
new file mode 100644
index 00000000000..48900c088a1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/plotting/_compat.py
@@ -0,0 +1,25 @@
+# being a bit too dynamic
+# pylint: disable=E1101
+from __future__ import division
+
+from distutils.version import LooseVersion
+import operator
+
+
+def _mpl_version(version, op):
+ def inner():
+ try:
+ import matplotlib as mpl
+ except ImportError:
+ return False
+ return (op(LooseVersion(mpl.__version__), LooseVersion(version)) and
+ str(mpl.__version__)[0] != '0')
+
+ return inner
+
+
+_mpl_ge_2_0_1 = _mpl_version('2.0.1', operator.ge)
+_mpl_ge_2_1_0 = _mpl_version('2.1.0', operator.ge)
+_mpl_ge_2_2_0 = _mpl_version('2.2.0', operator.ge)
+_mpl_ge_2_2_2 = _mpl_version('2.2.2', operator.ge)
+_mpl_ge_3_0_0 = _mpl_version('3.0.0', operator.ge)
diff --git a/contrib/python/pandas/py2/pandas/plotting/_converter.py b/contrib/python/pandas/py2/pandas/plotting/_converter.py
new file mode 100644
index 00000000000..aaa7aa04acf
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/plotting/_converter.py
@@ -0,0 +1,1154 @@
+import datetime as pydt
+from datetime import datetime, timedelta
+import warnings
+
+from dateutil.relativedelta import relativedelta
+import matplotlib.dates as dates
+from matplotlib.ticker import AutoLocator, Formatter, Locator
+from matplotlib.transforms import nonsingular
+import matplotlib.units as units
+import numpy as np
+
+from pandas._libs import lib, tslibs
+from pandas._libs.tslibs import resolution
+from pandas._libs.tslibs.frequencies import FreqGroup, get_freq
+import pandas.compat as compat
+from pandas.compat import lrange
+
+from pandas.core.dtypes.common import (
+ is_datetime64_ns_dtype, is_float, is_float_dtype, is_integer,
+ is_integer_dtype, is_nested_list_like)
+from pandas.core.dtypes.generic import ABCSeries
+
+import pandas.core.common as com
+from pandas.core.index import Index
+from pandas.core.indexes.datetimes import date_range
+from pandas.core.indexes.period import Period, PeriodIndex, period_range
+import pandas.core.tools.datetimes as tools
+
+# constants
+HOURS_PER_DAY = 24.
+MIN_PER_HOUR = 60.
+SEC_PER_MIN = 60.
+
+SEC_PER_HOUR = SEC_PER_MIN * MIN_PER_HOUR
+SEC_PER_DAY = SEC_PER_HOUR * HOURS_PER_DAY
+
+MUSEC_PER_DAY = 1e6 * SEC_PER_DAY
+
+_WARN = True # Global for whether pandas has registered the units explicitly
+_mpl_units = {} # Cache for units overwritten by us
+
+
+def get_pairs():
+ pairs = [
+ (tslibs.Timestamp, DatetimeConverter),
+ (Period, PeriodConverter),
+ (pydt.datetime, DatetimeConverter),
+ (pydt.date, DatetimeConverter),
+ (pydt.time, TimeConverter),
+ (np.datetime64, DatetimeConverter),
+ ]
+ return pairs
+
+
+def register(explicit=True):
+ """
+ Register Pandas Formatters and Converters with matplotlib
+
+ This function modifies the global ``matplotlib.units.registry``
+ dictionary. Pandas adds custom converters for
+
+ * pd.Timestamp
+ * pd.Period
+ * np.datetime64
+ * datetime.datetime
+ * datetime.date
+ * datetime.time
+
+ See Also
+ --------
+ deregister_matplotlib_converter
+ """
+ # Renamed in pandas.plotting.__init__
+ global _WARN
+
+ if explicit:
+ _WARN = False
+
+ pairs = get_pairs()
+ for type_, cls in pairs:
+ converter = cls()
+ if type_ in units.registry:
+ previous = units.registry[type_]
+ _mpl_units[type_] = previous
+ units.registry[type_] = converter
+
+
+def deregister():
+ """
+ Remove pandas' formatters and converters
+
+ Removes the custom converters added by :func:`register`. This
+ attempts to set the state of the registry back to the state before
+ pandas registered its own units. Converters for pandas' own types like
+ Timestamp and Period are removed completely. Converters for types
+ pandas overwrites, like ``datetime.datetime``, are restored to their
+ original value.
+
+ See Also
+ --------
+ deregister_matplotlib_converters
+ """
+ # Renamed in pandas.plotting.__init__
+ for type_, cls in get_pairs():
+ # We use type to catch our classes directly, no inheritance
+ if type(units.registry.get(type_)) is cls:
+ units.registry.pop(type_)
+
+ # restore the old keys
+ for unit, formatter in _mpl_units.items():
+ if type(formatter) not in {DatetimeConverter, PeriodConverter,
+ TimeConverter}:
+ # make it idempotent by excluding ours.
+ units.registry[unit] = formatter
+
+
+def _check_implicitly_registered():
+ global _WARN
+
+ if _WARN:
+ msg = ("Using an implicitly registered datetime converter for a "
+ "matplotlib plotting method. The converter was registered "
+ "by pandas on import. Future versions of pandas will require "
+ "you to explicitly register matplotlib converters.\n\n"
+ "To register the converters:\n\t"
+ ">>> from pandas.plotting import register_matplotlib_converters"
+ "\n\t"
+ ">>> register_matplotlib_converters()")
+ warnings.warn(msg, FutureWarning)
+ _WARN = False
+
+
+def _to_ordinalf(tm):
+ tot_sec = (tm.hour * 3600 + tm.minute * 60 + tm.second +
+ float(tm.microsecond / 1e6))
+ return tot_sec
+
+
+def time2num(d):
+ if isinstance(d, compat.string_types):
+ parsed = tools.to_datetime(d)
+ if not isinstance(parsed, datetime):
+ raise ValueError('Could not parse time {d}'.format(d=d))
+ return _to_ordinalf(parsed.time())
+ if isinstance(d, pydt.time):
+ return _to_ordinalf(d)
+ return d
+
+
+class TimeConverter(units.ConversionInterface):
+
+ @staticmethod
+ def convert(value, unit, axis):
+ valid_types = (str, pydt.time)
+ if (isinstance(value, valid_types) or is_integer(value) or
+ is_float(value)):
+ return time2num(value)
+ if isinstance(value, Index):
+ return value.map(time2num)
+ if isinstance(value, (list, tuple, np.ndarray, Index)):
+ return [time2num(x) for x in value]
+ return value
+
+ @staticmethod
+ def axisinfo(unit, axis):
+ if unit != 'time':
+ return None
+
+ majloc = AutoLocator()
+ majfmt = TimeFormatter(majloc)
+ return units.AxisInfo(majloc=majloc, majfmt=majfmt, label='time')
+
+ @staticmethod
+ def default_units(x, axis):
+ return 'time'
+
+
+# time formatter
+class TimeFormatter(Formatter):
+
+ def __init__(self, locs):
+ self.locs = locs
+
+ def __call__(self, x, pos=0):
+ """
+ Return the time of day as a formatted string.
+
+ Parameters
+ ----------
+ x : float
+ The time of day specified as seconds since 00:00 (midnight),
+ with up to microsecond precision.
+ pos
+ Unused
+
+ Returns
+ -------
+ str
+ A string in HH:MM:SS.mmmuuu format. Microseconds,
+ milliseconds and seconds are only displayed if non-zero.
+ """
+ fmt = '%H:%M:%S.%f'
+ s = int(x)
+ msus = int(round((x - s) * 1e6))
+ ms = msus // 1000
+ us = msus % 1000
+ m, s = divmod(s, 60)
+ h, m = divmod(m, 60)
+ _, h = divmod(h, 24)
+ if us != 0:
+ return pydt.time(h, m, s, msus).strftime(fmt)
+ elif ms != 0:
+ return pydt.time(h, m, s, msus).strftime(fmt)[:-3]
+ elif s != 0:
+ return pydt.time(h, m, s).strftime('%H:%M:%S')
+
+ return pydt.time(h, m).strftime('%H:%M')
+
+
+# Period Conversion
+
+
+class PeriodConverter(dates.DateConverter):
+
+ @staticmethod
+ def convert(values, units, axis):
+ if is_nested_list_like(values):
+ values = [PeriodConverter._convert_1d(v, units, axis)
+ for v in values]
+ else:
+ values = PeriodConverter._convert_1d(values, units, axis)
+ return values
+
+ @staticmethod
+ def _convert_1d(values, units, axis):
+ if not hasattr(axis, 'freq'):
+ raise TypeError('Axis must have `freq` set to convert to Periods')
+ valid_types = (compat.string_types, datetime,
+ Period, pydt.date, pydt.time, np.datetime64)
+ if (isinstance(values, valid_types) or is_integer(values) or
+ is_float(values)):
+ return get_datevalue(values, axis.freq)
+ elif isinstance(values, PeriodIndex):
+ return values.asfreq(axis.freq)._ndarray_values
+ elif isinstance(values, Index):
+ return values.map(lambda x: get_datevalue(x, axis.freq))
+ elif lib.infer_dtype(values, skipna=False) == 'period':
+ # https://github.com/pandas-dev/pandas/issues/24304
+ # convert ndarray[period] -> PeriodIndex
+ return PeriodIndex(values, freq=axis.freq)._ndarray_values
+ elif isinstance(values, (list, tuple, np.ndarray, Index)):
+ return [get_datevalue(x, axis.freq) for x in values]
+ return values
+
+
+def get_datevalue(date, freq):
+ if isinstance(date, Period):
+ return date.asfreq(freq).ordinal
+ elif isinstance(date, (compat.string_types, datetime,
+ pydt.date, pydt.time, np.datetime64)):
+ return Period(date, freq).ordinal
+ elif (is_integer(date) or is_float(date) or
+ (isinstance(date, (np.ndarray, Index)) and (date.size == 1))):
+ return date
+ elif date is None:
+ return None
+ raise ValueError("Unrecognizable date '{date}'".format(date=date))
+
+
+def _dt_to_float_ordinal(dt):
+ """
+ Convert :mod:`datetime` to the Gregorian date as UTC float days,
+ preserving hours, minutes, seconds and microseconds. Return value
+ is a :func:`float`.
+ """
+ if (isinstance(dt, (np.ndarray, Index, ABCSeries)
+ ) and is_datetime64_ns_dtype(dt)):
+ base = dates.epoch2num(dt.asi8 / 1.0E9)
+ else:
+ base = dates.date2num(dt)
+ return base
+
+
+# Datetime Conversion
+class DatetimeConverter(dates.DateConverter):
+
+ @staticmethod
+ def convert(values, unit, axis):
+ # values might be a 1-d array, or a list-like of arrays.
+ _check_implicitly_registered()
+ if is_nested_list_like(values):
+ values = [DatetimeConverter._convert_1d(v, unit, axis)
+ for v in values]
+ else:
+ values = DatetimeConverter._convert_1d(values, unit, axis)
+ return values
+
+ @staticmethod
+ def _convert_1d(values, unit, axis):
+ def try_parse(values):
+ try:
+ return _dt_to_float_ordinal(tools.to_datetime(values))
+ except Exception:
+ return values
+
+ if isinstance(values, (datetime, pydt.date)):
+ return _dt_to_float_ordinal(values)
+ elif isinstance(values, np.datetime64):
+ return _dt_to_float_ordinal(tslibs.Timestamp(values))
+ elif isinstance(values, pydt.time):
+ return dates.date2num(values)
+ elif (is_integer(values) or is_float(values)):
+ return values
+ elif isinstance(values, compat.string_types):
+ return try_parse(values)
+ elif isinstance(values, (list, tuple, np.ndarray, Index, ABCSeries)):
+ if isinstance(values, ABCSeries):
+ # https://github.com/matplotlib/matplotlib/issues/11391
+ # Series was skipped. Convert to DatetimeIndex to get asi8
+ values = Index(values)
+ if isinstance(values, Index):
+ values = values.values
+ if not isinstance(values, np.ndarray):
+ values = com.asarray_tuplesafe(values)
+
+ if is_integer_dtype(values) or is_float_dtype(values):
+ return values
+
+ try:
+ values = tools.to_datetime(values)
+ if isinstance(values, Index):
+ values = _dt_to_float_ordinal(values)
+ else:
+ values = [_dt_to_float_ordinal(x) for x in values]
+ except Exception:
+ values = _dt_to_float_ordinal(values)
+
+ return values
+
+ @staticmethod
+ def axisinfo(unit, axis):
+ """
+ Return the :class:`~matplotlib.units.AxisInfo` for *unit*.
+
+ *unit* is a tzinfo instance or None.
+ The *axis* argument is required but not used.
+ """
+ tz = unit
+
+ majloc = PandasAutoDateLocator(tz=tz)
+ majfmt = PandasAutoDateFormatter(majloc, tz=tz)
+ datemin = pydt.date(2000, 1, 1)
+ datemax = pydt.date(2010, 1, 1)
+
+ return units.AxisInfo(majloc=majloc, majfmt=majfmt, label='',
+ default_limits=(datemin, datemax))
+
+
+class PandasAutoDateFormatter(dates.AutoDateFormatter):
+
+ def __init__(self, locator, tz=None, defaultfmt='%Y-%m-%d'):
+ dates.AutoDateFormatter.__init__(self, locator, tz, defaultfmt)
+ # matplotlib.dates._UTC has no _utcoffset called by pandas
+ if self._tz is dates.UTC:
+ self._tz._utcoffset = self._tz.utcoffset(None)
+
+
+class PandasAutoDateLocator(dates.AutoDateLocator):
+
+ def get_locator(self, dmin, dmax):
+ 'Pick the best locator based on a distance.'
+ _check_implicitly_registered()
+ delta = relativedelta(dmax, dmin)
+
+ num_days = (delta.years * 12.0 + delta.months) * 31.0 + delta.days
+ num_sec = (delta.hours * 60.0 + delta.minutes) * 60.0 + delta.seconds
+ tot_sec = num_days * 86400. + num_sec
+
+ if abs(tot_sec) < self.minticks:
+ self._freq = -1
+ locator = MilliSecondLocator(self.tz)
+ locator.set_axis(self.axis)
+
+ locator.set_view_interval(*self.axis.get_view_interval())
+ locator.set_data_interval(*self.axis.get_data_interval())
+ return locator
+
+ return dates.AutoDateLocator.get_locator(self, dmin, dmax)
+
+ def _get_unit(self):
+ return MilliSecondLocator.get_unit_generic(self._freq)
+
+
+class MilliSecondLocator(dates.DateLocator):
+
+ UNIT = 1. / (24 * 3600 * 1000)
+
+ def __init__(self, tz):
+ dates.DateLocator.__init__(self, tz)
+ self._interval = 1.
+
+ def _get_unit(self):
+ return self.get_unit_generic(-1)
+
+ @staticmethod
+ def get_unit_generic(freq):
+ unit = dates.RRuleLocator.get_unit_generic(freq)
+ if unit < 0:
+ return MilliSecondLocator.UNIT
+ return unit
+
+ def __call__(self):
+ # if no data have been set, this will tank with a ValueError
+ _check_implicitly_registered()
+ try:
+ dmin, dmax = self.viewlim_to_dt()
+ except ValueError:
+ return []
+
+ if dmin > dmax:
+ dmax, dmin = dmin, dmax
+ # We need to cap at the endpoints of valid datetime
+
+ # TODO(wesm) unused?
+ # delta = relativedelta(dmax, dmin)
+ # try:
+ # start = dmin - delta
+ # except ValueError:
+ # start = _from_ordinal(1.0)
+
+ # try:
+ # stop = dmax + delta
+ # except ValueError:
+ # # The magic number!
+ # stop = _from_ordinal(3652059.9999999)
+
+ nmax, nmin = dates.date2num((dmax, dmin))
+
+ num = (nmax - nmin) * 86400 * 1000
+ max_millis_ticks = 6
+ for interval in [1, 10, 50, 100, 200, 500]:
+ if num <= interval * (max_millis_ticks - 1):
+ self._interval = interval
+ break
+ else:
+ # We went through the whole loop without breaking, default to 1
+ self._interval = 1000.
+
+ estimate = (nmax - nmin) / (self._get_unit() * self._get_interval())
+
+ if estimate > self.MAXTICKS * 2:
+ raise RuntimeError(('MillisecondLocator estimated to generate '
+ '{estimate:d} ticks from {dmin} to {dmax}: '
+ 'exceeds Locator.MAXTICKS'
+ '* 2 ({arg:d}) ').format(
+ estimate=estimate, dmin=dmin, dmax=dmax,
+ arg=self.MAXTICKS * 2))
+
+ freq = '%dL' % self._get_interval()
+ tz = self.tz.tzname(None)
+ st = _from_ordinal(dates.date2num(dmin)) # strip tz
+ ed = _from_ordinal(dates.date2num(dmax))
+ all_dates = date_range(start=st, end=ed,
+ freq=freq, tz=tz).astype(object)
+
+ try:
+ if len(all_dates) > 0:
+ locs = self.raise_if_exceeds(dates.date2num(all_dates))
+ return locs
+ except Exception: # pragma: no cover
+ pass
+
+ lims = dates.date2num([dmin, dmax])
+ return lims
+
+ def _get_interval(self):
+ return self._interval
+
+ def autoscale(self):
+ """
+ Set the view limits to include the data range.
+ """
+ dmin, dmax = self.datalim_to_dt()
+ if dmin > dmax:
+ dmax, dmin = dmin, dmax
+
+ # We need to cap at the endpoints of valid datetime
+
+ # TODO(wesm): unused?
+
+ # delta = relativedelta(dmax, dmin)
+ # try:
+ # start = dmin - delta
+ # except ValueError:
+ # start = _from_ordinal(1.0)
+
+ # try:
+ # stop = dmax + delta
+ # except ValueError:
+ # # The magic number!
+ # stop = _from_ordinal(3652059.9999999)
+
+ dmin, dmax = self.datalim_to_dt()
+
+ vmin = dates.date2num(dmin)
+ vmax = dates.date2num(dmax)
+
+ return self.nonsingular(vmin, vmax)
+
+
+def _from_ordinal(x, tz=None):
+ ix = int(x)
+ dt = datetime.fromordinal(ix)
+ remainder = float(x) - ix
+ hour, remainder = divmod(24 * remainder, 1)
+ minute, remainder = divmod(60 * remainder, 1)
+ second, remainder = divmod(60 * remainder, 1)
+ microsecond = int(1e6 * remainder)
+ if microsecond < 10:
+ microsecond = 0 # compensate for rounding errors
+ dt = datetime(dt.year, dt.month, dt.day, int(hour), int(minute),
+ int(second), microsecond)
+ if tz is not None:
+ dt = dt.astimezone(tz)
+
+ if microsecond > 999990: # compensate for rounding errors
+ dt += timedelta(microseconds=1e6 - microsecond)
+
+ return dt
+
+# Fixed frequency dynamic tick locators and formatters
+
+# -------------------------------------------------------------------------
+# --- Locators ---
+# -------------------------------------------------------------------------
+
+
+def _get_default_annual_spacing(nyears):
+ """
+ Returns a default spacing between consecutive ticks for annual data.
+ """
+ if nyears < 11:
+ (min_spacing, maj_spacing) = (1, 1)
+ elif nyears < 20:
+ (min_spacing, maj_spacing) = (1, 2)
+ elif nyears < 50:
+ (min_spacing, maj_spacing) = (1, 5)
+ elif nyears < 100:
+ (min_spacing, maj_spacing) = (5, 10)
+ elif nyears < 200:
+ (min_spacing, maj_spacing) = (5, 25)
+ elif nyears < 600:
+ (min_spacing, maj_spacing) = (10, 50)
+ else:
+ factor = nyears // 1000 + 1
+ (min_spacing, maj_spacing) = (factor * 20, factor * 100)
+ return (min_spacing, maj_spacing)
+
+
+def period_break(dates, period):
+ """
+ Returns the indices where the given period changes.
+
+ Parameters
+ ----------
+ dates : PeriodIndex
+ Array of intervals to monitor.
+ period : string
+ Name of the period to monitor.
+ """
+ current = getattr(dates, period)
+ previous = getattr(dates - 1 * dates.freq, period)
+ return np.nonzero(current - previous)[0]
+
+
+def has_level_label(label_flags, vmin):
+ """
+ Returns true if the ``label_flags`` indicate there is at least one label
+ for this level.
+
+ if the minimum view limit is not an exact integer, then the first tick
+ label won't be shown, so we must adjust for that.
+ """
+ if label_flags.size == 0 or (label_flags.size == 1 and
+ label_flags[0] == 0 and
+ vmin % 1 > 0.0):
+ return False
+ else:
+ return True
+
+
+def _daily_finder(vmin, vmax, freq):
+ periodsperday = -1
+
+ if freq >= FreqGroup.FR_HR:
+ if freq == FreqGroup.FR_NS:
+ periodsperday = 24 * 60 * 60 * 1000000000
+ elif freq == FreqGroup.FR_US:
+ periodsperday = 24 * 60 * 60 * 1000000
+ elif freq == FreqGroup.FR_MS:
+ periodsperday = 24 * 60 * 60 * 1000
+ elif freq == FreqGroup.FR_SEC:
+ periodsperday = 24 * 60 * 60
+ elif freq == FreqGroup.FR_MIN:
+ periodsperday = 24 * 60
+ elif freq == FreqGroup.FR_HR:
+ periodsperday = 24
+ else: # pragma: no cover
+ raise ValueError("unexpected frequency: {freq}".format(freq=freq))
+ periodsperyear = 365 * periodsperday
+ periodspermonth = 28 * periodsperday
+
+ elif freq == FreqGroup.FR_BUS:
+ periodsperyear = 261
+ periodspermonth = 19
+ elif freq == FreqGroup.FR_DAY:
+ periodsperyear = 365
+ periodspermonth = 28
+ elif resolution.get_freq_group(freq) == FreqGroup.FR_WK:
+ periodsperyear = 52
+ periodspermonth = 3
+ else: # pragma: no cover
+ raise ValueError("unexpected frequency")
+
+ # save this for later usage
+ vmin_orig = vmin
+
+ (vmin, vmax) = (Period(ordinal=int(vmin), freq=freq),
+ Period(ordinal=int(vmax), freq=freq))
+ span = vmax.ordinal - vmin.ordinal + 1
+ dates_ = period_range(start=vmin, end=vmax, freq=freq)
+ # Initialize the output
+ info = np.zeros(span,
+ dtype=[('val', np.int64), ('maj', bool),
+ ('min', bool), ('fmt', '|S20')])
+ info['val'][:] = dates_._ndarray_values
+ info['fmt'][:] = ''
+ info['maj'][[0, -1]] = True
+ # .. and set some shortcuts
+ info_maj = info['maj']
+ info_min = info['min']
+ info_fmt = info['fmt']
+
+ def first_label(label_flags):
+ if (label_flags[0] == 0) and (label_flags.size > 1) and \
+ ((vmin_orig % 1) > 0.0):
+ return label_flags[1]
+ else:
+ return label_flags[0]
+
+ # Case 1. Less than a month
+ if span <= periodspermonth:
+ day_start = period_break(dates_, 'day')
+ month_start = period_break(dates_, 'month')
+
+ def _hour_finder(label_interval, force_year_start):
+ _hour = dates_.hour
+ _prev_hour = (dates_ - 1 * dates_.freq).hour
+ hour_start = (_hour - _prev_hour) != 0
+ info_maj[day_start] = True
+ info_min[hour_start & (_hour % label_interval == 0)] = True
+ year_start = period_break(dates_, 'year')
+ info_fmt[hour_start & (_hour % label_interval == 0)] = '%H:%M'
+ info_fmt[day_start] = '%H:%M\n%d-%b'
+ info_fmt[year_start] = '%H:%M\n%d-%b\n%Y'
+ if force_year_start and not has_level_label(year_start, vmin_orig):
+ info_fmt[first_label(day_start)] = '%H:%M\n%d-%b\n%Y'
+
+ def _minute_finder(label_interval):
+ hour_start = period_break(dates_, 'hour')
+ _minute = dates_.minute
+ _prev_minute = (dates_ - 1 * dates_.freq).minute
+ minute_start = (_minute - _prev_minute) != 0
+ info_maj[hour_start] = True
+ info_min[minute_start & (_minute % label_interval == 0)] = True
+ year_start = period_break(dates_, 'year')
+ info_fmt = info['fmt']
+ info_fmt[minute_start & (_minute % label_interval == 0)] = '%H:%M'
+ info_fmt[day_start] = '%H:%M\n%d-%b'
+ info_fmt[year_start] = '%H:%M\n%d-%b\n%Y'
+
+ def _second_finder(label_interval):
+ minute_start = period_break(dates_, 'minute')
+ _second = dates_.second
+ _prev_second = (dates_ - 1 * dates_.freq).second
+ second_start = (_second - _prev_second) != 0
+ info['maj'][minute_start] = True
+ info['min'][second_start & (_second % label_interval == 0)] = True
+ year_start = period_break(dates_, 'year')
+ info_fmt = info['fmt']
+ info_fmt[second_start & (_second %
+ label_interval == 0)] = '%H:%M:%S'
+ info_fmt[day_start] = '%H:%M:%S\n%d-%b'
+ info_fmt[year_start] = '%H:%M:%S\n%d-%b\n%Y'
+
+ if span < periodsperday / 12000.0:
+ _second_finder(1)
+ elif span < periodsperday / 6000.0:
+ _second_finder(2)
+ elif span < periodsperday / 2400.0:
+ _second_finder(5)
+ elif span < periodsperday / 1200.0:
+ _second_finder(10)
+ elif span < periodsperday / 800.0:
+ _second_finder(15)
+ elif span < periodsperday / 400.0:
+ _second_finder(30)
+ elif span < periodsperday / 150.0:
+ _minute_finder(1)
+ elif span < periodsperday / 70.0:
+ _minute_finder(2)
+ elif span < periodsperday / 24.0:
+ _minute_finder(5)
+ elif span < periodsperday / 12.0:
+ _minute_finder(15)
+ elif span < periodsperday / 6.0:
+ _minute_finder(30)
+ elif span < periodsperday / 2.5:
+ _hour_finder(1, False)
+ elif span < periodsperday / 1.5:
+ _hour_finder(2, False)
+ elif span < periodsperday * 1.25:
+ _hour_finder(3, False)
+ elif span < periodsperday * 2.5:
+ _hour_finder(6, True)
+ elif span < periodsperday * 4:
+ _hour_finder(12, True)
+ else:
+ info_maj[month_start] = True
+ info_min[day_start] = True
+ year_start = period_break(dates_, 'year')
+ info_fmt = info['fmt']
+ info_fmt[day_start] = '%d'
+ info_fmt[month_start] = '%d\n%b'
+ info_fmt[year_start] = '%d\n%b\n%Y'
+ if not has_level_label(year_start, vmin_orig):
+ if not has_level_label(month_start, vmin_orig):
+ info_fmt[first_label(day_start)] = '%d\n%b\n%Y'
+ else:
+ info_fmt[first_label(month_start)] = '%d\n%b\n%Y'
+
+ # Case 2. Less than three months
+ elif span <= periodsperyear // 4:
+ month_start = period_break(dates_, 'month')
+ info_maj[month_start] = True
+ if freq < FreqGroup.FR_HR:
+ info['min'] = True
+ else:
+ day_start = period_break(dates_, 'day')
+ info['min'][day_start] = True
+ week_start = period_break(dates_, 'week')
+ year_start = period_break(dates_, 'year')
+ info_fmt[week_start] = '%d'
+ info_fmt[month_start] = '\n\n%b'
+ info_fmt[year_start] = '\n\n%b\n%Y'
+ if not has_level_label(year_start, vmin_orig):
+ if not has_level_label(month_start, vmin_orig):
+ info_fmt[first_label(week_start)] = '\n\n%b\n%Y'
+ else:
+ info_fmt[first_label(month_start)] = '\n\n%b\n%Y'
+ # Case 3. Less than 14 months ...............
+ elif span <= 1.15 * periodsperyear:
+ year_start = period_break(dates_, 'year')
+ month_start = period_break(dates_, 'month')
+ week_start = period_break(dates_, 'week')
+ info_maj[month_start] = True
+ info_min[week_start] = True
+ info_min[year_start] = False
+ info_min[month_start] = False
+ info_fmt[month_start] = '%b'
+ info_fmt[year_start] = '%b\n%Y'
+ if not has_level_label(year_start, vmin_orig):
+ info_fmt[first_label(month_start)] = '%b\n%Y'
+ # Case 4. Less than 2.5 years ...............
+ elif span <= 2.5 * periodsperyear:
+ year_start = period_break(dates_, 'year')
+ quarter_start = period_break(dates_, 'quarter')
+ month_start = period_break(dates_, 'month')
+ info_maj[quarter_start] = True
+ info_min[month_start] = True
+ info_fmt[quarter_start] = '%b'
+ info_fmt[year_start] = '%b\n%Y'
+ # Case 4. Less than 4 years .................
+ elif span <= 4 * periodsperyear:
+ year_start = period_break(dates_, 'year')
+ month_start = period_break(dates_, 'month')
+ info_maj[year_start] = True
+ info_min[month_start] = True
+ info_min[year_start] = False
+
+ month_break = dates_[month_start].month
+ jan_or_jul = month_start[(month_break == 1) | (month_break == 7)]
+ info_fmt[jan_or_jul] = '%b'
+ info_fmt[year_start] = '%b\n%Y'
+ # Case 5. Less than 11 years ................
+ elif span <= 11 * periodsperyear:
+ year_start = period_break(dates_, 'year')
+ quarter_start = period_break(dates_, 'quarter')
+ info_maj[year_start] = True
+ info_min[quarter_start] = True
+ info_min[year_start] = False
+ info_fmt[year_start] = '%Y'
+ # Case 6. More than 12 years ................
+ else:
+ year_start = period_break(dates_, 'year')
+ year_break = dates_[year_start].year
+ nyears = span / periodsperyear
+ (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears)
+ major_idx = year_start[(year_break % maj_anndef == 0)]
+ info_maj[major_idx] = True
+ minor_idx = year_start[(year_break % min_anndef == 0)]
+ info_min[minor_idx] = True
+ info_fmt[major_idx] = '%Y'
+
+ return info
+
+
+def _monthly_finder(vmin, vmax, freq):
+ periodsperyear = 12
+
+ vmin_orig = vmin
+ (vmin, vmax) = (int(vmin), int(vmax))
+ span = vmax - vmin + 1
+
+ # Initialize the output
+ info = np.zeros(span,
+ dtype=[('val', int), ('maj', bool), ('min', bool),
+ ('fmt', '|S8')])
+ info['val'] = np.arange(vmin, vmax + 1)
+ dates_ = info['val']
+ info['fmt'] = ''
+ year_start = (dates_ % 12 == 0).nonzero()[0]
+ info_maj = info['maj']
+ info_fmt = info['fmt']
+
+ if span <= 1.15 * periodsperyear:
+ info_maj[year_start] = True
+ info['min'] = True
+
+ info_fmt[:] = '%b'
+ info_fmt[year_start] = '%b\n%Y'
+
+ if not has_level_label(year_start, vmin_orig):
+ if dates_.size > 1:
+ idx = 1
+ else:
+ idx = 0
+ info_fmt[idx] = '%b\n%Y'
+
+ elif span <= 2.5 * periodsperyear:
+ quarter_start = (dates_ % 3 == 0).nonzero()
+ info_maj[year_start] = True
+ # TODO: Check the following : is it really info['fmt'] ?
+ info['fmt'][quarter_start] = True
+ info['min'] = True
+
+ info_fmt[quarter_start] = '%b'
+ info_fmt[year_start] = '%b\n%Y'
+
+ elif span <= 4 * periodsperyear:
+ info_maj[year_start] = True
+ info['min'] = True
+
+ jan_or_jul = (dates_ % 12 == 0) | (dates_ % 12 == 6)
+ info_fmt[jan_or_jul] = '%b'
+ info_fmt[year_start] = '%b\n%Y'
+
+ elif span <= 11 * periodsperyear:
+ quarter_start = (dates_ % 3 == 0).nonzero()
+ info_maj[year_start] = True
+ info['min'][quarter_start] = True
+
+ info_fmt[year_start] = '%Y'
+
+ else:
+ nyears = span / periodsperyear
+ (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears)
+ years = dates_[year_start] // 12 + 1
+ major_idx = year_start[(years % maj_anndef == 0)]
+ info_maj[major_idx] = True
+ info['min'][year_start[(years % min_anndef == 0)]] = True
+
+ info_fmt[major_idx] = '%Y'
+
+ return info
+
+
+def _quarterly_finder(vmin, vmax, freq):
+ periodsperyear = 4
+ vmin_orig = vmin
+ (vmin, vmax) = (int(vmin), int(vmax))
+ span = vmax - vmin + 1
+
+ info = np.zeros(span,
+ dtype=[('val', int), ('maj', bool), ('min', bool),
+ ('fmt', '|S8')])
+ info['val'] = np.arange(vmin, vmax + 1)
+ info['fmt'] = ''
+ dates_ = info['val']
+ info_maj = info['maj']
+ info_fmt = info['fmt']
+ year_start = (dates_ % 4 == 0).nonzero()[0]
+
+ if span <= 3.5 * periodsperyear:
+ info_maj[year_start] = True
+ info['min'] = True
+
+ info_fmt[:] = 'Q%q'
+ info_fmt[year_start] = 'Q%q\n%F'
+ if not has_level_label(year_start, vmin_orig):
+ if dates_.size > 1:
+ idx = 1
+ else:
+ idx = 0
+ info_fmt[idx] = 'Q%q\n%F'
+
+ elif span <= 11 * periodsperyear:
+ info_maj[year_start] = True
+ info['min'] = True
+ info_fmt[year_start] = '%F'
+
+ else:
+ years = dates_[year_start] // 4 + 1
+ nyears = span / periodsperyear
+ (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears)
+ major_idx = year_start[(years % maj_anndef == 0)]
+ info_maj[major_idx] = True
+ info['min'][year_start[(years % min_anndef == 0)]] = True
+ info_fmt[major_idx] = '%F'
+
+ return info
+
+
+def _annual_finder(vmin, vmax, freq):
+ (vmin, vmax) = (int(vmin), int(vmax + 1))
+ span = vmax - vmin + 1
+
+ info = np.zeros(span,
+ dtype=[('val', int), ('maj', bool), ('min', bool),
+ ('fmt', '|S8')])
+ info['val'] = np.arange(vmin, vmax + 1)
+ info['fmt'] = ''
+ dates_ = info['val']
+
+ (min_anndef, maj_anndef) = _get_default_annual_spacing(span)
+ major_idx = dates_ % maj_anndef == 0
+ info['maj'][major_idx] = True
+ info['min'][(dates_ % min_anndef == 0)] = True
+ info['fmt'][major_idx] = '%Y'
+
+ return info
+
+
+def get_finder(freq):
+ if isinstance(freq, compat.string_types):
+ freq = get_freq(freq)
+ fgroup = resolution.get_freq_group(freq)
+
+ if fgroup == FreqGroup.FR_ANN:
+ return _annual_finder
+ elif fgroup == FreqGroup.FR_QTR:
+ return _quarterly_finder
+ elif freq == FreqGroup.FR_MTH:
+ return _monthly_finder
+ elif ((freq >= FreqGroup.FR_BUS) or fgroup == FreqGroup.FR_WK):
+ return _daily_finder
+ else: # pragma: no cover
+ errmsg = "Unsupported frequency: {freq}".format(freq=freq)
+ raise NotImplementedError(errmsg)
+
+
+class TimeSeries_DateLocator(Locator):
+ """
+ Locates the ticks along an axis controlled by a :class:`Series`.
+
+ Parameters
+ ----------
+ freq : {var}
+ Valid frequency specifier.
+ minor_locator : {False, True}, optional
+ Whether the locator is for minor ticks (True) or not.
+ dynamic_mode : {True, False}, optional
+ Whether the locator should work in dynamic mode.
+ base : {int}, optional
+ quarter : {int}, optional
+ month : {int}, optional
+ day : {int}, optional
+ """
+
+ def __init__(self, freq, minor_locator=False, dynamic_mode=True,
+ base=1, quarter=1, month=1, day=1, plot_obj=None):
+ if isinstance(freq, compat.string_types):
+ freq = get_freq(freq)
+ self.freq = freq
+ self.base = base
+ (self.quarter, self.month, self.day) = (quarter, month, day)
+ self.isminor = minor_locator
+ self.isdynamic = dynamic_mode
+ self.offset = 0
+ self.plot_obj = plot_obj
+ self.finder = get_finder(freq)
+
+ def _get_default_locs(self, vmin, vmax):
+ "Returns the default locations of ticks."
+
+ if self.plot_obj.date_axis_info is None:
+ self.plot_obj.date_axis_info = self.finder(vmin, vmax, self.freq)
+
+ locator = self.plot_obj.date_axis_info
+
+ if self.isminor:
+ return np.compress(locator['min'], locator['val'])
+ return np.compress(locator['maj'], locator['val'])
+
+ def __call__(self):
+ 'Return the locations of the ticks.'
+ # axis calls Locator.set_axis inside set_m<xxxx>_formatter
+ _check_implicitly_registered()
+
+ vi = tuple(self.axis.get_view_interval())
+ if vi != self.plot_obj.view_interval:
+ self.plot_obj.date_axis_info = None
+ self.plot_obj.view_interval = vi
+ vmin, vmax = vi
+ if vmax < vmin:
+ vmin, vmax = vmax, vmin
+ if self.isdynamic:
+ locs = self._get_default_locs(vmin, vmax)
+ else: # pragma: no cover
+ base = self.base
+ (d, m) = divmod(vmin, base)
+ vmin = (d + 1) * base
+ locs = lrange(vmin, vmax + 1, base)
+ return locs
+
+ def autoscale(self):
+ """
+ Sets the view limits to the nearest multiples of base that contain the
+ data.
+ """
+ # requires matplotlib >= 0.98.0
+ (vmin, vmax) = self.axis.get_data_interval()
+
+ locs = self._get_default_locs(vmin, vmax)
+ (vmin, vmax) = locs[[0, -1]]
+ if vmin == vmax:
+ vmin -= 1
+ vmax += 1
+ return nonsingular(vmin, vmax)
+
+# -------------------------------------------------------------------------
+# --- Formatter ---
+# -------------------------------------------------------------------------
+
+
+class TimeSeries_DateFormatter(Formatter):
+ """
+ Formats the ticks along an axis controlled by a :class:`PeriodIndex`.
+
+ Parameters
+ ----------
+ freq : {int, string}
+ Valid frequency specifier.
+ minor_locator : {False, True}
+ Whether the current formatter should apply to minor ticks (True) or
+ major ticks (False).
+ dynamic_mode : {True, False}
+ Whether the formatter works in dynamic mode or not.
+ """
+
+ def __init__(self, freq, minor_locator=False, dynamic_mode=True,
+ plot_obj=None):
+ if isinstance(freq, compat.string_types):
+ freq = get_freq(freq)
+ self.format = None
+ self.freq = freq
+ self.locs = []
+ self.formatdict = None
+ self.isminor = minor_locator
+ self.isdynamic = dynamic_mode
+ self.offset = 0
+ self.plot_obj = plot_obj
+ self.finder = get_finder(freq)
+
+ def _set_default_format(self, vmin, vmax):
+ "Returns the default ticks spacing."
+
+ if self.plot_obj.date_axis_info is None:
+ self.plot_obj.date_axis_info = self.finder(vmin, vmax, self.freq)
+ info = self.plot_obj.date_axis_info
+
+ if self.isminor:
+ format = np.compress(info['min'] & np.logical_not(info['maj']),
+ info)
+ else:
+ format = np.compress(info['maj'], info)
+ self.formatdict = {x: f for (x, _, _, f) in format}
+ return self.formatdict
+
+ def set_locs(self, locs):
+ 'Sets the locations of the ticks'
+ # don't actually use the locs. This is just needed to work with
+ # matplotlib. Force to use vmin, vmax
+ _check_implicitly_registered()
+
+ self.locs = locs
+
+ (vmin, vmax) = vi = tuple(self.axis.get_view_interval())
+ if vi != self.plot_obj.view_interval:
+ self.plot_obj.date_axis_info = None
+ self.plot_obj.view_interval = vi
+ if vmax < vmin:
+ (vmin, vmax) = (vmax, vmin)
+ self._set_default_format(vmin, vmax)
+
+ def __call__(self, x, pos=0):
+ _check_implicitly_registered()
+
+ if self.formatdict is None:
+ return ''
+ else:
+ fmt = self.formatdict.pop(x, '')
+ return Period(ordinal=int(x), freq=self.freq).strftime(fmt)
+
+
+class TimeSeries_TimedeltaFormatter(Formatter):
+ """
+ Formats the ticks along an axis controlled by a :class:`TimedeltaIndex`.
+ """
+
+ @staticmethod
+ def format_timedelta_ticks(x, pos, n_decimals):
+ """
+ Convert seconds to 'D days HH:MM:SS.F'
+ """
+ s, ns = divmod(x, 1e9)
+ m, s = divmod(s, 60)
+ h, m = divmod(m, 60)
+ d, h = divmod(h, 24)
+ decimals = int(ns * 10**(n_decimals - 9))
+ s = r'{:02d}:{:02d}:{:02d}'.format(int(h), int(m), int(s))
+ if n_decimals > 0:
+ s += '.{{:0{:0d}d}}'.format(n_decimals).format(decimals)
+ if d != 0:
+ s = '{:d} days '.format(int(d)) + s
+ return s
+
+ def __call__(self, x, pos=0):
+ _check_implicitly_registered()
+ (vmin, vmax) = tuple(self.axis.get_view_interval())
+ n_decimals = int(np.ceil(np.log10(100 * 1e9 / (vmax - vmin))))
+ if n_decimals > 9:
+ n_decimals = 9
+ return self.format_timedelta_ticks(x, pos, n_decimals)
diff --git a/contrib/python/pandas/py2/pandas/plotting/_core.py b/contrib/python/pandas/py2/pandas/plotting/_core.py
new file mode 100644
index 00000000000..8e47510680a
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/plotting/_core.py
@@ -0,0 +1,3605 @@
+# being a bit too dynamic
+# pylint: disable=E1101
+from __future__ import division
+
+from collections import namedtuple
+from distutils.version import LooseVersion
+import re
+import warnings
+
+import numpy as np
+
+import pandas.compat as compat
+from pandas.compat import lrange, map, range, string_types, zip
+from pandas.errors import AbstractMethodError
+from pandas.util._decorators import Appender, cache_readonly
+
+from pandas.core.dtypes.common import (
+ is_hashable, is_integer, is_iterator, is_list_like, is_number)
+from pandas.core.dtypes.generic import (
+ ABCDataFrame, ABCIndexClass, ABCMultiIndex, ABCPeriodIndex, ABCSeries)
+from pandas.core.dtypes.missing import isna, notna, remove_na_arraylike
+
+from pandas.core.base import PandasObject
+import pandas.core.common as com
+from pandas.core.config import get_option
+from pandas.core.generic import _shared_doc_kwargs, _shared_docs
+
+from pandas.io.formats.printing import pprint_thing
+from pandas.plotting._compat import _mpl_ge_3_0_0
+from pandas.plotting._style import _get_standard_colors, plot_params
+from pandas.plotting._tools import (
+ _flatten, _get_all_lines, _get_xlim, _handle_shared_axes, _set_ticks_props,
+ _subplots, format_date_labels, table)
+
+try:
+ from pandas.plotting import _converter
+except ImportError:
+ _HAS_MPL = False
+else:
+ _HAS_MPL = True
+ if get_option('plotting.matplotlib.register_converters'):
+ _converter.register(explicit=False)
+
+
+def _raise_if_no_mpl():
+ # TODO(mpl_converter): remove once converter is explicit
+ if not _HAS_MPL:
+ raise ImportError("matplotlib is required for plotting.")
+
+
+def _get_standard_kind(kind):
+ return {'density': 'kde'}.get(kind, kind)
+
+
+def _gca(rc=None):
+ import matplotlib.pyplot as plt
+ with plt.rc_context(rc):
+ return plt.gca()
+
+
+def _gcf():
+ import matplotlib.pyplot as plt
+ return plt.gcf()
+
+
+class MPLPlot(object):
+ """
+ Base class for assembling a pandas plot using matplotlib
+
+ Parameters
+ ----------
+ data :
+
+ """
+
+ @property
+ def _kind(self):
+ """Specify kind str. Must be overridden in child class"""
+ raise NotImplementedError
+
+ _layout_type = 'vertical'
+ _default_rot = 0
+ orientation = None
+ _pop_attributes = ['label', 'style', 'logy', 'logx', 'loglog',
+ 'mark_right', 'stacked']
+ _attr_defaults = {'logy': False, 'logx': False, 'loglog': False,
+ 'mark_right': True, 'stacked': False}
+
+ def __init__(self, data, kind=None, by=None, subplots=False, sharex=None,
+ sharey=False, use_index=True,
+ figsize=None, grid=None, legend=True, rot=None,
+ ax=None, fig=None, title=None, xlim=None, ylim=None,
+ xticks=None, yticks=None,
+ sort_columns=False, fontsize=None,
+ secondary_y=False, colormap=None,
+ table=False, layout=None, **kwds):
+
+ _raise_if_no_mpl()
+ _converter._WARN = False
+ self.data = data
+ self.by = by
+
+ self.kind = kind
+
+ self.sort_columns = sort_columns
+
+ self.subplots = subplots
+
+ if sharex is None:
+ if ax is None:
+ self.sharex = True
+ else:
+ # if we get an axis, the users should do the visibility
+ # setting...
+ self.sharex = False
+ else:
+ self.sharex = sharex
+
+ self.sharey = sharey
+ self.figsize = figsize
+ self.layout = layout
+
+ self.xticks = xticks
+ self.yticks = yticks
+ self.xlim = xlim
+ self.ylim = ylim
+ self.title = title
+ self.use_index = use_index
+
+ self.fontsize = fontsize
+
+ if rot is not None:
+ self.rot = rot
+ # need to know for format_date_labels since it's rotated to 30 by
+ # default
+ self._rot_set = True
+ else:
+ self._rot_set = False
+ self.rot = self._default_rot
+
+ if grid is None:
+ grid = False if secondary_y else self.plt.rcParams['axes.grid']
+
+ self.grid = grid
+ self.legend = legend
+ self.legend_handles = []
+ self.legend_labels = []
+
+ for attr in self._pop_attributes:
+ value = kwds.pop(attr, self._attr_defaults.get(attr, None))
+ setattr(self, attr, value)
+
+ self.ax = ax
+ self.fig = fig
+ self.axes = None
+
+ # parse errorbar input if given
+ xerr = kwds.pop('xerr', None)
+ yerr = kwds.pop('yerr', None)
+ self.errors = {kw: self._parse_errorbars(kw, err)
+ for kw, err in zip(['xerr', 'yerr'], [xerr, yerr])}
+
+ if not isinstance(secondary_y, (bool, tuple, list,
+ np.ndarray, ABCIndexClass)):
+ secondary_y = [secondary_y]
+ self.secondary_y = secondary_y
+
+ # ugly TypeError if user passes matplotlib's `cmap` name.
+ # Probably better to accept either.
+ if 'cmap' in kwds and colormap:
+ raise TypeError("Only specify one of `cmap` and `colormap`.")
+ elif 'cmap' in kwds:
+ self.colormap = kwds.pop('cmap')
+ else:
+ self.colormap = colormap
+
+ self.table = table
+
+ self.kwds = kwds
+
+ self._validate_color_args()
+
+ def _validate_color_args(self):
+ if 'color' not in self.kwds and 'colors' in self.kwds:
+ warnings.warn(("'colors' is being deprecated. Please use 'color'"
+ "instead of 'colors'"))
+ colors = self.kwds.pop('colors')
+ self.kwds['color'] = colors
+
+ if ('color' in self.kwds and self.nseries == 1 and
+ not is_list_like(self.kwds['color'])):
+ # support series.plot(color='green')
+ self.kwds['color'] = [self.kwds['color']]
+
+ if ('color' in self.kwds and isinstance(self.kwds['color'], tuple) and
+ self.nseries == 1 and len(self.kwds['color']) in (3, 4)):
+ # support RGB and RGBA tuples in series plot
+ self.kwds['color'] = [self.kwds['color']]
+
+ if ('color' in self.kwds or 'colors' in self.kwds) and \
+ self.colormap is not None:
+ warnings.warn("'color' and 'colormap' cannot be used "
+ "simultaneously. Using 'color'")
+
+ if 'color' in self.kwds and self.style is not None:
+ if is_list_like(self.style):
+ styles = self.style
+ else:
+ styles = [self.style]
+ # need only a single match
+ for s in styles:
+ if re.match('^[a-z]+?', s) is not None:
+ raise ValueError(
+ "Cannot pass 'style' string with a color "
+ "symbol and 'color' keyword argument. Please"
+ " use one or the other or pass 'style' "
+ "without a color symbol")
+
+ def _iter_data(self, data=None, keep_index=False, fillna=None):
+ if data is None:
+ data = self.data
+ if fillna is not None:
+ data = data.fillna(fillna)
+
+ # TODO: unused?
+ # if self.sort_columns:
+ # columns = com.try_sort(data.columns)
+ # else:
+ # columns = data.columns
+
+ for col, values in data.iteritems():
+ if keep_index is True:
+ yield col, values
+ else:
+ yield col, values.values
+
+ @property
+ def nseries(self):
+ if self.data.ndim == 1:
+ return 1
+ else:
+ return self.data.shape[1]
+
+ def draw(self):
+ self.plt.draw_if_interactive()
+
+ def generate(self):
+ self._args_adjust()
+ self._compute_plot_data()
+ self._setup_subplots()
+ self._make_plot()
+ self._add_table()
+ self._make_legend()
+ self._adorn_subplots()
+
+ for ax in self.axes:
+ self._post_plot_logic_common(ax, self.data)
+ self._post_plot_logic(ax, self.data)
+
+ def _args_adjust(self):
+ pass
+
+ def _has_plotted_object(self, ax):
+ """check whether ax has data"""
+ return (len(ax.lines) != 0 or
+ len(ax.artists) != 0 or
+ len(ax.containers) != 0)
+
+ def _maybe_right_yaxis(self, ax, axes_num):
+ if not self.on_right(axes_num):
+ # secondary axes may be passed via ax kw
+ return self._get_ax_layer(ax)
+
+ if hasattr(ax, 'right_ax'):
+ # if it has right_ax proparty, ``ax`` must be left axes
+ return ax.right_ax
+ elif hasattr(ax, 'left_ax'):
+ # if it has left_ax proparty, ``ax`` must be right axes
+ return ax
+ else:
+ # otherwise, create twin axes
+ orig_ax, new_ax = ax, ax.twinx()
+ # TODO: use Matplotlib public API when available
+ new_ax._get_lines = orig_ax._get_lines
+ new_ax._get_patches_for_fill = orig_ax._get_patches_for_fill
+ orig_ax.right_ax, new_ax.left_ax = new_ax, orig_ax
+
+ if not self._has_plotted_object(orig_ax): # no data on left y
+ orig_ax.get_yaxis().set_visible(False)
+
+ if self.logy or self.loglog:
+ new_ax.set_yscale('log')
+ return new_ax
+
+ def _setup_subplots(self):
+ if self.subplots:
+ fig, axes = _subplots(naxes=self.nseries,
+ sharex=self.sharex, sharey=self.sharey,
+ figsize=self.figsize, ax=self.ax,
+ layout=self.layout,
+ layout_type=self._layout_type)
+ else:
+ if self.ax is None:
+ fig = self.plt.figure(figsize=self.figsize)
+ axes = fig.add_subplot(111)
+ else:
+ fig = self.ax.get_figure()
+ if self.figsize is not None:
+ fig.set_size_inches(self.figsize)
+ axes = self.ax
+
+ axes = _flatten(axes)
+
+ if self.logx or self.loglog:
+ [a.set_xscale('log') for a in axes]
+ if self.logy or self.loglog:
+ [a.set_yscale('log') for a in axes]
+
+ self.fig = fig
+ self.axes = axes
+
+ @property
+ def result(self):
+ """
+ Return result axes
+ """
+ if self.subplots:
+ if self.layout is not None and not is_list_like(self.ax):
+ return self.axes.reshape(*self.layout)
+ else:
+ return self.axes
+ else:
+ sec_true = isinstance(self.secondary_y, bool) and self.secondary_y
+ all_sec = (is_list_like(self.secondary_y) and
+ len(self.secondary_y) == self.nseries)
+ if (sec_true or all_sec):
+ # if all data is plotted on secondary, return right axes
+ return self._get_ax_layer(self.axes[0], primary=False)
+ else:
+ return self.axes[0]
+
+ def _compute_plot_data(self):
+ data = self.data
+
+ if isinstance(data, ABCSeries):
+ label = self.label
+ if label is None and data.name is None:
+ label = 'None'
+ data = data.to_frame(name=label)
+
+ # GH16953, _convert is needed as fallback, for ``Series``
+ # with ``dtype == object``
+ data = data._convert(datetime=True, timedelta=True)
+ numeric_data = data.select_dtypes(include=[np.number,
+ "datetime",
+ "datetimetz",
+ "timedelta"])
+
+ try:
+ is_empty = numeric_data.empty
+ except AttributeError:
+ is_empty = not len(numeric_data)
+
+ # no empty frames or series allowed
+ if is_empty:
+ raise TypeError('Empty {0!r}: no numeric data to '
+ 'plot'.format(numeric_data.__class__.__name__))
+
+ self.data = numeric_data
+
+ def _make_plot(self):
+ raise AbstractMethodError(self)
+
+ def _add_table(self):
+ if self.table is False:
+ return
+ elif self.table is True:
+ data = self.data.transpose()
+ else:
+ data = self.table
+ ax = self._get_ax(0)
+ table(ax, data)
+
+ def _post_plot_logic_common(self, ax, data):
+ """Common post process for each axes"""
+
+ def get_label(i):
+ try:
+ return pprint_thing(data.index[i])
+ except Exception:
+ return ''
+
+ if self.orientation == 'vertical' or self.orientation is None:
+ if self._need_to_set_index:
+ xticklabels = [get_label(x) for x in ax.get_xticks()]
+ ax.set_xticklabels(xticklabels)
+ self._apply_axis_properties(ax.xaxis, rot=self.rot,
+ fontsize=self.fontsize)
+ self._apply_axis_properties(ax.yaxis, fontsize=self.fontsize)
+
+ if hasattr(ax, 'right_ax'):
+ self._apply_axis_properties(ax.right_ax.yaxis,
+ fontsize=self.fontsize)
+
+ elif self.orientation == 'horizontal':
+ if self._need_to_set_index:
+ yticklabels = [get_label(y) for y in ax.get_yticks()]
+ ax.set_yticklabels(yticklabels)
+ self._apply_axis_properties(ax.yaxis, rot=self.rot,
+ fontsize=self.fontsize)
+ self._apply_axis_properties(ax.xaxis, fontsize=self.fontsize)
+
+ if hasattr(ax, 'right_ax'):
+ self._apply_axis_properties(ax.right_ax.yaxis,
+ fontsize=self.fontsize)
+ else: # pragma no cover
+ raise ValueError
+
+ def _post_plot_logic(self, ax, data):
+ """Post process for each axes. Overridden in child classes"""
+ pass
+
+ def _adorn_subplots(self):
+ """Common post process unrelated to data"""
+ if len(self.axes) > 0:
+ all_axes = self._get_subplots()
+ nrows, ncols = self._get_axes_layout()
+ _handle_shared_axes(axarr=all_axes, nplots=len(all_axes),
+ naxes=nrows * ncols, nrows=nrows,
+ ncols=ncols, sharex=self.sharex,
+ sharey=self.sharey)
+
+ for ax in self.axes:
+ if self.yticks is not None:
+ ax.set_yticks(self.yticks)
+
+ if self.xticks is not None:
+ ax.set_xticks(self.xticks)
+
+ if self.ylim is not None:
+ ax.set_ylim(self.ylim)
+
+ if self.xlim is not None:
+ ax.set_xlim(self.xlim)
+
+ ax.grid(self.grid)
+
+ if self.title:
+ if self.subplots:
+ if is_list_like(self.title):
+ if len(self.title) != self.nseries:
+ msg = ('The length of `title` must equal the number '
+ 'of columns if using `title` of type `list` '
+ 'and `subplots=True`.\n'
+ 'length of title = {}\n'
+ 'number of columns = {}').format(
+ len(self.title), self.nseries)
+ raise ValueError(msg)
+
+ for (ax, title) in zip(self.axes, self.title):
+ ax.set_title(title)
+ else:
+ self.fig.suptitle(self.title)
+ else:
+ if is_list_like(self.title):
+ msg = ('Using `title` of type `list` is not supported '
+ 'unless `subplots=True` is passed')
+ raise ValueError(msg)
+ self.axes[0].set_title(self.title)
+
+ def _apply_axis_properties(self, axis, rot=None, fontsize=None):
+ labels = axis.get_majorticklabels() + axis.get_minorticklabels()
+ for label in labels:
+ if rot is not None:
+ label.set_rotation(rot)
+ if fontsize is not None:
+ label.set_fontsize(fontsize)
+
+ @property
+ def legend_title(self):
+ if not isinstance(self.data.columns, ABCMultiIndex):
+ name = self.data.columns.name
+ if name is not None:
+ name = pprint_thing(name)
+ return name
+ else:
+ stringified = map(pprint_thing,
+ self.data.columns.names)
+ return ','.join(stringified)
+
+ def _add_legend_handle(self, handle, label, index=None):
+ if label is not None:
+ if self.mark_right and index is not None:
+ if self.on_right(index):
+ label = label + ' (right)'
+ self.legend_handles.append(handle)
+ self.legend_labels.append(label)
+
+ def _make_legend(self):
+ ax, leg = self._get_ax_legend(self.axes[0])
+
+ handles = []
+ labels = []
+ title = ''
+
+ if not self.subplots:
+ if leg is not None:
+ title = leg.get_title().get_text()
+ handles = leg.legendHandles
+ labels = [x.get_text() for x in leg.get_texts()]
+
+ if self.legend:
+ if self.legend == 'reverse':
+ self.legend_handles = reversed(self.legend_handles)
+ self.legend_labels = reversed(self.legend_labels)
+
+ handles += self.legend_handles
+ labels += self.legend_labels
+ if self.legend_title is not None:
+ title = self.legend_title
+
+ if len(handles) > 0:
+ ax.legend(handles, labels, loc='best', title=title)
+
+ elif self.subplots and self.legend:
+ for ax in self.axes:
+ if ax.get_visible():
+ ax.legend(loc='best')
+
+ def _get_ax_legend(self, ax):
+ leg = ax.get_legend()
+ other_ax = (getattr(ax, 'left_ax', None) or
+ getattr(ax, 'right_ax', None))
+ other_leg = None
+ if other_ax is not None:
+ other_leg = other_ax.get_legend()
+ if leg is None and other_leg is not None:
+ leg = other_leg
+ ax = other_ax
+ return ax, leg
+
+ @cache_readonly
+ def plt(self):
+ import matplotlib.pyplot as plt
+ return plt
+
+ _need_to_set_index = False
+
+ def _get_xticks(self, convert_period=False):
+ index = self.data.index
+ is_datetype = index.inferred_type in ('datetime', 'date',
+ 'datetime64', 'time')
+
+ if self.use_index:
+ if convert_period and isinstance(index, ABCPeriodIndex):
+ self.data = self.data.reindex(index=index.sort_values())
+ x = self.data.index.to_timestamp()._mpl_repr()
+ elif index.is_numeric():
+ """
+ Matplotlib supports numeric values or datetime objects as
+ xaxis values. Taking LBYL approach here, by the time
+ matplotlib raises exception when using non numeric/datetime
+ values for xaxis, several actions are already taken by plt.
+ """
+ x = index._mpl_repr()
+ elif is_datetype:
+ self.data = self.data[notna(self.data.index)]
+ self.data = self.data.sort_index()
+ x = self.data.index._mpl_repr()
+ else:
+ self._need_to_set_index = True
+ x = lrange(len(index))
+ else:
+ x = lrange(len(index))
+
+ return x
+
+ @classmethod
+ def _plot(cls, ax, x, y, style=None, is_errorbar=False, **kwds):
+ mask = isna(y)
+ if mask.any():
+ y = np.ma.array(y)
+ y = np.ma.masked_where(mask, y)
+
+ if isinstance(x, ABCIndexClass):
+ x = x._mpl_repr()
+
+ if is_errorbar:
+ if 'xerr' in kwds:
+ kwds['xerr'] = np.array(kwds.get('xerr'))
+ if 'yerr' in kwds:
+ kwds['yerr'] = np.array(kwds.get('yerr'))
+ return ax.errorbar(x, y, **kwds)
+ else:
+ # prevent style kwarg from going to errorbar, where it is
+ # unsupported
+ if style is not None:
+ args = (x, y, style)
+ else:
+ args = (x, y)
+ return ax.plot(*args, **kwds)
+
+ def _get_index_name(self):
+ if isinstance(self.data.index, ABCMultiIndex):
+ name = self.data.index.names
+ if com._any_not_none(*name):
+ name = ','.join(pprint_thing(x) for x in name)
+ else:
+ name = None
+ else:
+ name = self.data.index.name
+ if name is not None:
+ name = pprint_thing(name)
+
+ return name
+
+ @classmethod
+ def _get_ax_layer(cls, ax, primary=True):
+ """get left (primary) or right (secondary) axes"""
+ if primary:
+ return getattr(ax, 'left_ax', ax)
+ else:
+ return getattr(ax, 'right_ax', ax)
+
+ def _get_ax(self, i):
+ # get the twinx ax if appropriate
+ if self.subplots:
+ ax = self.axes[i]
+ ax = self._maybe_right_yaxis(ax, i)
+ self.axes[i] = ax
+ else:
+ ax = self.axes[0]
+ ax = self._maybe_right_yaxis(ax, i)
+
+ ax.get_yaxis().set_visible(True)
+ return ax
+
+ def on_right(self, i):
+ if isinstance(self.secondary_y, bool):
+ return self.secondary_y
+
+ if isinstance(self.secondary_y, (tuple, list,
+ np.ndarray, ABCIndexClass)):
+ return self.data.columns[i] in self.secondary_y
+
+ def _apply_style_colors(self, colors, kwds, col_num, label):
+ """
+ Manage style and color based on column number and its label.
+ Returns tuple of appropriate style and kwds which "color" may be added.
+ """
+ style = None
+ if self.style is not None:
+ if isinstance(self.style, list):
+ try:
+ style = self.style[col_num]
+ except IndexError:
+ pass
+ elif isinstance(self.style, dict):
+ style = self.style.get(label, style)
+ else:
+ style = self.style
+
+ has_color = 'color' in kwds or self.colormap is not None
+ nocolor_style = style is None or re.match('[a-z]+', style) is None
+ if (has_color or self.subplots) and nocolor_style:
+ kwds['color'] = colors[col_num % len(colors)]
+ return style, kwds
+
+ def _get_colors(self, num_colors=None, color_kwds='color'):
+ if num_colors is None:
+ num_colors = self.nseries
+
+ return _get_standard_colors(num_colors=num_colors,
+ colormap=self.colormap,
+ color=self.kwds.get(color_kwds))
+
+ def _parse_errorbars(self, label, err):
+ """
+ Look for error keyword arguments and return the actual errorbar data
+ or return the error DataFrame/dict
+
+ Error bars can be specified in several ways:
+ Series: the user provides a pandas.Series object of the same
+ length as the data
+ ndarray: provides a np.ndarray of the same length as the data
+ DataFrame/dict: error values are paired with keys matching the
+ key in the plotted DataFrame
+ str: the name of the column within the plotted DataFrame
+ """
+
+ if err is None:
+ return None
+
+ def match_labels(data, e):
+ e = e.reindex(data.index)
+ return e
+
+ # key-matched DataFrame
+ if isinstance(err, ABCDataFrame):
+
+ err = match_labels(self.data, err)
+ # key-matched dict
+ elif isinstance(err, dict):
+ pass
+
+ # Series of error values
+ elif isinstance(err, ABCSeries):
+ # broadcast error series across data
+ err = match_labels(self.data, err)
+ err = np.atleast_2d(err)
+ err = np.tile(err, (self.nseries, 1))
+
+ # errors are a column in the dataframe
+ elif isinstance(err, string_types):
+ evalues = self.data[err].values
+ self.data = self.data[self.data.columns.drop(err)]
+ err = np.atleast_2d(evalues)
+ err = np.tile(err, (self.nseries, 1))
+
+ elif is_list_like(err):
+ if is_iterator(err):
+ err = np.atleast_2d(list(err))
+ else:
+ # raw error values
+ err = np.atleast_2d(err)
+
+ err_shape = err.shape
+
+ # asymmetrical error bars
+ if err.ndim == 3:
+ if (err_shape[0] != self.nseries) or \
+ (err_shape[1] != 2) or \
+ (err_shape[2] != len(self.data)):
+ msg = "Asymmetrical error bars should be provided " + \
+ "with the shape (%u, 2, %u)" % \
+ (self.nseries, len(self.data))
+ raise ValueError(msg)
+
+ # broadcast errors to each data series
+ if len(err) == 1:
+ err = np.tile(err, (self.nseries, 1))
+
+ elif is_number(err):
+ err = np.tile([err], (self.nseries, len(self.data)))
+
+ else:
+ msg = "No valid {label} detected".format(label=label)
+ raise ValueError(msg)
+
+ return err
+
+ def _get_errorbars(self, label=None, index=None, xerr=True, yerr=True):
+ errors = {}
+
+ for kw, flag in zip(['xerr', 'yerr'], [xerr, yerr]):
+ if flag:
+ err = self.errors[kw]
+ # user provided label-matched dataframe of errors
+ if isinstance(err, (ABCDataFrame, dict)):
+ if label is not None and label in err.keys():
+ err = err[label]
+ else:
+ err = None
+ elif index is not None and err is not None:
+ err = err[index]
+
+ if err is not None:
+ errors[kw] = err
+ return errors
+
+ def _get_subplots(self):
+ from matplotlib.axes import Subplot
+ return [ax for ax in self.axes[0].get_figure().get_axes()
+ if isinstance(ax, Subplot)]
+
+ def _get_axes_layout(self):
+ axes = self._get_subplots()
+ x_set = set()
+ y_set = set()
+ for ax in axes:
+ # check axes coordinates to estimate layout
+ points = ax.get_position().get_points()
+ x_set.add(points[0][0])
+ y_set.add(points[0][1])
+ return (len(y_set), len(x_set))
+
+
+class PlanePlot(MPLPlot):
+ """
+ Abstract class for plotting on plane, currently scatter and hexbin.
+ """
+
+ _layout_type = 'single'
+
+ def __init__(self, data, x, y, **kwargs):
+ MPLPlot.__init__(self, data, **kwargs)
+ if x is None or y is None:
+ raise ValueError(self._kind + ' requires an x and y column')
+ if is_integer(x) and not self.data.columns.holds_integer():
+ x = self.data.columns[x]
+ if is_integer(y) and not self.data.columns.holds_integer():
+ y = self.data.columns[y]
+ if len(self.data[x]._get_numeric_data()) == 0:
+ raise ValueError(self._kind + ' requires x column to be numeric')
+ if len(self.data[y]._get_numeric_data()) == 0:
+ raise ValueError(self._kind + ' requires y column to be numeric')
+
+ self.x = x
+ self.y = y
+
+ @property
+ def nseries(self):
+ return 1
+
+ def _post_plot_logic(self, ax, data):
+ x, y = self.x, self.y
+ ax.set_ylabel(pprint_thing(y))
+ ax.set_xlabel(pprint_thing(x))
+
+ def _plot_colorbar(self, ax, **kwds):
+ # Addresses issues #10611 and #10678:
+ # When plotting scatterplots and hexbinplots in IPython
+ # inline backend the colorbar axis height tends not to
+ # exactly match the parent axis height.
+ # The difference is due to small fractional differences
+ # in floating points with similar representation.
+ # To deal with this, this method forces the colorbar
+ # height to take the height of the parent axes.
+ # For a more detailed description of the issue
+ # see the following link:
+ # https://github.com/ipython/ipython/issues/11215
+ img = ax.collections[0]
+ cbar = self.fig.colorbar(img, ax=ax, **kwds)
+
+ if _mpl_ge_3_0_0():
+ # The workaround below is no longer necessary.
+ return
+
+ points = ax.get_position().get_points()
+ cbar_points = cbar.ax.get_position().get_points()
+
+ cbar.ax.set_position([cbar_points[0, 0],
+ points[0, 1],
+ cbar_points[1, 0] - cbar_points[0, 0],
+ points[1, 1] - points[0, 1]])
+ # To see the discrepancy in axis heights uncomment
+ # the following two lines:
+ # print(points[1, 1] - points[0, 1])
+ # print(cbar_points[1, 1] - cbar_points[0, 1])
+
+
+class ScatterPlot(PlanePlot):
+ _kind = 'scatter'
+
+ def __init__(self, data, x, y, s=None, c=None, **kwargs):
+ if s is None:
+ # hide the matplotlib default for size, in case we want to change
+ # the handling of this argument later
+ s = 20
+ super(ScatterPlot, self).__init__(data, x, y, s=s, **kwargs)
+ if is_integer(c) and not self.data.columns.holds_integer():
+ c = self.data.columns[c]
+ self.c = c
+
+ def _make_plot(self):
+ x, y, c, data = self.x, self.y, self.c, self.data
+ ax = self.axes[0]
+
+ c_is_column = is_hashable(c) and c in self.data.columns
+
+ # plot a colorbar only if a colormap is provided or necessary
+ cb = self.kwds.pop('colorbar', self.colormap or c_is_column)
+
+ # pandas uses colormap, matplotlib uses cmap.
+ cmap = self.colormap or 'Greys'
+ cmap = self.plt.cm.get_cmap(cmap)
+ color = self.kwds.pop("color", None)
+ if c is not None and color is not None:
+ raise TypeError('Specify exactly one of `c` and `color`')
+ elif c is None and color is None:
+ c_values = self.plt.rcParams['patch.facecolor']
+ elif color is not None:
+ c_values = color
+ elif c_is_column:
+ c_values = self.data[c].values
+ else:
+ c_values = c
+
+ if self.legend and hasattr(self, 'label'):
+ label = self.label
+ else:
+ label = None
+ scatter = ax.scatter(data[x].values, data[y].values, c=c_values,
+ label=label, cmap=cmap, **self.kwds)
+ if cb:
+ cbar_label = c if c_is_column else ''
+ self._plot_colorbar(ax, label=cbar_label)
+
+ if label is not None:
+ self._add_legend_handle(scatter, label)
+ else:
+ self.legend = False
+
+ errors_x = self._get_errorbars(label=x, index=0, yerr=False)
+ errors_y = self._get_errorbars(label=y, index=0, xerr=False)
+ if len(errors_x) > 0 or len(errors_y) > 0:
+ err_kwds = dict(errors_x, **errors_y)
+ err_kwds['ecolor'] = scatter.get_facecolor()[0]
+ ax.errorbar(data[x].values, data[y].values,
+ linestyle='none', **err_kwds)
+
+
+class HexBinPlot(PlanePlot):
+ _kind = 'hexbin'
+
+ def __init__(self, data, x, y, C=None, **kwargs):
+ super(HexBinPlot, self).__init__(data, x, y, **kwargs)
+ if is_integer(C) and not self.data.columns.holds_integer():
+ C = self.data.columns[C]
+ self.C = C
+
+ def _make_plot(self):
+ x, y, data, C = self.x, self.y, self.data, self.C
+ ax = self.axes[0]
+ # pandas uses colormap, matplotlib uses cmap.
+ cmap = self.colormap or 'BuGn'
+ cmap = self.plt.cm.get_cmap(cmap)
+ cb = self.kwds.pop('colorbar', True)
+
+ if C is None:
+ c_values = None
+ else:
+ c_values = data[C].values
+
+ ax.hexbin(data[x].values, data[y].values, C=c_values, cmap=cmap,
+ **self.kwds)
+ if cb:
+ self._plot_colorbar(ax)
+
+ def _make_legend(self):
+ pass
+
+
+class LinePlot(MPLPlot):
+ _kind = 'line'
+ _default_rot = 0
+ orientation = 'vertical'
+
+ def __init__(self, data, **kwargs):
+ MPLPlot.__init__(self, data, **kwargs)
+ if self.stacked:
+ self.data = self.data.fillna(value=0)
+ self.x_compat = plot_params['x_compat']
+ if 'x_compat' in self.kwds:
+ self.x_compat = bool(self.kwds.pop('x_compat'))
+
+ def _is_ts_plot(self):
+ # this is slightly deceptive
+ return not self.x_compat and self.use_index and self._use_dynamic_x()
+
+ def _use_dynamic_x(self):
+ from pandas.plotting._timeseries import _use_dynamic_x
+ return _use_dynamic_x(self._get_ax(0), self.data)
+
+ def _make_plot(self):
+ if self._is_ts_plot():
+ from pandas.plotting._timeseries import _maybe_convert_index
+ data = _maybe_convert_index(self._get_ax(0), self.data)
+
+ x = data.index # dummy, not used
+ plotf = self._ts_plot
+ it = self._iter_data(data=data, keep_index=True)
+ else:
+ x = self._get_xticks(convert_period=True)
+ plotf = self._plot
+ it = self._iter_data()
+
+ stacking_id = self._get_stacking_id()
+ is_errorbar = com._any_not_none(*self.errors.values())
+
+ colors = self._get_colors()
+ for i, (label, y) in enumerate(it):
+ ax = self._get_ax(i)
+ kwds = self.kwds.copy()
+ style, kwds = self._apply_style_colors(colors, kwds, i, label)
+
+ errors = self._get_errorbars(label=label, index=i)
+ kwds = dict(kwds, **errors)
+
+ label = pprint_thing(label) # .encode('utf-8')
+ kwds['label'] = label
+
+ newlines = plotf(ax, x, y, style=style, column_num=i,
+ stacking_id=stacking_id,
+ is_errorbar=is_errorbar,
+ **kwds)
+ self._add_legend_handle(newlines[0], label, index=i)
+
+ lines = _get_all_lines(ax)
+ left, right = _get_xlim(lines)
+ ax.set_xlim(left, right)
+
+ @classmethod
+ def _plot(cls, ax, x, y, style=None, column_num=None,
+ stacking_id=None, **kwds):
+ # column_num is used to get the target column from protf in line and
+ # area plots
+ if column_num == 0:
+ cls._initialize_stacker(ax, stacking_id, len(y))
+ y_values = cls._get_stacked_values(ax, stacking_id, y, kwds['label'])
+ lines = MPLPlot._plot(ax, x, y_values, style=style, **kwds)
+ cls._update_stacker(ax, stacking_id, y)
+ return lines
+
+ @classmethod
+ def _ts_plot(cls, ax, x, data, style=None, **kwds):
+ from pandas.plotting._timeseries import (_maybe_resample,
+ _decorate_axes,
+ format_dateaxis)
+ # accept x to be consistent with normal plot func,
+ # x is not passed to tsplot as it uses data.index as x coordinate
+ # column_num must be in kwds for stacking purpose
+ freq, data = _maybe_resample(data, ax, kwds)
+
+ # Set ax with freq info
+ _decorate_axes(ax, freq, kwds)
+ # digging deeper
+ if hasattr(ax, 'left_ax'):
+ _decorate_axes(ax.left_ax, freq, kwds)
+ if hasattr(ax, 'right_ax'):
+ _decorate_axes(ax.right_ax, freq, kwds)
+ ax._plot_data.append((data, cls._kind, kwds))
+
+ lines = cls._plot(ax, data.index, data.values, style=style, **kwds)
+ # set date formatter, locators and rescale limits
+ format_dateaxis(ax, ax.freq, data.index)
+ return lines
+
+ def _get_stacking_id(self):
+ if self.stacked:
+ return id(self.data)
+ else:
+ return None
+
+ @classmethod
+ def _initialize_stacker(cls, ax, stacking_id, n):
+ if stacking_id is None:
+ return
+ if not hasattr(ax, '_stacker_pos_prior'):
+ ax._stacker_pos_prior = {}
+ if not hasattr(ax, '_stacker_neg_prior'):
+ ax._stacker_neg_prior = {}
+ ax._stacker_pos_prior[stacking_id] = np.zeros(n)
+ ax._stacker_neg_prior[stacking_id] = np.zeros(n)
+
+ @classmethod
+ def _get_stacked_values(cls, ax, stacking_id, values, label):
+ if stacking_id is None:
+ return values
+ if not hasattr(ax, '_stacker_pos_prior'):
+ # stacker may not be initialized for subplots
+ cls._initialize_stacker(ax, stacking_id, len(values))
+
+ if (values >= 0).all():
+ return ax._stacker_pos_prior[stacking_id] + values
+ elif (values <= 0).all():
+ return ax._stacker_neg_prior[stacking_id] + values
+
+ raise ValueError('When stacked is True, each column must be either '
+ 'all positive or negative.'
+ '{0} contains both positive and negative values'
+ .format(label))
+
+ @classmethod
+ def _update_stacker(cls, ax, stacking_id, values):
+ if stacking_id is None:
+ return
+ if (values >= 0).all():
+ ax._stacker_pos_prior[stacking_id] += values
+ elif (values <= 0).all():
+ ax._stacker_neg_prior[stacking_id] += values
+
+ def _post_plot_logic(self, ax, data):
+ condition = (not self._use_dynamic_x() and
+ data.index.is_all_dates and
+ not self.subplots or
+ (self.subplots and self.sharex))
+
+ index_name = self._get_index_name()
+
+ if condition:
+ # irregular TS rotated 30 deg. by default
+ # probably a better place to check / set this.
+ if not self._rot_set:
+ self.rot = 30
+ format_date_labels(ax, rot=self.rot)
+
+ if index_name is not None and self.use_index:
+ ax.set_xlabel(index_name)
+
+
+class AreaPlot(LinePlot):
+ _kind = 'area'
+
+ def __init__(self, data, **kwargs):
+ kwargs.setdefault('stacked', True)
+ data = data.fillna(value=0)
+ LinePlot.__init__(self, data, **kwargs)
+
+ if not self.stacked:
+ # use smaller alpha to distinguish overlap
+ self.kwds.setdefault('alpha', 0.5)
+
+ if self.logy or self.loglog:
+ raise ValueError("Log-y scales are not supported in area plot")
+
+ @classmethod
+ def _plot(cls, ax, x, y, style=None, column_num=None,
+ stacking_id=None, is_errorbar=False, **kwds):
+
+ if column_num == 0:
+ cls._initialize_stacker(ax, stacking_id, len(y))
+ y_values = cls._get_stacked_values(ax, stacking_id, y, kwds['label'])
+
+ # need to remove label, because subplots uses mpl legend as it is
+ line_kwds = kwds.copy()
+ line_kwds.pop('label')
+ lines = MPLPlot._plot(ax, x, y_values, style=style, **line_kwds)
+
+ # get data from the line to get coordinates for fill_between
+ xdata, y_values = lines[0].get_data(orig=False)
+
+ # unable to use ``_get_stacked_values`` here to get starting point
+ if stacking_id is None:
+ start = np.zeros(len(y))
+ elif (y >= 0).all():
+ start = ax._stacker_pos_prior[stacking_id]
+ elif (y <= 0).all():
+ start = ax._stacker_neg_prior[stacking_id]
+ else:
+ start = np.zeros(len(y))
+
+ if 'color' not in kwds:
+ kwds['color'] = lines[0].get_color()
+
+ rect = ax.fill_between(xdata, start, y_values, **kwds)
+ cls._update_stacker(ax, stacking_id, y)
+
+ # LinePlot expects list of artists
+ res = [rect]
+ return res
+
+ def _post_plot_logic(self, ax, data):
+ LinePlot._post_plot_logic(self, ax, data)
+
+ if self.ylim is None:
+ if (data >= 0).all().all():
+ ax.set_ylim(0, None)
+ elif (data <= 0).all().all():
+ ax.set_ylim(None, 0)
+
+
+class BarPlot(MPLPlot):
+ _kind = 'bar'
+ _default_rot = 90
+ orientation = 'vertical'
+
+ def __init__(self, data, **kwargs):
+ # we have to treat a series differently than a
+ # 1-column DataFrame w.r.t. color handling
+ self._is_series = isinstance(data, ABCSeries)
+ self.bar_width = kwargs.pop('width', 0.5)
+ pos = kwargs.pop('position', 0.5)
+ kwargs.setdefault('align', 'center')
+ self.tick_pos = np.arange(len(data))
+
+ self.bottom = kwargs.pop('bottom', 0)
+ self.left = kwargs.pop('left', 0)
+
+ self.log = kwargs.pop('log', False)
+ MPLPlot.__init__(self, data, **kwargs)
+
+ if self.stacked or self.subplots:
+ self.tickoffset = self.bar_width * pos
+ if kwargs['align'] == 'edge':
+ self.lim_offset = self.bar_width / 2
+ else:
+ self.lim_offset = 0
+ else:
+ if kwargs['align'] == 'edge':
+ w = self.bar_width / self.nseries
+ self.tickoffset = self.bar_width * (pos - 0.5) + w * 0.5
+ self.lim_offset = w * 0.5
+ else:
+ self.tickoffset = self.bar_width * pos
+ self.lim_offset = 0
+
+ self.ax_pos = self.tick_pos - self.tickoffset
+
+ def _args_adjust(self):
+ if is_list_like(self.bottom):
+ self.bottom = np.array(self.bottom)
+ if is_list_like(self.left):
+ self.left = np.array(self.left)
+
+ @classmethod
+ def _plot(cls, ax, x, y, w, start=0, log=False, **kwds):
+ return ax.bar(x, y, w, bottom=start, log=log, **kwds)
+
+ @property
+ def _start_base(self):
+ return self.bottom
+
+ def _make_plot(self):
+ import matplotlib as mpl
+
+ colors = self._get_colors()
+ ncolors = len(colors)
+
+ pos_prior = neg_prior = np.zeros(len(self.data))
+ K = self.nseries
+
+ for i, (label, y) in enumerate(self._iter_data(fillna=0)):
+ ax = self._get_ax(i)
+ kwds = self.kwds.copy()
+ if self._is_series:
+ kwds['color'] = colors
+ else:
+ kwds['color'] = colors[i % ncolors]
+
+ errors = self._get_errorbars(label=label, index=i)
+ kwds = dict(kwds, **errors)
+
+ label = pprint_thing(label)
+
+ if (('yerr' in kwds) or ('xerr' in kwds)) \
+ and (kwds.get('ecolor') is None):
+ kwds['ecolor'] = mpl.rcParams['xtick.color']
+
+ start = 0
+ if self.log and (y >= 1).all():
+ start = 1
+ start = start + self._start_base
+
+ if self.subplots:
+ w = self.bar_width / 2
+ rect = self._plot(ax, self.ax_pos + w, y, self.bar_width,
+ start=start, label=label,
+ log=self.log, **kwds)
+ ax.set_title(label)
+ elif self.stacked:
+ mask = y > 0
+ start = np.where(mask, pos_prior, neg_prior) + self._start_base
+ w = self.bar_width / 2
+ rect = self._plot(ax, self.ax_pos + w, y, self.bar_width,
+ start=start, label=label,
+ log=self.log, **kwds)
+ pos_prior = pos_prior + np.where(mask, y, 0)
+ neg_prior = neg_prior + np.where(mask, 0, y)
+ else:
+ w = self.bar_width / K
+ rect = self._plot(ax, self.ax_pos + (i + 0.5) * w, y, w,
+ start=start, label=label,
+ log=self.log, **kwds)
+ self._add_legend_handle(rect, label, index=i)
+
+ def _post_plot_logic(self, ax, data):
+ if self.use_index:
+ str_index = [pprint_thing(key) for key in data.index]
+ else:
+ str_index = [pprint_thing(key) for key in range(data.shape[0])]
+ name = self._get_index_name()
+
+ s_edge = self.ax_pos[0] - 0.25 + self.lim_offset
+ e_edge = self.ax_pos[-1] + 0.25 + self.bar_width + self.lim_offset
+
+ self._decorate_ticks(ax, name, str_index, s_edge, e_edge)
+
+ def _decorate_ticks(self, ax, name, ticklabels, start_edge, end_edge):
+ ax.set_xlim((start_edge, end_edge))
+ ax.set_xticks(self.tick_pos)
+ ax.set_xticklabels(ticklabels)
+ if name is not None and self.use_index:
+ ax.set_xlabel(name)
+
+
+class BarhPlot(BarPlot):
+ _kind = 'barh'
+ _default_rot = 0
+ orientation = 'horizontal'
+
+ @property
+ def _start_base(self):
+ return self.left
+
+ @classmethod
+ def _plot(cls, ax, x, y, w, start=0, log=False, **kwds):
+ return ax.barh(x, y, w, left=start, log=log, **kwds)
+
+ def _decorate_ticks(self, ax, name, ticklabels, start_edge, end_edge):
+ # horizontal bars
+ ax.set_ylim((start_edge, end_edge))
+ ax.set_yticks(self.tick_pos)
+ ax.set_yticklabels(ticklabels)
+ if name is not None and self.use_index:
+ ax.set_ylabel(name)
+
+
+class HistPlot(LinePlot):
+ _kind = 'hist'
+
+ def __init__(self, data, bins=10, bottom=0, **kwargs):
+ self.bins = bins # use mpl default
+ self.bottom = bottom
+ # Do not call LinePlot.__init__ which may fill nan
+ MPLPlot.__init__(self, data, **kwargs)
+
+ def _args_adjust(self):
+ if is_integer(self.bins):
+ # create common bin edge
+ values = (self.data._convert(datetime=True)._get_numeric_data())
+ values = np.ravel(values)
+ values = values[~isna(values)]
+
+ hist, self.bins = np.histogram(
+ values, bins=self.bins,
+ range=self.kwds.get('range', None),
+ weights=self.kwds.get('weights', None))
+
+ if is_list_like(self.bottom):
+ self.bottom = np.array(self.bottom)
+
+ @classmethod
+ def _plot(cls, ax, y, style=None, bins=None, bottom=0, column_num=0,
+ stacking_id=None, **kwds):
+ if column_num == 0:
+ cls._initialize_stacker(ax, stacking_id, len(bins) - 1)
+ y = y[~isna(y)]
+
+ base = np.zeros(len(bins) - 1)
+ bottom = bottom + \
+ cls._get_stacked_values(ax, stacking_id, base, kwds['label'])
+ # ignore style
+ n, bins, patches = ax.hist(y, bins=bins, bottom=bottom, **kwds)
+ cls._update_stacker(ax, stacking_id, n)
+ return patches
+
+ def _make_plot(self):
+ colors = self._get_colors()
+ stacking_id = self._get_stacking_id()
+
+ for i, (label, y) in enumerate(self._iter_data()):
+ ax = self._get_ax(i)
+
+ kwds = self.kwds.copy()
+
+ label = pprint_thing(label)
+ kwds['label'] = label
+
+ style, kwds = self._apply_style_colors(colors, kwds, i, label)
+ if style is not None:
+ kwds['style'] = style
+
+ kwds = self._make_plot_keywords(kwds, y)
+ artists = self._plot(ax, y, column_num=i,
+ stacking_id=stacking_id, **kwds)
+ self._add_legend_handle(artists[0], label, index=i)
+
+ def _make_plot_keywords(self, kwds, y):
+ """merge BoxPlot/KdePlot properties to passed kwds"""
+ # y is required for KdePlot
+ kwds['bottom'] = self.bottom
+ kwds['bins'] = self.bins
+ return kwds
+
+ def _post_plot_logic(self, ax, data):
+ if self.orientation == 'horizontal':
+ ax.set_xlabel('Frequency')
+ else:
+ ax.set_ylabel('Frequency')
+
+ @property
+ def orientation(self):
+ if self.kwds.get('orientation', None) == 'horizontal':
+ return 'horizontal'
+ else:
+ return 'vertical'
+
+
+_kde_docstring = """
+ Generate Kernel Density Estimate plot using Gaussian kernels.
+
+ In statistics, `kernel density estimation`_ (KDE) is a non-parametric
+ way to estimate the probability density function (PDF) of a random
+ variable. This function uses Gaussian kernels and includes automatic
+ bandwidth determination.
+
+ .. _kernel density estimation:
+ https://en.wikipedia.org/wiki/Kernel_density_estimation
+
+ Parameters
+ ----------
+ bw_method : str, scalar or callable, optional
+ The method used to calculate the estimator bandwidth. This can be
+ 'scott', 'silverman', a scalar constant or a callable.
+ If None (default), 'scott' is used.
+ See :class:`scipy.stats.gaussian_kde` for more information.
+ ind : NumPy array or integer, optional
+ Evaluation points for the estimated PDF. If None (default),
+ 1000 equally spaced points are used. If `ind` is a NumPy array, the
+ KDE is evaluated at the points passed. If `ind` is an integer,
+ `ind` number of equally spaced points are used.
+ **kwds : optional
+ Additional keyword arguments are documented in
+ :meth:`pandas.%(this-datatype)s.plot`.
+
+ Returns
+ -------
+ axes : matplotlib.axes.Axes or numpy.ndarray of them
+
+ See Also
+ --------
+ scipy.stats.gaussian_kde : Representation of a kernel-density
+ estimate using Gaussian kernels. This is the function used
+ internally to estimate the PDF.
+ %(sibling-datatype)s.plot.kde : Generate a KDE plot for a
+ %(sibling-datatype)s.
+
+ Examples
+ --------
+ %(examples)s
+ """
+
+
+class KdePlot(HistPlot):
+ _kind = 'kde'
+ orientation = 'vertical'
+
+ def __init__(self, data, bw_method=None, ind=None, **kwargs):
+ MPLPlot.__init__(self, data, **kwargs)
+ self.bw_method = bw_method
+ self.ind = ind
+
+ def _args_adjust(self):
+ pass
+
+ def _get_ind(self, y):
+ if self.ind is None:
+ # np.nanmax() and np.nanmin() ignores the missing values
+ sample_range = np.nanmax(y) - np.nanmin(y)
+ ind = np.linspace(np.nanmin(y) - 0.5 * sample_range,
+ np.nanmax(y) + 0.5 * sample_range, 1000)
+ elif is_integer(self.ind):
+ sample_range = np.nanmax(y) - np.nanmin(y)
+ ind = np.linspace(np.nanmin(y) - 0.5 * sample_range,
+ np.nanmax(y) + 0.5 * sample_range, self.ind)
+ else:
+ ind = self.ind
+ return ind
+
+ @classmethod
+ def _plot(cls, ax, y, style=None, bw_method=None, ind=None,
+ column_num=None, stacking_id=None, **kwds):
+ from scipy.stats import gaussian_kde
+ from scipy import __version__ as spv
+
+ y = remove_na_arraylike(y)
+
+ if LooseVersion(spv) >= '0.11.0':
+ gkde = gaussian_kde(y, bw_method=bw_method)
+ else:
+ gkde = gaussian_kde(y)
+ if bw_method is not None:
+ msg = ('bw_method was added in Scipy 0.11.0.' +
+ ' Scipy version in use is {spv}.'.format(spv=spv))
+ warnings.warn(msg)
+
+ y = gkde.evaluate(ind)
+ lines = MPLPlot._plot(ax, ind, y, style=style, **kwds)
+ return lines
+
+ def _make_plot_keywords(self, kwds, y):
+ kwds['bw_method'] = self.bw_method
+ kwds['ind'] = self._get_ind(y)
+ return kwds
+
+ def _post_plot_logic(self, ax, data):
+ ax.set_ylabel('Density')
+
+
+class PiePlot(MPLPlot):
+ _kind = 'pie'
+ _layout_type = 'horizontal'
+
+ def __init__(self, data, kind=None, **kwargs):
+ data = data.fillna(value=0)
+ if (data < 0).any().any():
+ raise ValueError("{0} doesn't allow negative values".format(kind))
+ MPLPlot.__init__(self, data, kind=kind, **kwargs)
+
+ def _args_adjust(self):
+ self.grid = False
+ self.logy = False
+ self.logx = False
+ self.loglog = False
+
+ def _validate_color_args(self):
+ pass
+
+ def _make_plot(self):
+ colors = self._get_colors(
+ num_colors=len(self.data), color_kwds='colors')
+ self.kwds.setdefault('colors', colors)
+
+ for i, (label, y) in enumerate(self._iter_data()):
+ ax = self._get_ax(i)
+ if label is not None:
+ label = pprint_thing(label)
+ ax.set_ylabel(label)
+
+ kwds = self.kwds.copy()
+
+ def blank_labeler(label, value):
+ if value == 0:
+ return ''
+ else:
+ return label
+
+ idx = [pprint_thing(v) for v in self.data.index]
+ labels = kwds.pop('labels', idx)
+ # labels is used for each wedge's labels
+ # Blank out labels for values of 0 so they don't overlap
+ # with nonzero wedges
+ if labels is not None:
+ blabels = [blank_labeler(l, value) for
+ l, value in zip(labels, y)]
+ else:
+ blabels = None
+ results = ax.pie(y, labels=blabels, **kwds)
+
+ if kwds.get('autopct', None) is not None:
+ patches, texts, autotexts = results
+ else:
+ patches, texts = results
+ autotexts = []
+
+ if self.fontsize is not None:
+ for t in texts + autotexts:
+ t.set_fontsize(self.fontsize)
+
+ # leglabels is used for legend labels
+ leglabels = labels if labels is not None else idx
+ for p, l in zip(patches, leglabels):
+ self._add_legend_handle(p, l)
+
+
+class BoxPlot(LinePlot):
+ _kind = 'box'
+ _layout_type = 'horizontal'
+
+ _valid_return_types = (None, 'axes', 'dict', 'both')
+ # namedtuple to hold results
+ BP = namedtuple("Boxplot", ['ax', 'lines'])
+
+ def __init__(self, data, return_type='axes', **kwargs):
+ # Do not call LinePlot.__init__ which may fill nan
+ if return_type not in self._valid_return_types:
+ raise ValueError(
+ "return_type must be {None, 'axes', 'dict', 'both'}")
+
+ self.return_type = return_type
+ MPLPlot.__init__(self, data, **kwargs)
+
+ def _args_adjust(self):
+ if self.subplots:
+ # Disable label ax sharing. Otherwise, all subplots shows last
+ # column label
+ if self.orientation == 'vertical':
+ self.sharex = False
+ else:
+ self.sharey = False
+
+ @classmethod
+ def _plot(cls, ax, y, column_num=None, return_type='axes', **kwds):
+ if y.ndim == 2:
+ y = [remove_na_arraylike(v) for v in y]
+ # Boxplot fails with empty arrays, so need to add a NaN
+ # if any cols are empty
+ # GH 8181
+ y = [v if v.size > 0 else np.array([np.nan]) for v in y]
+ else:
+ y = remove_na_arraylike(y)
+ bp = ax.boxplot(y, **kwds)
+
+ if return_type == 'dict':
+ return bp, bp
+ elif return_type == 'both':
+ return cls.BP(ax=ax, lines=bp), bp
+ else:
+ return ax, bp
+
+ def _validate_color_args(self):
+ if 'color' in self.kwds:
+ if self.colormap is not None:
+ warnings.warn("'color' and 'colormap' cannot be used "
+ "simultaneously. Using 'color'")
+ self.color = self.kwds.pop('color')
+
+ if isinstance(self.color, dict):
+ valid_keys = ['boxes', 'whiskers', 'medians', 'caps']
+ for key, values in compat.iteritems(self.color):
+ if key not in valid_keys:
+ raise ValueError("color dict contains invalid "
+ "key '{0}' "
+ "The key must be either {1}"
+ .format(key, valid_keys))
+ else:
+ self.color = None
+
+ # get standard colors for default
+ colors = _get_standard_colors(num_colors=3,
+ colormap=self.colormap,
+ color=None)
+ # use 2 colors by default, for box/whisker and median
+ # flier colors isn't needed here
+ # because it can be specified by ``sym`` kw
+ self._boxes_c = colors[0]
+ self._whiskers_c = colors[0]
+ self._medians_c = colors[2]
+ self._caps_c = 'k' # mpl default
+
+ def _get_colors(self, num_colors=None, color_kwds='color'):
+ pass
+
+ def maybe_color_bp(self, bp):
+ if isinstance(self.color, dict):
+ boxes = self.color.get('boxes', self._boxes_c)
+ whiskers = self.color.get('whiskers', self._whiskers_c)
+ medians = self.color.get('medians', self._medians_c)
+ caps = self.color.get('caps', self._caps_c)
+ else:
+ # Other types are forwarded to matplotlib
+ # If None, use default colors
+ boxes = self.color or self._boxes_c
+ whiskers = self.color or self._whiskers_c
+ medians = self.color or self._medians_c
+ caps = self.color or self._caps_c
+
+ from matplotlib.artist import setp
+ setp(bp['boxes'], color=boxes, alpha=1)
+ setp(bp['whiskers'], color=whiskers, alpha=1)
+ setp(bp['medians'], color=medians, alpha=1)
+ setp(bp['caps'], color=caps, alpha=1)
+
+ def _make_plot(self):
+ if self.subplots:
+ from pandas.core.series import Series
+ self._return_obj = Series()
+
+ for i, (label, y) in enumerate(self._iter_data()):
+ ax = self._get_ax(i)
+ kwds = self.kwds.copy()
+
+ ret, bp = self._plot(ax, y, column_num=i,
+ return_type=self.return_type, **kwds)
+ self.maybe_color_bp(bp)
+ self._return_obj[label] = ret
+
+ label = [pprint_thing(label)]
+ self._set_ticklabels(ax, label)
+ else:
+ y = self.data.values.T
+ ax = self._get_ax(0)
+ kwds = self.kwds.copy()
+
+ ret, bp = self._plot(ax, y, column_num=0,
+ return_type=self.return_type, **kwds)
+ self.maybe_color_bp(bp)
+ self._return_obj = ret
+
+ labels = [l for l, _ in self._iter_data()]
+ labels = [pprint_thing(l) for l in labels]
+ if not self.use_index:
+ labels = [pprint_thing(key) for key in range(len(labels))]
+ self._set_ticklabels(ax, labels)
+
+ def _set_ticklabels(self, ax, labels):
+ if self.orientation == 'vertical':
+ ax.set_xticklabels(labels)
+ else:
+ ax.set_yticklabels(labels)
+
+ def _make_legend(self):
+ pass
+
+ def _post_plot_logic(self, ax, data):
+ pass
+
+ @property
+ def orientation(self):
+ if self.kwds.get('vert', True):
+ return 'vertical'
+ else:
+ return 'horizontal'
+
+ @property
+ def result(self):
+ if self.return_type is None:
+ return super(BoxPlot, self).result
+ else:
+ return self._return_obj
+
+
+# kinds supported by both dataframe and series
+_common_kinds = ['line', 'bar', 'barh',
+ 'kde', 'density', 'area', 'hist', 'box']
+# kinds supported by dataframe
+_dataframe_kinds = ['scatter', 'hexbin']
+# kinds supported only by series or dataframe single column
+_series_kinds = ['pie']
+_all_kinds = _common_kinds + _dataframe_kinds + _series_kinds
+
+_klasses = [LinePlot, BarPlot, BarhPlot, KdePlot, HistPlot, BoxPlot,
+ ScatterPlot, HexBinPlot, AreaPlot, PiePlot]
+
+_plot_klass = {klass._kind: klass for klass in _klasses}
+
+
+def _plot(data, x=None, y=None, subplots=False,
+ ax=None, kind='line', **kwds):
+ kind = _get_standard_kind(kind.lower().strip())
+ if kind in _all_kinds:
+ klass = _plot_klass[kind]
+ else:
+ raise ValueError("%r is not a valid plot kind" % kind)
+
+ if kind in _dataframe_kinds:
+ if isinstance(data, ABCDataFrame):
+ plot_obj = klass(data, x=x, y=y, subplots=subplots, ax=ax,
+ kind=kind, **kwds)
+ else:
+ raise ValueError("plot kind %r can only be used for data frames"
+ % kind)
+
+ elif kind in _series_kinds:
+ if isinstance(data, ABCDataFrame):
+ if y is None and subplots is False:
+ msg = "{0} requires either y column or 'subplots=True'"
+ raise ValueError(msg.format(kind))
+ elif y is not None:
+ if is_integer(y) and not data.columns.holds_integer():
+ y = data.columns[y]
+ # converted to series actually. copy to not modify
+ data = data[y].copy()
+ data.index.name = y
+ plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
+ else:
+ if isinstance(data, ABCDataFrame):
+ data_cols = data.columns
+ if x is not None:
+ if is_integer(x) and not data.columns.holds_integer():
+ x = data_cols[x]
+ elif not isinstance(data[x], ABCSeries):
+ raise ValueError("x must be a label or position")
+ data = data.set_index(x)
+
+ if y is not None:
+ # check if we have y as int or list of ints
+ int_ylist = is_list_like(y) and all(is_integer(c) for c in y)
+ int_y_arg = is_integer(y) or int_ylist
+ if int_y_arg and not data.columns.holds_integer():
+ y = data_cols[y]
+
+ label_kw = kwds['label'] if 'label' in kwds else False
+ for kw in ['xerr', 'yerr']:
+ if (kw in kwds) and \
+ (isinstance(kwds[kw], string_types) or
+ is_integer(kwds[kw])):
+ try:
+ kwds[kw] = data[kwds[kw]]
+ except (IndexError, KeyError, TypeError):
+ pass
+
+ # don't overwrite
+ data = data[y].copy()
+
+ if isinstance(data, ABCSeries):
+ label_name = label_kw or y
+ data.name = label_name
+ else:
+ match = is_list_like(label_kw) and len(label_kw) == len(y)
+ if label_kw and not match:
+ raise ValueError(
+ "label should be list-like and same length as y"
+ )
+ label_name = label_kw or data.columns
+ data.columns = label_name
+
+ plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
+
+ plot_obj.generate()
+ plot_obj.draw()
+ return plot_obj.result
+
+
+df_kind = """- 'scatter' : scatter plot
+ - 'hexbin' : hexbin plot"""
+series_kind = ""
+
+df_coord = """x : label or position, default None
+ y : label, position or list of label, positions, default None
+ Allows plotting of one column versus another"""
+series_coord = ""
+
+df_unique = """stacked : boolean, default False in line and
+ bar plots, and True in area plot. If True, create stacked plot.
+ sort_columns : boolean, default False
+ Sort column names to determine plot ordering
+ secondary_y : boolean or sequence, default False
+ Whether to plot on the secondary y-axis
+ If a list/tuple, which columns to plot on secondary y-axis"""
+series_unique = """label : label argument to provide to plot
+ secondary_y : boolean or sequence of ints, default False
+ If True then y-axis will be on the right"""
+
+df_ax = """ax : matplotlib axes object, default None
+ subplots : boolean, default False
+ Make separate subplots for each column
+ sharex : boolean, default True if ax is None else False
+ In case subplots=True, share x axis and set some x axis labels to
+ invisible; defaults to True if ax is None otherwise False if an ax
+ is passed in; Be aware, that passing in both an ax and sharex=True
+ will alter all x axis labels for all axis in a figure!
+ sharey : boolean, default False
+ In case subplots=True, share y axis and set some y axis labels to
+ invisible
+ layout : tuple (optional)
+ (rows, columns) for the layout of subplots"""
+series_ax = """ax : matplotlib axes object
+ If not passed, uses gca()"""
+
+df_note = """- If `kind` = 'scatter' and the argument `c` is the name of a dataframe
+ column, the values of that column are used to color each point.
+ - If `kind` = 'hexbin', you can control the size of the bins with the
+ `gridsize` argument. By default, a histogram of the counts around each
+ `(x, y)` point is computed. You can specify alternative aggregations
+ by passing values to the `C` and `reduce_C_function` arguments.
+ `C` specifies the value at each `(x, y)` point and `reduce_C_function`
+ is a function of one argument that reduces all the values in a bin to
+ a single number (e.g. `mean`, `max`, `sum`, `std`)."""
+series_note = ""
+
+_shared_doc_df_kwargs = dict(klass='DataFrame', klass_obj='df',
+ klass_kind=df_kind, klass_coord=df_coord,
+ klass_ax=df_ax, klass_unique=df_unique,
+ klass_note=df_note)
+_shared_doc_series_kwargs = dict(klass='Series', klass_obj='s',
+ klass_kind=series_kind,
+ klass_coord=series_coord, klass_ax=series_ax,
+ klass_unique=series_unique,
+ klass_note=series_note)
+
+_shared_docs['plot'] = """
+ Make plots of %(klass)s using matplotlib / pylab.
+
+ *New in version 0.17.0:* Each plot kind has a corresponding method on the
+ ``%(klass)s.plot`` accessor:
+ ``%(klass_obj)s.plot(kind='line')`` is equivalent to
+ ``%(klass_obj)s.plot.line()``.
+
+ Parameters
+ ----------
+ data : %(klass)s
+ %(klass_coord)s
+ kind : str
+ - 'line' : line plot (default)
+ - 'bar' : vertical bar plot
+ - 'barh' : horizontal bar plot
+ - 'hist' : histogram
+ - 'box' : boxplot
+ - 'kde' : Kernel Density Estimation plot
+ - 'density' : same as 'kde'
+ - 'area' : area plot
+ - 'pie' : pie plot
+ %(klass_kind)s
+ %(klass_ax)s
+ figsize : a tuple (width, height) in inches
+ use_index : boolean, default True
+ Use index as ticks for x axis
+ title : string or list
+ Title to use for the plot. If a string is passed, print the string at
+ the top of the figure. If a list is passed and `subplots` is True,
+ print each item in the list above the corresponding subplot.
+ grid : boolean, default None (matlab style default)
+ Axis grid lines
+ legend : False/True/'reverse'
+ Place legend on axis subplots
+ style : list or dict
+ matplotlib line style per column
+ logx : boolean, default False
+ Use log scaling on x axis
+ logy : boolean, default False
+ Use log scaling on y axis
+ loglog : boolean, default False
+ Use log scaling on both x and y axes
+ xticks : sequence
+ Values to use for the xticks
+ yticks : sequence
+ Values to use for the yticks
+ xlim : 2-tuple/list
+ ylim : 2-tuple/list
+ rot : int, default None
+ Rotation for ticks (xticks for vertical, yticks for horizontal plots)
+ fontsize : int, default None
+ Font size for xticks and yticks
+ colormap : str or matplotlib colormap object, default None
+ Colormap to select colors from. If string, load colormap with that name
+ from matplotlib.
+ colorbar : boolean, optional
+ If True, plot colorbar (only relevant for 'scatter' and 'hexbin' plots)
+ position : float
+ Specify relative alignments for bar plot layout.
+ From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center)
+ table : boolean, Series or DataFrame, default False
+ If True, draw a table using the data in the DataFrame and the data will
+ be transposed to meet matplotlib's default layout.
+ If a Series or DataFrame is passed, use passed data to draw a table.
+ yerr : DataFrame, Series, array-like, dict and str
+ See :ref:`Plotting with Error Bars <visualization.errorbars>` for
+ detail.
+ xerr : same types as yerr.
+ %(klass_unique)s
+ mark_right : boolean, default True
+ When using a secondary_y axis, automatically mark the column
+ labels with "(right)" in the legend
+ `**kwds` : keywords
+ Options to pass to matplotlib plotting method
+
+ Returns
+ -------
+ axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them
+
+ Notes
+ -----
+
+ - See matplotlib documentation online for more on this subject
+ - If `kind` = 'bar' or 'barh', you can specify relative alignments
+ for bar plot layout by `position` keyword.
+ From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center)
+ %(klass_note)s
+ """
+
+
+@Appender(_shared_docs['plot'] % _shared_doc_df_kwargs)
+def plot_frame(data, x=None, y=None, kind='line', ax=None,
+ subplots=False, sharex=None, sharey=False, layout=None,
+ figsize=None, use_index=True, title=None, grid=None,
+ legend=True, style=None, logx=False, logy=False, loglog=False,
+ xticks=None, yticks=None, xlim=None, ylim=None,
+ rot=None, fontsize=None, colormap=None, table=False,
+ yerr=None, xerr=None,
+ secondary_y=False, sort_columns=False,
+ **kwds):
+ return _plot(data, kind=kind, x=x, y=y, ax=ax,
+ subplots=subplots, sharex=sharex, sharey=sharey,
+ layout=layout, figsize=figsize, use_index=use_index,
+ title=title, grid=grid, legend=legend,
+ style=style, logx=logx, logy=logy, loglog=loglog,
+ xticks=xticks, yticks=yticks, xlim=xlim, ylim=ylim,
+ rot=rot, fontsize=fontsize, colormap=colormap, table=table,
+ yerr=yerr, xerr=xerr,
+ secondary_y=secondary_y, sort_columns=sort_columns,
+ **kwds)
+
+
+@Appender(_shared_docs['plot'] % _shared_doc_series_kwargs)
+def plot_series(data, kind='line', ax=None, # Series unique
+ figsize=None, use_index=True, title=None, grid=None,
+ legend=False, style=None, logx=False, logy=False, loglog=False,
+ xticks=None, yticks=None, xlim=None, ylim=None,
+ rot=None, fontsize=None, colormap=None, table=False,
+ yerr=None, xerr=None,
+ label=None, secondary_y=False, # Series unique
+ **kwds):
+
+ import matplotlib.pyplot as plt
+ if ax is None and len(plt.get_fignums()) > 0:
+ ax = _gca()
+ ax = MPLPlot._get_ax_layer(ax)
+ return _plot(data, kind=kind, ax=ax,
+ figsize=figsize, use_index=use_index, title=title,
+ grid=grid, legend=legend,
+ style=style, logx=logx, logy=logy, loglog=loglog,
+ xticks=xticks, yticks=yticks, xlim=xlim, ylim=ylim,
+ rot=rot, fontsize=fontsize, colormap=colormap, table=table,
+ yerr=yerr, xerr=xerr,
+ label=label, secondary_y=secondary_y,
+ **kwds)
+
+
+_shared_docs['boxplot'] = """
+ Make a box plot from DataFrame columns.
+
+ Make a box-and-whisker plot from DataFrame columns, optionally grouped
+ by some other columns. A box plot is a method for graphically depicting
+ groups of numerical data through their quartiles.
+ The box extends from the Q1 to Q3 quartile values of the data,
+ with a line at the median (Q2). The whiskers extend from the edges
+ of box to show the range of the data. The position of the whiskers
+ is set by default to `1.5 * IQR (IQR = Q3 - Q1)` from the edges of the box.
+ Outlier points are those past the end of the whiskers.
+
+ For further details see
+ Wikipedia's entry for `boxplot <https://en.wikipedia.org/wiki/Box_plot>`_.
+
+ Parameters
+ ----------
+ column : str or list of str, optional
+ Column name or list of names, or vector.
+ Can be any valid input to :meth:`pandas.DataFrame.groupby`.
+ by : str or array-like, optional
+ Column in the DataFrame to :meth:`pandas.DataFrame.groupby`.
+ One box-plot will be done per value of columns in `by`.
+ ax : object of class matplotlib.axes.Axes, optional
+ The matplotlib axes to be used by boxplot.
+ fontsize : float or str
+ Tick label font size in points or as a string (e.g., `large`).
+ rot : int or float, default 0
+ The rotation angle of labels (in degrees)
+ with respect to the screen coordinate system.
+ grid : boolean, default True
+ Setting this to True will show the grid.
+ figsize : A tuple (width, height) in inches
+ The size of the figure to create in matplotlib.
+ layout : tuple (rows, columns), optional
+ For example, (3, 5) will display the subplots
+ using 3 columns and 5 rows, starting from the top-left.
+ return_type : {'axes', 'dict', 'both'} or None, default 'axes'
+ The kind of object to return. The default is ``axes``.
+
+ * 'axes' returns the matplotlib axes the boxplot is drawn on.
+ * 'dict' returns a dictionary whose values are the matplotlib
+ Lines of the boxplot.
+ * 'both' returns a namedtuple with the axes and dict.
+ * when grouping with ``by``, a Series mapping columns to
+ ``return_type`` is returned.
+
+ If ``return_type`` is `None`, a NumPy array
+ of axes with the same shape as ``layout`` is returned.
+ **kwds
+ All other plotting keyword arguments to be passed to
+ :func:`matplotlib.pyplot.boxplot`.
+
+ Returns
+ -------
+ result :
+
+ The return type depends on the `return_type` parameter:
+
+ * 'axes' : object of class matplotlib.axes.Axes
+ * 'dict' : dict of matplotlib.lines.Line2D objects
+ * 'both' : a namedtuple with structure (ax, lines)
+
+ For data grouped with ``by``:
+
+ * :class:`~pandas.Series`
+ * :class:`~numpy.array` (for ``return_type = None``)
+
+ See Also
+ --------
+ Series.plot.hist: Make a histogram.
+ matplotlib.pyplot.boxplot : Matplotlib equivalent plot.
+
+ Notes
+ -----
+ Use ``return_type='dict'`` when you want to tweak the appearance
+ of the lines after plotting. In this case a dict containing the Lines
+ making up the boxes, caps, fliers, medians, and whiskers is returned.
+
+ Examples
+ --------
+
+ Boxplots can be created for every column in the dataframe
+ by ``df.boxplot()`` or indicating the columns to be used:
+
+ .. plot::
+ :context: close-figs
+
+ >>> np.random.seed(1234)
+ >>> df = pd.DataFrame(np.random.randn(10,4),
+ ... columns=['Col1', 'Col2', 'Col3', 'Col4'])
+ >>> boxplot = df.boxplot(column=['Col1', 'Col2', 'Col3'])
+
+ Boxplots of variables distributions grouped by the values of a third
+ variable can be created using the option ``by``. For instance:
+
+ .. plot::
+ :context: close-figs
+
+ >>> df = pd.DataFrame(np.random.randn(10, 2),
+ ... columns=['Col1', 'Col2'])
+ >>> df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A',
+ ... 'B', 'B', 'B', 'B', 'B'])
+ >>> boxplot = df.boxplot(by='X')
+
+ A list of strings (i.e. ``['X', 'Y']``) can be passed to boxplot
+ in order to group the data by combination of the variables in the x-axis:
+
+ .. plot::
+ :context: close-figs
+
+ >>> df = pd.DataFrame(np.random.randn(10,3),
+ ... columns=['Col1', 'Col2', 'Col3'])
+ >>> df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A',
+ ... 'B', 'B', 'B', 'B', 'B'])
+ >>> df['Y'] = pd.Series(['A', 'B', 'A', 'B', 'A',
+ ... 'B', 'A', 'B', 'A', 'B'])
+ >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by=['X', 'Y'])
+
+ The layout of boxplot can be adjusted giving a tuple to ``layout``:
+
+ .. plot::
+ :context: close-figs
+
+ >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X',
+ ... layout=(2, 1))
+
+ Additional formatting can be done to the boxplot, like suppressing the grid
+ (``grid=False``), rotating the labels in the x-axis (i.e. ``rot=45``)
+ or changing the fontsize (i.e. ``fontsize=15``):
+
+ .. plot::
+ :context: close-figs
+
+ >>> boxplot = df.boxplot(grid=False, rot=45, fontsize=15)
+
+ The parameter ``return_type`` can be used to select the type of element
+ returned by `boxplot`. When ``return_type='axes'`` is selected,
+ the matplotlib axes on which the boxplot is drawn are returned:
+
+ >>> boxplot = df.boxplot(column=['Col1','Col2'], return_type='axes')
+ >>> type(boxplot)
+ <class 'matplotlib.axes._subplots.AxesSubplot'>
+
+ When grouping with ``by``, a Series mapping columns to ``return_type``
+ is returned:
+
+ >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X',
+ ... return_type='axes')
+ >>> type(boxplot)
+ <class 'pandas.core.series.Series'>
+
+ If ``return_type`` is `None`, a NumPy array of axes with the same shape
+ as ``layout`` is returned:
+
+ >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X',
+ ... return_type=None)
+ >>> type(boxplot)
+ <class 'numpy.ndarray'>
+ """
+
+
+@Appender(_shared_docs['boxplot'] % _shared_doc_kwargs)
+def boxplot(data, column=None, by=None, ax=None, fontsize=None,
+ rot=0, grid=True, figsize=None, layout=None, return_type=None,
+ **kwds):
+
+ # validate return_type:
+ if return_type not in BoxPlot._valid_return_types:
+ raise ValueError("return_type must be {'axes', 'dict', 'both'}")
+
+ if isinstance(data, ABCSeries):
+ data = data.to_frame('x')
+ column = 'x'
+
+ def _get_colors():
+ # num_colors=3 is required as method maybe_color_bp takes the colors
+ # in positions 0 and 2.
+ return _get_standard_colors(color=kwds.get('color'), num_colors=3)
+
+ def maybe_color_bp(bp):
+ if 'color' not in kwds:
+ from matplotlib.artist import setp
+ setp(bp['boxes'], color=colors[0], alpha=1)
+ setp(bp['whiskers'], color=colors[0], alpha=1)
+ setp(bp['medians'], color=colors[2], alpha=1)
+
+ def plot_group(keys, values, ax):
+ keys = [pprint_thing(x) for x in keys]
+ values = [np.asarray(remove_na_arraylike(v)) for v in values]
+ bp = ax.boxplot(values, **kwds)
+ if fontsize is not None:
+ ax.tick_params(axis='both', labelsize=fontsize)
+ if kwds.get('vert', 1):
+ ax.set_xticklabels(keys, rotation=rot)
+ else:
+ ax.set_yticklabels(keys, rotation=rot)
+ maybe_color_bp(bp)
+
+ # Return axes in multiplot case, maybe revisit later # 985
+ if return_type == 'dict':
+ return bp
+ elif return_type == 'both':
+ return BoxPlot.BP(ax=ax, lines=bp)
+ else:
+ return ax
+
+ colors = _get_colors()
+ if column is None:
+ columns = None
+ else:
+ if isinstance(column, (list, tuple)):
+ columns = column
+ else:
+ columns = [column]
+
+ if by is not None:
+ # Prefer array return type for 2-D plots to match the subplot layout
+ # https://github.com/pandas-dev/pandas/pull/12216#issuecomment-241175580
+ result = _grouped_plot_by_column(plot_group, data, columns=columns,
+ by=by, grid=grid, figsize=figsize,
+ ax=ax, layout=layout,
+ return_type=return_type)
+ else:
+ if return_type is None:
+ return_type = 'axes'
+ if layout is not None:
+ raise ValueError("The 'layout' keyword is not supported when "
+ "'by' is None")
+
+ if ax is None:
+ rc = {'figure.figsize': figsize} if figsize is not None else {}
+ ax = _gca(rc)
+ data = data._get_numeric_data()
+ if columns is None:
+ columns = data.columns
+ else:
+ data = data[columns]
+
+ result = plot_group(columns, data.values.T, ax)
+ ax.grid(grid)
+
+ return result
+
+
+@Appender(_shared_docs['boxplot'] % _shared_doc_kwargs)
+def boxplot_frame(self, column=None, by=None, ax=None, fontsize=None, rot=0,
+ grid=True, figsize=None, layout=None,
+ return_type=None, **kwds):
+ import matplotlib.pyplot as plt
+ _converter._WARN = False
+ ax = boxplot(self, column=column, by=by, ax=ax, fontsize=fontsize,
+ grid=grid, rot=rot, figsize=figsize, layout=layout,
+ return_type=return_type, **kwds)
+ plt.draw_if_interactive()
+ return ax
+
+
+def scatter_plot(data, x, y, by=None, ax=None, figsize=None, grid=False,
+ **kwargs):
+ """
+ Make a scatter plot from two DataFrame columns
+
+ Parameters
+ ----------
+ data : DataFrame
+ x : Column name for the x-axis values
+ y : Column name for the y-axis values
+ ax : Matplotlib axis object
+ figsize : A tuple (width, height) in inches
+ grid : Setting this to True will show the grid
+ kwargs : other plotting keyword arguments
+ To be passed to scatter function
+
+ Returns
+ -------
+ fig : matplotlib.Figure
+ """
+ import matplotlib.pyplot as plt
+
+ kwargs.setdefault('edgecolors', 'none')
+
+ def plot_group(group, ax):
+ xvals = group[x].values
+ yvals = group[y].values
+ ax.scatter(xvals, yvals, **kwargs)
+ ax.grid(grid)
+
+ if by is not None:
+ fig = _grouped_plot(plot_group, data, by=by, figsize=figsize, ax=ax)
+ else:
+ if ax is None:
+ fig = plt.figure()
+ ax = fig.add_subplot(111)
+ else:
+ fig = ax.get_figure()
+ plot_group(data, ax)
+ ax.set_ylabel(pprint_thing(y))
+ ax.set_xlabel(pprint_thing(x))
+
+ ax.grid(grid)
+
+ return fig
+
+
+def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None,
+ xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
+ sharey=False, figsize=None, layout=None, bins=10, **kwds):
+ """
+ Make a histogram of the DataFrame's.
+
+ A `histogram`_ is a representation of the distribution of data.
+ This function calls :meth:`matplotlib.pyplot.hist`, on each series in
+ the DataFrame, resulting in one histogram per column.
+
+ .. _histogram: https://en.wikipedia.org/wiki/Histogram
+
+ Parameters
+ ----------
+ data : DataFrame
+ The pandas object holding the data.
+ column : string or sequence
+ If passed, will be used to limit data to a subset of columns.
+ by : object, optional
+ If passed, then used to form histograms for separate groups.
+ grid : boolean, default True
+ Whether to show axis grid lines.
+ xlabelsize : int, default None
+ If specified changes the x-axis label size.
+ xrot : float, default None
+ Rotation of x axis labels. For example, a value of 90 displays the
+ x labels rotated 90 degrees clockwise.
+ ylabelsize : int, default None
+ If specified changes the y-axis label size.
+ yrot : float, default None
+ Rotation of y axis labels. For example, a value of 90 displays the
+ y labels rotated 90 degrees clockwise.
+ ax : Matplotlib axes object, default None
+ The axes to plot the histogram on.
+ sharex : boolean, default True if ax is None else False
+ In case subplots=True, share x axis and set some x axis labels to
+ invisible; defaults to True if ax is None otherwise False if an ax
+ is passed in.
+ Note that passing in both an ax and sharex=True will alter all x axis
+ labels for all subplots in a figure.
+ sharey : boolean, default False
+ In case subplots=True, share y axis and set some y axis labels to
+ invisible.
+ figsize : tuple
+ The size in inches of the figure to create. Uses the value in
+ `matplotlib.rcParams` by default.
+ layout : tuple, optional
+ Tuple of (rows, columns) for the layout of the histograms.
+ bins : integer or sequence, default 10
+ Number of histogram bins to be used. If an integer is given, bins + 1
+ bin edges are calculated and returned. If bins is a sequence, gives
+ bin edges, including left edge of first bin and right edge of last
+ bin. In this case, bins is returned unmodified.
+ **kwds
+ All other plotting keyword arguments to be passed to
+ :meth:`matplotlib.pyplot.hist`.
+
+ Returns
+ -------
+ axes : matplotlib.AxesSubplot or numpy.ndarray of them
+
+ See Also
+ --------
+ matplotlib.pyplot.hist : Plot a histogram using matplotlib.
+
+ Examples
+ --------
+
+ .. plot::
+ :context: close-figs
+
+ This example draws a histogram based on the length and width of
+ some animals, displayed in three bins
+
+ >>> df = pd.DataFrame({
+ ... 'length': [1.5, 0.5, 1.2, 0.9, 3],
+ ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1]
+ ... }, index= ['pig', 'rabbit', 'duck', 'chicken', 'horse'])
+ >>> hist = df.hist(bins=3)
+ """
+ _raise_if_no_mpl()
+ _converter._WARN = False
+ if by is not None:
+ axes = grouped_hist(data, column=column, by=by, ax=ax, grid=grid,
+ figsize=figsize, sharex=sharex, sharey=sharey,
+ layout=layout, bins=bins, xlabelsize=xlabelsize,
+ xrot=xrot, ylabelsize=ylabelsize,
+ yrot=yrot, **kwds)
+ return axes
+
+ if column is not None:
+ if not isinstance(column, (list, np.ndarray, ABCIndexClass)):
+ column = [column]
+ data = data[column]
+ data = data._get_numeric_data()
+ naxes = len(data.columns)
+
+ fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False,
+ sharex=sharex, sharey=sharey, figsize=figsize,
+ layout=layout)
+ _axes = _flatten(axes)
+
+ for i, col in enumerate(com.try_sort(data.columns)):
+ ax = _axes[i]
+ ax.hist(data[col].dropna().values, bins=bins, **kwds)
+ ax.set_title(col)
+ ax.grid(grid)
+
+ _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot,
+ ylabelsize=ylabelsize, yrot=yrot)
+ fig.subplots_adjust(wspace=0.3, hspace=0.3)
+
+ return axes
+
+
+def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None,
+ xrot=None, ylabelsize=None, yrot=None, figsize=None,
+ bins=10, **kwds):
+ """
+ Draw histogram of the input series using matplotlib.
+
+ Parameters
+ ----------
+ by : object, optional
+ If passed, then used to form histograms for separate groups
+ ax : matplotlib axis object
+ If not passed, uses gca()
+ grid : boolean, default True
+ Whether to show axis grid lines
+ xlabelsize : int, default None
+ If specified changes the x-axis label size
+ xrot : float, default None
+ rotation of x axis labels
+ ylabelsize : int, default None
+ If specified changes the y-axis label size
+ yrot : float, default None
+ rotation of y axis labels
+ figsize : tuple, default None
+ figure size in inches by default
+ bins : integer or sequence, default 10
+ Number of histogram bins to be used. If an integer is given, bins + 1
+ bin edges are calculated and returned. If bins is a sequence, gives
+ bin edges, including left edge of first bin and right edge of last
+ bin. In this case, bins is returned unmodified.
+ bins : integer, default 10
+ Number of histogram bins to be used
+ `**kwds` : keywords
+ To be passed to the actual plotting function
+
+ See Also
+ --------
+ matplotlib.axes.Axes.hist : Plot a histogram using matplotlib.
+ """
+ import matplotlib.pyplot as plt
+
+ if by is None:
+ if kwds.get('layout', None) is not None:
+ raise ValueError("The 'layout' keyword is not supported when "
+ "'by' is None")
+ # hack until the plotting interface is a bit more unified
+ fig = kwds.pop('figure', plt.gcf() if plt.get_fignums() else
+ plt.figure(figsize=figsize))
+ if (figsize is not None and tuple(figsize) !=
+ tuple(fig.get_size_inches())):
+ fig.set_size_inches(*figsize, forward=True)
+ if ax is None:
+ ax = fig.gca()
+ elif ax.get_figure() != fig:
+ raise AssertionError('passed axis not bound to passed figure')
+ values = self.dropna().values
+
+ ax.hist(values, bins=bins, **kwds)
+ ax.grid(grid)
+ axes = np.array([ax])
+
+ _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot,
+ ylabelsize=ylabelsize, yrot=yrot)
+
+ else:
+ if 'figure' in kwds:
+ raise ValueError("Cannot pass 'figure' when using the "
+ "'by' argument, since a new 'Figure' instance "
+ "will be created")
+ axes = grouped_hist(self, by=by, ax=ax, grid=grid, figsize=figsize,
+ bins=bins, xlabelsize=xlabelsize, xrot=xrot,
+ ylabelsize=ylabelsize, yrot=yrot, **kwds)
+
+ if hasattr(axes, 'ndim'):
+ if axes.ndim == 1 and len(axes) == 1:
+ return axes[0]
+ return axes
+
+
+def grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None,
+ layout=None, sharex=False, sharey=False, rot=90, grid=True,
+ xlabelsize=None, xrot=None, ylabelsize=None, yrot=None,
+ **kwargs):
+ """
+ Grouped histogram
+
+ Parameters
+ ----------
+ data : Series/DataFrame
+ column : object, optional
+ by : object, optional
+ ax : axes, optional
+ bins : int, default 50
+ figsize : tuple, optional
+ layout : optional
+ sharex : boolean, default False
+ sharey : boolean, default False
+ rot : int, default 90
+ grid : bool, default True
+ kwargs : dict, keyword arguments passed to matplotlib.Axes.hist
+
+ Returns
+ -------
+ axes : collection of Matplotlib Axes
+ """
+ _raise_if_no_mpl()
+ _converter._WARN = False
+
+ def plot_group(group, ax):
+ ax.hist(group.dropna().values, bins=bins, **kwargs)
+
+ xrot = xrot or rot
+
+ fig, axes = _grouped_plot(plot_group, data, column=column,
+ by=by, sharex=sharex, sharey=sharey, ax=ax,
+ figsize=figsize, layout=layout, rot=rot)
+
+ _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot,
+ ylabelsize=ylabelsize, yrot=yrot)
+
+ fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9,
+ hspace=0.5, wspace=0.3)
+ return axes
+
+
+def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None,
+ rot=0, grid=True, ax=None, figsize=None,
+ layout=None, sharex=False, sharey=True, **kwds):
+ """
+ Make box plots from DataFrameGroupBy data.
+
+ Parameters
+ ----------
+ grouped : Grouped DataFrame
+ subplots :
+ * ``False`` - no subplots will be used
+ * ``True`` - create a subplot for each group
+ column : column name or list of names, or vector
+ Can be any valid input to groupby
+ fontsize : int or string
+ rot : label rotation angle
+ grid : Setting this to True will show the grid
+ ax : Matplotlib axis object, default None
+ figsize : A tuple (width, height) in inches
+ layout : tuple (optional)
+ (rows, columns) for the layout of the plot
+ sharex : bool, default False
+ Whether x-axes will be shared among subplots
+
+ .. versionadded:: 0.23.1
+ sharey : bool, default True
+ Whether y-axes will be shared among subplots
+
+ .. versionadded:: 0.23.1
+ `**kwds` : Keyword Arguments
+ All other plotting keyword arguments to be passed to
+ matplotlib's boxplot function
+
+ Returns
+ -------
+ dict of key/value = group key/DataFrame.boxplot return value
+ or DataFrame.boxplot return value in case subplots=figures=False
+
+ Examples
+ --------
+ >>> import itertools
+ >>> tuples = [t for t in itertools.product(range(1000), range(4))]
+ >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1'])
+ >>> data = np.random.randn(len(index),4)
+ >>> df = pd.DataFrame(data, columns=list('ABCD'), index=index)
+ >>>
+ >>> grouped = df.groupby(level='lvl1')
+ >>> boxplot_frame_groupby(grouped)
+ >>>
+ >>> grouped = df.unstack(level='lvl1').groupby(level=0, axis=1)
+ >>> boxplot_frame_groupby(grouped, subplots=False)
+ """
+ _raise_if_no_mpl()
+ _converter._WARN = False
+ if subplots is True:
+ naxes = len(grouped)
+ fig, axes = _subplots(naxes=naxes, squeeze=False,
+ ax=ax, sharex=sharex, sharey=sharey,
+ figsize=figsize, layout=layout)
+ axes = _flatten(axes)
+
+ from pandas.core.series import Series
+ ret = Series()
+ for (key, group), ax in zip(grouped, axes):
+ d = group.boxplot(ax=ax, column=column, fontsize=fontsize,
+ rot=rot, grid=grid, **kwds)
+ ax.set_title(pprint_thing(key))
+ ret.loc[key] = d
+ fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1,
+ right=0.9, wspace=0.2)
+ else:
+ from pandas.core.reshape.concat import concat
+ keys, frames = zip(*grouped)
+ if grouped.axis == 0:
+ df = concat(frames, keys=keys, axis=1)
+ else:
+ if len(frames) > 1:
+ df = frames[0].join(frames[1::])
+ else:
+ df = frames[0]
+ ret = df.boxplot(column=column, fontsize=fontsize, rot=rot,
+ grid=grid, ax=ax, figsize=figsize,
+ layout=layout, **kwds)
+ return ret
+
+
+def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True,
+ figsize=None, sharex=True, sharey=True, layout=None,
+ rot=0, ax=None, **kwargs):
+
+ if figsize == 'default':
+ # allowed to specify mpl default with 'default'
+ warnings.warn("figsize='default' is deprecated. Specify figure"
+ "size by tuple instead", FutureWarning, stacklevel=4)
+ figsize = None
+
+ grouped = data.groupby(by)
+ if column is not None:
+ grouped = grouped[column]
+
+ naxes = len(grouped)
+ fig, axes = _subplots(naxes=naxes, figsize=figsize,
+ sharex=sharex, sharey=sharey, ax=ax,
+ layout=layout)
+
+ _axes = _flatten(axes)
+
+ for i, (key, group) in enumerate(grouped):
+ ax = _axes[i]
+ if numeric_only and isinstance(group, ABCDataFrame):
+ group = group._get_numeric_data()
+ plotf(group, ax, **kwargs)
+ ax.set_title(pprint_thing(key))
+
+ return fig, axes
+
+
+def _grouped_plot_by_column(plotf, data, columns=None, by=None,
+ numeric_only=True, grid=False,
+ figsize=None, ax=None, layout=None,
+ return_type=None, **kwargs):
+ grouped = data.groupby(by)
+ if columns is None:
+ if not isinstance(by, (list, tuple)):
+ by = [by]
+ columns = data._get_numeric_data().columns.difference(by)
+ naxes = len(columns)
+ fig, axes = _subplots(naxes=naxes, sharex=True, sharey=True,
+ figsize=figsize, ax=ax, layout=layout)
+
+ _axes = _flatten(axes)
+
+ ax_values = []
+
+ for i, col in enumerate(columns):
+ ax = _axes[i]
+ gp_col = grouped[col]
+ keys, values = zip(*gp_col)
+ re_plotf = plotf(keys, values, ax, **kwargs)
+ ax.set_title(col)
+ ax.set_xlabel(pprint_thing(by))
+ ax_values.append(re_plotf)
+ ax.grid(grid)
+
+ from pandas.core.series import Series
+ result = Series(ax_values, index=columns)
+
+ # Return axes in multiplot case, maybe revisit later # 985
+ if return_type is None:
+ result = axes
+
+ byline = by[0] if len(by) == 1 else by
+ fig.suptitle('Boxplot grouped by {byline}'.format(byline=byline))
+ fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2)
+
+ return result
+
+
+class BasePlotMethods(PandasObject):
+
+ def __init__(self, data):
+ self._parent = data # can be Series or DataFrame
+
+ def __call__(self, *args, **kwargs):
+ raise NotImplementedError
+
+
+class SeriesPlotMethods(BasePlotMethods):
+ """
+ Series plotting accessor and method.
+
+ Examples
+ --------
+ >>> s.plot.line()
+ >>> s.plot.bar()
+ >>> s.plot.hist()
+
+ Plotting methods can also be accessed by calling the accessor as a method
+ with the ``kind`` argument:
+ ``s.plot(kind='line')`` is equivalent to ``s.plot.line()``
+ """
+
+ def __call__(self, kind='line', ax=None,
+ figsize=None, use_index=True, title=None, grid=None,
+ legend=False, style=None, logx=False, logy=False,
+ loglog=False, xticks=None, yticks=None,
+ xlim=None, ylim=None,
+ rot=None, fontsize=None, colormap=None, table=False,
+ yerr=None, xerr=None,
+ label=None, secondary_y=False, **kwds):
+ return plot_series(self._parent, kind=kind, ax=ax, figsize=figsize,
+ use_index=use_index, title=title, grid=grid,
+ legend=legend, style=style, logx=logx, logy=logy,
+ loglog=loglog, xticks=xticks, yticks=yticks,
+ xlim=xlim, ylim=ylim, rot=rot, fontsize=fontsize,
+ colormap=colormap, table=table, yerr=yerr,
+ xerr=xerr, label=label, secondary_y=secondary_y,
+ **kwds)
+ __call__.__doc__ = plot_series.__doc__
+
+ def line(self, **kwds):
+ """
+ Line plot.
+
+ Parameters
+ ----------
+ `**kwds` : optional
+ Additional keyword arguments are documented in
+ :meth:`pandas.Series.plot`.
+
+ Returns
+ -------
+ axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them
+
+ Examples
+ --------
+
+ .. plot::
+ :context: close-figs
+
+ >>> s = pd.Series([1, 3, 2])
+ >>> s.plot.line()
+ """
+ return self(kind='line', **kwds)
+
+ def bar(self, **kwds):
+ """
+ Vertical bar plot.
+
+ Parameters
+ ----------
+ `**kwds` : optional
+ Additional keyword arguments are documented in
+ :meth:`pandas.Series.plot`.
+
+ Returns
+ -------
+ axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them
+ """
+ return self(kind='bar', **kwds)
+
+ def barh(self, **kwds):
+ """
+ Horizontal bar plot.
+
+ Parameters
+ ----------
+ `**kwds` : optional
+ Additional keyword arguments are documented in
+ :meth:`pandas.Series.plot`.
+
+ Returns
+ -------
+ axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them
+ """
+ return self(kind='barh', **kwds)
+
+ def box(self, **kwds):
+ """
+ Boxplot.
+
+ Parameters
+ ----------
+ `**kwds` : optional
+ Additional keyword arguments are documented in
+ :meth:`pandas.Series.plot`.
+
+ Returns
+ -------
+ axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them
+ """
+ return self(kind='box', **kwds)
+
+ def hist(self, bins=10, **kwds):
+ """
+ Histogram.
+
+ Parameters
+ ----------
+ bins : integer, default 10
+ Number of histogram bins to be used
+ `**kwds` : optional
+ Additional keyword arguments are documented in
+ :meth:`pandas.Series.plot`.
+
+ Returns
+ -------
+ axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them
+ """
+ return self(kind='hist', bins=bins, **kwds)
+
+ @Appender(_kde_docstring % {
+ 'this-datatype': 'Series',
+ 'sibling-datatype': 'DataFrame',
+ 'examples': """
+ Given a Series of points randomly sampled from an unknown
+ distribution, estimate its PDF using KDE with automatic
+ bandwidth determination and plot the results, evaluating them at
+ 1000 equally spaced points (default):
+
+ .. plot::
+ :context: close-figs
+
+ >>> s = pd.Series([1, 2, 2.5, 3, 3.5, 4, 5])
+ >>> ax = s.plot.kde()
+
+ A scalar bandwidth can be specified. Using a small bandwidth value can
+ lead to over-fitting, while using a large bandwidth value may result
+ in under-fitting:
+
+ .. plot::
+ :context: close-figs
+
+ >>> ax = s.plot.kde(bw_method=0.3)
+
+ .. plot::
+ :context: close-figs
+
+ >>> ax = s.plot.kde(bw_method=3)
+
+ Finally, the `ind` parameter determines the evaluation points for the
+ plot of the estimated PDF:
+
+ .. plot::
+ :context: close-figs
+
+ >>> ax = s.plot.kde(ind=[1, 2, 3, 4, 5])
+ """.strip()
+ })
+ def kde(self, bw_method=None, ind=None, **kwds):
+ return self(kind='kde', bw_method=bw_method, ind=ind, **kwds)
+
+ density = kde
+
+ def area(self, **kwds):
+ """
+ Area plot.
+
+ Parameters
+ ----------
+ `**kwds` : optional
+ Additional keyword arguments are documented in
+ :meth:`pandas.Series.plot`.
+
+ Returns
+ -------
+ axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them
+ """
+ return self(kind='area', **kwds)
+
+ def pie(self, **kwds):
+ """
+ Pie chart.
+
+ Parameters
+ ----------
+ `**kwds` : optional
+ Additional keyword arguments are documented in
+ :meth:`pandas.Series.plot`.
+
+ Returns
+ -------
+ axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them
+ """
+ return self(kind='pie', **kwds)
+
+
+class FramePlotMethods(BasePlotMethods):
+ """DataFrame plotting accessor and method
+
+ Examples
+ --------
+ >>> df.plot.line()
+ >>> df.plot.scatter('x', 'y')
+ >>> df.plot.hexbin()
+
+ These plotting methods can also be accessed by calling the accessor as a
+ method with the ``kind`` argument:
+ ``df.plot(kind='line')`` is equivalent to ``df.plot.line()``
+ """
+
+ def __call__(self, x=None, y=None, kind='line', ax=None,
+ subplots=False, sharex=None, sharey=False, layout=None,
+ figsize=None, use_index=True, title=None, grid=None,
+ legend=True, style=None, logx=False, logy=False, loglog=False,
+ xticks=None, yticks=None, xlim=None, ylim=None,
+ rot=None, fontsize=None, colormap=None, table=False,
+ yerr=None, xerr=None,
+ secondary_y=False, sort_columns=False, **kwds):
+ return plot_frame(self._parent, kind=kind, x=x, y=y, ax=ax,
+ subplots=subplots, sharex=sharex, sharey=sharey,
+ layout=layout, figsize=figsize, use_index=use_index,
+ title=title, grid=grid, legend=legend, style=style,
+ logx=logx, logy=logy, loglog=loglog, xticks=xticks,
+ yticks=yticks, xlim=xlim, ylim=ylim, rot=rot,
+ fontsize=fontsize, colormap=colormap, table=table,
+ yerr=yerr, xerr=xerr, secondary_y=secondary_y,
+ sort_columns=sort_columns, **kwds)
+ __call__.__doc__ = plot_frame.__doc__
+
+ def line(self, x=None, y=None, **kwds):
+ """
+ Plot DataFrame columns as lines.
+
+ This function is useful to plot lines using DataFrame's values
+ as coordinates.
+
+ Parameters
+ ----------
+ x : int or str, optional
+ Columns to use for the horizontal axis.
+ Either the location or the label of the columns to be used.
+ By default, it will use the DataFrame indices.
+ y : int, str, or list of them, optional
+ The values to be plotted.
+ Either the location or the label of the columns to be used.
+ By default, it will use the remaining DataFrame numeric columns.
+ **kwds
+ Keyword arguments to pass on to :meth:`pandas.DataFrame.plot`.
+
+ Returns
+ -------
+ axes : :class:`matplotlib.axes.Axes` or :class:`numpy.ndarray`
+ Returns an ndarray when ``subplots=True``.
+
+ See Also
+ --------
+ matplotlib.pyplot.plot : Plot y versus x as lines and/or markers.
+
+ Examples
+ --------
+
+ .. plot::
+ :context: close-figs
+
+ The following example shows the populations for some animals
+ over the years.
+
+ >>> df = pd.DataFrame({
+ ... 'pig': [20, 18, 489, 675, 1776],
+ ... 'horse': [4, 25, 281, 600, 1900]
+ ... }, index=[1990, 1997, 2003, 2009, 2014])
+ >>> lines = df.plot.line()
+
+ .. plot::
+ :context: close-figs
+
+ An example with subplots, so an array of axes is returned.
+
+ >>> axes = df.plot.line(subplots=True)
+ >>> type(axes)
+ <class 'numpy.ndarray'>
+
+ .. plot::
+ :context: close-figs
+
+ The following example shows the relationship between both
+ populations.
+
+ >>> lines = df.plot.line(x='pig', y='horse')
+ """
+ return self(kind='line', x=x, y=y, **kwds)
+
+ def bar(self, x=None, y=None, **kwds):
+ """
+ Vertical bar plot.
+
+ A bar plot is a plot that presents categorical data with
+ rectangular bars with lengths proportional to the values that they
+ represent. A bar plot shows comparisons among discrete categories. One
+ axis of the plot shows the specific categories being compared, and the
+ other axis represents a measured value.
+
+ Parameters
+ ----------
+ x : label or position, optional
+ Allows plotting of one column versus another. If not specified,
+ the index of the DataFrame is used.
+ y : label or position, optional
+ Allows plotting of one column versus another. If not specified,
+ all numerical columns are used.
+ **kwds
+ Additional keyword arguments are documented in
+ :meth:`pandas.DataFrame.plot`.
+
+ Returns
+ -------
+ axes : matplotlib.axes.Axes or np.ndarray of them
+ An ndarray is returned with one :class:`matplotlib.axes.Axes`
+ per column when ``subplots=True``.
+
+ See Also
+ --------
+ pandas.DataFrame.plot.barh : Horizontal bar plot.
+ pandas.DataFrame.plot : Make plots of a DataFrame.
+ matplotlib.pyplot.bar : Make a bar plot with matplotlib.
+
+ Examples
+ --------
+ Basic plot.
+
+ .. plot::
+ :context: close-figs
+
+ >>> df = pd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]})
+ >>> ax = df.plot.bar(x='lab', y='val', rot=0)
+
+ Plot a whole dataframe to a bar plot. Each column is assigned a
+ distinct color, and each row is nested in a group along the
+ horizontal axis.
+
+ .. plot::
+ :context: close-figs
+
+ >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
+ >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
+ >>> index = ['snail', 'pig', 'elephant',
+ ... 'rabbit', 'giraffe', 'coyote', 'horse']
+ >>> df = pd.DataFrame({'speed': speed,
+ ... 'lifespan': lifespan}, index=index)
+ >>> ax = df.plot.bar(rot=0)
+
+ Instead of nesting, the figure can be split by column with
+ ``subplots=True``. In this case, a :class:`numpy.ndarray` of
+ :class:`matplotlib.axes.Axes` are returned.
+
+ .. plot::
+ :context: close-figs
+
+ >>> axes = df.plot.bar(rot=0, subplots=True)
+ >>> axes[1].legend(loc=2) # doctest: +SKIP
+
+ Plot a single column.
+
+ .. plot::
+ :context: close-figs
+
+ >>> ax = df.plot.bar(y='speed', rot=0)
+
+ Plot only selected categories for the DataFrame.
+
+ .. plot::
+ :context: close-figs
+
+ >>> ax = df.plot.bar(x='lifespan', rot=0)
+ """
+ return self(kind='bar', x=x, y=y, **kwds)
+
+ def barh(self, x=None, y=None, **kwds):
+ """
+ Make a horizontal bar plot.
+
+ A horizontal bar plot is a plot that presents quantitative data with
+ rectangular bars with lengths proportional to the values that they
+ represent. A bar plot shows comparisons among discrete categories. One
+ axis of the plot shows the specific categories being compared, and the
+ other axis represents a measured value.
+
+ Parameters
+ ----------
+ x : label or position, default DataFrame.index
+ Column to be used for categories.
+ y : label or position, default All numeric columns in dataframe
+ Columns to be plotted from the DataFrame.
+ **kwds
+ Keyword arguments to pass on to :meth:`pandas.DataFrame.plot`.
+
+ Returns
+ -------
+ axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them.
+
+ See Also
+ --------
+ pandas.DataFrame.plot.bar: Vertical bar plot.
+ pandas.DataFrame.plot : Make plots of DataFrame using matplotlib.
+ matplotlib.axes.Axes.bar : Plot a vertical bar plot using matplotlib.
+
+ Examples
+ --------
+ Basic example
+
+ .. plot::
+ :context: close-figs
+
+ >>> df = pd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]})
+ >>> ax = df.plot.barh(x='lab', y='val')
+
+ Plot a whole DataFrame to a horizontal bar plot
+
+ .. plot::
+ :context: close-figs
+
+ >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
+ >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
+ >>> index = ['snail', 'pig', 'elephant',
+ ... 'rabbit', 'giraffe', 'coyote', 'horse']
+ >>> df = pd.DataFrame({'speed': speed,
+ ... 'lifespan': lifespan}, index=index)
+ >>> ax = df.plot.barh()
+
+ Plot a column of the DataFrame to a horizontal bar plot
+
+ .. plot::
+ :context: close-figs
+
+ >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
+ >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
+ >>> index = ['snail', 'pig', 'elephant',
+ ... 'rabbit', 'giraffe', 'coyote', 'horse']
+ >>> df = pd.DataFrame({'speed': speed,
+ ... 'lifespan': lifespan}, index=index)
+ >>> ax = df.plot.barh(y='speed')
+
+ Plot DataFrame versus the desired column
+
+ .. plot::
+ :context: close-figs
+
+ >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
+ >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
+ >>> index = ['snail', 'pig', 'elephant',
+ ... 'rabbit', 'giraffe', 'coyote', 'horse']
+ >>> df = pd.DataFrame({'speed': speed,
+ ... 'lifespan': lifespan}, index=index)
+ >>> ax = df.plot.barh(x='lifespan')
+ """
+ return self(kind='barh', x=x, y=y, **kwds)
+
+ def box(self, by=None, **kwds):
+ r"""
+ Make a box plot of the DataFrame columns.
+
+ A box plot is a method for graphically depicting groups of numerical
+ data through their quartiles.
+ The box extends from the Q1 to Q3 quartile values of the data,
+ with a line at the median (Q2). The whiskers extend from the edges
+ of box to show the range of the data. The position of the whiskers
+ is set by default to 1.5*IQR (IQR = Q3 - Q1) from the edges of the
+ box. Outlier points are those past the end of the whiskers.
+
+ For further details see Wikipedia's
+ entry for `boxplot <https://en.wikipedia.org/wiki/Box_plot>`__.
+
+ A consideration when using this chart is that the box and the whiskers
+ can overlap, which is very common when plotting small sets of data.
+
+ Parameters
+ ----------
+ by : string or sequence
+ Column in the DataFrame to group by.
+ **kwds : optional
+ Additional keywords are documented in
+ :meth:`pandas.DataFrame.plot`.
+
+ Returns
+ -------
+ axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them
+
+ See Also
+ --------
+ pandas.DataFrame.boxplot: Another method to draw a box plot.
+ pandas.Series.plot.box: Draw a box plot from a Series object.
+ matplotlib.pyplot.boxplot: Draw a box plot in matplotlib.
+
+ Examples
+ --------
+ Draw a box plot from a DataFrame with four columns of randomly
+ generated data.
+
+ .. plot::
+ :context: close-figs
+
+ >>> data = np.random.randn(25, 4)
+ >>> df = pd.DataFrame(data, columns=list('ABCD'))
+ >>> ax = df.plot.box()
+ """
+ return self(kind='box', by=by, **kwds)
+
+ def hist(self, by=None, bins=10, **kwds):
+ """
+ Draw one histogram of the DataFrame's columns.
+
+ A histogram is a representation of the distribution of data.
+ This function groups the values of all given Series in the DataFrame
+ into bins and draws all bins in one :class:`matplotlib.axes.Axes`.
+ This is useful when the DataFrame's Series are in a similar scale.
+
+ Parameters
+ ----------
+ by : str or sequence, optional
+ Column in the DataFrame to group by.
+ bins : int, default 10
+ Number of histogram bins to be used.
+ **kwds
+ Additional keyword arguments are documented in
+ :meth:`pandas.DataFrame.plot`.
+
+ Returns
+ -------
+ axes : matplotlib.AxesSubplot histogram.
+
+ See Also
+ --------
+ DataFrame.hist : Draw histograms per DataFrame's Series.
+ Series.hist : Draw a histogram with Series' data.
+
+ Examples
+ --------
+ When we draw a dice 6000 times, we expect to get each value around 1000
+ times. But when we draw two dices and sum the result, the distribution
+ is going to be quite different. A histogram illustrates those
+ distributions.
+
+ .. plot::
+ :context: close-figs
+
+ >>> df = pd.DataFrame(
+ ... np.random.randint(1, 7, 6000),
+ ... columns = ['one'])
+ >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000)
+ >>> ax = df.plot.hist(bins=12, alpha=0.5)
+ """
+ return self(kind='hist', by=by, bins=bins, **kwds)
+
+ @Appender(_kde_docstring % {
+ 'this-datatype': 'DataFrame',
+ 'sibling-datatype': 'Series',
+ 'examples': """
+ Given several Series of points randomly sampled from unknown
+ distributions, estimate their PDFs using KDE with automatic
+ bandwidth determination and plot the results, evaluating them at
+ 1000 equally spaced points (default):
+
+ .. plot::
+ :context: close-figs
+
+ >>> df = pd.DataFrame({
+ ... 'x': [1, 2, 2.5, 3, 3.5, 4, 5],
+ ... 'y': [4, 4, 4.5, 5, 5.5, 6, 6],
+ ... })
+ >>> ax = df.plot.kde()
+
+ A scalar bandwidth can be specified. Using a small bandwidth value can
+ lead to over-fitting, while using a large bandwidth value may result
+ in under-fitting:
+
+ .. plot::
+ :context: close-figs
+
+ >>> ax = df.plot.kde(bw_method=0.3)
+
+ .. plot::
+ :context: close-figs
+
+ >>> ax = df.plot.kde(bw_method=3)
+
+ Finally, the `ind` parameter determines the evaluation points for the
+ plot of the estimated PDF:
+
+ .. plot::
+ :context: close-figs
+
+ >>> ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6])
+ """.strip()
+ })
+ def kde(self, bw_method=None, ind=None, **kwds):
+ return self(kind='kde', bw_method=bw_method, ind=ind, **kwds)
+
+ density = kde
+
+ def area(self, x=None, y=None, **kwds):
+ """
+ Draw a stacked area plot.
+
+ An area plot displays quantitative data visually.
+ This function wraps the matplotlib area function.
+
+ Parameters
+ ----------
+ x : label or position, optional
+ Coordinates for the X axis. By default uses the index.
+ y : label or position, optional
+ Column to plot. By default uses all columns.
+ stacked : bool, default True
+ Area plots are stacked by default. Set to False to create a
+ unstacked plot.
+ **kwds : optional
+ Additional keyword arguments are documented in
+ :meth:`pandas.DataFrame.plot`.
+
+ Returns
+ -------
+ matplotlib.axes.Axes or numpy.ndarray
+ Area plot, or array of area plots if subplots is True
+
+ See Also
+ --------
+ DataFrame.plot : Make plots of DataFrame using matplotlib / pylab.
+
+ Examples
+ --------
+ Draw an area plot based on basic business metrics:
+
+ .. plot::
+ :context: close-figs
+
+ >>> df = pd.DataFrame({
+ ... 'sales': [3, 2, 3, 9, 10, 6],
+ ... 'signups': [5, 5, 6, 12, 14, 13],
+ ... 'visits': [20, 42, 28, 62, 81, 50],
+ ... }, index=pd.date_range(start='2018/01/01', end='2018/07/01',
+ ... freq='M'))
+ >>> ax = df.plot.area()
+
+ Area plots are stacked by default. To produce an unstacked plot,
+ pass ``stacked=False``:
+
+ .. plot::
+ :context: close-figs
+
+ >>> ax = df.plot.area(stacked=False)
+
+ Draw an area plot for a single column:
+
+ .. plot::
+ :context: close-figs
+
+ >>> ax = df.plot.area(y='sales')
+
+ Draw with a different `x`:
+
+ .. plot::
+ :context: close-figs
+
+ >>> df = pd.DataFrame({
+ ... 'sales': [3, 2, 3],
+ ... 'visits': [20, 42, 28],
+ ... 'day': [1, 2, 3],
+ ... })
+ >>> ax = df.plot.area(x='day')
+ """
+ return self(kind='area', x=x, y=y, **kwds)
+
+ def pie(self, y=None, **kwds):
+ """
+ Generate a pie plot.
+
+ A pie plot is a proportional representation of the numerical data in a
+ column. This function wraps :meth:`matplotlib.pyplot.pie` for the
+ specified column. If no column reference is passed and
+ ``subplots=True`` a pie plot is drawn for each numerical column
+ independently.
+
+ Parameters
+ ----------
+ y : int or label, optional
+ Label or position of the column to plot.
+ If not provided, ``subplots=True`` argument must be passed.
+ **kwds
+ Keyword arguments to pass on to :meth:`pandas.DataFrame.plot`.
+
+ Returns
+ -------
+ axes : matplotlib.axes.Axes or np.ndarray of them.
+ A NumPy array is returned when `subplots` is True.
+
+ See Also
+ --------
+ Series.plot.pie : Generate a pie plot for a Series.
+ DataFrame.plot : Make plots of a DataFrame.
+
+ Examples
+ --------
+ In the example below we have a DataFrame with the information about
+ planet's mass and radius. We pass the the 'mass' column to the
+ pie function to get a pie plot.
+
+ .. plot::
+ :context: close-figs
+
+ >>> df = pd.DataFrame({'mass': [0.330, 4.87 , 5.97],
+ ... 'radius': [2439.7, 6051.8, 6378.1]},
+ ... index=['Mercury', 'Venus', 'Earth'])
+ >>> plot = df.plot.pie(y='mass', figsize=(5, 5))
+
+ .. plot::
+ :context: close-figs
+
+ >>> plot = df.plot.pie(subplots=True, figsize=(6, 3))
+ """
+ return self(kind='pie', y=y, **kwds)
+
+ def scatter(self, x, y, s=None, c=None, **kwds):
+ """
+ Create a scatter plot with varying marker point size and color.
+
+ The coordinates of each point are defined by two dataframe columns and
+ filled circles are used to represent each point. This kind of plot is
+ useful to see complex correlations between two variables. Points could
+ be for instance natural 2D coordinates like longitude and latitude in
+ a map or, in general, any pair of metrics that can be plotted against
+ each other.
+
+ Parameters
+ ----------
+ x : int or str
+ The column name or column position to be used as horizontal
+ coordinates for each point.
+ y : int or str
+ The column name or column position to be used as vertical
+ coordinates for each point.
+ s : scalar or array_like, optional
+ The size of each point. Possible values are:
+
+ - A single scalar so all points have the same size.
+
+ - A sequence of scalars, which will be used for each point's size
+ recursively. For instance, when passing [2,14] all points size
+ will be either 2 or 14, alternatively.
+
+ c : str, int or array_like, optional
+ The color of each point. Possible values are:
+
+ - A single color string referred to by name, RGB or RGBA code,
+ for instance 'red' or '#a98d19'.
+
+ - A sequence of color strings referred to by name, RGB or RGBA
+ code, which will be used for each point's color recursively. For
+ instance ['green','yellow'] all points will be filled in green or
+ yellow, alternatively.
+
+ - A column name or position whose values will be used to color the
+ marker points according to a colormap.
+
+ **kwds
+ Keyword arguments to pass on to :meth:`pandas.DataFrame.plot`.
+
+ Returns
+ -------
+ axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them
+
+ See Also
+ --------
+ matplotlib.pyplot.scatter : Scatter plot using multiple input data
+ formats.
+
+ Examples
+ --------
+ Let's see how to draw a scatter plot using coordinates from the values
+ in a DataFrame's columns.
+
+ .. plot::
+ :context: close-figs
+
+ >>> df = pd.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1],
+ ... [6.4, 3.2, 1], [5.9, 3.0, 2]],
+ ... columns=['length', 'width', 'species'])
+ >>> ax1 = df.plot.scatter(x='length',
+ ... y='width',
+ ... c='DarkBlue')
+
+ And now with the color determined by a column as well.
+
+ .. plot::
+ :context: close-figs
+
+ >>> ax2 = df.plot.scatter(x='length',
+ ... y='width',
+ ... c='species',
+ ... colormap='viridis')
+ """
+ return self(kind='scatter', x=x, y=y, c=c, s=s, **kwds)
+
+ def hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None,
+ **kwds):
+ """
+ Generate a hexagonal binning plot.
+
+ Generate a hexagonal binning plot of `x` versus `y`. If `C` is `None`
+ (the default), this is a histogram of the number of occurrences
+ of the observations at ``(x[i], y[i])``.
+
+ If `C` is specified, specifies values at given coordinates
+ ``(x[i], y[i])``. These values are accumulated for each hexagonal
+ bin and then reduced according to `reduce_C_function`,
+ having as default the NumPy's mean function (:meth:`numpy.mean`).
+ (If `C` is specified, it must also be a 1-D sequence
+ of the same length as `x` and `y`, or a column label.)
+
+ Parameters
+ ----------
+ x : int or str
+ The column label or position for x points.
+ y : int or str
+ The column label or position for y points.
+ C : int or str, optional
+ The column label or position for the value of `(x, y)` point.
+ reduce_C_function : callable, default `np.mean`
+ Function of one argument that reduces all the values in a bin to
+ a single number (e.g. `np.mean`, `np.max`, `np.sum`, `np.std`).
+ gridsize : int or tuple of (int, int), default 100
+ The number of hexagons in the x-direction.
+ The corresponding number of hexagons in the y-direction is
+ chosen in a way that the hexagons are approximately regular.
+ Alternatively, gridsize can be a tuple with two elements
+ specifying the number of hexagons in the x-direction and the
+ y-direction.
+ **kwds
+ Additional keyword arguments are documented in
+ :meth:`pandas.DataFrame.plot`.
+
+ Returns
+ -------
+ matplotlib.AxesSubplot
+ The matplotlib ``Axes`` on which the hexbin is plotted.
+
+ See Also
+ --------
+ DataFrame.plot : Make plots of a DataFrame.
+ matplotlib.pyplot.hexbin : Hexagonal binning plot using matplotlib,
+ the matplotlib function that is used under the hood.
+
+ Examples
+ --------
+ The following examples are generated with random data from
+ a normal distribution.
+
+ .. plot::
+ :context: close-figs
+
+ >>> n = 10000
+ >>> df = pd.DataFrame({'x': np.random.randn(n),
+ ... 'y': np.random.randn(n)})
+ >>> ax = df.plot.hexbin(x='x', y='y', gridsize=20)
+
+ The next example uses `C` and `np.sum` as `reduce_C_function`.
+ Note that `'observations'` values ranges from 1 to 5 but the result
+ plot shows values up to more than 25. This is because of the
+ `reduce_C_function`.
+
+ .. plot::
+ :context: close-figs
+
+ >>> n = 500
+ >>> df = pd.DataFrame({
+ ... 'coord_x': np.random.uniform(-3, 3, size=n),
+ ... 'coord_y': np.random.uniform(30, 50, size=n),
+ ... 'observations': np.random.randint(1,5, size=n)
+ ... })
+ >>> ax = df.plot.hexbin(x='coord_x',
+ ... y='coord_y',
+ ... C='observations',
+ ... reduce_C_function=np.sum,
+ ... gridsize=10,
+ ... cmap="viridis")
+ """
+ if reduce_C_function is not None:
+ kwds['reduce_C_function'] = reduce_C_function
+ if gridsize is not None:
+ kwds['gridsize'] = gridsize
+ return self(kind='hexbin', x=x, y=y, C=C, **kwds)
diff --git a/contrib/python/pandas/py2/pandas/plotting/_misc.py b/contrib/python/pandas/py2/pandas/plotting/_misc.py
new file mode 100644
index 00000000000..1c69c03025e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/plotting/_misc.py
@@ -0,0 +1,640 @@
+# being a bit too dynamic
+# pylint: disable=E1101
+from __future__ import division
+
+import numpy as np
+
+from pandas.compat import lmap, lrange, range, zip
+from pandas.util._decorators import deprecate_kwarg
+
+from pandas.core.dtypes.missing import notna
+
+from pandas.io.formats.printing import pprint_thing
+from pandas.plotting._style import _get_standard_colors
+from pandas.plotting._tools import _set_ticks_props, _subplots
+
+
+def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False,
+ diagonal='hist', marker='.', density_kwds=None,
+ hist_kwds=None, range_padding=0.05, **kwds):
+ """
+ Draw a matrix of scatter plots.
+
+ Parameters
+ ----------
+ frame : DataFrame
+ alpha : float, optional
+ amount of transparency applied
+ figsize : (float,float), optional
+ a tuple (width, height) in inches
+ ax : Matplotlib axis object, optional
+ grid : bool, optional
+ setting this to True will show the grid
+ diagonal : {'hist', 'kde'}
+ pick between 'kde' and 'hist' for
+ either Kernel Density Estimation or Histogram
+ plot in the diagonal
+ marker : str, optional
+ Matplotlib marker type, default '.'
+ hist_kwds : other plotting keyword arguments
+ To be passed to hist function
+ density_kwds : other plotting keyword arguments
+ To be passed to kernel density estimate plot
+ range_padding : float, optional
+ relative extension of axis range in x and y
+ with respect to (x_max - x_min) or (y_max - y_min),
+ default 0.05
+ kwds : other plotting keyword arguments
+ To be passed to scatter function
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
+ >>> scatter_matrix(df, alpha=0.2)
+ """
+
+ df = frame._get_numeric_data()
+ n = df.columns.size
+ naxes = n * n
+ fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax,
+ squeeze=False)
+
+ # no gaps between subplots
+ fig.subplots_adjust(wspace=0, hspace=0)
+
+ mask = notna(df)
+
+ marker = _get_marker_compat(marker)
+
+ hist_kwds = hist_kwds or {}
+ density_kwds = density_kwds or {}
+
+ # GH 14855
+ kwds.setdefault('edgecolors', 'none')
+
+ boundaries_list = []
+ for a in df.columns:
+ values = df[a].values[mask[a].values]
+ rmin_, rmax_ = np.min(values), np.max(values)
+ rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
+ boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))
+
+ for i, a in zip(lrange(n), df.columns):
+ for j, b in zip(lrange(n), df.columns):
+ ax = axes[i, j]
+
+ if i == j:
+ values = df[a].values[mask[a].values]
+
+ # Deal with the diagonal by drawing a histogram there.
+ if diagonal == 'hist':
+ ax.hist(values, **hist_kwds)
+
+ elif diagonal in ('kde', 'density'):
+ from scipy.stats import gaussian_kde
+ y = values
+ gkde = gaussian_kde(y)
+ ind = np.linspace(y.min(), y.max(), 1000)
+ ax.plot(ind, gkde.evaluate(ind), **density_kwds)
+
+ ax.set_xlim(boundaries_list[i])
+
+ else:
+ common = (mask[a] & mask[b]).values
+
+ ax.scatter(df[b][common], df[a][common],
+ marker=marker, alpha=alpha, **kwds)
+
+ ax.set_xlim(boundaries_list[j])
+ ax.set_ylim(boundaries_list[i])
+
+ ax.set_xlabel(b)
+ ax.set_ylabel(a)
+
+ if j != 0:
+ ax.yaxis.set_visible(False)
+ if i != n - 1:
+ ax.xaxis.set_visible(False)
+
+ if len(df.columns) > 1:
+ lim1 = boundaries_list[0]
+ locs = axes[0][1].yaxis.get_majorticklocs()
+ locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])]
+ adj = (locs - lim1[0]) / (lim1[1] - lim1[0])
+
+ lim0 = axes[0][0].get_ylim()
+ adj = adj * (lim0[1] - lim0[0]) + lim0[0]
+ axes[0][0].yaxis.set_ticks(adj)
+
+ if np.all(locs == locs.astype(int)):
+ # if all ticks are int
+ locs = locs.astype(int)
+ axes[0][0].yaxis.set_ticklabels(locs)
+
+ _set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)
+
+ return axes
+
+
+def _get_marker_compat(marker):
+ import matplotlib.lines as mlines
+ if marker not in mlines.lineMarkers:
+ return 'o'
+ return marker
+
+
+def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds):
+ """
+ Plot a multidimensional dataset in 2D.
+
+ Each Series in the DataFrame is represented as a evenly distributed
+ slice on a circle. Each data point is rendered in the circle according to
+ the value on each Series. Highly correlated `Series` in the `DataFrame`
+ are placed closer on the unit circle.
+
+ RadViz allow to project a N-dimensional data set into a 2D space where the
+ influence of each dimension can be interpreted as a balance between the
+ influence of all dimensions.
+
+ More info available at the `original article
+ <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.135.889>`_
+ describing RadViz.
+
+ Parameters
+ ----------
+ frame : `DataFrame`
+ Pandas object holding the data.
+ class_column : str
+ Column name containing the name of the data point category.
+ ax : :class:`matplotlib.axes.Axes`, optional
+ A plot instance to which to add the information.
+ color : list[str] or tuple[str], optional
+ Assign a color to each category. Example: ['blue', 'green'].
+ colormap : str or :class:`matplotlib.colors.Colormap`, default None
+ Colormap to select colors from. If string, load colormap with that
+ name from matplotlib.
+ kwds : optional
+ Options to pass to matplotlib scatter plotting method.
+
+ Returns
+ -------
+ axes : :class:`matplotlib.axes.Axes`
+
+ See Also
+ --------
+ pandas.plotting.andrews_curves : Plot clustering visualization.
+
+ Examples
+ --------
+ .. plot::
+ :context: close-figs
+
+ >>> df = pd.DataFrame({
+ ... 'SepalLength': [6.5, 7.7, 5.1, 5.8, 7.6, 5.0, 5.4, 4.6,
+ ... 6.7, 4.6],
+ ... 'SepalWidth': [3.0, 3.8, 3.8, 2.7, 3.0, 2.3, 3.0, 3.2,
+ ... 3.3, 3.6],
+ ... 'PetalLength': [5.5, 6.7, 1.9, 5.1, 6.6, 3.3, 4.5, 1.4,
+ ... 5.7, 1.0],
+ ... 'PetalWidth': [1.8, 2.2, 0.4, 1.9, 2.1, 1.0, 1.5, 0.2,
+ ... 2.1, 0.2],
+ ... 'Category': ['virginica', 'virginica', 'setosa',
+ ... 'virginica', 'virginica', 'versicolor',
+ ... 'versicolor', 'setosa', 'virginica',
+ ... 'setosa']
+ ... })
+ >>> rad_viz = pd.plotting.radviz(df, 'Category') # doctest: +SKIP
+ """
+ import matplotlib.pyplot as plt
+ import matplotlib.patches as patches
+
+ def normalize(series):
+ a = min(series)
+ b = max(series)
+ return (series - a) / (b - a)
+
+ n = len(frame)
+ classes = frame[class_column].drop_duplicates()
+ class_col = frame[class_column]
+ df = frame.drop(class_column, axis=1).apply(normalize)
+
+ if ax is None:
+ ax = plt.gca(xlim=[-1, 1], ylim=[-1, 1])
+
+ to_plot = {}
+ colors = _get_standard_colors(num_colors=len(classes), colormap=colormap,
+ color_type='random', color=color)
+
+ for kls in classes:
+ to_plot[kls] = [[], []]
+
+ m = len(frame.columns) - 1
+ s = np.array([(np.cos(t), np.sin(t))
+ for t in [2.0 * np.pi * (i / float(m))
+ for i in range(m)]])
+
+ for i in range(n):
+ row = df.iloc[i].values
+ row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1)
+ y = (s * row_).sum(axis=0) / row.sum()
+ kls = class_col.iat[i]
+ to_plot[kls][0].append(y[0])
+ to_plot[kls][1].append(y[1])
+
+ for i, kls in enumerate(classes):
+ ax.scatter(to_plot[kls][0], to_plot[kls][1], color=colors[i],
+ label=pprint_thing(kls), **kwds)
+ ax.legend()
+
+ ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor='none'))
+
+ for xy, name in zip(s, df.columns):
+
+ ax.add_patch(patches.Circle(xy, radius=0.025, facecolor='gray'))
+
+ if xy[0] < 0.0 and xy[1] < 0.0:
+ ax.text(xy[0] - 0.025, xy[1] - 0.025, name,
+ ha='right', va='top', size='small')
+ elif xy[0] < 0.0 and xy[1] >= 0.0:
+ ax.text(xy[0] - 0.025, xy[1] + 0.025, name,
+ ha='right', va='bottom', size='small')
+ elif xy[0] >= 0.0 and xy[1] < 0.0:
+ ax.text(xy[0] + 0.025, xy[1] - 0.025, name,
+ ha='left', va='top', size='small')
+ elif xy[0] >= 0.0 and xy[1] >= 0.0:
+ ax.text(xy[0] + 0.025, xy[1] + 0.025, name,
+ ha='left', va='bottom', size='small')
+
+ ax.axis('equal')
+ return ax
+
+
+@deprecate_kwarg(old_arg_name='data', new_arg_name='frame')
+def andrews_curves(frame, class_column, ax=None, samples=200, color=None,
+ colormap=None, **kwds):
+ """
+ Generates a matplotlib plot of Andrews curves, for visualising clusters of
+ multivariate data.
+
+ Andrews curves have the functional form:
+
+ f(t) = x_1/sqrt(2) + x_2 sin(t) + x_3 cos(t) +
+ x_4 sin(2t) + x_5 cos(2t) + ...
+
+ Where x coefficients correspond to the values of each dimension and t is
+ linearly spaced between -pi and +pi. Each row of frame then corresponds to
+ a single curve.
+
+ Parameters
+ ----------
+ frame : DataFrame
+ Data to be plotted, preferably normalized to (0.0, 1.0)
+ class_column : Name of the column containing class names
+ ax : matplotlib axes object, default None
+ samples : Number of points to plot in each curve
+ color : list or tuple, optional
+ Colors to use for the different classes
+ colormap : str or matplotlib colormap object, default None
+ Colormap to select colors from. If string, load colormap with that name
+ from matplotlib.
+ kwds : keywords
+ Options to pass to matplotlib plotting method
+
+ Returns
+ -------
+ ax : Matplotlib axis object
+
+ """
+ from math import sqrt, pi
+ import matplotlib.pyplot as plt
+
+ def function(amplitudes):
+ def f(t):
+ x1 = amplitudes[0]
+ result = x1 / sqrt(2.0)
+
+ # Take the rest of the coefficients and resize them
+ # appropriately. Take a copy of amplitudes as otherwise numpy
+ # deletes the element from amplitudes itself.
+ coeffs = np.delete(np.copy(amplitudes), 0)
+ coeffs.resize(int((coeffs.size + 1) / 2), 2)
+
+ # Generate the harmonics and arguments for the sin and cos
+ # functions.
+ harmonics = np.arange(0, coeffs.shape[0]) + 1
+ trig_args = np.outer(harmonics, t)
+
+ result += np.sum(coeffs[:, 0, np.newaxis] * np.sin(trig_args) +
+ coeffs[:, 1, np.newaxis] * np.cos(trig_args),
+ axis=0)
+ return result
+ return f
+
+ n = len(frame)
+ class_col = frame[class_column]
+ classes = frame[class_column].drop_duplicates()
+ df = frame.drop(class_column, axis=1)
+ t = np.linspace(-pi, pi, samples)
+ used_legends = set()
+
+ color_values = _get_standard_colors(num_colors=len(classes),
+ colormap=colormap, color_type='random',
+ color=color)
+ colors = dict(zip(classes, color_values))
+ if ax is None:
+ ax = plt.gca(xlim=(-pi, pi))
+ for i in range(n):
+ row = df.iloc[i].values
+ f = function(row)
+ y = f(t)
+ kls = class_col.iat[i]
+ label = pprint_thing(kls)
+ if label not in used_legends:
+ used_legends.add(label)
+ ax.plot(t, y, color=colors[kls], label=label, **kwds)
+ else:
+ ax.plot(t, y, color=colors[kls], **kwds)
+
+ ax.legend(loc='upper right')
+ ax.grid()
+ return ax
+
+
+def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds):
+ """
+ Bootstrap plot on mean, median and mid-range statistics.
+
+ The bootstrap plot is used to estimate the uncertainty of a statistic
+ by relaying on random sampling with replacement [1]_. This function will
+ generate bootstrapping plots for mean, median and mid-range statistics
+ for the given number of samples of the given size.
+
+ .. [1] "Bootstrapping (statistics)" in \
+ https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29
+
+ Parameters
+ ----------
+ series : pandas.Series
+ Pandas Series from where to get the samplings for the bootstrapping.
+ fig : matplotlib.figure.Figure, default None
+ If given, it will use the `fig` reference for plotting instead of
+ creating a new one with default parameters.
+ size : int, default 50
+ Number of data points to consider during each sampling. It must be
+ greater or equal than the length of the `series`.
+ samples : int, default 500
+ Number of times the bootstrap procedure is performed.
+ **kwds :
+ Options to pass to matplotlib plotting method.
+
+ Returns
+ -------
+ fig : matplotlib.figure.Figure
+ Matplotlib figure
+
+ See Also
+ --------
+ pandas.DataFrame.plot : Basic plotting for DataFrame objects.
+ pandas.Series.plot : Basic plotting for Series objects.
+
+ Examples
+ --------
+
+ .. plot::
+ :context: close-figs
+
+ >>> s = pd.Series(np.random.uniform(size=100))
+ >>> fig = pd.plotting.bootstrap_plot(s) # doctest: +SKIP
+ """
+ import random
+ import matplotlib.pyplot as plt
+
+ # random.sample(ndarray, int) fails on python 3.3, sigh
+ data = list(series.values)
+ samplings = [random.sample(data, size) for _ in range(samples)]
+
+ means = np.array([np.mean(sampling) for sampling in samplings])
+ medians = np.array([np.median(sampling) for sampling in samplings])
+ midranges = np.array([(min(sampling) + max(sampling)) * 0.5
+ for sampling in samplings])
+ if fig is None:
+ fig = plt.figure()
+ x = lrange(samples)
+ axes = []
+ ax1 = fig.add_subplot(2, 3, 1)
+ ax1.set_xlabel("Sample")
+ axes.append(ax1)
+ ax1.plot(x, means, **kwds)
+ ax2 = fig.add_subplot(2, 3, 2)
+ ax2.set_xlabel("Sample")
+ axes.append(ax2)
+ ax2.plot(x, medians, **kwds)
+ ax3 = fig.add_subplot(2, 3, 3)
+ ax3.set_xlabel("Sample")
+ axes.append(ax3)
+ ax3.plot(x, midranges, **kwds)
+ ax4 = fig.add_subplot(2, 3, 4)
+ ax4.set_xlabel("Mean")
+ axes.append(ax4)
+ ax4.hist(means, **kwds)
+ ax5 = fig.add_subplot(2, 3, 5)
+ ax5.set_xlabel("Median")
+ axes.append(ax5)
+ ax5.hist(medians, **kwds)
+ ax6 = fig.add_subplot(2, 3, 6)
+ ax6.set_xlabel("Midrange")
+ axes.append(ax6)
+ ax6.hist(midranges, **kwds)
+ for axis in axes:
+ plt.setp(axis.get_xticklabels(), fontsize=8)
+ plt.setp(axis.get_yticklabels(), fontsize=8)
+ return fig
+
+
+@deprecate_kwarg(old_arg_name='colors', new_arg_name='color')
+@deprecate_kwarg(old_arg_name='data', new_arg_name='frame', stacklevel=3)
+def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None,
+ use_columns=False, xticks=None, colormap=None,
+ axvlines=True, axvlines_kwds=None, sort_labels=False,
+ **kwds):
+ """Parallel coordinates plotting.
+
+ Parameters
+ ----------
+ frame : DataFrame
+ class_column : str
+ Column name containing class names
+ cols : list, optional
+ A list of column names to use
+ ax : matplotlib.axis, optional
+ matplotlib axis object
+ color : list or tuple, optional
+ Colors to use for the different classes
+ use_columns : bool, optional
+ If true, columns will be used as xticks
+ xticks : list or tuple, optional
+ A list of values to use for xticks
+ colormap : str or matplotlib colormap, default None
+ Colormap to use for line colors.
+ axvlines : bool, optional
+ If true, vertical lines will be added at each xtick
+ axvlines_kwds : keywords, optional
+ Options to be passed to axvline method for vertical lines
+ sort_labels : bool, False
+ Sort class_column labels, useful when assigning colors
+
+ .. versionadded:: 0.20.0
+
+ kwds : keywords
+ Options to pass to matplotlib plotting method
+
+ Returns
+ -------
+ ax: matplotlib axis object
+
+ Examples
+ --------
+ >>> from matplotlib import pyplot as plt
+ >>> df = pd.read_csv('https://raw.github.com/pandas-dev/pandas/master'
+ '/pandas/tests/data/iris.csv')
+ >>> pd.plotting.parallel_coordinates(
+ df, 'Name',
+ color=('#556270', '#4ECDC4', '#C7F464'))
+ >>> plt.show()
+ """
+ if axvlines_kwds is None:
+ axvlines_kwds = {'linewidth': 1, 'color': 'black'}
+ import matplotlib.pyplot as plt
+
+ n = len(frame)
+ classes = frame[class_column].drop_duplicates()
+ class_col = frame[class_column]
+
+ if cols is None:
+ df = frame.drop(class_column, axis=1)
+ else:
+ df = frame[cols]
+
+ used_legends = set()
+
+ ncols = len(df.columns)
+
+ # determine values to use for xticks
+ if use_columns is True:
+ if not np.all(np.isreal(list(df.columns))):
+ raise ValueError('Columns must be numeric to be used as xticks')
+ x = df.columns
+ elif xticks is not None:
+ if not np.all(np.isreal(xticks)):
+ raise ValueError('xticks specified must be numeric')
+ elif len(xticks) != ncols:
+ raise ValueError('Length of xticks must match number of columns')
+ x = xticks
+ else:
+ x = lrange(ncols)
+
+ if ax is None:
+ ax = plt.gca()
+
+ color_values = _get_standard_colors(num_colors=len(classes),
+ colormap=colormap, color_type='random',
+ color=color)
+
+ if sort_labels:
+ classes = sorted(classes)
+ color_values = sorted(color_values)
+ colors = dict(zip(classes, color_values))
+
+ for i in range(n):
+ y = df.iloc[i].values
+ kls = class_col.iat[i]
+ label = pprint_thing(kls)
+ if label not in used_legends:
+ used_legends.add(label)
+ ax.plot(x, y, color=colors[kls], label=label, **kwds)
+ else:
+ ax.plot(x, y, color=colors[kls], **kwds)
+
+ if axvlines:
+ for i in x:
+ ax.axvline(i, **axvlines_kwds)
+
+ ax.set_xticks(x)
+ ax.set_xticklabels(df.columns)
+ ax.set_xlim(x[0], x[-1])
+ ax.legend(loc='upper right')
+ ax.grid()
+ return ax
+
+
+def lag_plot(series, lag=1, ax=None, **kwds):
+ """Lag plot for time series.
+
+ Parameters
+ ----------
+ series : Time series
+ lag : lag of the scatter plot, default 1
+ ax : Matplotlib axis object, optional
+ kwds : Matplotlib scatter method keyword arguments, optional
+
+ Returns
+ -------
+ ax: Matplotlib axis object
+ """
+ import matplotlib.pyplot as plt
+
+ # workaround because `c='b'` is hardcoded in matplotlibs scatter method
+ kwds.setdefault('c', plt.rcParams['patch.facecolor'])
+
+ data = series.values
+ y1 = data[:-lag]
+ y2 = data[lag:]
+ if ax is None:
+ ax = plt.gca()
+ ax.set_xlabel("y(t)")
+ ax.set_ylabel("y(t + {lag})".format(lag=lag))
+ ax.scatter(y1, y2, **kwds)
+ return ax
+
+
+def autocorrelation_plot(series, ax=None, **kwds):
+ """Autocorrelation plot for time series.
+
+ Parameters:
+ -----------
+ series: Time series
+ ax: Matplotlib axis object, optional
+ kwds : keywords
+ Options to pass to matplotlib plotting method
+
+ Returns:
+ -----------
+ ax: Matplotlib axis object
+ """
+ import matplotlib.pyplot as plt
+ n = len(series)
+ data = np.asarray(series)
+ if ax is None:
+ ax = plt.gca(xlim=(1, n), ylim=(-1.0, 1.0))
+ mean = np.mean(data)
+ c0 = np.sum((data - mean) ** 2) / float(n)
+
+ def r(h):
+ return ((data[:n - h] - mean) *
+ (data[h:] - mean)).sum() / float(n) / c0
+ x = np.arange(n) + 1
+ y = lmap(r, x)
+ z95 = 1.959963984540054
+ z99 = 2.5758293035489004
+ ax.axhline(y=z99 / np.sqrt(n), linestyle='--', color='grey')
+ ax.axhline(y=z95 / np.sqrt(n), color='grey')
+ ax.axhline(y=0.0, color='black')
+ ax.axhline(y=-z95 / np.sqrt(n), color='grey')
+ ax.axhline(y=-z99 / np.sqrt(n), linestyle='--', color='grey')
+ ax.set_xlabel("Lag")
+ ax.set_ylabel("Autocorrelation")
+ ax.plot(x, y, **kwds)
+ if 'label' in kwds:
+ ax.legend()
+ ax.grid()
+ return ax
diff --git a/contrib/python/pandas/py2/pandas/plotting/_style.py b/contrib/python/pandas/py2/pandas/plotting/_style.py
new file mode 100644
index 00000000000..d9da34e0087
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/plotting/_style.py
@@ -0,0 +1,168 @@
+# being a bit too dynamic
+# pylint: disable=E1101
+from __future__ import division
+
+from contextlib import contextmanager
+import warnings
+
+import numpy as np
+
+import pandas.compat as compat
+from pandas.compat import lmap, lrange
+
+from pandas.core.dtypes.common import is_list_like
+
+
+def _get_standard_colors(num_colors=None, colormap=None, color_type='default',
+ color=None):
+ import matplotlib.pyplot as plt
+
+ if color is None and colormap is not None:
+ if isinstance(colormap, compat.string_types):
+ import matplotlib.cm as cm
+ cmap = colormap
+ colormap = cm.get_cmap(colormap)
+ if colormap is None:
+ raise ValueError("Colormap {0} is not recognized".format(cmap))
+ colors = lmap(colormap, np.linspace(0, 1, num=num_colors))
+ elif color is not None:
+ if colormap is not None:
+ warnings.warn("'color' and 'colormap' cannot be used "
+ "simultaneously. Using 'color'")
+ colors = list(color) if is_list_like(color) else color
+ else:
+ if color_type == 'default':
+ # need to call list() on the result to copy so we don't
+ # modify the global rcParams below
+ try:
+ colors = [c['color']
+ for c in list(plt.rcParams['axes.prop_cycle'])]
+ except KeyError:
+ colors = list(plt.rcParams.get('axes.color_cycle',
+ list('bgrcmyk')))
+ if isinstance(colors, compat.string_types):
+ colors = list(colors)
+
+ colors = colors[0:num_colors]
+ elif color_type == 'random':
+ import pandas.core.common as com
+
+ def random_color(column):
+ """ Returns a random color represented as a list of length 3"""
+ # GH17525 use common._random_state to avoid resetting the seed
+ rs = com.random_state(column)
+ return rs.rand(3).tolist()
+
+ colors = lmap(random_color, lrange(num_colors))
+ else:
+ raise ValueError("color_type must be either 'default' or 'random'")
+
+ if isinstance(colors, compat.string_types):
+ import matplotlib.colors
+ conv = matplotlib.colors.ColorConverter()
+
+ def _maybe_valid_colors(colors):
+ try:
+ [conv.to_rgba(c) for c in colors]
+ return True
+ except ValueError:
+ return False
+
+ # check whether the string can be convertible to single color
+ maybe_single_color = _maybe_valid_colors([colors])
+ # check whether each character can be convertible to colors
+ maybe_color_cycle = _maybe_valid_colors(list(colors))
+ if maybe_single_color and maybe_color_cycle and len(colors) > 1:
+ hex_color = [c['color']
+ for c in list(plt.rcParams['axes.prop_cycle'])]
+ colors = [hex_color[int(colors[1])]]
+ elif maybe_single_color:
+ colors = [colors]
+ else:
+ # ``colors`` is regarded as color cycle.
+ # mpl will raise error any of them is invalid
+ pass
+
+ # Append more colors by cycling if there is not enough color.
+ # Extra colors will be ignored by matplotlib if there are more colors
+ # than needed and nothing needs to be done here.
+ if len(colors) < num_colors:
+ try:
+ multiple = num_colors // len(colors) - 1
+ except ZeroDivisionError:
+ raise ValueError("Invalid color argument: ''")
+ mod = num_colors % len(colors)
+
+ colors += multiple * colors
+ colors += colors[:mod]
+
+ return colors
+
+
+class _Options(dict):
+ """
+ Stores pandas plotting options.
+ Allows for parameter aliasing so you can just use parameter names that are
+ the same as the plot function parameters, but is stored in a canonical
+ format that makes it easy to breakdown into groups later
+ """
+
+ # alias so the names are same as plotting method parameter names
+ _ALIASES = {'x_compat': 'xaxis.compat'}
+ _DEFAULT_KEYS = ['xaxis.compat']
+
+ def __init__(self, deprecated=False):
+ self._deprecated = deprecated
+ # self['xaxis.compat'] = False
+ super(_Options, self).__setitem__('xaxis.compat', False)
+
+ def __getitem__(self, key):
+ key = self._get_canonical_key(key)
+ if key not in self:
+ raise ValueError(
+ '{key} is not a valid pandas plotting option'.format(key=key))
+ return super(_Options, self).__getitem__(key)
+
+ def __setitem__(self, key, value):
+ key = self._get_canonical_key(key)
+ return super(_Options, self).__setitem__(key, value)
+
+ def __delitem__(self, key):
+ key = self._get_canonical_key(key)
+ if key in self._DEFAULT_KEYS:
+ raise ValueError(
+ 'Cannot remove default parameter {key}'.format(key=key))
+ return super(_Options, self).__delitem__(key)
+
+ def __contains__(self, key):
+ key = self._get_canonical_key(key)
+ return super(_Options, self).__contains__(key)
+
+ def reset(self):
+ """
+ Reset the option store to its initial state
+
+ Returns
+ -------
+ None
+ """
+ self.__init__()
+
+ def _get_canonical_key(self, key):
+ return self._ALIASES.get(key, key)
+
+ @contextmanager
+ def use(self, key, value):
+ """
+ Temporarily set a parameter value using the with statement.
+ Aliasing allowed.
+ """
+ old_value = self[key]
+ try:
+ self[key] = value
+ yield self
+ finally:
+ self[key] = old_value
+
+
+plot_params = _Options()
diff --git a/contrib/python/pandas/py2/pandas/plotting/_timeseries.py b/contrib/python/pandas/py2/pandas/plotting/_timeseries.py
new file mode 100644
index 00000000000..51b06290059
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/plotting/_timeseries.py
@@ -0,0 +1,353 @@
+# TODO: Use the fact that axis can have units to simplify the process
+
+import functools
+
+from matplotlib import pylab
+import numpy as np
+
+from pandas._libs.tslibs.frequencies import (
+ FreqGroup, get_base_alias, get_freq, is_subperiod, is_superperiod)
+from pandas._libs.tslibs.period import Period
+import pandas.compat as compat
+
+from pandas.core.dtypes.generic import (
+ ABCDatetimeIndex, ABCPeriodIndex, ABCTimedeltaIndex)
+
+from pandas.io.formats.printing import pprint_thing
+from pandas.plotting._converter import (
+ TimeSeries_DateFormatter, TimeSeries_DateLocator,
+ TimeSeries_TimedeltaFormatter)
+import pandas.tseries.frequencies as frequencies
+from pandas.tseries.offsets import DateOffset
+
+# ---------------------------------------------------------------------
+# Plotting functions and monkey patches
+
+
+def tsplot(series, plotf, ax=None, **kwargs):
+ import warnings
+ """
+ Plots a Series on the given Matplotlib axes or the current axes
+
+ Parameters
+ ----------
+ axes : Axes
+ series : Series
+
+ Notes
+ _____
+ Supports same kwargs as Axes.plot
+
+
+ .. deprecated:: 0.23.0
+ Use Series.plot() instead
+ """
+ warnings.warn("'tsplot' is deprecated and will be removed in a "
+ "future version. Please use Series.plot() instead.",
+ FutureWarning, stacklevel=2)
+
+ # Used inferred freq is possible, need a test case for inferred
+ if ax is None:
+ import matplotlib.pyplot as plt
+ ax = plt.gca()
+
+ freq, series = _maybe_resample(series, ax, kwargs)
+
+ # Set ax with freq info
+ _decorate_axes(ax, freq, kwargs)
+ ax._plot_data.append((series, plotf, kwargs))
+ lines = plotf(ax, series.index._mpl_repr(), series.values, **kwargs)
+
+ # set date formatter, locators and rescale limits
+ format_dateaxis(ax, ax.freq, series.index)
+ return lines
+
+
+def _maybe_resample(series, ax, kwargs):
+ # resample against axes freq if necessary
+ freq, ax_freq = _get_freq(ax, series)
+
+ if freq is None: # pragma: no cover
+ raise ValueError('Cannot use dynamic axis without frequency info')
+
+ # Convert DatetimeIndex to PeriodIndex
+ if isinstance(series.index, ABCDatetimeIndex):
+ series = series.to_period(freq=freq)
+
+ if ax_freq is not None and freq != ax_freq:
+ if is_superperiod(freq, ax_freq): # upsample input
+ series = series.copy()
+ series.index = series.index.asfreq(ax_freq, how='s')
+ freq = ax_freq
+ elif _is_sup(freq, ax_freq): # one is weekly
+ how = kwargs.pop('how', 'last')
+ series = getattr(series.resample('D'), how)().dropna()
+ series = getattr(series.resample(ax_freq), how)().dropna()
+ freq = ax_freq
+ elif is_subperiod(freq, ax_freq) or _is_sub(freq, ax_freq):
+ _upsample_others(ax, freq, kwargs)
+ else: # pragma: no cover
+ raise ValueError('Incompatible frequency conversion')
+ return freq, series
+
+
+def _is_sub(f1, f2):
+ return ((f1.startswith('W') and is_subperiod('D', f2)) or
+ (f2.startswith('W') and is_subperiod(f1, 'D')))
+
+
+def _is_sup(f1, f2):
+ return ((f1.startswith('W') and is_superperiod('D', f2)) or
+ (f2.startswith('W') and is_superperiod(f1, 'D')))
+
+
+def _upsample_others(ax, freq, kwargs):
+ legend = ax.get_legend()
+ lines, labels = _replot_ax(ax, freq, kwargs)
+ _replot_ax(ax, freq, kwargs)
+
+ other_ax = None
+ if hasattr(ax, 'left_ax'):
+ other_ax = ax.left_ax
+ if hasattr(ax, 'right_ax'):
+ other_ax = ax.right_ax
+
+ if other_ax is not None:
+ rlines, rlabels = _replot_ax(other_ax, freq, kwargs)
+ lines.extend(rlines)
+ labels.extend(rlabels)
+
+ if (legend is not None and kwargs.get('legend', True) and
+ len(lines) > 0):
+ title = legend.get_title().get_text()
+ if title == 'None':
+ title = None
+ ax.legend(lines, labels, loc='best', title=title)
+
+
+def _replot_ax(ax, freq, kwargs):
+ data = getattr(ax, '_plot_data', None)
+
+ # clear current axes and data
+ ax._plot_data = []
+ ax.clear()
+
+ _decorate_axes(ax, freq, kwargs)
+
+ lines = []
+ labels = []
+ if data is not None:
+ for series, plotf, kwds in data:
+ series = series.copy()
+ idx = series.index.asfreq(freq, how='S')
+ series.index = idx
+ ax._plot_data.append((series, plotf, kwds))
+
+ # for tsplot
+ if isinstance(plotf, compat.string_types):
+ from pandas.plotting._core import _plot_klass
+ plotf = _plot_klass[plotf]._plot
+
+ lines.append(plotf(ax, series.index._mpl_repr(),
+ series.values, **kwds)[0])
+ labels.append(pprint_thing(series.name))
+
+ return lines, labels
+
+
+def _decorate_axes(ax, freq, kwargs):
+ """Initialize axes for time-series plotting"""
+ if not hasattr(ax, '_plot_data'):
+ ax._plot_data = []
+
+ ax.freq = freq
+ xaxis = ax.get_xaxis()
+ xaxis.freq = freq
+ if not hasattr(ax, 'legendlabels'):
+ ax.legendlabels = [kwargs.get('label', None)]
+ else:
+ ax.legendlabels.append(kwargs.get('label', None))
+ ax.view_interval = None
+ ax.date_axis_info = None
+
+
+def _get_ax_freq(ax):
+ """
+ Get the freq attribute of the ax object if set.
+ Also checks shared axes (eg when using secondary yaxis, sharex=True
+ or twinx)
+ """
+ ax_freq = getattr(ax, 'freq', None)
+ if ax_freq is None:
+ # check for left/right ax in case of secondary yaxis
+ if hasattr(ax, 'left_ax'):
+ ax_freq = getattr(ax.left_ax, 'freq', None)
+ elif hasattr(ax, 'right_ax'):
+ ax_freq = getattr(ax.right_ax, 'freq', None)
+ if ax_freq is None:
+ # check if a shared ax (sharex/twinx) has already freq set
+ shared_axes = ax.get_shared_x_axes().get_siblings(ax)
+ if len(shared_axes) > 1:
+ for shared_ax in shared_axes:
+ ax_freq = getattr(shared_ax, 'freq', None)
+ if ax_freq is not None:
+ break
+ return ax_freq
+
+
+def _get_freq(ax, series):
+ # get frequency from data
+ freq = getattr(series.index, 'freq', None)
+ if freq is None:
+ freq = getattr(series.index, 'inferred_freq', None)
+
+ ax_freq = _get_ax_freq(ax)
+
+ # use axes freq if no data freq
+ if freq is None:
+ freq = ax_freq
+
+ # get the period frequency
+ if isinstance(freq, DateOffset):
+ freq = freq.rule_code
+ else:
+ freq = get_base_alias(freq)
+
+ freq = frequencies.get_period_alias(freq)
+ return freq, ax_freq
+
+
+def _use_dynamic_x(ax, data):
+ freq = _get_index_freq(data)
+ ax_freq = _get_ax_freq(ax)
+
+ if freq is None: # convert irregular if axes has freq info
+ freq = ax_freq
+ else: # do not use tsplot if irregular was plotted first
+ if (ax_freq is None) and (len(ax.get_lines()) > 0):
+ return False
+
+ if freq is None:
+ return False
+
+ if isinstance(freq, DateOffset):
+ freq = freq.rule_code
+ else:
+ freq = get_base_alias(freq)
+ freq = frequencies.get_period_alias(freq)
+
+ if freq is None:
+ return False
+
+ # hack this for 0.10.1, creating more technical debt...sigh
+ if isinstance(data.index, ABCDatetimeIndex):
+ base = get_freq(freq)
+ x = data.index
+ if (base <= FreqGroup.FR_DAY):
+ return x[:1].is_normalized
+ return Period(x[0], freq).to_timestamp(tz=x.tz) == x[0]
+ return True
+
+
+def _get_index_freq(data):
+ freq = getattr(data.index, 'freq', None)
+ if freq is None:
+ freq = getattr(data.index, 'inferred_freq', None)
+ if freq == 'B':
+ weekdays = np.unique(data.index.dayofweek)
+ if (5 in weekdays) or (6 in weekdays):
+ freq = None
+ return freq
+
+
+def _maybe_convert_index(ax, data):
+ # tsplot converts automatically, but don't want to convert index
+ # over and over for DataFrames
+ if isinstance(data.index, ABCDatetimeIndex):
+ freq = getattr(data.index, 'freq', None)
+
+ if freq is None:
+ freq = getattr(data.index, 'inferred_freq', None)
+ if isinstance(freq, DateOffset):
+ freq = freq.rule_code
+
+ if freq is None:
+ freq = _get_ax_freq(ax)
+
+ if freq is None:
+ raise ValueError('Could not get frequency alias for plotting')
+
+ freq = get_base_alias(freq)
+ freq = frequencies.get_period_alias(freq)
+
+ data = data.to_period(freq=freq)
+ return data
+
+
+# Patch methods for subplot. Only format_dateaxis is currently used.
+# Do we need the rest for convenience?
+
+def format_timedelta_ticks(x, pos, n_decimals):
+ """
+ Convert seconds to 'D days HH:MM:SS.F'
+ """
+ s, ns = divmod(x, 1e9)
+ m, s = divmod(s, 60)
+ h, m = divmod(m, 60)
+ d, h = divmod(h, 24)
+ decimals = int(ns * 10**(n_decimals - 9))
+ s = r'{:02d}:{:02d}:{:02d}'.format(int(h), int(m), int(s))
+ if n_decimals > 0:
+ s += '.{{:0{:0d}d}}'.format(n_decimals).format(decimals)
+ if d != 0:
+ s = '{:d} days '.format(int(d)) + s
+ return s
+
+
+def _format_coord(freq, t, y):
+ return "t = {0} y = {1:8f}".format(Period(ordinal=int(t), freq=freq), y)
+
+
+def format_dateaxis(subplot, freq, index):
+ """
+ Pretty-formats the date axis (x-axis).
+
+ Major and minor ticks are automatically set for the frequency of the
+ current underlying series. As the dynamic mode is activated by
+ default, changing the limits of the x axis will intelligently change
+ the positions of the ticks.
+ """
+
+ # handle index specific formatting
+ # Note: DatetimeIndex does not use this
+ # interface. DatetimeIndex uses matplotlib.date directly
+ if isinstance(index, ABCPeriodIndex):
+
+ majlocator = TimeSeries_DateLocator(freq, dynamic_mode=True,
+ minor_locator=False,
+ plot_obj=subplot)
+ minlocator = TimeSeries_DateLocator(freq, dynamic_mode=True,
+ minor_locator=True,
+ plot_obj=subplot)
+ subplot.xaxis.set_major_locator(majlocator)
+ subplot.xaxis.set_minor_locator(minlocator)
+
+ majformatter = TimeSeries_DateFormatter(freq, dynamic_mode=True,
+ minor_locator=False,
+ plot_obj=subplot)
+ minformatter = TimeSeries_DateFormatter(freq, dynamic_mode=True,
+ minor_locator=True,
+ plot_obj=subplot)
+ subplot.xaxis.set_major_formatter(majformatter)
+ subplot.xaxis.set_minor_formatter(minformatter)
+
+ # x and y coord info
+ subplot.format_coord = functools.partial(_format_coord, freq)
+
+ elif isinstance(index, ABCTimedeltaIndex):
+ subplot.xaxis.set_major_formatter(
+ TimeSeries_TimedeltaFormatter())
+ else:
+ raise TypeError('index type not supported')
+
+ pylab.draw_if_interactive()
diff --git a/contrib/python/pandas/py2/pandas/plotting/_tools.py b/contrib/python/pandas/py2/pandas/plotting/_tools.py
new file mode 100644
index 00000000000..4d9e97f11fd
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/plotting/_tools.py
@@ -0,0 +1,382 @@
+# being a bit too dynamic
+# pylint: disable=E1101
+from __future__ import division
+
+from math import ceil
+import warnings
+
+import numpy as np
+
+from pandas.compat import range
+
+from pandas.core.dtypes.common import is_list_like
+from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
+
+
+def format_date_labels(ax, rot):
+ # mini version of autofmt_xdate
+ try:
+ for label in ax.get_xticklabels():
+ label.set_ha('right')
+ label.set_rotation(rot)
+ fig = ax.get_figure()
+ fig.subplots_adjust(bottom=0.2)
+ except Exception: # pragma: no cover
+ pass
+
+
+def table(ax, data, rowLabels=None, colLabels=None, **kwargs):
+ """
+ Helper function to convert DataFrame and Series to matplotlib.table
+
+ Parameters
+ ----------
+ ax : Matplotlib axes object
+ data : DataFrame or Series
+ data for table contents
+ kwargs : keywords, optional
+ keyword arguments which passed to matplotlib.table.table.
+ If `rowLabels` or `colLabels` is not specified, data index or column
+ name will be used.
+
+ Returns
+ -------
+ matplotlib table object
+ """
+ if isinstance(data, ABCSeries):
+ data = data.to_frame()
+ elif isinstance(data, ABCDataFrame):
+ pass
+ else:
+ raise ValueError('Input data must be DataFrame or Series')
+
+ if rowLabels is None:
+ rowLabels = data.index
+
+ if colLabels is None:
+ colLabels = data.columns
+
+ cellText = data.values
+
+ import matplotlib.table
+ table = matplotlib.table.table(ax, cellText=cellText,
+ rowLabels=rowLabels,
+ colLabels=colLabels, **kwargs)
+ return table
+
+
+def _get_layout(nplots, layout=None, layout_type='box'):
+ if layout is not None:
+ if not isinstance(layout, (tuple, list)) or len(layout) != 2:
+ raise ValueError('Layout must be a tuple of (rows, columns)')
+
+ nrows, ncols = layout
+
+ # Python 2 compat
+ ceil_ = lambda x: int(ceil(x))
+ if nrows == -1 and ncols > 0:
+ layout = nrows, ncols = (ceil_(float(nplots) / ncols), ncols)
+ elif ncols == -1 and nrows > 0:
+ layout = nrows, ncols = (nrows, ceil_(float(nplots) / nrows))
+ elif ncols <= 0 and nrows <= 0:
+ msg = "At least one dimension of layout must be positive"
+ raise ValueError(msg)
+
+ if nrows * ncols < nplots:
+ raise ValueError('Layout of {nrows}x{ncols} must be larger '
+ 'than required size {nplots}'.format(
+ nrows=nrows, ncols=ncols, nplots=nplots))
+
+ return layout
+
+ if layout_type == 'single':
+ return (1, 1)
+ elif layout_type == 'horizontal':
+ return (1, nplots)
+ elif layout_type == 'vertical':
+ return (nplots, 1)
+
+ layouts = {1: (1, 1), 2: (1, 2), 3: (2, 2), 4: (2, 2)}
+ try:
+ return layouts[nplots]
+ except KeyError:
+ k = 1
+ while k ** 2 < nplots:
+ k += 1
+
+ if (k - 1) * k >= nplots:
+ return k, (k - 1)
+ else:
+ return k, k
+
+# copied from matplotlib/pyplot.py and modified for pandas.plotting
+
+
+def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True,
+ subplot_kw=None, ax=None, layout=None, layout_type='box',
+ **fig_kw):
+ """Create a figure with a set of subplots already made.
+
+ This utility wrapper makes it convenient to create common layouts of
+ subplots, including the enclosing figure object, in a single call.
+
+ Keyword arguments:
+
+ naxes : int
+ Number of required axes. Exceeded axes are set invisible. Default is
+ nrows * ncols.
+
+ sharex : bool
+ If True, the X axis will be shared amongst all subplots.
+
+ sharey : bool
+ If True, the Y axis will be shared amongst all subplots.
+
+ squeeze : bool
+
+ If True, extra dimensions are squeezed out from the returned axis object:
+ - if only one subplot is constructed (nrows=ncols=1), the resulting
+ single Axis object is returned as a scalar.
+ - for Nx1 or 1xN subplots, the returned object is a 1-d numpy object
+ array of Axis objects are returned as numpy 1-d arrays.
+ - for NxM subplots with N>1 and M>1 are returned as a 2d array.
+
+ If False, no squeezing is done: the returned axis object is always
+ a 2-d array containing Axis instances, even if it ends up being 1x1.
+
+ subplot_kw : dict
+ Dict with keywords passed to the add_subplot() call used to create each
+ subplots.
+
+ ax : Matplotlib axis object, optional
+
+ layout : tuple
+ Number of rows and columns of the subplot grid.
+ If not specified, calculated from naxes and layout_type
+
+ layout_type : {'box', 'horziontal', 'vertical'}, default 'box'
+ Specify how to layout the subplot grid.
+
+ fig_kw : Other keyword arguments to be passed to the figure() call.
+ Note that all keywords not recognized above will be
+ automatically included here.
+
+ Returns:
+
+ fig, ax : tuple
+ - fig is the Matplotlib Figure object
+ - ax can be either a single axis object or an array of axis objects if
+ more than one subplot was created. The dimensions of the resulting array
+ can be controlled with the squeeze keyword, see above.
+
+ **Examples:**
+
+ x = np.linspace(0, 2*np.pi, 400)
+ y = np.sin(x**2)
+
+ # Just a figure and one subplot
+ f, ax = plt.subplots()
+ ax.plot(x, y)
+ ax.set_title('Simple plot')
+
+ # Two subplots, unpack the output array immediately
+ f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
+ ax1.plot(x, y)
+ ax1.set_title('Sharing Y axis')
+ ax2.scatter(x, y)
+
+ # Four polar axes
+ plt.subplots(2, 2, subplot_kw=dict(polar=True))
+ """
+ import matplotlib.pyplot as plt
+
+ if subplot_kw is None:
+ subplot_kw = {}
+
+ if ax is None:
+ fig = plt.figure(**fig_kw)
+ else:
+ if is_list_like(ax):
+ ax = _flatten(ax)
+ if layout is not None:
+ warnings.warn("When passing multiple axes, layout keyword is "
+ "ignored", UserWarning)
+ if sharex or sharey:
+ warnings.warn("When passing multiple axes, sharex and sharey "
+ "are ignored. These settings must be specified "
+ "when creating axes", UserWarning,
+ stacklevel=4)
+ if len(ax) == naxes:
+ fig = ax[0].get_figure()
+ return fig, ax
+ else:
+ raise ValueError("The number of passed axes must be {0}, the "
+ "same as the output plot".format(naxes))
+
+ fig = ax.get_figure()
+ # if ax is passed and a number of subplots is 1, return ax as it is
+ if naxes == 1:
+ if squeeze:
+ return fig, ax
+ else:
+ return fig, _flatten(ax)
+ else:
+ warnings.warn("To output multiple subplots, the figure containing "
+ "the passed axes is being cleared", UserWarning,
+ stacklevel=4)
+ fig.clear()
+
+ nrows, ncols = _get_layout(naxes, layout=layout, layout_type=layout_type)
+ nplots = nrows * ncols
+
+ # Create empty object array to hold all axes. It's easiest to make it 1-d
+ # so we can just append subplots upon creation, and then
+ axarr = np.empty(nplots, dtype=object)
+
+ # Create first subplot separately, so we can share it if requested
+ ax0 = fig.add_subplot(nrows, ncols, 1, **subplot_kw)
+
+ if sharex:
+ subplot_kw['sharex'] = ax0
+ if sharey:
+ subplot_kw['sharey'] = ax0
+ axarr[0] = ax0
+
+ # Note off-by-one counting because add_subplot uses the MATLAB 1-based
+ # convention.
+ for i in range(1, nplots):
+ kwds = subplot_kw.copy()
+ # Set sharex and sharey to None for blank/dummy axes, these can
+ # interfere with proper axis limits on the visible axes if
+ # they share axes e.g. issue #7528
+ if i >= naxes:
+ kwds['sharex'] = None
+ kwds['sharey'] = None
+ ax = fig.add_subplot(nrows, ncols, i + 1, **kwds)
+ axarr[i] = ax
+
+ if naxes != nplots:
+ for ax in axarr[naxes:]:
+ ax.set_visible(False)
+
+ _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey)
+
+ if squeeze:
+ # Reshape the array to have the final desired dimension (nrow,ncol),
+ # though discarding unneeded dimensions that equal 1. If we only have
+ # one subplot, just return it instead of a 1-element array.
+ if nplots == 1:
+ axes = axarr[0]
+ else:
+ axes = axarr.reshape(nrows, ncols).squeeze()
+ else:
+ # returned axis array will be always 2-d, even if nrows=ncols=1
+ axes = axarr.reshape(nrows, ncols)
+
+ return fig, axes
+
+
+def _remove_labels_from_axis(axis):
+ for t in axis.get_majorticklabels():
+ t.set_visible(False)
+
+ try:
+ # set_visible will not be effective if
+ # minor axis has NullLocator and NullFormattor (default)
+ import matplotlib.ticker as ticker
+ if isinstance(axis.get_minor_locator(), ticker.NullLocator):
+ axis.set_minor_locator(ticker.AutoLocator())
+ if isinstance(axis.get_minor_formatter(), ticker.NullFormatter):
+ axis.set_minor_formatter(ticker.FormatStrFormatter(''))
+ for t in axis.get_minorticklabels():
+ t.set_visible(False)
+ except Exception: # pragma no cover
+ raise
+ axis.get_label().set_visible(False)
+
+
+def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey):
+ if nplots > 1:
+
+ if nrows > 1:
+ try:
+ # first find out the ax layout,
+ # so that we can correctly handle 'gaps"
+ layout = np.zeros((nrows + 1, ncols + 1), dtype=np.bool)
+ for ax in axarr:
+ layout[ax.rowNum, ax.colNum] = ax.get_visible()
+
+ for ax in axarr:
+ # only the last row of subplots should get x labels -> all
+ # other off layout handles the case that the subplot is
+ # the last in the column, because below is no subplot/gap.
+ if not layout[ax.rowNum + 1, ax.colNum]:
+ continue
+ if sharex or len(ax.get_shared_x_axes()
+ .get_siblings(ax)) > 1:
+ _remove_labels_from_axis(ax.xaxis)
+
+ except IndexError:
+ # if gridspec is used, ax.rowNum and ax.colNum may different
+ # from layout shape. in this case, use last_row logic
+ for ax in axarr:
+ if ax.is_last_row():
+ continue
+ if sharex or len(ax.get_shared_x_axes()
+ .get_siblings(ax)) > 1:
+ _remove_labels_from_axis(ax.xaxis)
+
+ if ncols > 1:
+ for ax in axarr:
+ # only the first column should get y labels -> set all other to
+ # off as we only have labels in the first column and we always
+ # have a subplot there, we can skip the layout test
+ if ax.is_first_col():
+ continue
+ if sharey or len(ax.get_shared_y_axes().get_siblings(ax)) > 1:
+ _remove_labels_from_axis(ax.yaxis)
+
+
+def _flatten(axes):
+ if not is_list_like(axes):
+ return np.array([axes])
+ elif isinstance(axes, (np.ndarray, ABCIndexClass)):
+ return axes.ravel()
+ return np.array(axes)
+
+
+def _get_all_lines(ax):
+ lines = ax.get_lines()
+
+ if hasattr(ax, 'right_ax'):
+ lines += ax.right_ax.get_lines()
+
+ if hasattr(ax, 'left_ax'):
+ lines += ax.left_ax.get_lines()
+
+ return lines
+
+
+def _get_xlim(lines):
+ left, right = np.inf, -np.inf
+ for l in lines:
+ x = l.get_xdata(orig=False)
+ left = min(np.nanmin(x), left)
+ right = max(np.nanmax(x), right)
+ return left, right
+
+
+def _set_ticks_props(axes, xlabelsize=None, xrot=None,
+ ylabelsize=None, yrot=None):
+ import matplotlib.pyplot as plt
+
+ for ax in _flatten(axes):
+ if xlabelsize is not None:
+ plt.setp(ax.get_xticklabels(), fontsize=xlabelsize)
+ if xrot is not None:
+ plt.setp(ax.get_xticklabels(), rotation=xrot)
+ if ylabelsize is not None:
+ plt.setp(ax.get_yticklabels(), fontsize=ylabelsize)
+ if yrot is not None:
+ plt.setp(ax.get_yticklabels(), rotation=yrot)
+ return axes
diff --git a/contrib/python/pandas/py2/pandas/testing.py b/contrib/python/pandas/py2/pandas/testing.py
new file mode 100644
index 00000000000..dbea1ecc736
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/testing.py
@@ -0,0 +1,8 @@
+# flake8: noqa
+
+"""
+Public testing utility functions.
+"""
+
+from pandas.util.testing import (
+ assert_frame_equal, assert_index_equal, assert_series_equal)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/data/fixed_width_format.txt b/contrib/python/pandas/py2/pandas/tests/io/data/fixed_width_format.txt
new file mode 100644
index 00000000000..bb487d8de7e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/data/fixed_width_format.txt
@@ -0,0 +1,3 @@
+A B C
+1 2 3
+4 5 6
diff --git a/contrib/python/pandas/py2/pandas/tests/io/data/gbq_fake_job.txt b/contrib/python/pandas/py2/pandas/tests/io/data/gbq_fake_job.txt
new file mode 100644
index 00000000000..2a0f09bc66e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/data/gbq_fake_job.txt
@@ -0,0 +1 @@
+{u'status': {u'state': u'DONE'}, u'kind': u'bigquery#job', u'statistics': {u'query': {u'cacheHit': True, u'totalBytesProcessed': u'0'}, u'endTime': u'1377668744674', u'totalBytesProcessed': u'0', u'startTime': u'1377668744466'}, u'jobReference': {u'projectId': u'57288129629', u'jobId': u'bqjob_r5f956972f0190bdf_00000140c374bf42_2'}, u'etag': u'"4PTsVxg68bQkQs1RJ1Ndewqkgg4/oO4VmgFrAku4N6FWci9s7iFIftc"', u'configuration': {u'query': {u'createDisposition': u'CREATE_IF_NEEDED', u'query': u'SELECT * FROM [publicdata:samples.shakespeare]', u'writeDisposition': u'WRITE_TRUNCATE', u'destinationTable': {u'projectId': u'57288129629', u'tableId': u'anonb5ec450da88eeeb78a27784ea482ee75a146d442', u'datasetId': u'_d0b4f5f0d50dc68a3eb0fa6cba66a9a8687d9253'}}}, u'id': u'57288129629:bqjob_r5f956972f0190bdf_00000140c374bf42_2', u'selfLink': u'https://www.googleapis.com/bigquery/v2/projects/57288129629/jobs/bqjob_r5f956972f0190bdf_00000140c374bf42_2'} \ No newline at end of file
diff --git a/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_newspec.py b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_newspec.py
new file mode 100644
index 00000000000..d92c649c5e1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/msgpack/test_newspec.py
@@ -0,0 +1,92 @@
+# coding: utf-8
+
+from pandas.io.msgpack import ExtType, packb, unpackb
+
+
+def test_str8():
+ header = b'\xd9'
+ data = b'x' * 32
+ b = packb(data.decode(), use_bin_type=True)
+ assert len(b) == len(data) + 2
+ assert b[0:2] == header + b'\x20'
+ assert b[2:] == data
+ assert unpackb(b) == data
+
+ data = b'x' * 255
+ b = packb(data.decode(), use_bin_type=True)
+ assert len(b) == len(data) + 2
+ assert b[0:2] == header + b'\xff'
+ assert b[2:] == data
+ assert unpackb(b) == data
+
+
+def test_bin8():
+ header = b'\xc4'
+ data = b''
+ b = packb(data, use_bin_type=True)
+ assert len(b) == len(data) + 2
+ assert b[0:2] == header + b'\x00'
+ assert b[2:] == data
+ assert unpackb(b) == data
+
+ data = b'x' * 255
+ b = packb(data, use_bin_type=True)
+ assert len(b) == len(data) + 2
+ assert b[0:2] == header + b'\xff'
+ assert b[2:] == data
+ assert unpackb(b) == data
+
+
+def test_bin16():
+ header = b'\xc5'
+ data = b'x' * 256
+ b = packb(data, use_bin_type=True)
+ assert len(b) == len(data) + 3
+ assert b[0:1] == header
+ assert b[1:3] == b'\x01\x00'
+ assert b[3:] == data
+ assert unpackb(b) == data
+
+ data = b'x' * 65535
+ b = packb(data, use_bin_type=True)
+ assert len(b) == len(data) + 3
+ assert b[0:1] == header
+ assert b[1:3] == b'\xff\xff'
+ assert b[3:] == data
+ assert unpackb(b) == data
+
+
+def test_bin32():
+ header = b'\xc6'
+ data = b'x' * 65536
+ b = packb(data, use_bin_type=True)
+ assert len(b) == len(data) + 5
+ assert b[0:1] == header
+ assert b[1:5] == b'\x00\x01\x00\x00'
+ assert b[5:] == data
+ assert unpackb(b) == data
+
+
+def test_ext():
+ def check(ext, packed):
+ assert packb(ext) == packed
+ assert unpackb(packed) == ext
+
+ check(ExtType(0x42, b'Z'), b'\xd4\x42Z') # fixext 1
+ check(ExtType(0x42, b'ZZ'), b'\xd5\x42ZZ') # fixext 2
+ check(ExtType(0x42, b'Z' * 4), b'\xd6\x42' + b'Z' * 4) # fixext 4
+ check(ExtType(0x42, b'Z' * 8), b'\xd7\x42' + b'Z' * 8) # fixext 8
+ check(ExtType(0x42, b'Z' * 16), b'\xd8\x42' + b'Z' * 16) # fixext 16
+ # ext 8
+ check(ExtType(0x42, b''), b'\xc7\x00\x42')
+ check(ExtType(0x42, b'Z' * 255), b'\xc7\xff\x42' + b'Z' * 255)
+ # ext 16
+ check(ExtType(0x42, b'Z' * 256), b'\xc8\x01\x00\x42' + b'Z' * 256)
+ check(ExtType(0x42, b'Z' * 0xffff), b'\xc8\xff\xff\x42' + b'Z' * 0xffff)
+ # ext 32
+ check(
+ ExtType(0x42, b'Z' *
+ 0x10000), b'\xc9\x00\x01\x00\x00\x42' + b'Z' * 0x10000)
+ # needs large memory
+ # check(ExtType(0x42, b'Z'*0xffffffff),
+ # b'\xc9\xff\xff\xff\xff\x42' + b'Z'*0xffffffff)
diff --git a/contrib/python/pandas/py2/pandas/tests/io/parser/data/utf16_ex.txt b/contrib/python/pandas/py2/pandas/tests/io/parser/data/utf16_ex.txt
new file mode 100644
index 00000000000..f0b452a2bd5
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tests/io/parser/data/utf16_ex.txt
Binary files differ
diff --git a/contrib/python/pandas/py2/pandas/tseries/__init__.py b/contrib/python/pandas/py2/pandas/tseries/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tseries/__init__.py
diff --git a/contrib/python/pandas/py2/pandas/tseries/api.py b/contrib/python/pandas/py2/pandas/tseries/api.py
new file mode 100644
index 00000000000..2094791ecdc
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tseries/api.py
@@ -0,0 +1,8 @@
+"""
+Timeseries API
+"""
+
+# flake8: noqa
+
+from pandas.tseries.frequencies import infer_freq
+import pandas.tseries.offsets as offsets
diff --git a/contrib/python/pandas/py2/pandas/tseries/converter.py b/contrib/python/pandas/py2/pandas/tseries/converter.py
new file mode 100644
index 00000000000..05dd7cea1bd
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tseries/converter.py
@@ -0,0 +1,16 @@
+# flake8: noqa
+import warnings
+
+from pandas.plotting._converter import (
+ DatetimeConverter, MilliSecondLocator, PandasAutoDateFormatter,
+ PandasAutoDateLocator, PeriodConverter, TimeConverter, TimeFormatter,
+ TimeSeries_DateFormatter, TimeSeries_DateLocator, get_datevalue,
+ get_finder, time2num)
+
+
+def register():
+ from pandas.plotting._converter import register as register_
+ msg = ("'pandas.tseries.converter.register' has been moved and renamed to "
+ "'pandas.plotting.register_matplotlib_converters'. ")
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+ register_()
diff --git a/contrib/python/pandas/py2/pandas/tseries/frequencies.py b/contrib/python/pandas/py2/pandas/tseries/frequencies.py
new file mode 100644
index 00000000000..c454db3bbdf
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tseries/frequencies.py
@@ -0,0 +1,497 @@
+# -*- coding: utf-8 -*-
+from datetime import timedelta
+import re
+
+import numpy as np
+from pytz import AmbiguousTimeError
+
+from pandas._libs.algos import unique_deltas
+from pandas._libs.tslibs import Timedelta, Timestamp
+from pandas._libs.tslibs.ccalendar import MONTH_ALIASES, int_to_weekday
+from pandas._libs.tslibs.conversion import tz_convert
+from pandas._libs.tslibs.fields import build_field_sarray
+import pandas._libs.tslibs.frequencies as libfreqs
+from pandas._libs.tslibs.offsets import _offset_to_period_map
+import pandas._libs.tslibs.resolution as libresolution
+from pandas._libs.tslibs.resolution import Resolution
+from pandas._libs.tslibs.timezones import UTC
+import pandas.compat as compat
+from pandas.compat import zip
+from pandas.util._decorators import cache_readonly
+
+from pandas.core.dtypes.common import (
+ is_datetime64_dtype, is_period_arraylike, is_timedelta64_dtype)
+from pandas.core.dtypes.generic import ABCSeries
+
+from pandas.core.algorithms import unique
+
+from pandas.tseries.offsets import (
+ DateOffset, Day, Hour, Micro, Milli, Minute, Nano, Second, prefix_mapping)
+
+_ONE_MICRO = 1000
+_ONE_MILLI = (_ONE_MICRO * 1000)
+_ONE_SECOND = (_ONE_MILLI * 1000)
+_ONE_MINUTE = (60 * _ONE_SECOND)
+_ONE_HOUR = (60 * _ONE_MINUTE)
+_ONE_DAY = (24 * _ONE_HOUR)
+
+# ---------------------------------------------------------------------
+# Offset names ("time rules") and related functions
+
+#: cache of previously seen offsets
+_offset_map = {}
+
+
+def get_period_alias(offset_str):
+ """ alias to closest period strings BQ->Q etc"""
+ return _offset_to_period_map.get(offset_str, None)
+
+
+_name_to_offset_map = {'days': Day(1),
+ 'hours': Hour(1),
+ 'minutes': Minute(1),
+ 'seconds': Second(1),
+ 'milliseconds': Milli(1),
+ 'microseconds': Micro(1),
+ 'nanoseconds': Nano(1)}
+
+
+def to_offset(freq):
+ """
+ Return DateOffset object from string or tuple representation
+ or datetime.timedelta object
+
+ Parameters
+ ----------
+ freq : str, tuple, datetime.timedelta, DateOffset or None
+
+ Returns
+ -------
+ delta : DateOffset
+ None if freq is None
+
+ Raises
+ ------
+ ValueError
+ If freq is an invalid frequency
+
+ See Also
+ --------
+ pandas.DateOffset
+
+ Examples
+ --------
+ >>> to_offset('5min')
+ <5 * Minutes>
+
+ >>> to_offset('1D1H')
+ <25 * Hours>
+
+ >>> to_offset(('W', 2))
+ <2 * Weeks: weekday=6>
+
+ >>> to_offset((2, 'B'))
+ <2 * BusinessDays>
+
+ >>> to_offset(datetime.timedelta(days=1))
+ <Day>
+
+ >>> to_offset(Hour())
+ <Hour>
+ """
+ if freq is None:
+ return None
+
+ if isinstance(freq, DateOffset):
+ return freq
+
+ if isinstance(freq, tuple):
+ name = freq[0]
+ stride = freq[1]
+ if isinstance(stride, compat.string_types):
+ name, stride = stride, name
+ name, _ = libfreqs._base_and_stride(name)
+ delta = get_offset(name) * stride
+
+ elif isinstance(freq, timedelta):
+ delta = None
+ freq = Timedelta(freq)
+ try:
+ for name in freq.components._fields:
+ offset = _name_to_offset_map[name]
+ stride = getattr(freq.components, name)
+ if stride != 0:
+ offset = stride * offset
+ if delta is None:
+ delta = offset
+ else:
+ delta = delta + offset
+ except Exception:
+ raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(freq))
+
+ else:
+ delta = None
+ stride_sign = None
+ try:
+ splitted = re.split(libfreqs.opattern, freq)
+ if splitted[-1] != '' and not splitted[-1].isspace():
+ # the last element must be blank
+ raise ValueError('last element must be blank')
+ for sep, stride, name in zip(splitted[0::4], splitted[1::4],
+ splitted[2::4]):
+ if sep != '' and not sep.isspace():
+ raise ValueError('separator must be spaces')
+ prefix = libfreqs._lite_rule_alias.get(name) or name
+ if stride_sign is None:
+ stride_sign = -1 if stride.startswith('-') else 1
+ if not stride:
+ stride = 1
+ if prefix in Resolution._reso_str_bump_map.keys():
+ stride, name = Resolution.get_stride_from_decimal(
+ float(stride), prefix
+ )
+ stride = int(stride)
+ offset = get_offset(name)
+ offset = offset * int(np.fabs(stride) * stride_sign)
+ if delta is None:
+ delta = offset
+ else:
+ delta = delta + offset
+ except Exception:
+ raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(freq))
+
+ if delta is None:
+ raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(freq))
+
+ return delta
+
+
+def get_offset(name):
+ """
+ Return DateOffset object associated with rule name
+
+ Examples
+ --------
+ get_offset('EOM') --> BMonthEnd(1)
+ """
+ if name not in libfreqs._dont_uppercase:
+ name = name.upper()
+ name = libfreqs._lite_rule_alias.get(name, name)
+ name = libfreqs._lite_rule_alias.get(name.lower(), name)
+ else:
+ name = libfreqs._lite_rule_alias.get(name, name)
+
+ if name not in _offset_map:
+ try:
+ split = name.split('-')
+ klass = prefix_mapping[split[0]]
+ # handles case where there's no suffix (and will TypeError if too
+ # many '-')
+ offset = klass._from_name(*split[1:])
+ except (ValueError, TypeError, KeyError):
+ # bad prefix or suffix
+ raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(name))
+ # cache
+ _offset_map[name] = offset
+
+ return _offset_map[name]
+
+
+# ---------------------------------------------------------------------
+# Period codes
+
+
+def infer_freq(index, warn=True):
+ """
+ Infer the most likely frequency given the input index. If the frequency is
+ uncertain, a warning will be printed.
+
+ Parameters
+ ----------
+ index : DatetimeIndex or TimedeltaIndex
+ if passed a Series will use the values of the series (NOT THE INDEX)
+ warn : boolean, default True
+
+ Returns
+ -------
+ freq : string or None
+ None if no discernible frequency
+ TypeError if the index is not datetime-like
+ ValueError if there are less than three values.
+ """
+ import pandas as pd
+
+ if isinstance(index, ABCSeries):
+ values = index._values
+ if not (is_datetime64_dtype(values) or
+ is_timedelta64_dtype(values) or
+ values.dtype == object):
+ raise TypeError("cannot infer freq from a non-convertible dtype "
+ "on a Series of {dtype}".format(dtype=index.dtype))
+ index = values
+
+ if is_period_arraylike(index):
+ raise TypeError("PeriodIndex given. Check the `freq` attribute "
+ "instead of using infer_freq.")
+ elif is_timedelta64_dtype(index):
+ # Allow TimedeltaIndex and TimedeltaArray
+ inferer = _TimedeltaFrequencyInferer(index, warn=warn)
+ return inferer.get_freq()
+
+ if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex):
+ if isinstance(index, (pd.Int64Index, pd.Float64Index)):
+ raise TypeError("cannot infer freq from a non-convertible index "
+ "type {type}".format(type=type(index)))
+ index = index.values
+
+ if not isinstance(index, pd.DatetimeIndex):
+ try:
+ index = pd.DatetimeIndex(index)
+ except AmbiguousTimeError:
+ index = pd.DatetimeIndex(index.asi8)
+
+ inferer = _FrequencyInferer(index, warn=warn)
+ return inferer.get_freq()
+
+
+class _FrequencyInferer(object):
+ """
+ Not sure if I can avoid the state machine here
+ """
+
+ def __init__(self, index, warn=True):
+ self.index = index
+ self.values = index.asi8
+
+ # This moves the values, which are implicitly in UTC, to the
+ # the timezone so they are in local time
+ if hasattr(index, 'tz'):
+ if index.tz is not None:
+ self.values = tz_convert(self.values, UTC, index.tz)
+
+ self.warn = warn
+
+ if len(index) < 3:
+ raise ValueError('Need at least 3 dates to infer frequency')
+
+ self.is_monotonic = (self.index._is_monotonic_increasing or
+ self.index._is_monotonic_decreasing)
+
+ @cache_readonly
+ def deltas(self):
+ return unique_deltas(self.values)
+
+ @cache_readonly
+ def deltas_asi8(self):
+ return unique_deltas(self.index.asi8)
+
+ @cache_readonly
+ def is_unique(self):
+ return len(self.deltas) == 1
+
+ @cache_readonly
+ def is_unique_asi8(self):
+ return len(self.deltas_asi8) == 1
+
+ def get_freq(self):
+ """
+ Find the appropriate frequency string to describe the inferred
+ frequency of self.values
+
+ Returns
+ -------
+ freqstr : str or None
+ """
+ if not self.is_monotonic or not self.index._is_unique:
+ return None
+
+ delta = self.deltas[0]
+ if _is_multiple(delta, _ONE_DAY):
+ return self._infer_daily_rule()
+
+ # Business hourly, maybe. 17: one day / 65: one weekend
+ if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]):
+ return 'BH'
+ # Possibly intraday frequency. Here we use the
+ # original .asi8 values as the modified values
+ # will not work around DST transitions. See #8772
+ elif not self.is_unique_asi8:
+ return None
+
+ delta = self.deltas_asi8[0]
+ if _is_multiple(delta, _ONE_HOUR):
+ # Hours
+ return _maybe_add_count('H', delta / _ONE_HOUR)
+ elif _is_multiple(delta, _ONE_MINUTE):
+ # Minutes
+ return _maybe_add_count('T', delta / _ONE_MINUTE)
+ elif _is_multiple(delta, _ONE_SECOND):
+ # Seconds
+ return _maybe_add_count('S', delta / _ONE_SECOND)
+ elif _is_multiple(delta, _ONE_MILLI):
+ # Milliseconds
+ return _maybe_add_count('L', delta / _ONE_MILLI)
+ elif _is_multiple(delta, _ONE_MICRO):
+ # Microseconds
+ return _maybe_add_count('U', delta / _ONE_MICRO)
+ else:
+ # Nanoseconds
+ return _maybe_add_count('N', delta)
+
+ @cache_readonly
+ def day_deltas(self):
+ return [x / _ONE_DAY for x in self.deltas]
+
+ @cache_readonly
+ def hour_deltas(self):
+ return [x / _ONE_HOUR for x in self.deltas]
+
+ @cache_readonly
+ def fields(self):
+ return build_field_sarray(self.values)
+
+ @cache_readonly
+ def rep_stamp(self):
+ return Timestamp(self.values[0])
+
+ def month_position_check(self):
+ return libresolution.month_position_check(self.fields,
+ self.index.dayofweek)
+
+ @cache_readonly
+ def mdiffs(self):
+ nmonths = self.fields['Y'] * 12 + self.fields['M']
+ return unique_deltas(nmonths.astype('i8'))
+
+ @cache_readonly
+ def ydiffs(self):
+ return unique_deltas(self.fields['Y'].astype('i8'))
+
+ def _infer_daily_rule(self):
+ annual_rule = self._get_annual_rule()
+ if annual_rule:
+ nyears = self.ydiffs[0]
+ month = MONTH_ALIASES[self.rep_stamp.month]
+ alias = '{prefix}-{month}'.format(prefix=annual_rule, month=month)
+ return _maybe_add_count(alias, nyears)
+
+ quarterly_rule = self._get_quarterly_rule()
+ if quarterly_rule:
+ nquarters = self.mdiffs[0] / 3
+ mod_dict = {0: 12, 2: 11, 1: 10}
+ month = MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]]
+ alias = '{prefix}-{month}'.format(prefix=quarterly_rule,
+ month=month)
+ return _maybe_add_count(alias, nquarters)
+
+ monthly_rule = self._get_monthly_rule()
+ if monthly_rule:
+ return _maybe_add_count(monthly_rule, self.mdiffs[0])
+
+ if self.is_unique:
+ days = self.deltas[0] / _ONE_DAY
+ if days % 7 == 0:
+ # Weekly
+ day = int_to_weekday[self.rep_stamp.weekday()]
+ return _maybe_add_count(
+ 'W-{day}'.format(day=day), days / 7)
+ else:
+ return _maybe_add_count('D', days)
+
+ if self._is_business_daily():
+ return 'B'
+
+ wom_rule = self._get_wom_rule()
+ if wom_rule:
+ return wom_rule
+
+ def _get_annual_rule(self):
+ if len(self.ydiffs) > 1:
+ return None
+
+ if len(unique(self.fields['M'])) > 1:
+ return None
+
+ pos_check = self.month_position_check()
+ return {'cs': 'AS', 'bs': 'BAS',
+ 'ce': 'A', 'be': 'BA'}.get(pos_check)
+
+ def _get_quarterly_rule(self):
+ if len(self.mdiffs) > 1:
+ return None
+
+ if not self.mdiffs[0] % 3 == 0:
+ return None
+
+ pos_check = self.month_position_check()
+ return {'cs': 'QS', 'bs': 'BQS',
+ 'ce': 'Q', 'be': 'BQ'}.get(pos_check)
+
+ def _get_monthly_rule(self):
+ if len(self.mdiffs) > 1:
+ return None
+ pos_check = self.month_position_check()
+ return {'cs': 'MS', 'bs': 'BMS',
+ 'ce': 'M', 'be': 'BM'}.get(pos_check)
+
+ def _is_business_daily(self):
+ # quick check: cannot be business daily
+ if self.day_deltas != [1, 3]:
+ return False
+
+ # probably business daily, but need to confirm
+ first_weekday = self.index[0].weekday()
+ shifts = np.diff(self.index.asi8)
+ shifts = np.floor_divide(shifts, _ONE_DAY)
+ weekdays = np.mod(first_weekday + np.cumsum(shifts), 7)
+ return np.all(((weekdays == 0) & (shifts == 3)) |
+ ((weekdays > 0) & (weekdays <= 4) & (shifts == 1)))
+
+ def _get_wom_rule(self):
+ # wdiffs = unique(np.diff(self.index.week))
+ # We also need -47, -49, -48 to catch index spanning year boundary
+ # if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all():
+ # return None
+
+ weekdays = unique(self.index.weekday)
+ if len(weekdays) > 1:
+ return None
+
+ week_of_months = unique((self.index.day - 1) // 7)
+ # Only attempt to infer up to WOM-4. See #9425
+ week_of_months = week_of_months[week_of_months < 4]
+ if len(week_of_months) == 0 or len(week_of_months) > 1:
+ return None
+
+ # get which week
+ week = week_of_months[0] + 1
+ wd = int_to_weekday[weekdays[0]]
+
+ return 'WOM-{week}{weekday}'.format(week=week, weekday=wd)
+
+
+class _TimedeltaFrequencyInferer(_FrequencyInferer):
+
+ def _infer_daily_rule(self):
+ if self.is_unique:
+ days = self.deltas[0] / _ONE_DAY
+ if days % 7 == 0:
+ # Weekly
+ wd = int_to_weekday[self.rep_stamp.weekday()]
+ alias = 'W-{weekday}'.format(weekday=wd)
+ return _maybe_add_count(alias, days / 7)
+ else:
+ return _maybe_add_count('D', days)
+
+
+def _is_multiple(us, mult):
+ return us % mult == 0
+
+
+def _maybe_add_count(base, count):
+ if count != 1:
+ assert count == int(count)
+ count = int(count)
+ return '{count}{base}'.format(count=count, base=base)
+ else:
+ return base
diff --git a/contrib/python/pandas/py2/pandas/tseries/holiday.py b/contrib/python/pandas/py2/pandas/tseries/holiday.py
new file mode 100644
index 00000000000..4016114919f
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tseries/holiday.py
@@ -0,0 +1,513 @@
+from datetime import datetime, timedelta
+import warnings
+
+from dateutil.relativedelta import FR, MO, SA, SU, TH, TU, WE # noqa
+import numpy as np
+
+from pandas.compat import add_metaclass
+from pandas.errors import PerformanceWarning
+
+from pandas import DateOffset, Series, Timestamp, date_range
+
+from pandas.tseries.offsets import Day, Easter
+
+
+def next_monday(dt):
+ """
+ If holiday falls on Saturday, use following Monday instead;
+ if holiday falls on Sunday, use Monday instead
+ """
+ if dt.weekday() == 5:
+ return dt + timedelta(2)
+ elif dt.weekday() == 6:
+ return dt + timedelta(1)
+ return dt
+
+
+def next_monday_or_tuesday(dt):
+ """
+ For second holiday of two adjacent ones!
+ If holiday falls on Saturday, use following Monday instead;
+ if holiday falls on Sunday or Monday, use following Tuesday instead
+ (because Monday is already taken by adjacent holiday on the day before)
+ """
+ dow = dt.weekday()
+ if dow == 5 or dow == 6:
+ return dt + timedelta(2)
+ elif dow == 0:
+ return dt + timedelta(1)
+ return dt
+
+
+def previous_friday(dt):
+ """
+ If holiday falls on Saturday or Sunday, use previous Friday instead.
+ """
+ if dt.weekday() == 5:
+ return dt - timedelta(1)
+ elif dt.weekday() == 6:
+ return dt - timedelta(2)
+ return dt
+
+
+def sunday_to_monday(dt):
+ """
+ If holiday falls on Sunday, use day thereafter (Monday) instead.
+ """
+ if dt.weekday() == 6:
+ return dt + timedelta(1)
+ return dt
+
+
+def weekend_to_monday(dt):
+ """
+ If holiday falls on Sunday or Saturday,
+ use day thereafter (Monday) instead.
+ Needed for holidays such as Christmas observation in Europe
+ """
+ if dt.weekday() == 6:
+ return dt + timedelta(1)
+ elif dt.weekday() == 5:
+ return dt + timedelta(2)
+ return dt
+
+
+def nearest_workday(dt):
+ """
+ If holiday falls on Saturday, use day before (Friday) instead;
+ if holiday falls on Sunday, use day thereafter (Monday) instead.
+ """
+ if dt.weekday() == 5:
+ return dt - timedelta(1)
+ elif dt.weekday() == 6:
+ return dt + timedelta(1)
+ return dt
+
+
+def next_workday(dt):
+ """
+ returns next weekday used for observances
+ """
+ dt += timedelta(days=1)
+ while dt.weekday() > 4:
+ # Mon-Fri are 0-4
+ dt += timedelta(days=1)
+ return dt
+
+
+def previous_workday(dt):
+ """
+ returns previous weekday used for observances
+ """
+ dt -= timedelta(days=1)
+ while dt.weekday() > 4:
+ # Mon-Fri are 0-4
+ dt -= timedelta(days=1)
+ return dt
+
+
+def before_nearest_workday(dt):
+ """
+ returns previous workday after nearest workday
+ """
+ return previous_workday(nearest_workday(dt))
+
+
+def after_nearest_workday(dt):
+ """
+ returns next workday after nearest workday
+ needed for Boxing day or multiple holidays in a series
+ """
+ return next_workday(nearest_workday(dt))
+
+
+class Holiday(object):
+ """
+ Class that defines a holiday with start/end dates and rules
+ for observance.
+ """
+
+ def __init__(self, name, year=None, month=None, day=None, offset=None,
+ observance=None, start_date=None, end_date=None,
+ days_of_week=None):
+ """
+ Parameters
+ ----------
+ name : str
+ Name of the holiday , defaults to class name
+ offset : array of pandas.tseries.offsets or
+ class from pandas.tseries.offsets
+ computes offset from date
+ observance: function
+ computes when holiday is given a pandas Timestamp
+ days_of_week:
+ provide a tuple of days e.g (0,1,2,3,) for Monday Through Thursday
+ Monday=0,..,Sunday=6
+
+ Examples
+ --------
+ >>> from pandas.tseries.holiday import Holiday, nearest_workday
+ >>> from dateutil.relativedelta import MO
+ >>> USMemorialDay = Holiday('MemorialDay', month=5, day=24,
+ offset=pd.DateOffset(weekday=MO(1)))
+ >>> USLaborDay = Holiday('Labor Day', month=9, day=1,
+ offset=pd.DateOffset(weekday=MO(1)))
+ >>> July3rd = Holiday('July 3rd', month=7, day=3,)
+ >>> NewYears = Holiday('New Years Day', month=1, day=1,
+ observance=nearest_workday),
+ >>> July3rd = Holiday('July 3rd', month=7, day=3,
+ days_of_week=(0, 1, 2, 3))
+ """
+ if offset is not None and observance is not None:
+ raise NotImplementedError("Cannot use both offset and observance.")
+
+ self.name = name
+ self.year = year
+ self.month = month
+ self.day = day
+ self.offset = offset
+ self.start_date = Timestamp(
+ start_date) if start_date is not None else start_date
+ self.end_date = Timestamp(
+ end_date) if end_date is not None else end_date
+ self.observance = observance
+ assert (days_of_week is None or type(days_of_week) == tuple)
+ self.days_of_week = days_of_week
+
+ def __repr__(self):
+ info = ''
+ if self.year is not None:
+ info += 'year={year}, '.format(year=self.year)
+ info += 'month={mon}, day={day}, '.format(mon=self.month, day=self.day)
+
+ if self.offset is not None:
+ info += 'offset={offset}'.format(offset=self.offset)
+
+ if self.observance is not None:
+ info += 'observance={obs}'.format(obs=self.observance)
+
+ repr = 'Holiday: {name} ({info})'.format(name=self.name, info=info)
+ return repr
+
+ def dates(self, start_date, end_date, return_name=False):
+ """
+ Calculate holidays observed between start date and end date
+
+ Parameters
+ ----------
+ start_date : starting date, datetime-like, optional
+ end_date : ending date, datetime-like, optional
+ return_name : bool, optional, default=False
+ If True, return a series that has dates and holiday names.
+ False will only return dates.
+ """
+ start_date = Timestamp(start_date)
+ end_date = Timestamp(end_date)
+
+ filter_start_date = start_date
+ filter_end_date = end_date
+
+ if self.year is not None:
+ dt = Timestamp(datetime(self.year, self.month, self.day))
+ if return_name:
+ return Series(self.name, index=[dt])
+ else:
+ return [dt]
+
+ dates = self._reference_dates(start_date, end_date)
+ holiday_dates = self._apply_rule(dates)
+ if self.days_of_week is not None:
+ holiday_dates = holiday_dates[np.in1d(holiday_dates.dayofweek,
+ self.days_of_week)]
+
+ if self.start_date is not None:
+ filter_start_date = max(self.start_date.tz_localize(
+ filter_start_date.tz), filter_start_date)
+ if self.end_date is not None:
+ filter_end_date = min(self.end_date.tz_localize(
+ filter_end_date.tz), filter_end_date)
+ holiday_dates = holiday_dates[(holiday_dates >= filter_start_date) &
+ (holiday_dates <= filter_end_date)]
+ if return_name:
+ return Series(self.name, index=holiday_dates)
+ return holiday_dates
+
+ def _reference_dates(self, start_date, end_date):
+ """
+ Get reference dates for the holiday.
+
+ Return reference dates for the holiday also returning the year
+ prior to the start_date and year following the end_date. This ensures
+ that any offsets to be applied will yield the holidays within
+ the passed in dates.
+ """
+ if self.start_date is not None:
+ start_date = self.start_date.tz_localize(start_date.tz)
+
+ if self.end_date is not None:
+ end_date = self.end_date.tz_localize(start_date.tz)
+
+ year_offset = DateOffset(years=1)
+ reference_start_date = Timestamp(
+ datetime(start_date.year - 1, self.month, self.day))
+
+ reference_end_date = Timestamp(
+ datetime(end_date.year + 1, self.month, self.day))
+ # Don't process unnecessary holidays
+ dates = date_range(start=reference_start_date,
+ end=reference_end_date,
+ freq=year_offset, tz=start_date.tz)
+
+ return dates
+
+ def _apply_rule(self, dates):
+ """
+ Apply the given offset/observance to a DatetimeIndex of dates.
+
+ Parameters
+ ----------
+ dates : DatetimeIndex
+ Dates to apply the given offset/observance rule
+
+ Returns
+ -------
+ Dates with rules applied
+ """
+ if self.observance is not None:
+ return dates.map(lambda d: self.observance(d))
+
+ if self.offset is not None:
+ if not isinstance(self.offset, list):
+ offsets = [self.offset]
+ else:
+ offsets = self.offset
+ for offset in offsets:
+
+ # if we are adding a non-vectorized value
+ # ignore the PerformanceWarnings:
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore", PerformanceWarning)
+ dates += offset
+ return dates
+
+
+holiday_calendars = {}
+
+
+def register(cls):
+ try:
+ name = cls.name
+ except AttributeError:
+ name = cls.__name__
+ holiday_calendars[name] = cls
+
+
+def get_calendar(name):
+ """
+ Return an instance of a calendar based on its name.
+
+ Parameters
+ ----------
+ name : str
+ Calendar name to return an instance of
+ """
+ return holiday_calendars[name]()
+
+
+class HolidayCalendarMetaClass(type):
+
+ def __new__(cls, clsname, bases, attrs):
+ calendar_class = super(HolidayCalendarMetaClass, cls).__new__(
+ cls, clsname, bases, attrs)
+ register(calendar_class)
+ return calendar_class
+
+
+@add_metaclass(HolidayCalendarMetaClass)
+class AbstractHolidayCalendar(object):
+ """
+ Abstract interface to create holidays following certain rules.
+ """
+ __metaclass__ = HolidayCalendarMetaClass
+ rules = []
+ start_date = Timestamp(datetime(1970, 1, 1))
+ end_date = Timestamp(datetime(2030, 12, 31))
+ _cache = None
+
+ def __init__(self, name=None, rules=None):
+ """
+ Initializes holiday object with a given set a rules. Normally
+ classes just have the rules defined within them.
+
+ Parameters
+ ----------
+ name : str
+ Name of the holiday calendar, defaults to class name
+ rules : array of Holiday objects
+ A set of rules used to create the holidays.
+ """
+ super(AbstractHolidayCalendar, self).__init__()
+ if name is None:
+ name = self.__class__.__name__
+ self.name = name
+
+ if rules is not None:
+ self.rules = rules
+
+ def rule_from_name(self, name):
+ for rule in self.rules:
+ if rule.name == name:
+ return rule
+
+ return None
+
+ def holidays(self, start=None, end=None, return_name=False):
+ """
+ Returns a curve with holidays between start_date and end_date
+
+ Parameters
+ ----------
+ start : starting date, datetime-like, optional
+ end : ending date, datetime-like, optional
+ return_name : bool, optional
+ If True, return a series that has dates and holiday names.
+ False will only return a DatetimeIndex of dates.
+
+ Returns
+ -------
+ DatetimeIndex of holidays
+ """
+ if self.rules is None:
+ raise Exception('Holiday Calendar {name} does not have any '
+ 'rules specified'.format(name=self.name))
+
+ if start is None:
+ start = AbstractHolidayCalendar.start_date
+
+ if end is None:
+ end = AbstractHolidayCalendar.end_date
+
+ start = Timestamp(start)
+ end = Timestamp(end)
+
+ holidays = None
+ # If we don't have a cache or the dates are outside the prior cache, we
+ # get them again
+ if (self._cache is None or start < self._cache[0] or
+ end > self._cache[1]):
+ for rule in self.rules:
+ rule_holidays = rule.dates(start, end, return_name=True)
+
+ if holidays is None:
+ holidays = rule_holidays
+ else:
+ holidays = holidays.append(rule_holidays)
+
+ self._cache = (start, end, holidays.sort_index())
+
+ holidays = self._cache[2]
+ holidays = holidays[start:end]
+
+ if return_name:
+ return holidays
+ else:
+ return holidays.index
+
+ @staticmethod
+ def merge_class(base, other):
+ """
+ Merge holiday calendars together. The base calendar
+ will take precedence to other. The merge will be done
+ based on each holiday's name.
+
+ Parameters
+ ----------
+ base : AbstractHolidayCalendar
+ instance/subclass or array of Holiday objects
+ other : AbstractHolidayCalendar
+ instance/subclass or array of Holiday objects
+ """
+ try:
+ other = other.rules
+ except AttributeError:
+ pass
+
+ if not isinstance(other, list):
+ other = [other]
+ other_holidays = {holiday.name: holiday for holiday in other}
+
+ try:
+ base = base.rules
+ except AttributeError:
+ pass
+
+ if not isinstance(base, list):
+ base = [base]
+ base_holidays = {holiday.name: holiday for holiday in base}
+
+ other_holidays.update(base_holidays)
+ return list(other_holidays.values())
+
+ def merge(self, other, inplace=False):
+ """
+ Merge holiday calendars together. The caller's class
+ rules take precedence. The merge will be done
+ based on each holiday's name.
+
+ Parameters
+ ----------
+ other : holiday calendar
+ inplace : bool (default=False)
+ If True set rule_table to holidays, else return array of Holidays
+ """
+ holidays = self.merge_class(self, other)
+ if inplace:
+ self.rules = holidays
+ else:
+ return holidays
+
+
+USMemorialDay = Holiday('MemorialDay', month=5, day=31,
+ offset=DateOffset(weekday=MO(-1)))
+USLaborDay = Holiday('Labor Day', month=9, day=1,
+ offset=DateOffset(weekday=MO(1)))
+USColumbusDay = Holiday('Columbus Day', month=10, day=1,
+ offset=DateOffset(weekday=MO(2)))
+USThanksgivingDay = Holiday('Thanksgiving', month=11, day=1,
+ offset=DateOffset(weekday=TH(4)))
+USMartinLutherKingJr = Holiday('Dr. Martin Luther King Jr.',
+ start_date=datetime(1986, 1, 1), month=1, day=1,
+ offset=DateOffset(weekday=MO(3)))
+USPresidentsDay = Holiday('President''s Day', month=2, day=1,
+ offset=DateOffset(weekday=MO(3)))
+GoodFriday = Holiday("Good Friday", month=1, day=1, offset=[Easter(), Day(-2)])
+
+EasterMonday = Holiday("Easter Monday", month=1, day=1,
+ offset=[Easter(), Day(1)])
+
+
+class USFederalHolidayCalendar(AbstractHolidayCalendar):
+ """
+ US Federal Government Holiday Calendar based on rules specified by:
+ https://www.opm.gov/policy-data-oversight/
+ snow-dismissal-procedures/federal-holidays/
+ """
+ rules = [
+ Holiday('New Years Day', month=1, day=1, observance=nearest_workday),
+ USMartinLutherKingJr,
+ USPresidentsDay,
+ USMemorialDay,
+ Holiday('July 4th', month=7, day=4, observance=nearest_workday),
+ USLaborDay,
+ USColumbusDay,
+ Holiday('Veterans Day', month=11, day=11, observance=nearest_workday),
+ USThanksgivingDay,
+ Holiday('Christmas', month=12, day=25, observance=nearest_workday)
+ ]
+
+
+def HolidayCalendarFactory(name, base, other,
+ base_class=AbstractHolidayCalendar):
+ rules = AbstractHolidayCalendar.merge_class(base, other)
+ calendar_class = type(name, (base_class,), {"rules": rules, "name": name})
+ return calendar_class
diff --git a/contrib/python/pandas/py2/pandas/tseries/offsets.py b/contrib/python/pandas/py2/pandas/tseries/offsets.py
new file mode 100644
index 00000000000..f208ce37a3b
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tseries/offsets.py
@@ -0,0 +1,2514 @@
+# -*- coding: utf-8 -*-
+from datetime import date, datetime, timedelta
+import functools
+import operator
+
+from dateutil.easter import easter
+import numpy as np
+
+from pandas._libs.tslibs import (
+ NaT, OutOfBoundsDatetime, Timedelta, Timestamp, ccalendar, conversion,
+ delta_to_nanoseconds, frequencies as libfrequencies, normalize_date,
+ offsets as liboffsets, timezones)
+from pandas._libs.tslibs.offsets import (
+ ApplyTypeError, BaseOffset, _get_calendar, _is_normalized, _to_dt64,
+ apply_index_wraps, as_datetime, roll_yearday, shift_month)
+import pandas.compat as compat
+from pandas.compat import range
+from pandas.errors import AbstractMethodError
+from pandas.util._decorators import cache_readonly
+
+from pandas.core.dtypes.generic import ABCPeriod
+
+from pandas.core.tools.datetimes import to_datetime
+
+__all__ = ['Day', 'BusinessDay', 'BDay', 'CustomBusinessDay', 'CDay',
+ 'CBMonthEnd', 'CBMonthBegin',
+ 'MonthBegin', 'BMonthBegin', 'MonthEnd', 'BMonthEnd',
+ 'SemiMonthEnd', 'SemiMonthBegin',
+ 'BusinessHour', 'CustomBusinessHour',
+ 'YearBegin', 'BYearBegin', 'YearEnd', 'BYearEnd',
+ 'QuarterBegin', 'BQuarterBegin', 'QuarterEnd', 'BQuarterEnd',
+ 'LastWeekOfMonth', 'FY5253Quarter', 'FY5253',
+ 'Week', 'WeekOfMonth', 'Easter',
+ 'Hour', 'Minute', 'Second', 'Milli', 'Micro', 'Nano',
+ 'DateOffset']
+
+# convert to/from datetime/timestamp to allow invalid Timestamp ranges to
+# pass thru
+
+
+def as_timestamp(obj):
+ if isinstance(obj, Timestamp):
+ return obj
+ try:
+ return Timestamp(obj)
+ except (OutOfBoundsDatetime):
+ pass
+ return obj
+
+
+def apply_wraps(func):
+ @functools.wraps(func)
+ def wrapper(self, other):
+ if other is NaT:
+ return NaT
+ elif isinstance(other, (timedelta, Tick, DateOffset)):
+ # timedelta path
+ return func(self, other)
+ elif isinstance(other, (np.datetime64, datetime, date)):
+ other = as_timestamp(other)
+
+ tz = getattr(other, 'tzinfo', None)
+ nano = getattr(other, 'nanosecond', 0)
+
+ try:
+ if self._adjust_dst and isinstance(other, Timestamp):
+ other = other.tz_localize(None)
+
+ result = func(self, other)
+
+ if self._adjust_dst:
+ result = conversion.localize_pydatetime(result, tz)
+
+ result = Timestamp(result)
+ if self.normalize:
+ result = result.normalize()
+
+ # nanosecond may be deleted depending on offset process
+ if not self.normalize and nano != 0:
+ if not isinstance(self, Nano) and result.nanosecond != nano:
+ if result.tz is not None:
+ # convert to UTC
+ value = conversion.tz_convert_single(
+ result.value, timezones.UTC, result.tz)
+ else:
+ value = result.value
+ result = Timestamp(value + nano)
+
+ if tz is not None and result.tzinfo is None:
+ result = conversion.localize_pydatetime(result, tz)
+
+ except OutOfBoundsDatetime:
+ result = func(self, as_datetime(other))
+
+ if self.normalize:
+ # normalize_date returns normal datetime
+ result = normalize_date(result)
+
+ if tz is not None and result.tzinfo is None:
+ result = conversion.localize_pydatetime(result, tz)
+
+ return result
+ return wrapper
+
+
+# ---------------------------------------------------------------------
+# DateOffset
+
+
+class DateOffset(BaseOffset):
+ """
+ Standard kind of date increment used for a date range.
+
+ Works exactly like relativedelta in terms of the keyword args you
+ pass in, use of the keyword n is discouraged-- you would be better
+ off specifying n in the keywords you use, but regardless it is
+ there for you. n is needed for DateOffset subclasses.
+
+ DateOffets work as follows. Each offset specify a set of dates
+ that conform to the DateOffset. For example, Bday defines this
+ set to be the set of dates that are weekdays (M-F). To test if a
+ date is in the set of a DateOffset dateOffset we can use the
+ onOffset method: dateOffset.onOffset(date).
+
+ If a date is not on a valid date, the rollback and rollforward
+ methods can be used to roll the date to the nearest valid date
+ before/after the date.
+
+ DateOffsets can be created to move dates forward a given number of
+ valid dates. For example, Bday(2) can be added to a date to move
+ it two business days forward. If the date does not start on a
+ valid date, first it is moved to a valid date. Thus pseudo code
+ is:
+
+ def __add__(date):
+ date = rollback(date) # does nothing if date is valid
+ return date + <n number of periods>
+
+ When a date offset is created for a negative number of periods,
+ the date is first rolled forward. The pseudo code is:
+
+ def __add__(date):
+ date = rollforward(date) # does nothing is date is valid
+ return date + <n number of periods>
+
+ Zero presents a problem. Should it roll forward or back? We
+ arbitrarily have it rollforward:
+
+ date + BDay(0) == BDay.rollforward(date)
+
+ Since 0 is a bit weird, we suggest avoiding its use.
+
+ Parameters
+ ----------
+ n : int, default 1
+ The number of time periods the offset represents.
+ normalize : bool, default False
+ Whether to round the result of a DateOffset addition down to the
+ previous midnight.
+ **kwds
+ Temporal parameter that add to or replace the offset value.
+
+ Parameters that **add** to the offset (like Timedelta):
+
+ - years
+ - months
+ - weeks
+ - days
+ - hours
+ - minutes
+ - seconds
+ - microseconds
+ - nanoseconds
+
+ Parameters that **replace** the offset value:
+
+ - year
+ - month
+ - day
+ - weekday
+ - hour
+ - minute
+ - second
+ - microsecond
+ - nanosecond
+
+ See Also
+ --------
+ dateutil.relativedelta.relativedelta
+
+ Examples
+ --------
+ >>> ts = pd.Timestamp('2017-01-01 09:10:11')
+ >>> ts + DateOffset(months=3)
+ Timestamp('2017-04-01 09:10:11')
+
+ >>> ts = pd.Timestamp('2017-01-01 09:10:11')
+ >>> ts + DateOffset(month=3)
+ Timestamp('2017-03-01 09:10:11')
+ """
+ _params = cache_readonly(BaseOffset._params.fget)
+ _use_relativedelta = False
+ _adjust_dst = False
+ _attributes = frozenset(['n', 'normalize'] +
+ list(liboffsets.relativedelta_kwds))
+
+ # default for prior pickles
+ normalize = False
+
+ def __init__(self, n=1, normalize=False, **kwds):
+ BaseOffset.__init__(self, n, normalize)
+
+ off, use_rd = liboffsets._determine_offset(kwds)
+ object.__setattr__(self, "_offset", off)
+ object.__setattr__(self, "_use_relativedelta", use_rd)
+ for key in kwds:
+ val = kwds[key]
+ object.__setattr__(self, key, val)
+
+ @apply_wraps
+ def apply(self, other):
+ if self._use_relativedelta:
+ other = as_datetime(other)
+
+ if len(self.kwds) > 0:
+ tzinfo = getattr(other, 'tzinfo', None)
+ if tzinfo is not None and self._use_relativedelta:
+ # perform calculation in UTC
+ other = other.replace(tzinfo=None)
+
+ if self.n > 0:
+ for i in range(self.n):
+ other = other + self._offset
+ else:
+ for i in range(-self.n):
+ other = other - self._offset
+
+ if tzinfo is not None and self._use_relativedelta:
+ # bring tz back from UTC calculation
+ other = conversion.localize_pydatetime(other, tzinfo)
+
+ return as_timestamp(other)
+ else:
+ return other + timedelta(self.n)
+
+ @apply_index_wraps
+ def apply_index(self, i):
+ """
+ Vectorized apply of DateOffset to DatetimeIndex,
+ raises NotImplentedError for offsets without a
+ vectorized implementation.
+
+ Parameters
+ ----------
+ i : DatetimeIndex
+
+ Returns
+ -------
+ y : DatetimeIndex
+ """
+
+ if type(self) is not DateOffset:
+ raise NotImplementedError("DateOffset subclass {name} "
+ "does not have a vectorized "
+ "implementation".format(
+ name=self.__class__.__name__))
+ kwds = self.kwds
+ relativedelta_fast = {'years', 'months', 'weeks', 'days', 'hours',
+ 'minutes', 'seconds', 'microseconds'}
+ # relativedelta/_offset path only valid for base DateOffset
+ if (self._use_relativedelta and
+ set(kwds).issubset(relativedelta_fast)):
+
+ months = ((kwds.get('years', 0) * 12 +
+ kwds.get('months', 0)) * self.n)
+ if months:
+ shifted = liboffsets.shift_months(i.asi8, months)
+ i = type(i)(shifted, freq=i.freq, dtype=i.dtype)
+
+ weeks = (kwds.get('weeks', 0)) * self.n
+ if weeks:
+ # integer addition on PeriodIndex is deprecated,
+ # so we directly use _time_shift instead
+ asper = i.to_period('W')
+ if not isinstance(asper._data, np.ndarray):
+ # unwrap PeriodIndex --> PeriodArray
+ asper = asper._data
+ shifted = asper._time_shift(weeks)
+ i = shifted.to_timestamp() + i.to_perioddelta('W')
+
+ timedelta_kwds = {k: v for k, v in kwds.items()
+ if k in ['days', 'hours', 'minutes',
+ 'seconds', 'microseconds']}
+ if timedelta_kwds:
+ delta = Timedelta(**timedelta_kwds)
+ i = i + (self.n * delta)
+ return i
+ elif not self._use_relativedelta and hasattr(self, '_offset'):
+ # timedelta
+ return i + (self._offset * self.n)
+ else:
+ # relativedelta with other keywords
+ kwd = set(kwds) - relativedelta_fast
+ raise NotImplementedError("DateOffset with relativedelta "
+ "keyword(s) {kwd} not able to be "
+ "applied vectorized".format(kwd=kwd))
+
+ def isAnchored(self):
+ # TODO: Does this make sense for the general case? It would help
+ # if there were a canonical docstring for what isAnchored means.
+ return (self.n == 1)
+
+ # TODO: Combine this with BusinessMixin version by defining a whitelisted
+ # set of attributes on each object rather than the existing behavior of
+ # iterating over internal ``__dict__``
+ def _repr_attrs(self):
+ exclude = {'n', 'inc', 'normalize'}
+ attrs = []
+ for attr in sorted(self.__dict__):
+ if attr.startswith('_') or attr == 'kwds':
+ continue
+ elif attr not in exclude:
+ value = getattr(self, attr)
+ attrs.append('{attr}={value}'.format(attr=attr, value=value))
+
+ out = ''
+ if attrs:
+ out += ': ' + ', '.join(attrs)
+ return out
+
+ @property
+ def name(self):
+ return self.rule_code
+
+ def rollback(self, dt):
+ """
+ Roll provided date backward to next offset only if not on offset.
+ """
+ dt = as_timestamp(dt)
+ if not self.onOffset(dt):
+ dt = dt - self.__class__(1, normalize=self.normalize, **self.kwds)
+ return dt
+
+ def rollforward(self, dt):
+ """
+ Roll provided date forward to next offset only if not on offset.
+ """
+ dt = as_timestamp(dt)
+ if not self.onOffset(dt):
+ dt = dt + self.__class__(1, normalize=self.normalize, **self.kwds)
+ return dt
+
+ def onOffset(self, dt):
+ if self.normalize and not _is_normalized(dt):
+ return False
+ # XXX, see #1395
+ if type(self) == DateOffset or isinstance(self, Tick):
+ return True
+
+ # Default (slow) method for determining if some date is a member of the
+ # date range generated by this offset. Subclasses may have this
+ # re-implemented in a nicer way.
+ a = dt
+ b = ((dt + self) - self)
+ return a == b
+
+ # way to get around weirdness with rule_code
+ @property
+ def _prefix(self):
+ raise NotImplementedError('Prefix not defined')
+
+ @property
+ def rule_code(self):
+ return self._prefix
+
+ @cache_readonly
+ def freqstr(self):
+ try:
+ code = self.rule_code
+ except NotImplementedError:
+ return repr(self)
+
+ if self.n != 1:
+ fstr = '{n}{code}'.format(n=self.n, code=code)
+ else:
+ fstr = code
+
+ try:
+ if self._offset:
+ fstr += self._offset_str()
+ except AttributeError:
+ # TODO: standardize `_offset` vs `offset` naming convention
+ pass
+
+ return fstr
+
+ def _offset_str(self):
+ return ''
+
+ @property
+ def nanos(self):
+ raise ValueError("{name} is a non-fixed frequency".format(name=self))
+
+
+class SingleConstructorOffset(DateOffset):
+ @classmethod
+ def _from_name(cls, suffix=None):
+ # default _from_name calls cls with no args
+ if suffix:
+ raise ValueError("Bad freq suffix {suffix}".format(suffix=suffix))
+ return cls()
+
+
+class _CustomMixin(object):
+ """
+ Mixin for classes that define and validate calendar, holidays,
+ and weekdays attributes.
+ """
+ def __init__(self, weekmask, holidays, calendar):
+ calendar, holidays = _get_calendar(weekmask=weekmask,
+ holidays=holidays,
+ calendar=calendar)
+ # Custom offset instances are identified by the
+ # following two attributes. See DateOffset._params()
+ # holidays, weekmask
+
+ object.__setattr__(self, "weekmask", weekmask)
+ object.__setattr__(self, "holidays", holidays)
+ object.__setattr__(self, "calendar", calendar)
+
+
+class BusinessMixin(object):
+ """
+ Mixin to business types to provide related functions.
+ """
+
+ @property
+ def offset(self):
+ """
+ Alias for self._offset.
+ """
+ # Alias for backward compat
+ return self._offset
+
+ def _repr_attrs(self):
+ if self.offset:
+ attrs = ['offset={offset!r}'.format(offset=self.offset)]
+ else:
+ attrs = None
+ out = ''
+ if attrs:
+ out += ': ' + ', '.join(attrs)
+ return out
+
+
+class BusinessDay(BusinessMixin, SingleConstructorOffset):
+ """
+ DateOffset subclass representing possibly n business days.
+ """
+ _prefix = 'B'
+ _adjust_dst = True
+ _attributes = frozenset(['n', 'normalize', 'offset'])
+
+ def __init__(self, n=1, normalize=False, offset=timedelta(0)):
+ BaseOffset.__init__(self, n, normalize)
+ object.__setattr__(self, "_offset", offset)
+
+ def _offset_str(self):
+ def get_str(td):
+ off_str = ''
+ if td.days > 0:
+ off_str += str(td.days) + 'D'
+ if td.seconds > 0:
+ s = td.seconds
+ hrs = int(s / 3600)
+ if hrs != 0:
+ off_str += str(hrs) + 'H'
+ s -= hrs * 3600
+ mts = int(s / 60)
+ if mts != 0:
+ off_str += str(mts) + 'Min'
+ s -= mts * 60
+ if s != 0:
+ off_str += str(s) + 's'
+ if td.microseconds > 0:
+ off_str += str(td.microseconds) + 'us'
+ return off_str
+
+ if isinstance(self.offset, timedelta):
+ zero = timedelta(0, 0, 0)
+ if self.offset >= zero:
+ off_str = '+' + get_str(self.offset)
+ else:
+ off_str = '-' + get_str(-self.offset)
+ return off_str
+ else:
+ return '+' + repr(self.offset)
+
+ @apply_wraps
+ def apply(self, other):
+ if isinstance(other, datetime):
+ n = self.n
+ wday = other.weekday()
+
+ # avoid slowness below by operating on weeks first
+ weeks = n // 5
+ if n <= 0 and wday > 4:
+ # roll forward
+ n += 1
+
+ n -= 5 * weeks
+
+ # n is always >= 0 at this point
+ if n == 0 and wday > 4:
+ # roll back
+ days = 4 - wday
+ elif wday > 4:
+ # roll forward
+ days = (7 - wday) + (n - 1)
+ elif wday + n <= 4:
+ # shift by n days without leaving the current week
+ days = n
+ else:
+ # shift by n days plus 2 to get past the weekend
+ days = n + 2
+
+ result = other + timedelta(days=7 * weeks + days)
+ if self.offset:
+ result = result + self.offset
+ return result
+
+ elif isinstance(other, (timedelta, Tick)):
+ return BDay(self.n, offset=self.offset + other,
+ normalize=self.normalize)
+ else:
+ raise ApplyTypeError('Only know how to combine business day with '
+ 'datetime or timedelta.')
+
+ @apply_index_wraps
+ def apply_index(self, i):
+ time = i.to_perioddelta('D')
+ # to_period rolls forward to next BDay; track and
+ # reduce n where it does when rolling forward
+ asper = i.to_period('B')
+ if not isinstance(asper._data, np.ndarray):
+ # unwrap PeriodIndex --> PeriodArray
+ asper = asper._data
+
+ if self.n > 0:
+ shifted = (i.to_perioddelta('B') - time).asi8 != 0
+
+ # Integer-array addition is deprecated, so we use
+ # _time_shift directly
+ roll = np.where(shifted, self.n - 1, self.n)
+ shifted = asper._addsub_int_array(roll, operator.add)
+ else:
+ # Integer addition is deprecated, so we use _time_shift directly
+ roll = self.n
+ shifted = asper._time_shift(roll)
+
+ result = shifted.to_timestamp() + time
+ return result
+
+ def onOffset(self, dt):
+ if self.normalize and not _is_normalized(dt):
+ return False
+ return dt.weekday() < 5
+
+
+class BusinessHourMixin(BusinessMixin):
+
+ def __init__(self, start='09:00', end='17:00', offset=timedelta(0)):
+ # must be validated here to equality check
+ start = liboffsets._validate_business_time(start)
+ object.__setattr__(self, "start", start)
+ end = liboffsets._validate_business_time(end)
+ object.__setattr__(self, "end", end)
+ object.__setattr__(self, "_offset", offset)
+
+ @cache_readonly
+ def next_bday(self):
+ """
+ Used for moving to next business day.
+ """
+ if self.n >= 0:
+ nb_offset = 1
+ else:
+ nb_offset = -1
+ if self._prefix.startswith('C'):
+ # CustomBusinessHour
+ return CustomBusinessDay(n=nb_offset,
+ weekmask=self.weekmask,
+ holidays=self.holidays,
+ calendar=self.calendar)
+ else:
+ return BusinessDay(n=nb_offset)
+
+ @cache_readonly
+ def _get_daytime_flag(self):
+ if self.start == self.end:
+ raise ValueError('start and end must not be the same')
+ elif self.start < self.end:
+ return True
+ else:
+ return False
+
+ def _next_opening_time(self, other):
+ """
+ If n is positive, return tomorrow's business day opening time.
+ Otherwise yesterday's business day's opening time.
+
+ Opening time always locates on BusinessDay.
+ Otherwise, closing time may not if business hour extends over midnight.
+ """
+ if not self.next_bday.onOffset(other):
+ other = other + self.next_bday
+ else:
+ if self.n >= 0 and self.start < other.time():
+ other = other + self.next_bday
+ elif self.n < 0 and other.time() < self.start:
+ other = other + self.next_bday
+ return datetime(other.year, other.month, other.day,
+ self.start.hour, self.start.minute)
+
+ def _prev_opening_time(self, other):
+ """
+ If n is positive, return yesterday's business day opening time.
+ Otherwise yesterday business day's opening time.
+ """
+ if not self.next_bday.onOffset(other):
+ other = other - self.next_bday
+ else:
+ if self.n >= 0 and other.time() < self.start:
+ other = other - self.next_bday
+ elif self.n < 0 and other.time() > self.start:
+ other = other - self.next_bday
+ return datetime(other.year, other.month, other.day,
+ self.start.hour, self.start.minute)
+
+ @cache_readonly
+ def _get_business_hours_by_sec(self):
+ """
+ Return business hours in a day by seconds.
+ """
+ if self._get_daytime_flag:
+ # create dummy datetime to calculate businesshours in a day
+ dtstart = datetime(2014, 4, 1, self.start.hour, self.start.minute)
+ until = datetime(2014, 4, 1, self.end.hour, self.end.minute)
+ return (until - dtstart).total_seconds()
+ else:
+ dtstart = datetime(2014, 4, 1, self.start.hour, self.start.minute)
+ until = datetime(2014, 4, 2, self.end.hour, self.end.minute)
+ return (until - dtstart).total_seconds()
+
+ @apply_wraps
+ def rollback(self, dt):
+ """
+ Roll provided date backward to next offset only if not on offset.
+ """
+ if not self.onOffset(dt):
+ businesshours = self._get_business_hours_by_sec
+ if self.n >= 0:
+ dt = self._prev_opening_time(
+ dt) + timedelta(seconds=businesshours)
+ else:
+ dt = self._next_opening_time(
+ dt) + timedelta(seconds=businesshours)
+ return dt
+
+ @apply_wraps
+ def rollforward(self, dt):
+ """
+ Roll provided date forward to next offset only if not on offset.
+ """
+ if not self.onOffset(dt):
+ if self.n >= 0:
+ return self._next_opening_time(dt)
+ else:
+ return self._prev_opening_time(dt)
+ return dt
+
+ @apply_wraps
+ def apply(self, other):
+ daytime = self._get_daytime_flag
+ businesshours = self._get_business_hours_by_sec
+ bhdelta = timedelta(seconds=businesshours)
+
+ if isinstance(other, datetime):
+ # used for detecting edge condition
+ nanosecond = getattr(other, 'nanosecond', 0)
+ # reset timezone and nanosecond
+ # other may be a Timestamp, thus not use replace
+ other = datetime(other.year, other.month, other.day,
+ other.hour, other.minute,
+ other.second, other.microsecond)
+ n = self.n
+ if n >= 0:
+ if (other.time() == self.end or
+ not self._onOffset(other, businesshours)):
+ other = self._next_opening_time(other)
+ else:
+ if other.time() == self.start:
+ # adjustment to move to previous business day
+ other = other - timedelta(seconds=1)
+ if not self._onOffset(other, businesshours):
+ other = self._next_opening_time(other)
+ other = other + bhdelta
+
+ bd, r = divmod(abs(n * 60), businesshours // 60)
+ if n < 0:
+ bd, r = -bd, -r
+
+ if bd != 0:
+ skip_bd = BusinessDay(n=bd)
+ # midnight business hour may not on BusinessDay
+ if not self.next_bday.onOffset(other):
+ remain = other - self._prev_opening_time(other)
+ other = self._next_opening_time(other + skip_bd) + remain
+ else:
+ other = other + skip_bd
+
+ hours, minutes = divmod(r, 60)
+ result = other + timedelta(hours=hours, minutes=minutes)
+
+ # because of previous adjustment, time will be larger than start
+ if ((daytime and (result.time() < self.start or
+ self.end < result.time())) or
+ not daytime and (self.end < result.time() < self.start)):
+ if n >= 0:
+ bday_edge = self._prev_opening_time(other)
+ bday_edge = bday_edge + bhdelta
+ # calculate remainder
+ bday_remain = result - bday_edge
+ result = self._next_opening_time(other)
+ result += bday_remain
+ else:
+ bday_edge = self._next_opening_time(other)
+ bday_remain = result - bday_edge
+ result = self._next_opening_time(result) + bhdelta
+ result += bday_remain
+ # edge handling
+ if n >= 0:
+ if result.time() == self.end:
+ result = self._next_opening_time(result)
+ else:
+ if result.time() == self.start and nanosecond == 0:
+ # adjustment to move to previous business day
+ result = self._next_opening_time(
+ result - timedelta(seconds=1)) + bhdelta
+
+ return result
+ else:
+ # TODO: Figure out the end of this sente
+ raise ApplyTypeError(
+ 'Only know how to combine business hour with ')
+
+ def onOffset(self, dt):
+ if self.normalize and not _is_normalized(dt):
+ return False
+
+ if dt.tzinfo is not None:
+ dt = datetime(dt.year, dt.month, dt.day, dt.hour,
+ dt.minute, dt.second, dt.microsecond)
+ # Valid BH can be on the different BusinessDay during midnight
+ # Distinguish by the time spent from previous opening time
+ businesshours = self._get_business_hours_by_sec
+ return self._onOffset(dt, businesshours)
+
+ def _onOffset(self, dt, businesshours):
+ """
+ Slight speedups using calculated values.
+ """
+ # if self.normalize and not _is_normalized(dt):
+ # return False
+ # Valid BH can be on the different BusinessDay during midnight
+ # Distinguish by the time spent from previous opening time
+ if self.n >= 0:
+ op = self._prev_opening_time(dt)
+ else:
+ op = self._next_opening_time(dt)
+ span = (dt - op).total_seconds()
+ if span <= businesshours:
+ return True
+ else:
+ return False
+
+ def _repr_attrs(self):
+ out = super(BusinessHourMixin, self)._repr_attrs()
+ start = self.start.strftime('%H:%M')
+ end = self.end.strftime('%H:%M')
+ attrs = ['{prefix}={start}-{end}'.format(prefix=self._prefix,
+ start=start, end=end)]
+ out += ': ' + ', '.join(attrs)
+ return out
+
+
+class BusinessHour(BusinessHourMixin, SingleConstructorOffset):
+ """
+ DateOffset subclass representing possibly n business days.
+
+ .. versionadded:: 0.16.1
+ """
+ _prefix = 'BH'
+ _anchor = 0
+ _attributes = frozenset(['n', 'normalize', 'start', 'end', 'offset'])
+
+ def __init__(self, n=1, normalize=False, start='09:00',
+ end='17:00', offset=timedelta(0)):
+ BaseOffset.__init__(self, n, normalize)
+ super(BusinessHour, self).__init__(start=start, end=end, offset=offset)
+
+
+class CustomBusinessDay(_CustomMixin, BusinessDay):
+ """
+ DateOffset subclass representing possibly n custom business days,
+ excluding holidays.
+
+ Parameters
+ ----------
+ n : int, default 1
+ normalize : bool, default False
+ Normalize start/end dates to midnight before generating date range
+ weekmask : str, Default 'Mon Tue Wed Thu Fri'
+ weekmask of valid business days, passed to ``numpy.busdaycalendar``
+ holidays : list
+ list/array of dates to exclude from the set of valid business days,
+ passed to ``numpy.busdaycalendar``
+ calendar : pd.HolidayCalendar or np.busdaycalendar
+ offset : timedelta, default timedelta(0)
+ """
+ _prefix = 'C'
+ _attributes = frozenset(['n', 'normalize',
+ 'weekmask', 'holidays', 'calendar', 'offset'])
+
+ def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri',
+ holidays=None, calendar=None, offset=timedelta(0)):
+ BaseOffset.__init__(self, n, normalize)
+ object.__setattr__(self, "_offset", offset)
+
+ _CustomMixin.__init__(self, weekmask, holidays, calendar)
+
+ @apply_wraps
+ def apply(self, other):
+ if self.n <= 0:
+ roll = 'forward'
+ else:
+ roll = 'backward'
+
+ if isinstance(other, datetime):
+ date_in = other
+ np_dt = np.datetime64(date_in.date())
+
+ np_incr_dt = np.busday_offset(np_dt, self.n, roll=roll,
+ busdaycal=self.calendar)
+
+ dt_date = np_incr_dt.astype(datetime)
+ result = datetime.combine(dt_date, date_in.time())
+
+ if self.offset:
+ result = result + self.offset
+ return result
+
+ elif isinstance(other, (timedelta, Tick)):
+ return BDay(self.n, offset=self.offset + other,
+ normalize=self.normalize)
+ else:
+ raise ApplyTypeError('Only know how to combine trading day with '
+ 'datetime, datetime64 or timedelta.')
+
+ def apply_index(self, i):
+ raise NotImplementedError
+
+ def onOffset(self, dt):
+ if self.normalize and not _is_normalized(dt):
+ return False
+ day64 = _to_dt64(dt, 'datetime64[D]')
+ return np.is_busday(day64, busdaycal=self.calendar)
+
+
+class CustomBusinessHour(_CustomMixin, BusinessHourMixin,
+ SingleConstructorOffset):
+ """
+ DateOffset subclass representing possibly n custom business days.
+
+ .. versionadded:: 0.18.1
+ """
+ _prefix = 'CBH'
+ _anchor = 0
+ _attributes = frozenset(['n', 'normalize',
+ 'weekmask', 'holidays', 'calendar',
+ 'start', 'end', 'offset'])
+
+ def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri',
+ holidays=None, calendar=None,
+ start='09:00', end='17:00', offset=timedelta(0)):
+ BaseOffset.__init__(self, n, normalize)
+ object.__setattr__(self, "_offset", offset)
+
+ _CustomMixin.__init__(self, weekmask, holidays, calendar)
+ BusinessHourMixin.__init__(self, start=start, end=end, offset=offset)
+
+
+# ---------------------------------------------------------------------
+# Month-Based Offset Classes
+
+
+class MonthOffset(SingleConstructorOffset):
+ _adjust_dst = True
+ _attributes = frozenset(['n', 'normalize'])
+
+ __init__ = BaseOffset.__init__
+
+ @property
+ def name(self):
+ if self.isAnchored:
+ return self.rule_code
+ else:
+ month = ccalendar.MONTH_ALIASES[self.n]
+ return "{code}-{month}".format(code=self.rule_code,
+ month=month)
+
+ def onOffset(self, dt):
+ if self.normalize and not _is_normalized(dt):
+ return False
+ return dt.day == self._get_offset_day(dt)
+
+ @apply_wraps
+ def apply(self, other):
+ compare_day = self._get_offset_day(other)
+ n = liboffsets.roll_convention(other.day, self.n, compare_day)
+ return shift_month(other, n, self._day_opt)
+
+ @apply_index_wraps
+ def apply_index(self, i):
+ shifted = liboffsets.shift_months(i.asi8, self.n, self._day_opt)
+ # TODO: going through __new__ raises on call to _validate_frequency;
+ # are we passing incorrect freq?
+ return type(i)._simple_new(shifted, freq=i.freq, dtype=i.dtype)
+
+
+class MonthEnd(MonthOffset):
+ """
+ DateOffset of one month end.
+ """
+ _prefix = 'M'
+ _day_opt = 'end'
+
+
+class MonthBegin(MonthOffset):
+ """
+ DateOffset of one month at beginning.
+ """
+ _prefix = 'MS'
+ _day_opt = 'start'
+
+
+class BusinessMonthEnd(MonthOffset):
+ """
+ DateOffset increments between business EOM dates.
+ """
+ _prefix = 'BM'
+ _day_opt = 'business_end'
+
+
+class BusinessMonthBegin(MonthOffset):
+ """
+ DateOffset of one business month at beginning.
+ """
+ _prefix = 'BMS'
+ _day_opt = 'business_start'
+
+
+class _CustomBusinessMonth(_CustomMixin, BusinessMixin, MonthOffset):
+ """
+ DateOffset subclass representing one custom business month, incrementing
+ between [BEGIN/END] of month dates.
+
+ Parameters
+ ----------
+ n : int, default 1
+ normalize : bool, default False
+ Normalize start/end dates to midnight before generating date range
+ weekmask : str, Default 'Mon Tue Wed Thu Fri'
+ weekmask of valid business days, passed to ``numpy.busdaycalendar``
+ holidays : list
+ list/array of dates to exclude from the set of valid business days,
+ passed to ``numpy.busdaycalendar``
+ calendar : pd.HolidayCalendar or np.busdaycalendar
+ offset : timedelta, default timedelta(0)
+ """
+ _attributes = frozenset(['n', 'normalize',
+ 'weekmask', 'holidays', 'calendar', 'offset'])
+
+ onOffset = DateOffset.onOffset # override MonthOffset method
+ apply_index = DateOffset.apply_index # override MonthOffset method
+
+ def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri',
+ holidays=None, calendar=None, offset=timedelta(0)):
+ BaseOffset.__init__(self, n, normalize)
+ object.__setattr__(self, "_offset", offset)
+
+ _CustomMixin.__init__(self, weekmask, holidays, calendar)
+
+ @cache_readonly
+ def cbday_roll(self):
+ """
+ Define default roll function to be called in apply method.
+ """
+ cbday = CustomBusinessDay(n=self.n, normalize=False, **self.kwds)
+
+ if self._prefix.endswith('S'):
+ # MonthBegin
+ roll_func = cbday.rollforward
+ else:
+ # MonthEnd
+ roll_func = cbday.rollback
+ return roll_func
+
+ @cache_readonly
+ def m_offset(self):
+ if self._prefix.endswith('S'):
+ # MonthBegin
+ moff = MonthBegin(n=1, normalize=False)
+ else:
+ # MonthEnd
+ moff = MonthEnd(n=1, normalize=False)
+ return moff
+
+ @cache_readonly
+ def month_roll(self):
+ """
+ Define default roll function to be called in apply method.
+ """
+ if self._prefix.endswith('S'):
+ # MonthBegin
+ roll_func = self.m_offset.rollback
+ else:
+ # MonthEnd
+ roll_func = self.m_offset.rollforward
+ return roll_func
+
+ @apply_wraps
+ def apply(self, other):
+ # First move to month offset
+ cur_month_offset_date = self.month_roll(other)
+
+ # Find this custom month offset
+ compare_date = self.cbday_roll(cur_month_offset_date)
+ n = liboffsets.roll_convention(other.day, self.n, compare_date.day)
+
+ new = cur_month_offset_date + n * self.m_offset
+ result = self.cbday_roll(new)
+ return result
+
+
+class CustomBusinessMonthEnd(_CustomBusinessMonth):
+ # TODO(py27): Replace condition with Subsitution after dropping Py27
+ if _CustomBusinessMonth.__doc__:
+ __doc__ = _CustomBusinessMonth.__doc__.replace('[BEGIN/END]', 'end')
+ _prefix = 'CBM'
+
+
+class CustomBusinessMonthBegin(_CustomBusinessMonth):
+ # TODO(py27): Replace condition with Subsitution after dropping Py27
+ if _CustomBusinessMonth.__doc__:
+ __doc__ = _CustomBusinessMonth.__doc__.replace('[BEGIN/END]',
+ 'beginning')
+ _prefix = 'CBMS'
+
+
+# ---------------------------------------------------------------------
+# Semi-Month Based Offset Classes
+
+class SemiMonthOffset(DateOffset):
+ _adjust_dst = True
+ _default_day_of_month = 15
+ _min_day_of_month = 2
+ _attributes = frozenset(['n', 'normalize', 'day_of_month'])
+
+ def __init__(self, n=1, normalize=False, day_of_month=None):
+ BaseOffset.__init__(self, n, normalize)
+
+ if day_of_month is None:
+ object.__setattr__(self, "day_of_month",
+ self._default_day_of_month)
+ else:
+ object.__setattr__(self, "day_of_month", int(day_of_month))
+ if not self._min_day_of_month <= self.day_of_month <= 27:
+ msg = 'day_of_month must be {min}<=day_of_month<=27, got {day}'
+ raise ValueError(msg.format(min=self._min_day_of_month,
+ day=self.day_of_month))
+
+ @classmethod
+ def _from_name(cls, suffix=None):
+ return cls(day_of_month=suffix)
+
+ @property
+ def rule_code(self):
+ suffix = '-{day_of_month}'.format(day_of_month=self.day_of_month)
+ return self._prefix + suffix
+
+ @apply_wraps
+ def apply(self, other):
+ # shift `other` to self.day_of_month, incrementing `n` if necessary
+ n = liboffsets.roll_convention(other.day, self.n, self.day_of_month)
+
+ days_in_month = ccalendar.get_days_in_month(other.year, other.month)
+
+ # For SemiMonthBegin on other.day == 1 and
+ # SemiMonthEnd on other.day == days_in_month,
+ # shifting `other` to `self.day_of_month` _always_ requires
+ # incrementing/decrementing `n`, regardless of whether it is
+ # initially positive.
+ if type(self) is SemiMonthBegin and (self.n <= 0 and other.day == 1):
+ n -= 1
+ elif type(self) is SemiMonthEnd and (self.n > 0 and
+ other.day == days_in_month):
+ n += 1
+
+ return self._apply(n, other)
+
+ def _apply(self, n, other):
+ """
+ Handle specific apply logic for child classes.
+ """
+ raise AbstractMethodError(self)
+
+ @apply_index_wraps
+ def apply_index(self, i):
+ # determine how many days away from the 1st of the month we are
+ dti = i
+ days_from_start = i.to_perioddelta('M').asi8
+ delta = Timedelta(days=self.day_of_month - 1).value
+
+ # get boolean array for each element before the day_of_month
+ before_day_of_month = days_from_start < delta
+
+ # get boolean array for each element after the day_of_month
+ after_day_of_month = days_from_start > delta
+
+ # determine the correct n for each date in i
+ roll = self._get_roll(i, before_day_of_month, after_day_of_month)
+
+ # isolate the time since it will be striped away one the next line
+ time = i.to_perioddelta('D')
+
+ # apply the correct number of months
+
+ # integer-array addition on PeriodIndex is deprecated,
+ # so we use _addsub_int_array directly
+ asper = i.to_period('M')
+ if not isinstance(asper._data, np.ndarray):
+ # unwrap PeriodIndex --> PeriodArray
+ asper = asper._data
+
+ shifted = asper._addsub_int_array(roll // 2, operator.add)
+ i = type(dti)(shifted.to_timestamp())
+
+ # apply the correct day
+ i = self._apply_index_days(i, roll)
+
+ return i + time
+
+ def _get_roll(self, i, before_day_of_month, after_day_of_month):
+ """
+ Return an array with the correct n for each date in i.
+
+ The roll array is based on the fact that i gets rolled back to
+ the first day of the month.
+ """
+ raise AbstractMethodError(self)
+
+ def _apply_index_days(self, i, roll):
+ """
+ Apply the correct day for each date in i.
+ """
+ raise AbstractMethodError(self)
+
+
+class SemiMonthEnd(SemiMonthOffset):
+ """
+ Two DateOffset's per month repeating on the last
+ day of the month and day_of_month.
+
+ .. versionadded:: 0.19.0
+
+ Parameters
+ ----------
+ n : int
+ normalize : bool, default False
+ day_of_month : int, {1, 3,...,27}, default 15
+ """
+ _prefix = 'SM'
+ _min_day_of_month = 1
+
+ def onOffset(self, dt):
+ if self.normalize and not _is_normalized(dt):
+ return False
+ days_in_month = ccalendar.get_days_in_month(dt.year, dt.month)
+ return dt.day in (self.day_of_month, days_in_month)
+
+ def _apply(self, n, other):
+ months = n // 2
+ day = 31 if n % 2 else self.day_of_month
+ return shift_month(other, months, day)
+
+ def _get_roll(self, i, before_day_of_month, after_day_of_month):
+ n = self.n
+ is_month_end = i.is_month_end
+ if n > 0:
+ roll_end = np.where(is_month_end, 1, 0)
+ roll_before = np.where(before_day_of_month, n, n + 1)
+ roll = roll_end + roll_before
+ elif n == 0:
+ roll_after = np.where(after_day_of_month, 2, 0)
+ roll_before = np.where(~after_day_of_month, 1, 0)
+ roll = roll_before + roll_after
+ else:
+ roll = np.where(after_day_of_month, n + 2, n + 1)
+ return roll
+
+ def _apply_index_days(self, i, roll):
+ """
+ Add days portion of offset to DatetimeIndex i.
+
+ Parameters
+ ----------
+ i : DatetimeIndex
+ roll : ndarray[int64_t]
+
+ Returns
+ -------
+ result : DatetimeIndex
+ """
+ nanos = (roll % 2) * Timedelta(days=self.day_of_month).value
+ i += nanos.astype('timedelta64[ns]')
+ return i + Timedelta(days=-1)
+
+
+class SemiMonthBegin(SemiMonthOffset):
+ """
+ Two DateOffset's per month repeating on the first
+ day of the month and day_of_month.
+
+ .. versionadded:: 0.19.0
+
+ Parameters
+ ----------
+ n : int
+ normalize : bool, default False
+ day_of_month : int, {2, 3,...,27}, default 15
+ """
+ _prefix = 'SMS'
+
+ def onOffset(self, dt):
+ if self.normalize and not _is_normalized(dt):
+ return False
+ return dt.day in (1, self.day_of_month)
+
+ def _apply(self, n, other):
+ months = n // 2 + n % 2
+ day = 1 if n % 2 else self.day_of_month
+ return shift_month(other, months, day)
+
+ def _get_roll(self, i, before_day_of_month, after_day_of_month):
+ n = self.n
+ is_month_start = i.is_month_start
+ if n > 0:
+ roll = np.where(before_day_of_month, n, n + 1)
+ elif n == 0:
+ roll_start = np.where(is_month_start, 0, 1)
+ roll_after = np.where(after_day_of_month, 1, 0)
+ roll = roll_start + roll_after
+ else:
+ roll_after = np.where(after_day_of_month, n + 2, n + 1)
+ roll_start = np.where(is_month_start, -1, 0)
+ roll = roll_after + roll_start
+ return roll
+
+ def _apply_index_days(self, i, roll):
+ """
+ Add days portion of offset to DatetimeIndex i.
+
+ Parameters
+ ----------
+ i : DatetimeIndex
+ roll : ndarray[int64_t]
+
+ Returns
+ -------
+ result : DatetimeIndex
+ """
+ nanos = (roll % 2) * Timedelta(days=self.day_of_month - 1).value
+ return i + nanos.astype('timedelta64[ns]')
+
+
+# ---------------------------------------------------------------------
+# Week-Based Offset Classes
+
+class Week(DateOffset):
+ """
+ Weekly offset.
+
+ Parameters
+ ----------
+ weekday : int, default None
+ Always generate specific day of week. 0 for Monday
+ """
+ _adjust_dst = True
+ _inc = timedelta(weeks=1)
+ _prefix = 'W'
+ _attributes = frozenset(['n', 'normalize', 'weekday'])
+
+ def __init__(self, n=1, normalize=False, weekday=None):
+ BaseOffset.__init__(self, n, normalize)
+ object.__setattr__(self, "weekday", weekday)
+
+ if self.weekday is not None:
+ if self.weekday < 0 or self.weekday > 6:
+ raise ValueError('Day must be 0<=day<=6, got {day}'
+ .format(day=self.weekday))
+
+ def isAnchored(self):
+ return (self.n == 1 and self.weekday is not None)
+
+ @apply_wraps
+ def apply(self, other):
+ if self.weekday is None:
+ return other + self.n * self._inc
+
+ k = self.n
+ otherDay = other.weekday()
+ if otherDay != self.weekday:
+ other = other + timedelta((self.weekday - otherDay) % 7)
+ if k > 0:
+ k -= 1
+
+ return other + timedelta(weeks=k)
+
+ @apply_index_wraps
+ def apply_index(self, i):
+ if self.weekday is None:
+ # integer addition on PeriodIndex is deprecated,
+ # so we use _time_shift directly
+ asper = i.to_period('W')
+ if not isinstance(asper._data, np.ndarray):
+ # unwrap PeriodIndex --> PeriodArray
+ asper = asper._data
+
+ shifted = asper._time_shift(self.n)
+ return shifted.to_timestamp() + i.to_perioddelta('W')
+ else:
+ return self._end_apply_index(i)
+
+ def _end_apply_index(self, dtindex):
+ """
+ Add self to the given DatetimeIndex, specialized for case where
+ self.weekday is non-null.
+
+ Parameters
+ ----------
+ dtindex : DatetimeIndex
+
+ Returns
+ -------
+ result : DatetimeIndex
+ """
+ off = dtindex.to_perioddelta('D')
+
+ base, mult = libfrequencies.get_freq_code(self.freqstr)
+ base_period = dtindex.to_period(base)
+ if not isinstance(base_period._data, np.ndarray):
+ # unwrap PeriodIndex --> PeriodArray
+ base_period = base_period._data
+
+ if self.n > 0:
+ # when adding, dates on end roll to next
+ normed = dtindex - off + Timedelta(1, 'D') - Timedelta(1, 'ns')
+ roll = np.where(base_period.to_timestamp(how='end') == normed,
+ self.n, self.n - 1)
+ # integer-array addition on PeriodIndex is deprecated,
+ # so we use _addsub_int_array directly
+ shifted = base_period._addsub_int_array(roll, operator.add)
+ base = shifted.to_timestamp(how='end')
+ else:
+ # integer addition on PeriodIndex is deprecated,
+ # so we use _time_shift directly
+ roll = self.n
+ base = base_period._time_shift(roll).to_timestamp(how='end')
+
+ return base + off + Timedelta(1, 'ns') - Timedelta(1, 'D')
+
+ def onOffset(self, dt):
+ if self.normalize and not _is_normalized(dt):
+ return False
+ elif self.weekday is None:
+ return True
+ return dt.weekday() == self.weekday
+
+ @property
+ def rule_code(self):
+ suffix = ''
+ if self.weekday is not None:
+ weekday = ccalendar.int_to_weekday[self.weekday]
+ suffix = '-{weekday}'.format(weekday=weekday)
+ return self._prefix + suffix
+
+ @classmethod
+ def _from_name(cls, suffix=None):
+ if not suffix:
+ weekday = None
+ else:
+ weekday = ccalendar.weekday_to_int[suffix]
+ return cls(weekday=weekday)
+
+
+class _WeekOfMonthMixin(object):
+ """
+ Mixin for methods common to WeekOfMonth and LastWeekOfMonth.
+ """
+ @apply_wraps
+ def apply(self, other):
+ compare_day = self._get_offset_day(other)
+
+ months = self.n
+ if months > 0 and compare_day > other.day:
+ months -= 1
+ elif months <= 0 and compare_day < other.day:
+ months += 1
+
+ shifted = shift_month(other, months, 'start')
+ to_day = self._get_offset_day(shifted)
+ return liboffsets.shift_day(shifted, to_day - shifted.day)
+
+ def onOffset(self, dt):
+ if self.normalize and not _is_normalized(dt):
+ return False
+ return dt.day == self._get_offset_day(dt)
+
+
+class WeekOfMonth(_WeekOfMonthMixin, DateOffset):
+ """
+ Describes monthly dates like "the Tuesday of the 2nd week of each month".
+
+ Parameters
+ ----------
+ n : int
+ week : {0, 1, 2, 3, ...}, default 0
+ 0 is 1st week of month, 1 2nd week, etc.
+ weekday : {0, 1, ..., 6}, default 0
+ 0: Mondays
+ 1: Tuesdays
+ 2: Wednesdays
+ 3: Thursdays
+ 4: Fridays
+ 5: Saturdays
+ 6: Sundays
+ """
+ _prefix = 'WOM'
+ _adjust_dst = True
+ _attributes = frozenset(['n', 'normalize', 'week', 'weekday'])
+
+ def __init__(self, n=1, normalize=False, week=0, weekday=0):
+ BaseOffset.__init__(self, n, normalize)
+ object.__setattr__(self, "weekday", weekday)
+ object.__setattr__(self, "week", week)
+
+ if self.weekday < 0 or self.weekday > 6:
+ raise ValueError('Day must be 0<=day<=6, got {day}'
+ .format(day=self.weekday))
+ if self.week < 0 or self.week > 3:
+ raise ValueError('Week must be 0<=week<=3, got {week}'
+ .format(week=self.week))
+
+ def _get_offset_day(self, other):
+ """
+ Find the day in the same month as other that has the same
+ weekday as self.weekday and is the self.week'th such day in the month.
+
+ Parameters
+ ----------
+ other : datetime
+
+ Returns
+ -------
+ day : int
+ """
+ mstart = datetime(other.year, other.month, 1)
+ wday = mstart.weekday()
+ shift_days = (self.weekday - wday) % 7
+ return 1 + shift_days + self.week * 7
+
+ @property
+ def rule_code(self):
+ weekday = ccalendar.int_to_weekday.get(self.weekday, '')
+ return '{prefix}-{week}{weekday}'.format(prefix=self._prefix,
+ week=self.week + 1,
+ weekday=weekday)
+
+ @classmethod
+ def _from_name(cls, suffix=None):
+ if not suffix:
+ raise ValueError("Prefix {prefix!r} requires a suffix."
+ .format(prefix=cls._prefix))
+ # TODO: handle n here...
+ # only one digit weeks (1 --> week 0, 2 --> week 1, etc.)
+ week = int(suffix[0]) - 1
+ weekday = ccalendar.weekday_to_int[suffix[1:]]
+ return cls(week=week, weekday=weekday)
+
+
+class LastWeekOfMonth(_WeekOfMonthMixin, DateOffset):
+ """
+ Describes monthly dates in last week of month like "the last Tuesday of
+ each month".
+
+ Parameters
+ ----------
+ n : int, default 1
+ weekday : {0, 1, ..., 6}, default 0
+ 0: Mondays
+ 1: Tuesdays
+ 2: Wednesdays
+ 3: Thursdays
+ 4: Fridays
+ 5: Saturdays
+ 6: Sundays
+ """
+ _prefix = 'LWOM'
+ _adjust_dst = True
+ _attributes = frozenset(['n', 'normalize', 'weekday'])
+
+ def __init__(self, n=1, normalize=False, weekday=0):
+ BaseOffset.__init__(self, n, normalize)
+ object.__setattr__(self, "weekday", weekday)
+
+ if self.n == 0:
+ raise ValueError('N cannot be 0')
+
+ if self.weekday < 0 or self.weekday > 6:
+ raise ValueError('Day must be 0<=day<=6, got {day}'
+ .format(day=self.weekday))
+
+ def _get_offset_day(self, other):
+ """
+ Find the day in the same month as other that has the same
+ weekday as self.weekday and is the last such day in the month.
+
+ Parameters
+ ----------
+ other: datetime
+
+ Returns
+ -------
+ day: int
+ """
+ dim = ccalendar.get_days_in_month(other.year, other.month)
+ mend = datetime(other.year, other.month, dim)
+ wday = mend.weekday()
+ shift_days = (wday - self.weekday) % 7
+ return dim - shift_days
+
+ @property
+ def rule_code(self):
+ weekday = ccalendar.int_to_weekday.get(self.weekday, '')
+ return '{prefix}-{weekday}'.format(prefix=self._prefix,
+ weekday=weekday)
+
+ @classmethod
+ def _from_name(cls, suffix=None):
+ if not suffix:
+ raise ValueError("Prefix {prefix!r} requires a suffix."
+ .format(prefix=cls._prefix))
+ # TODO: handle n here...
+ weekday = ccalendar.weekday_to_int[suffix]
+ return cls(weekday=weekday)
+
+# ---------------------------------------------------------------------
+# Quarter-Based Offset Classes
+
+
+class QuarterOffset(DateOffset):
+ """
+ Quarter representation - doesn't call super.
+ """
+ _default_startingMonth = None
+ _from_name_startingMonth = None
+ _adjust_dst = True
+ _attributes = frozenset(['n', 'normalize', 'startingMonth'])
+ # TODO: Consider combining QuarterOffset and YearOffset __init__ at some
+ # point. Also apply_index, onOffset, rule_code if
+ # startingMonth vs month attr names are resolved
+
+ def __init__(self, n=1, normalize=False, startingMonth=None):
+ BaseOffset.__init__(self, n, normalize)
+
+ if startingMonth is None:
+ startingMonth = self._default_startingMonth
+ object.__setattr__(self, "startingMonth", startingMonth)
+
+ def isAnchored(self):
+ return (self.n == 1 and self.startingMonth is not None)
+
+ @classmethod
+ def _from_name(cls, suffix=None):
+ kwargs = {}
+ if suffix:
+ kwargs['startingMonth'] = ccalendar.MONTH_TO_CAL_NUM[suffix]
+ else:
+ if cls._from_name_startingMonth is not None:
+ kwargs['startingMonth'] = cls._from_name_startingMonth
+ return cls(**kwargs)
+
+ @property
+ def rule_code(self):
+ month = ccalendar.MONTH_ALIASES[self.startingMonth]
+ return '{prefix}-{month}'.format(prefix=self._prefix, month=month)
+
+ @apply_wraps
+ def apply(self, other):
+ # months_since: find the calendar quarter containing other.month,
+ # e.g. if other.month == 8, the calendar quarter is [Jul, Aug, Sep].
+ # Then find the month in that quarter containing an onOffset date for
+ # self. `months_since` is the number of months to shift other.month
+ # to get to this on-offset month.
+ months_since = other.month % 3 - self.startingMonth % 3
+ qtrs = liboffsets.roll_qtrday(other, self.n, self.startingMonth,
+ day_opt=self._day_opt, modby=3)
+ months = qtrs * 3 - months_since
+ return shift_month(other, months, self._day_opt)
+
+ def onOffset(self, dt):
+ if self.normalize and not _is_normalized(dt):
+ return False
+ mod_month = (dt.month - self.startingMonth) % 3
+ return mod_month == 0 and dt.day == self._get_offset_day(dt)
+
+ @apply_index_wraps
+ def apply_index(self, dtindex):
+ shifted = liboffsets.shift_quarters(dtindex.asi8, self.n,
+ self.startingMonth, self._day_opt)
+ # TODO: going through __new__ raises on call to _validate_frequency;
+ # are we passing incorrect freq?
+ return type(dtindex)._simple_new(shifted, freq=dtindex.freq,
+ dtype=dtindex.dtype)
+
+
+class BQuarterEnd(QuarterOffset):
+ """
+ DateOffset increments between business Quarter dates.
+
+ startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ...
+ startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ...
+ startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ...
+ """
+ _outputName = 'BusinessQuarterEnd'
+ _default_startingMonth = 3
+ _from_name_startingMonth = 12
+ _prefix = 'BQ'
+ _day_opt = 'business_end'
+
+
+# TODO: This is basically the same as BQuarterEnd
+class BQuarterBegin(QuarterOffset):
+ _outputName = "BusinessQuarterBegin"
+ # I suspect this is wrong for *all* of them.
+ _default_startingMonth = 3
+ _from_name_startingMonth = 1
+ _prefix = 'BQS'
+ _day_opt = 'business_start'
+
+
+class QuarterEnd(QuarterOffset):
+ """
+ DateOffset increments between business Quarter dates.
+
+ startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ...
+ startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ...
+ startingMonth = 3 corresponds to dates like 3/31/2007, 6/30/2007, ...
+ """
+ _outputName = 'QuarterEnd'
+ _default_startingMonth = 3
+ _prefix = 'Q'
+ _day_opt = 'end'
+
+
+class QuarterBegin(QuarterOffset):
+ _outputName = 'QuarterBegin'
+ _default_startingMonth = 3
+ _from_name_startingMonth = 1
+ _prefix = 'QS'
+ _day_opt = 'start'
+
+
+# ---------------------------------------------------------------------
+# Year-Based Offset Classes
+
+class YearOffset(DateOffset):
+ """
+ DateOffset that just needs a month.
+ """
+ _adjust_dst = True
+ _attributes = frozenset(['n', 'normalize', 'month'])
+
+ def _get_offset_day(self, other):
+ # override BaseOffset method to use self.month instead of other.month
+ # TODO: there may be a more performant way to do this
+ return liboffsets.get_day_of_month(other.replace(month=self.month),
+ self._day_opt)
+
+ @apply_wraps
+ def apply(self, other):
+ years = roll_yearday(other, self.n, self.month, self._day_opt)
+ months = years * 12 + (self.month - other.month)
+ return shift_month(other, months, self._day_opt)
+
+ @apply_index_wraps
+ def apply_index(self, dtindex):
+ shifted = liboffsets.shift_quarters(dtindex.asi8, self.n,
+ self.month, self._day_opt,
+ modby=12)
+ # TODO: going through __new__ raises on call to _validate_frequency;
+ # are we passing incorrect freq?
+ return type(dtindex)._simple_new(shifted, freq=dtindex.freq,
+ dtype=dtindex.dtype)
+
+ def onOffset(self, dt):
+ if self.normalize and not _is_normalized(dt):
+ return False
+ return dt.month == self.month and dt.day == self._get_offset_day(dt)
+
+ def __init__(self, n=1, normalize=False, month=None):
+ BaseOffset.__init__(self, n, normalize)
+
+ month = month if month is not None else self._default_month
+ object.__setattr__(self, "month", month)
+
+ if self.month < 1 or self.month > 12:
+ raise ValueError('Month must go from 1 to 12')
+
+ @classmethod
+ def _from_name(cls, suffix=None):
+ kwargs = {}
+ if suffix:
+ kwargs['month'] = ccalendar.MONTH_TO_CAL_NUM[suffix]
+ return cls(**kwargs)
+
+ @property
+ def rule_code(self):
+ month = ccalendar.MONTH_ALIASES[self.month]
+ return '{prefix}-{month}'.format(prefix=self._prefix, month=month)
+
+
+class BYearEnd(YearOffset):
+ """
+ DateOffset increments between business EOM dates.
+ """
+ _outputName = 'BusinessYearEnd'
+ _default_month = 12
+ _prefix = 'BA'
+ _day_opt = 'business_end'
+
+
+class BYearBegin(YearOffset):
+ """
+ DateOffset increments between business year begin dates.
+ """
+ _outputName = 'BusinessYearBegin'
+ _default_month = 1
+ _prefix = 'BAS'
+ _day_opt = 'business_start'
+
+
+class YearEnd(YearOffset):
+ """
+ DateOffset increments between calendar year ends.
+ """
+ _default_month = 12
+ _prefix = 'A'
+ _day_opt = 'end'
+
+
+class YearBegin(YearOffset):
+ """
+ DateOffset increments between calendar year begin dates.
+ """
+ _default_month = 1
+ _prefix = 'AS'
+ _day_opt = 'start'
+
+
+# ---------------------------------------------------------------------
+# Special Offset Classes
+
+class FY5253(DateOffset):
+ """
+ Describes 52-53 week fiscal year. This is also known as a 4-4-5 calendar.
+
+ It is used by companies that desire that their
+ fiscal year always end on the same day of the week.
+
+ It is a method of managing accounting periods.
+ It is a common calendar structure for some industries,
+ such as retail, manufacturing and parking industry.
+
+ For more information see:
+ http://en.wikipedia.org/wiki/4-4-5_calendar
+
+ The year may either:
+ - end on the last X day of the Y month.
+ - end on the last X day closest to the last day of the Y month.
+
+ X is a specific day of the week.
+ Y is a certain month of the year
+
+ Parameters
+ ----------
+ n : int
+ weekday : {0, 1, ..., 6}
+ 0: Mondays
+ 1: Tuesdays
+ 2: Wednesdays
+ 3: Thursdays
+ 4: Fridays
+ 5: Saturdays
+ 6: Sundays
+ startingMonth : The month in which fiscal years end. {1, 2, ... 12}
+ variation : str
+ {"nearest", "last"} for "LastOfMonth" or "NearestEndMonth"
+ """
+ _prefix = 'RE'
+ _adjust_dst = True
+ _attributes = frozenset(['weekday', 'startingMonth', 'variation'])
+
+ def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1,
+ variation="nearest"):
+ BaseOffset.__init__(self, n, normalize)
+ object.__setattr__(self, "startingMonth", startingMonth)
+ object.__setattr__(self, "weekday", weekday)
+
+ object.__setattr__(self, "variation", variation)
+
+ if self.n == 0:
+ raise ValueError('N cannot be 0')
+
+ if self.variation not in ["nearest", "last"]:
+ raise ValueError('{variation} is not a valid variation'
+ .format(variation=self.variation))
+
+ def isAnchored(self):
+ return (self.n == 1 and
+ self.startingMonth is not None and
+ self.weekday is not None)
+
+ def onOffset(self, dt):
+ if self.normalize and not _is_normalized(dt):
+ return False
+ dt = datetime(dt.year, dt.month, dt.day)
+ year_end = self.get_year_end(dt)
+
+ if self.variation == "nearest":
+ # We have to check the year end of "this" cal year AND the previous
+ return (year_end == dt or
+ self.get_year_end(shift_month(dt, -1, None)) == dt)
+ else:
+ return year_end == dt
+
+ @apply_wraps
+ def apply(self, other):
+ norm = Timestamp(other).normalize()
+
+ n = self.n
+ prev_year = self.get_year_end(
+ datetime(other.year - 1, self.startingMonth, 1))
+ cur_year = self.get_year_end(
+ datetime(other.year, self.startingMonth, 1))
+ next_year = self.get_year_end(
+ datetime(other.year + 1, self.startingMonth, 1))
+
+ prev_year = conversion.localize_pydatetime(prev_year, other.tzinfo)
+ cur_year = conversion.localize_pydatetime(cur_year, other.tzinfo)
+ next_year = conversion.localize_pydatetime(next_year, other.tzinfo)
+
+ # Note: next_year.year == other.year + 1, so we will always
+ # have other < next_year
+ if norm == prev_year:
+ n -= 1
+ elif norm == cur_year:
+ pass
+ elif n > 0:
+ if norm < prev_year:
+ n -= 2
+ elif prev_year < norm < cur_year:
+ n -= 1
+ elif cur_year < norm < next_year:
+ pass
+ else:
+ if cur_year < norm < next_year:
+ n += 1
+ elif prev_year < norm < cur_year:
+ pass
+ elif (norm.year == prev_year.year and norm < prev_year and
+ prev_year - norm <= timedelta(6)):
+ # GH#14774, error when next_year.year == cur_year.year
+ # e.g. prev_year == datetime(2004, 1, 3),
+ # other == datetime(2004, 1, 1)
+ n -= 1
+ else:
+ assert False
+
+ shifted = datetime(other.year + n, self.startingMonth, 1)
+ result = self.get_year_end(shifted)
+ result = datetime(result.year, result.month, result.day,
+ other.hour, other.minute, other.second,
+ other.microsecond)
+ return result
+
+ def get_year_end(self, dt):
+ assert dt.tzinfo is None
+
+ dim = ccalendar.get_days_in_month(dt.year, self.startingMonth)
+ target_date = datetime(dt.year, self.startingMonth, dim)
+ wkday_diff = self.weekday - target_date.weekday()
+ if wkday_diff == 0:
+ # year_end is the same for "last" and "nearest" cases
+ return target_date
+
+ if self.variation == "last":
+ days_forward = (wkday_diff % 7) - 7
+
+ # days_forward is always negative, so we always end up
+ # in the same year as dt
+ return target_date + timedelta(days=days_forward)
+ else:
+ # variation == "nearest":
+ days_forward = wkday_diff % 7
+ if days_forward <= 3:
+ # The upcoming self.weekday is closer than the previous one
+ return target_date + timedelta(days_forward)
+ else:
+ # The previous self.weekday is closer than the upcoming one
+ return target_date + timedelta(days_forward - 7)
+
+ @property
+ def rule_code(self):
+ prefix = self._prefix
+ suffix = self.get_rule_code_suffix()
+ return "{prefix}-{suffix}".format(prefix=prefix, suffix=suffix)
+
+ def _get_suffix_prefix(self):
+ if self.variation == "nearest":
+ return 'N'
+ else:
+ return 'L'
+
+ def get_rule_code_suffix(self):
+ prefix = self._get_suffix_prefix()
+ month = ccalendar.MONTH_ALIASES[self.startingMonth]
+ weekday = ccalendar.int_to_weekday[self.weekday]
+ return '{prefix}-{month}-{weekday}'.format(prefix=prefix, month=month,
+ weekday=weekday)
+
+ @classmethod
+ def _parse_suffix(cls, varion_code, startingMonth_code, weekday_code):
+ if varion_code == "N":
+ variation = "nearest"
+ elif varion_code == "L":
+ variation = "last"
+ else:
+ raise ValueError("Unable to parse varion_code: "
+ "{code}".format(code=varion_code))
+
+ startingMonth = ccalendar.MONTH_TO_CAL_NUM[startingMonth_code]
+ weekday = ccalendar.weekday_to_int[weekday_code]
+
+ return {"weekday": weekday,
+ "startingMonth": startingMonth,
+ "variation": variation}
+
+ @classmethod
+ def _from_name(cls, *args):
+ return cls(**cls._parse_suffix(*args))
+
+
+class FY5253Quarter(DateOffset):
+ """
+ DateOffset increments between business quarter dates
+ for 52-53 week fiscal year (also known as a 4-4-5 calendar).
+
+ It is used by companies that desire that their
+ fiscal year always end on the same day of the week.
+
+ It is a method of managing accounting periods.
+ It is a common calendar structure for some industries,
+ such as retail, manufacturing and parking industry.
+
+ For more information see:
+ http://en.wikipedia.org/wiki/4-4-5_calendar
+
+ The year may either:
+ - end on the last X day of the Y month.
+ - end on the last X day closest to the last day of the Y month.
+
+ X is a specific day of the week.
+ Y is a certain month of the year
+
+ startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ...
+ startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ...
+ startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ...
+
+ Parameters
+ ----------
+ n : int
+ weekday : {0, 1, ..., 6}
+ 0: Mondays
+ 1: Tuesdays
+ 2: Wednesdays
+ 3: Thursdays
+ 4: Fridays
+ 5: Saturdays
+ 6: Sundays
+ startingMonth : The month in which fiscal years end. {1, 2, ... 12}
+ qtr_with_extra_week : The quarter number that has the leap
+ or 14 week when needed. {1,2,3,4}
+ variation : str
+ {"nearest", "last"} for "LastOfMonth" or "NearestEndMonth"
+ """
+
+ _prefix = 'REQ'
+ _adjust_dst = True
+ _attributes = frozenset(['weekday', 'startingMonth', 'qtr_with_extra_week',
+ 'variation'])
+
+ def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1,
+ qtr_with_extra_week=1, variation="nearest"):
+ BaseOffset.__init__(self, n, normalize)
+
+ object.__setattr__(self, "startingMonth", startingMonth)
+ object.__setattr__(self, "weekday", weekday)
+ object.__setattr__(self, "qtr_with_extra_week", qtr_with_extra_week)
+ object.__setattr__(self, "variation", variation)
+
+ if self.n == 0:
+ raise ValueError('N cannot be 0')
+
+ @cache_readonly
+ def _offset(self):
+ return FY5253(startingMonth=self.startingMonth,
+ weekday=self.weekday,
+ variation=self.variation)
+
+ def isAnchored(self):
+ return self.n == 1 and self._offset.isAnchored()
+
+ def _rollback_to_year(self, other):
+ """
+ Roll `other` back to the most recent date that was on a fiscal year
+ end.
+
+ Return the date of that year-end, the number of full quarters
+ elapsed between that year-end and other, and the remaining Timedelta
+ since the most recent quarter-end.
+
+ Parameters
+ ----------
+ other : datetime or Timestamp
+
+ Returns
+ -------
+ tuple of
+ prev_year_end : Timestamp giving most recent fiscal year end
+ num_qtrs : int
+ tdelta : Timedelta
+ """
+ num_qtrs = 0
+
+ norm = Timestamp(other).tz_localize(None)
+ start = self._offset.rollback(norm)
+ # Note: start <= norm and self._offset.onOffset(start)
+
+ if start < norm:
+ # roll adjustment
+ qtr_lens = self.get_weeks(norm)
+
+ # check thet qtr_lens is consistent with self._offset addition
+ end = liboffsets.shift_day(start, days=7 * sum(qtr_lens))
+ assert self._offset.onOffset(end), (start, end, qtr_lens)
+
+ tdelta = norm - start
+ for qlen in qtr_lens:
+ if qlen * 7 <= tdelta.days:
+ num_qtrs += 1
+ tdelta -= Timedelta(days=qlen * 7)
+ else:
+ break
+ else:
+ tdelta = Timedelta(0)
+
+ # Note: we always have tdelta.value >= 0
+ return start, num_qtrs, tdelta
+
+ @apply_wraps
+ def apply(self, other):
+ # Note: self.n == 0 is not allowed.
+ n = self.n
+
+ prev_year_end, num_qtrs, tdelta = self._rollback_to_year(other)
+ res = prev_year_end
+ n += num_qtrs
+ if self.n <= 0 and tdelta.value > 0:
+ n += 1
+
+ # Possible speedup by handling years first.
+ years = n // 4
+ if years:
+ res += self._offset * years
+ n -= years * 4
+
+ # Add an extra day to make *sure* we are getting the quarter lengths
+ # for the upcoming year, not the previous year
+ qtr_lens = self.get_weeks(res + Timedelta(days=1))
+
+ # Note: we always have 0 <= n < 4
+ weeks = sum(qtr_lens[:n])
+ if weeks:
+ res = liboffsets.shift_day(res, days=weeks * 7)
+
+ return res
+
+ def get_weeks(self, dt):
+ ret = [13] * 4
+
+ year_has_extra_week = self.year_has_extra_week(dt)
+
+ if year_has_extra_week:
+ ret[self.qtr_with_extra_week - 1] = 14
+
+ return ret
+
+ def year_has_extra_week(self, dt):
+ # Avoid round-down errors --> normalize to get
+ # e.g. '370D' instead of '360D23H'
+ norm = Timestamp(dt).normalize().tz_localize(None)
+
+ next_year_end = self._offset.rollforward(norm)
+ prev_year_end = norm - self._offset
+ weeks_in_year = (next_year_end - prev_year_end).days / 7
+ assert weeks_in_year in [52, 53], weeks_in_year
+ return weeks_in_year == 53
+
+ def onOffset(self, dt):
+ if self.normalize and not _is_normalized(dt):
+ return False
+ if self._offset.onOffset(dt):
+ return True
+
+ next_year_end = dt - self._offset
+
+ qtr_lens = self.get_weeks(dt)
+
+ current = next_year_end
+ for qtr_len in qtr_lens:
+ current = liboffsets.shift_day(current, days=qtr_len * 7)
+ if dt == current:
+ return True
+ return False
+
+ @property
+ def rule_code(self):
+ suffix = self._offset.get_rule_code_suffix()
+ qtr = self.qtr_with_extra_week
+ return "{prefix}-{suffix}-{qtr}".format(prefix=self._prefix,
+ suffix=suffix, qtr=qtr)
+
+ @classmethod
+ def _from_name(cls, *args):
+ return cls(**dict(FY5253._parse_suffix(*args[:-1]),
+ qtr_with_extra_week=int(args[-1])))
+
+
+class Easter(DateOffset):
+ """
+ DateOffset for the Easter holiday using logic defined in dateutil.
+
+ Right now uses the revised method which is valid in years 1583-4099.
+ """
+ _adjust_dst = True
+ _attributes = frozenset(['n', 'normalize'])
+
+ __init__ = BaseOffset.__init__
+
+ @apply_wraps
+ def apply(self, other):
+ current_easter = easter(other.year)
+ current_easter = datetime(current_easter.year,
+ current_easter.month, current_easter.day)
+ current_easter = conversion.localize_pydatetime(current_easter,
+ other.tzinfo)
+
+ n = self.n
+ if n >= 0 and other < current_easter:
+ n -= 1
+ elif n < 0 and other > current_easter:
+ n += 1
+ # TODO: Why does this handle the 0 case the opposite of others?
+
+ # NOTE: easter returns a datetime.date so we have to convert to type of
+ # other
+ new = easter(other.year + n)
+ new = datetime(new.year, new.month, new.day, other.hour,
+ other.minute, other.second, other.microsecond)
+ return new
+
+ def onOffset(self, dt):
+ if self.normalize and not _is_normalized(dt):
+ return False
+ return date(dt.year, dt.month, dt.day) == easter(dt.year)
+
+# ---------------------------------------------------------------------
+# Ticks
+
+
+def _tick_comp(op):
+ assert op not in [operator.eq, operator.ne]
+
+ def f(self, other):
+ try:
+ return op(self.delta, other.delta)
+ except AttributeError:
+ # comparing with a non-Tick object
+ raise TypeError("Invalid comparison between {cls} and {typ}"
+ .format(cls=type(self).__name__,
+ typ=type(other).__name__))
+
+ f.__name__ = '__{opname}__'.format(opname=op.__name__)
+ return f
+
+
+class Tick(liboffsets._Tick, SingleConstructorOffset):
+ _inc = Timedelta(microseconds=1000)
+ _prefix = 'undefined'
+ _attributes = frozenset(['n', 'normalize'])
+
+ def __init__(self, n=1, normalize=False):
+ BaseOffset.__init__(self, n, normalize)
+ if normalize:
+ raise ValueError("Tick offset with `normalize=True` are not "
+ "allowed.") # GH#21427
+
+ __gt__ = _tick_comp(operator.gt)
+ __ge__ = _tick_comp(operator.ge)
+ __lt__ = _tick_comp(operator.lt)
+ __le__ = _tick_comp(operator.le)
+
+ def __add__(self, other):
+ if isinstance(other, Tick):
+ if type(self) == type(other):
+ return type(self)(self.n + other.n)
+ else:
+ return _delta_to_tick(self.delta + other.delta)
+ elif isinstance(other, ABCPeriod):
+ return other + self
+ try:
+ return self.apply(other)
+ except ApplyTypeError:
+ return NotImplemented
+ except OverflowError:
+ raise OverflowError("the add operation between {self} and {other} "
+ "will overflow".format(self=self, other=other))
+
+ def __eq__(self, other):
+ if isinstance(other, compat.string_types):
+ from pandas.tseries.frequencies import to_offset
+ try:
+ # GH#23524 if to_offset fails, we are dealing with an
+ # incomparable type so == is False and != is True
+ other = to_offset(other)
+ except ValueError:
+ # e.g. "infer"
+ return False
+
+ if isinstance(other, Tick):
+ return self.delta == other.delta
+ else:
+ return False
+
+ # This is identical to DateOffset.__hash__, but has to be redefined here
+ # for Python 3, because we've redefined __eq__.
+ def __hash__(self):
+ return hash(self._params)
+
+ def __ne__(self, other):
+ if isinstance(other, compat.string_types):
+ from pandas.tseries.frequencies import to_offset
+ try:
+ # GH#23524 if to_offset fails, we are dealing with an
+ # incomparable type so == is False and != is True
+ other = to_offset(other)
+ except ValueError:
+ # e.g. "infer"
+ return True
+
+ if isinstance(other, Tick):
+ return self.delta != other.delta
+ else:
+ return True
+
+ @property
+ def delta(self):
+ return self.n * self._inc
+
+ @property
+ def nanos(self):
+ return delta_to_nanoseconds(self.delta)
+
+ # TODO: Should Tick have its own apply_index?
+ def apply(self, other):
+ # Timestamp can handle tz and nano sec, thus no need to use apply_wraps
+ if isinstance(other, Timestamp):
+
+ # GH 15126
+ # in order to avoid a recursive
+ # call of __add__ and __radd__ if there is
+ # an exception, when we call using the + operator,
+ # we directly call the known method
+ result = other.__add__(self)
+ if result == NotImplemented:
+ raise OverflowError
+ return result
+ elif isinstance(other, (datetime, np.datetime64, date)):
+ return as_timestamp(other) + self
+
+ if isinstance(other, timedelta):
+ return other + self.delta
+ elif isinstance(other, type(self)):
+ return type(self)(self.n + other.n)
+
+ raise ApplyTypeError('Unhandled type: {type_str}'
+ .format(type_str=type(other).__name__))
+
+ def isAnchored(self):
+ return False
+
+
+def _delta_to_tick(delta):
+ if delta.microseconds == 0 and getattr(delta, "nanoseconds", 0) == 0:
+ # nanoseconds only for pd.Timedelta
+ if delta.seconds == 0:
+ return Day(delta.days)
+ else:
+ seconds = delta.days * 86400 + delta.seconds
+ if seconds % 3600 == 0:
+ return Hour(seconds / 3600)
+ elif seconds % 60 == 0:
+ return Minute(seconds / 60)
+ else:
+ return Second(seconds)
+ else:
+ nanos = delta_to_nanoseconds(delta)
+ if nanos % 1000000 == 0:
+ return Milli(nanos // 1000000)
+ elif nanos % 1000 == 0:
+ return Micro(nanos // 1000)
+ else: # pragma: no cover
+ return Nano(nanos)
+
+
+class Day(Tick):
+ _inc = Timedelta(days=1)
+ _prefix = 'D'
+
+
+class Hour(Tick):
+ _inc = Timedelta(hours=1)
+ _prefix = 'H'
+
+
+class Minute(Tick):
+ _inc = Timedelta(minutes=1)
+ _prefix = 'T'
+
+
+class Second(Tick):
+ _inc = Timedelta(seconds=1)
+ _prefix = 'S'
+
+
+class Milli(Tick):
+ _inc = Timedelta(milliseconds=1)
+ _prefix = 'L'
+
+
+class Micro(Tick):
+ _inc = Timedelta(microseconds=1)
+ _prefix = 'U'
+
+
+class Nano(Tick):
+ _inc = Timedelta(nanoseconds=1)
+ _prefix = 'N'
+
+
+BDay = BusinessDay
+BMonthEnd = BusinessMonthEnd
+BMonthBegin = BusinessMonthBegin
+CBMonthEnd = CustomBusinessMonthEnd
+CBMonthBegin = CustomBusinessMonthBegin
+CDay = CustomBusinessDay
+
+# ---------------------------------------------------------------------
+
+
+def generate_range(start=None, end=None, periods=None, offset=BDay()):
+ """
+ Generates a sequence of dates corresponding to the specified time
+ offset. Similar to dateutil.rrule except uses pandas DateOffset
+ objects to represent time increments.
+
+ Parameters
+ ----------
+ start : datetime (default None)
+ end : datetime (default None)
+ periods : int, (default None)
+ offset : DateOffset, (default BDay())
+
+ Notes
+ -----
+ * This method is faster for generating weekdays than dateutil.rrule
+ * At least two of (start, end, periods) must be specified.
+ * If both start and end are specified, the returned dates will
+ satisfy start <= date <= end.
+
+ Returns
+ -------
+ dates : generator object
+ """
+ from pandas.tseries.frequencies import to_offset
+ offset = to_offset(offset)
+
+ start = to_datetime(start)
+ end = to_datetime(end)
+
+ if start and not offset.onOffset(start):
+ start = offset.rollforward(start)
+
+ elif end and not offset.onOffset(end):
+ end = offset.rollback(end)
+
+ if periods is None and end < start and offset.n >= 0:
+ end = None
+ periods = 0
+
+ if end is None:
+ end = start + (periods - 1) * offset
+
+ if start is None:
+ start = end - (periods - 1) * offset
+
+ cur = start
+ if offset.n >= 0:
+ while cur <= end:
+ yield cur
+
+ # faster than cur + offset
+ next_date = offset.apply(cur)
+ if next_date <= cur:
+ raise ValueError('Offset {offset} did not increment date'
+ .format(offset=offset))
+ cur = next_date
+ else:
+ while cur >= end:
+ yield cur
+
+ # faster than cur + offset
+ next_date = offset.apply(cur)
+ if next_date >= cur:
+ raise ValueError('Offset {offset} did not decrement date'
+ .format(offset=offset))
+ cur = next_date
+
+
+prefix_mapping = {offset._prefix: offset for offset in [
+ YearBegin, # 'AS'
+ YearEnd, # 'A'
+ BYearBegin, # 'BAS'
+ BYearEnd, # 'BA'
+ BusinessDay, # 'B'
+ BusinessMonthBegin, # 'BMS'
+ BusinessMonthEnd, # 'BM'
+ BQuarterEnd, # 'BQ'
+ BQuarterBegin, # 'BQS'
+ BusinessHour, # 'BH'
+ CustomBusinessDay, # 'C'
+ CustomBusinessMonthEnd, # 'CBM'
+ CustomBusinessMonthBegin, # 'CBMS'
+ CustomBusinessHour, # 'CBH'
+ MonthEnd, # 'M'
+ MonthBegin, # 'MS'
+ Nano, # 'N'
+ SemiMonthEnd, # 'SM'
+ SemiMonthBegin, # 'SMS'
+ Week, # 'W'
+ Second, # 'S'
+ Minute, # 'T'
+ Micro, # 'U'
+ QuarterEnd, # 'Q'
+ QuarterBegin, # 'QS'
+ Milli, # 'L'
+ Hour, # 'H'
+ Day, # 'D'
+ WeekOfMonth, # 'WOM'
+ FY5253,
+ FY5253Quarter
+]}
diff --git a/contrib/python/pandas/py2/pandas/tseries/plotting.py b/contrib/python/pandas/py2/pandas/tseries/plotting.py
new file mode 100644
index 00000000000..30201690763
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/tseries/plotting.py
@@ -0,0 +1,3 @@
+# flake8: noqa
+
+from pandas.plotting._timeseries import tsplot
diff --git a/contrib/python/pandas/py2/pandas/util/__init__.py b/contrib/python/pandas/py2/pandas/util/__init__.py
new file mode 100644
index 00000000000..202e58c916e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/util/__init__.py
@@ -0,0 +1,2 @@
+from pandas.util._decorators import Appender, Substitution, cache_readonly # noqa
+from pandas.core.util.hashing import hash_pandas_object, hash_array # noqa
diff --git a/contrib/python/pandas/py2/pandas/util/_decorators.py b/contrib/python/pandas/py2/pandas/util/_decorators.py
new file mode 100644
index 00000000000..86cd8b1e698
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/util/_decorators.py
@@ -0,0 +1,352 @@
+from functools import wraps
+import inspect
+from textwrap import dedent
+import warnings
+
+from pandas._libs.properties import cache_readonly # noqa
+from pandas.compat import PY2, callable, signature
+
+
+def deprecate(name, alternative, version, alt_name=None,
+ klass=None, stacklevel=2, msg=None):
+ """Return a new function that emits a deprecation warning on use.
+
+ To use this method for a deprecated function, another function
+ `alternative` with the same signature must exist. The deprecated
+ function will emit a deprecation warning, and in the docstring
+ it will contain the deprecation directive with the provided version
+ so it can be detected for future removal.
+
+ Parameters
+ ----------
+ name : str
+ Name of function to deprecate.
+ alternative : func
+ Function to use instead.
+ version : str
+ Version of pandas in which the method has been deprecated.
+ alt_name : str, optional
+ Name to use in preference of alternative.__name__.
+ klass : Warning, default FutureWarning
+ stacklevel : int, default 2
+ msg : str
+ The message to display in the warning.
+ Default is '{name} is deprecated. Use {alt_name} instead.'
+ """
+
+ alt_name = alt_name or alternative.__name__
+ klass = klass or FutureWarning
+ warning_msg = msg or '{} is deprecated, use {} instead'.format(name,
+ alt_name)
+
+ @wraps(alternative)
+ def wrapper(*args, **kwargs):
+ warnings.warn(warning_msg, klass, stacklevel=stacklevel)
+ return alternative(*args, **kwargs)
+
+ # adding deprecated directive to the docstring
+ msg = msg or 'Use `{alt_name}` instead.'.format(alt_name=alt_name)
+ doc_error_msg = ('deprecate needs a correctly formatted docstring in '
+ 'the target function (should have a one liner short '
+ 'summary, and opening quotes should be in their own '
+ 'line). Found:\n{}'.format(alternative.__doc__))
+
+ # when python is running in optimized mode (i.e. `-OO`), docstrings are
+ # removed, so we check that a docstring with correct formatting is used
+ # but we allow empty docstrings
+ if alternative.__doc__:
+ if alternative.__doc__.count('\n') < 3:
+ raise AssertionError(doc_error_msg)
+ empty1, summary, empty2, doc = alternative.__doc__.split('\n', 3)
+ if empty1 or empty2 and not summary:
+ raise AssertionError(doc_error_msg)
+ wrapper.__doc__ = dedent("""
+ {summary}
+
+ .. deprecated:: {depr_version}
+ {depr_msg}
+
+ {rest_of_docstring}""").format(summary=summary.strip(),
+ depr_version=version,
+ depr_msg=msg,
+ rest_of_docstring=dedent(doc))
+
+ return wrapper
+
+
+def deprecate_kwarg(old_arg_name, new_arg_name, mapping=None, stacklevel=2):
+ """
+ Decorator to deprecate a keyword argument of a function.
+
+ Parameters
+ ----------
+ old_arg_name : str
+ Name of argument in function to deprecate
+ new_arg_name : str or None
+ Name of preferred argument in function. Use None to raise warning that
+ ``old_arg_name`` keyword is deprecated.
+ mapping : dict or callable
+ If mapping is present, use it to translate old arguments to
+ new arguments. A callable must do its own value checking;
+ values not found in a dict will be forwarded unchanged.
+
+ Examples
+ --------
+ The following deprecates 'cols', using 'columns' instead
+
+ >>> @deprecate_kwarg(old_arg_name='cols', new_arg_name='columns')
+ ... def f(columns=''):
+ ... print(columns)
+ ...
+ >>> f(columns='should work ok')
+ should work ok
+
+ >>> f(cols='should raise warning')
+ FutureWarning: cols is deprecated, use columns instead
+ warnings.warn(msg, FutureWarning)
+ should raise warning
+
+ >>> f(cols='should error', columns="can\'t pass do both")
+ TypeError: Can only specify 'cols' or 'columns', not both
+
+ >>> @deprecate_kwarg('old', 'new', {'yes': True, 'no': False})
+ ... def f(new=False):
+ ... print('yes!' if new else 'no!')
+ ...
+ >>> f(old='yes')
+ FutureWarning: old='yes' is deprecated, use new=True instead
+ warnings.warn(msg, FutureWarning)
+ yes!
+
+ To raise a warning that a keyword will be removed entirely in the future
+
+ >>> @deprecate_kwarg(old_arg_name='cols', new_arg_name=None)
+ ... def f(cols='', another_param=''):
+ ... print(cols)
+ ...
+ >>> f(cols='should raise warning')
+ FutureWarning: the 'cols' keyword is deprecated and will be removed in a
+ future version please takes steps to stop use of 'cols'
+ should raise warning
+ >>> f(another_param='should not raise warning')
+ should not raise warning
+
+ >>> f(cols='should raise warning', another_param='')
+ FutureWarning: the 'cols' keyword is deprecated and will be removed in a
+ future version please takes steps to stop use of 'cols'
+ should raise warning
+ """
+
+ if mapping is not None and not hasattr(mapping, 'get') and \
+ not callable(mapping):
+ raise TypeError("mapping from old to new argument values "
+ "must be dict or callable!")
+
+ def _deprecate_kwarg(func):
+ @wraps(func)
+ def wrapper(*args, **kwargs):
+ old_arg_value = kwargs.pop(old_arg_name, None)
+
+ if new_arg_name is None and old_arg_value is not None:
+ msg = (
+ "the '{old_name}' keyword is deprecated and will be "
+ "removed in a future version. "
+ "Please take steps to stop the use of '{old_name}'"
+ ).format(old_name=old_arg_name)
+ warnings.warn(msg, FutureWarning, stacklevel=stacklevel)
+ kwargs[old_arg_name] = old_arg_value
+ return func(*args, **kwargs)
+
+ if old_arg_value is not None:
+ if mapping is not None:
+ if hasattr(mapping, 'get'):
+ new_arg_value = mapping.get(old_arg_value,
+ old_arg_value)
+ else:
+ new_arg_value = mapping(old_arg_value)
+ msg = ("the {old_name}={old_val!r} keyword is deprecated, "
+ "use {new_name}={new_val!r} instead"
+ ).format(old_name=old_arg_name,
+ old_val=old_arg_value,
+ new_name=new_arg_name,
+ new_val=new_arg_value)
+ else:
+ new_arg_value = old_arg_value
+ msg = ("the '{old_name}' keyword is deprecated, "
+ "use '{new_name}' instead"
+ ).format(old_name=old_arg_name,
+ new_name=new_arg_name)
+
+ warnings.warn(msg, FutureWarning, stacklevel=stacklevel)
+ if kwargs.get(new_arg_name, None) is not None:
+ msg = ("Can only specify '{old_name}' or '{new_name}', "
+ "not both").format(old_name=old_arg_name,
+ new_name=new_arg_name)
+ raise TypeError(msg)
+ else:
+ kwargs[new_arg_name] = new_arg_value
+ return func(*args, **kwargs)
+ return wrapper
+ return _deprecate_kwarg
+
+
+def rewrite_axis_style_signature(name, extra_params):
+ def decorate(func):
+ @wraps(func)
+ def wrapper(*args, **kwargs):
+ return func(*args, **kwargs)
+
+ if not PY2:
+ kind = inspect.Parameter.POSITIONAL_OR_KEYWORD
+ params = [
+ inspect.Parameter('self', kind),
+ inspect.Parameter(name, kind, default=None),
+ inspect.Parameter('index', kind, default=None),
+ inspect.Parameter('columns', kind, default=None),
+ inspect.Parameter('axis', kind, default=None),
+ ]
+
+ for pname, default in extra_params:
+ params.append(inspect.Parameter(pname, kind, default=default))
+
+ sig = inspect.Signature(params)
+
+ func.__signature__ = sig
+ return wrapper
+ return decorate
+
+# Substitution and Appender are derived from matplotlib.docstring (1.1.0)
+# module http://matplotlib.org/users/license.html
+
+
+class Substitution(object):
+ """
+ A decorator to take a function's docstring and perform string
+ substitution on it.
+
+ This decorator should be robust even if func.__doc__ is None
+ (for example, if -OO was passed to the interpreter)
+
+ Usage: construct a docstring.Substitution with a sequence or
+ dictionary suitable for performing substitution; then
+ decorate a suitable function with the constructed object. e.g.
+
+ sub_author_name = Substitution(author='Jason')
+
+ @sub_author_name
+ def some_function(x):
+ "%(author)s wrote this function"
+
+ # note that some_function.__doc__ is now "Jason wrote this function"
+
+ One can also use positional arguments.
+
+ sub_first_last_names = Substitution('Edgar Allen', 'Poe')
+
+ @sub_first_last_names
+ def some_function(x):
+ "%s %s wrote the Raven"
+ """
+
+ def __init__(self, *args, **kwargs):
+ if (args and kwargs):
+ raise AssertionError("Only positional or keyword args are allowed")
+
+ self.params = args or kwargs
+
+ def __call__(self, func):
+ func.__doc__ = func.__doc__ and func.__doc__ % self.params
+ return func
+
+ def update(self, *args, **kwargs):
+ """
+ Update self.params with supplied args.
+
+ If called, we assume self.params is a dict.
+ """
+
+ self.params.update(*args, **kwargs)
+
+ @classmethod
+ def from_params(cls, params):
+ """
+ In the case where the params is a mutable sequence (list or dictionary)
+ and it may change before this class is called, one may explicitly use a
+ reference to the params rather than using *args or **kwargs which will
+ copy the values and not reference them.
+ """
+ result = cls()
+ result.params = params
+ return result
+
+
+class Appender(object):
+ """
+ A function decorator that will append an addendum to the docstring
+ of the target function.
+
+ This decorator should be robust even if func.__doc__ is None
+ (for example, if -OO was passed to the interpreter).
+
+ Usage: construct a docstring.Appender with a string to be joined to
+ the original docstring. An optional 'join' parameter may be supplied
+ which will be used to join the docstring and addendum. e.g.
+
+ add_copyright = Appender("Copyright (c) 2009", join='\n')
+
+ @add_copyright
+ def my_dog(has='fleas'):
+ "This docstring will have a copyright below"
+ pass
+ """
+
+ def __init__(self, addendum, join='', indents=0):
+ if indents > 0:
+ self.addendum = indent(addendum, indents=indents)
+ else:
+ self.addendum = addendum
+ self.join = join
+
+ def __call__(self, func):
+ func.__doc__ = func.__doc__ if func.__doc__ else ''
+ self.addendum = self.addendum if self.addendum else ''
+ docitems = [func.__doc__, self.addendum]
+ func.__doc__ = dedent(self.join.join(docitems))
+ return func
+
+
+def indent(text, indents=1):
+ if not text or not isinstance(text, str):
+ return ''
+ jointext = ''.join(['\n'] + [' '] * indents)
+ return jointext.join(text.split('\n'))
+
+
+def make_signature(func):
+ """
+ Returns a tuple containing the paramenter list with defaults
+ and parameter list.
+
+ Examples
+ --------
+ >>> def f(a, b, c=2):
+ >>> return a * b * c
+ >>> print(make_signature(f))
+ (['a', 'b', 'c=2'], ['a', 'b', 'c'])
+ """
+
+ spec = signature(func)
+ if spec.defaults is None:
+ n_wo_defaults = len(spec.args)
+ defaults = ('',) * n_wo_defaults
+ else:
+ n_wo_defaults = len(spec.args) - len(spec.defaults)
+ defaults = ('',) * n_wo_defaults + tuple(spec.defaults)
+ args = []
+ for var, default in zip(spec.args, defaults):
+ args.append(var if default == '' else var + '=' + repr(default))
+ if spec.varargs:
+ args.append('*' + spec.varargs)
+ if spec.keywords:
+ args.append('**' + spec.keywords)
+ return args, spec.args
diff --git a/contrib/python/pandas/py2/pandas/util/_depr_module.py b/contrib/python/pandas/py2/pandas/util/_depr_module.py
new file mode 100644
index 00000000000..2c8feec798c
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/util/_depr_module.py
@@ -0,0 +1,103 @@
+"""
+This module houses a utility class for mocking deprecated modules.
+It is for internal use only and should not be used beyond this purpose.
+"""
+
+import importlib
+import warnings
+
+
+class _DeprecatedModule(object):
+ """ Class for mocking deprecated modules.
+
+ Parameters
+ ----------
+ deprmod : name of module to be deprecated.
+ deprmodto : name of module as a replacement, optional.
+ If not given, the __module__ attribute will
+ be used when needed.
+ removals : objects or methods in module that will no longer be
+ accessible once module is removed.
+ moved : dict, optional
+ dictionary of function name -> new location for moved
+ objects
+ """
+
+ def __init__(self, deprmod, deprmodto=None, removals=None,
+ moved=None):
+ self.deprmod = deprmod
+ self.deprmodto = deprmodto
+ self.removals = removals
+ if self.removals is not None:
+ self.removals = frozenset(self.removals)
+ self.moved = moved
+
+ # For introspection purposes.
+ self.self_dir = frozenset(dir(self.__class__))
+
+ def __dir__(self):
+ deprmodule = self._import_deprmod()
+ return dir(deprmodule)
+
+ def __repr__(self):
+ deprmodule = self._import_deprmod()
+ return repr(deprmodule)
+
+ __str__ = __repr__
+
+ def __getattr__(self, name):
+ if name in self.self_dir:
+ return object.__getattribute__(self, name)
+
+ try:
+ deprmodule = self._import_deprmod(self.deprmod)
+ except ImportError:
+ if self.deprmodto is None:
+ raise
+
+ # a rename
+ deprmodule = self._import_deprmod(self.deprmodto)
+
+ obj = getattr(deprmodule, name)
+
+ if self.removals is not None and name in self.removals:
+ warnings.warn(
+ "{deprmod}.{name} is deprecated and will be removed in "
+ "a future version.".format(deprmod=self.deprmod, name=name),
+ FutureWarning, stacklevel=2)
+ elif self.moved is not None and name in self.moved:
+ warnings.warn(
+ "{deprmod} is deprecated and will be removed in "
+ "a future version.\nYou can access {name} as {moved}".format(
+ deprmod=self.deprmod,
+ name=name,
+ moved=self.moved[name]),
+ FutureWarning, stacklevel=2)
+ else:
+ deprmodto = self.deprmodto
+ if deprmodto is False:
+ warnings.warn(
+ "{deprmod}.{name} is deprecated and will be removed in "
+ "a future version.".format(
+ deprmod=self.deprmod, name=name),
+ FutureWarning, stacklevel=2)
+ else:
+ if deprmodto is None:
+ deprmodto = obj.__module__
+ # The object is actually located in another module.
+ warnings.warn(
+ "{deprmod}.{name} is deprecated. Please use "
+ "{deprmodto}.{name} instead.".format(
+ deprmod=self.deprmod, name=name, deprmodto=deprmodto),
+ FutureWarning, stacklevel=2)
+
+ return obj
+
+ def _import_deprmod(self, mod=None):
+ if mod is None:
+ mod = self.deprmod
+
+ with warnings.catch_warnings():
+ warnings.filterwarnings('ignore', category=FutureWarning)
+ deprmodule = importlib.import_module(mod)
+ return deprmodule
diff --git a/contrib/python/pandas/py2/pandas/util/_doctools.py b/contrib/python/pandas/py2/pandas/util/_doctools.py
new file mode 100644
index 00000000000..4aee0a2e535
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/util/_doctools.py
@@ -0,0 +1,206 @@
+import numpy as np
+
+import pandas.compat as compat
+
+import pandas as pd
+
+
+class TablePlotter(object):
+ """
+ Layout some DataFrames in vertical/horizontal layout for explanation.
+ Used in merging.rst
+ """
+
+ def __init__(self, cell_width=0.37, cell_height=0.25, font_size=7.5):
+ self.cell_width = cell_width
+ self.cell_height = cell_height
+ self.font_size = font_size
+
+ def _shape(self, df):
+ """
+ Calculate table chape considering index levels.
+ """
+
+ row, col = df.shape
+ return row + df.columns.nlevels, col + df.index.nlevels
+
+ def _get_cells(self, left, right, vertical):
+ """
+ Calculate appropriate figure size based on left and right data.
+ """
+
+ if vertical:
+ # calculate required number of cells
+ vcells = max(sum(self._shape(l)[0] for l in left),
+ self._shape(right)[0])
+ hcells = (max(self._shape(l)[1] for l in left) +
+ self._shape(right)[1])
+ else:
+ vcells = max([self._shape(l)[0] for l in left] +
+ [self._shape(right)[0]])
+ hcells = sum([self._shape(l)[1] for l in left] +
+ [self._shape(right)[1]])
+ return hcells, vcells
+
+ def plot(self, left, right, labels=None, vertical=True):
+ """
+ Plot left / right DataFrames in specified layout.
+
+ Parameters
+ ----------
+ left : list of DataFrames before operation is applied
+ right : DataFrame of operation result
+ labels : list of str to be drawn as titles of left DataFrames
+ vertical : bool
+ If True, use vertical layout. If False, use horizontal layout.
+ """
+ import matplotlib.pyplot as plt
+ import matplotlib.gridspec as gridspec
+
+ if not isinstance(left, list):
+ left = [left]
+ left = [self._conv(l) for l in left]
+ right = self._conv(right)
+
+ hcells, vcells = self._get_cells(left, right, vertical)
+
+ if vertical:
+ figsize = self.cell_width * hcells, self.cell_height * vcells
+ else:
+ # include margin for titles
+ figsize = self.cell_width * hcells, self.cell_height * vcells
+ fig = plt.figure(figsize=figsize)
+
+ if vertical:
+ gs = gridspec.GridSpec(len(left), hcells)
+ # left
+ max_left_cols = max(self._shape(l)[1] for l in left)
+ max_left_rows = max(self._shape(l)[0] for l in left)
+ for i, (l, label) in enumerate(zip(left, labels)):
+ ax = fig.add_subplot(gs[i, 0:max_left_cols])
+ self._make_table(ax, l, title=label,
+ height=1.0 / max_left_rows)
+ # right
+ ax = plt.subplot(gs[:, max_left_cols:])
+ self._make_table(ax, right, title='Result', height=1.05 / vcells)
+ fig.subplots_adjust(top=0.9, bottom=0.05, left=0.05, right=0.95)
+ else:
+ max_rows = max(self._shape(df)[0] for df in left + [right])
+ height = 1.0 / np.max(max_rows)
+ gs = gridspec.GridSpec(1, hcells)
+ # left
+ i = 0
+ for l, label in zip(left, labels):
+ sp = self._shape(l)
+ ax = fig.add_subplot(gs[0, i:i + sp[1]])
+ self._make_table(ax, l, title=label, height=height)
+ i += sp[1]
+ # right
+ ax = plt.subplot(gs[0, i:])
+ self._make_table(ax, right, title='Result', height=height)
+ fig.subplots_adjust(top=0.85, bottom=0.05, left=0.05, right=0.95)
+
+ return fig
+
+ def _conv(self, data):
+ """Convert each input to appropriate for table outplot"""
+ if isinstance(data, pd.Series):
+ if data.name is None:
+ data = data.to_frame(name='')
+ else:
+ data = data.to_frame()
+ data = data.fillna('NaN')
+ return data
+
+ def _insert_index(self, data):
+ # insert is destructive
+ data = data.copy()
+ idx_nlevels = data.index.nlevels
+ if idx_nlevels == 1:
+ data.insert(0, 'Index', data.index)
+ else:
+ for i in range(idx_nlevels):
+ data.insert(i, 'Index{0}'.format(i),
+ data.index._get_level_values(i))
+
+ col_nlevels = data.columns.nlevels
+ if col_nlevels > 1:
+ col = data.columns._get_level_values(0)
+ values = [data.columns._get_level_values(i).values
+ for i in range(1, col_nlevels)]
+ col_df = pd.DataFrame(values)
+ data.columns = col_df.columns
+ data = pd.concat([col_df, data])
+ data.columns = col
+ return data
+
+ def _make_table(self, ax, df, title, height=None):
+ if df is None:
+ ax.set_visible(False)
+ return
+
+ import pandas.plotting as plotting
+
+ idx_nlevels = df.index.nlevels
+ col_nlevels = df.columns.nlevels
+ # must be convert here to get index levels for colorization
+ df = self._insert_index(df)
+ tb = plotting.table(ax, df, loc=9)
+ tb.set_fontsize(self.font_size)
+
+ if height is None:
+ height = 1.0 / (len(df) + 1)
+
+ props = tb.properties()
+ for (r, c), cell in compat.iteritems(props['celld']):
+ if c == -1:
+ cell.set_visible(False)
+ elif r < col_nlevels and c < idx_nlevels:
+ cell.set_visible(False)
+ elif r < col_nlevels or c < idx_nlevels:
+ cell.set_facecolor('#AAAAAA')
+ cell.set_height(height)
+
+ ax.set_title(title, size=self.font_size)
+ ax.axis('off')
+
+
+class _WritableDoc(type):
+ # Remove this when Python2 support is dropped
+ # __doc__ is not mutable for new-style classes in Python2, which means
+ # we can't use @Appender to share class docstrings. This can be used
+ # with `add_metaclass` to make cls.__doc__ mutable.
+ pass
+
+
+if __name__ == "__main__":
+ import matplotlib.pyplot as plt
+
+ p = TablePlotter()
+
+ df1 = pd.DataFrame({'A': [10, 11, 12],
+ 'B': [20, 21, 22],
+ 'C': [30, 31, 32]})
+ df2 = pd.DataFrame({'A': [10, 12],
+ 'C': [30, 32]})
+
+ p.plot([df1, df2], pd.concat([df1, df2]),
+ labels=['df1', 'df2'], vertical=True)
+ plt.show()
+
+ df3 = pd.DataFrame({'X': [10, 12],
+ 'Z': [30, 32]})
+
+ p.plot([df1, df3], pd.concat([df1, df3], axis=1),
+ labels=['df1', 'df2'], vertical=False)
+ plt.show()
+
+ idx = pd.MultiIndex.from_tuples([(1, 'A'), (1, 'B'), (1, 'C'),
+ (2, 'A'), (2, 'B'), (2, 'C')])
+ col = pd.MultiIndex.from_tuples([(1, 'A'), (1, 'B')])
+ df3 = pd.DataFrame({'v1': [1, 2, 3, 4, 5, 6],
+ 'v2': [5, 6, 7, 8, 9, 10]},
+ index=idx)
+ df3.columns = col
+ p.plot(df3, df3, labels=['df3'])
+ plt.show()
diff --git a/contrib/python/pandas/py2/pandas/util/_exceptions.py b/contrib/python/pandas/py2/pandas/util/_exceptions.py
new file mode 100644
index 00000000000..953c8a43a21
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/util/_exceptions.py
@@ -0,0 +1,16 @@
+import contextlib
+
+
+def rewrite_exception(old_name, new_name):
+ """Rewrite the message of an exception."""
+ try:
+ yield
+ except Exception as e:
+ msg = e.args[0]
+ msg = msg.replace(old_name, new_name)
+ args = (msg,)
+ if len(e.args) > 1:
+ args = args + e.args[1:]
+ e.args = args
+ raise
diff --git a/contrib/python/pandas/py2/pandas/util/_print_versions.py b/contrib/python/pandas/py2/pandas/util/_print_versions.py
new file mode 100644
index 00000000000..a5c86c2cc80
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/util/_print_versions.py
@@ -0,0 +1,159 @@
+import codecs
+import importlib
+import locale
+import os
+import platform
+import struct
+import subprocess
+import sys
+
+
+def get_sys_info():
+ "Returns system information as a dict"
+
+ blob = []
+
+ # get full commit hash
+ commit = None
+ if os.path.isdir(".git") and os.path.isdir("pandas"):
+ try:
+ pipe = subprocess.Popen('git log --format="%H" -n 1'.split(" "),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ so, serr = pipe.communicate()
+ except (OSError, ValueError):
+ pass
+ else:
+ if pipe.returncode == 0:
+ commit = so
+ try:
+ commit = so.decode('utf-8')
+ except ValueError:
+ pass
+ commit = commit.strip().strip('"')
+
+ blob.append(('commit', commit))
+
+ try:
+ (sysname, nodename, release,
+ version, machine, processor) = platform.uname()
+ blob.extend([
+ ("python", '.'.join(map(str, sys.version_info))),
+ ("python-bits", struct.calcsize("P") * 8),
+ ("OS", "{sysname}".format(sysname=sysname)),
+ ("OS-release", "{release}".format(release=release)),
+ # ("Version", "{version}".format(version=version)),
+ ("machine", "{machine}".format(machine=machine)),
+ ("processor", "{processor}".format(processor=processor)),
+ ("byteorder", "{byteorder}".format(byteorder=sys.byteorder)),
+ ("LC_ALL", "{lc}".format(lc=os.environ.get('LC_ALL', "None"))),
+ ("LANG", "{lang}".format(lang=os.environ.get('LANG', "None"))),
+ ("LOCALE", '.'.join(map(str, locale.getlocale()))),
+ ])
+ except (KeyError, ValueError):
+ pass
+
+ return blob
+
+
+def show_versions(as_json=False):
+ sys_info = get_sys_info()
+
+ deps = [
+ # (MODULE_NAME, f(mod) -> mod version)
+ ("pandas", lambda mod: mod.__version__),
+ ("pytest", lambda mod: mod.__version__),
+ ("pip", lambda mod: mod.__version__),
+ ("setuptools", lambda mod: mod.__version__),
+ ("Cython", lambda mod: mod.__version__),
+ ("numpy", lambda mod: mod.version.version),
+ ("scipy", lambda mod: mod.version.version),
+ ("pyarrow", lambda mod: mod.__version__),
+ ("xarray", lambda mod: mod.__version__),
+ ("IPython", lambda mod: mod.__version__),
+ ("sphinx", lambda mod: mod.__version__),
+ ("patsy", lambda mod: mod.__version__),
+ ("dateutil", lambda mod: mod.__version__),
+ ("pytz", lambda mod: mod.VERSION),
+ ("blosc", lambda mod: mod.__version__),
+ ("bottleneck", lambda mod: mod.__version__),
+ ("tables", lambda mod: mod.__version__),
+ ("numexpr", lambda mod: mod.__version__),
+ ("feather", lambda mod: mod.__version__),
+ ("matplotlib", lambda mod: mod.__version__),
+ ("openpyxl", lambda mod: mod.__version__),
+ ("xlrd", lambda mod: mod.__VERSION__),
+ ("xlwt", lambda mod: mod.__VERSION__),
+ ("xlsxwriter", lambda mod: mod.__version__),
+ ("lxml.etree", lambda mod: mod.__version__),
+ ("bs4", lambda mod: mod.__version__),
+ ("html5lib", lambda mod: mod.__version__),
+ ("sqlalchemy", lambda mod: mod.__version__),
+ ("pymysql", lambda mod: mod.__version__),
+ ("psycopg2", lambda mod: mod.__version__),
+ ("jinja2", lambda mod: mod.__version__),
+ ("s3fs", lambda mod: mod.__version__),
+ ("fastparquet", lambda mod: mod.__version__),
+ ("pandas_gbq", lambda mod: mod.__version__),
+ ("pandas_datareader", lambda mod: mod.__version__),
+ ("gcsfs", lambda mod: mod.__version__),
+ ]
+
+ deps_blob = list()
+ for (modname, ver_f) in deps:
+ try:
+ if modname in sys.modules:
+ mod = sys.modules[modname]
+ else:
+ mod = importlib.import_module(modname)
+ ver = ver_f(mod)
+ deps_blob.append((modname, ver))
+ except ImportError:
+ deps_blob.append((modname, None))
+
+ if (as_json):
+ try:
+ import json
+ except ImportError:
+ import simplejson as json
+
+ j = dict(system=dict(sys_info), dependencies=dict(deps_blob))
+
+ if as_json is True:
+ print(j)
+ else:
+ with codecs.open(as_json, "wb", encoding='utf8') as f:
+ json.dump(j, f, indent=2)
+
+ else:
+
+ print("\nINSTALLED VERSIONS")
+ print("------------------")
+
+ for k, stat in sys_info:
+ print("{k}: {stat}".format(k=k, stat=stat))
+
+ print("")
+ for k, stat in deps_blob:
+ print("{k}: {stat}".format(k=k, stat=stat))
+
+
+def main():
+ from optparse import OptionParser
+ parser = OptionParser()
+ parser.add_option("-j", "--json", metavar="FILE", nargs=1,
+ help="Save output as JSON into file, pass in "
+ "'-' to output to stdout")
+
+ (options, args) = parser.parse_args()
+
+ if options.json == "-":
+ options.json = True
+
+ show_versions(as_json=options.json)
+
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/contrib/python/pandas/py2/pandas/util/_test_decorators.py b/contrib/python/pandas/py2/pandas/util/_test_decorators.py
new file mode 100644
index 00000000000..0331661c313
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/util/_test_decorators.py
@@ -0,0 +1,210 @@
+"""
+This module provides decorator functions which can be applied to test objects
+in order to skip those objects when certain conditions occur. A sample use case
+is to detect if the platform is missing ``matplotlib``. If so, any test objects
+which require ``matplotlib`` and decorated with ``@td.skip_if_no_mpl`` will be
+skipped by ``pytest`` during the execution of the test suite.
+
+To illustrate, after importing this module:
+
+import pandas.util._test_decorators as td
+
+The decorators can be applied to classes:
+
[email protected]_if_some_reason
+class Foo():
+ ...
+
+Or individual functions:
+
[email protected]_if_some_reason
+def test_foo():
+ ...
+
+For more information, refer to the ``pytest`` documentation on ``skipif``.
+"""
+from distutils.version import LooseVersion
+import locale
+
+import pytest
+
+from pandas.compat import (
+ PY3, import_lzma, is_platform_32bit, is_platform_windows)
+from pandas.compat.numpy import _np_version_under1p15
+
+from pandas.core.computation.expressions import (
+ _NUMEXPR_INSTALLED, _USE_NUMEXPR)
+
+
+def safe_import(mod_name, min_version=None):
+ """
+ Parameters:
+ -----------
+ mod_name : str
+ Name of the module to be imported
+ min_version : str, default None
+ Minimum required version of the specified mod_name
+
+ Returns:
+ --------
+ object
+ The imported module if successful, or False
+ """
+ try:
+ mod = __import__(mod_name)
+ except ImportError:
+ return False
+
+ if not min_version:
+ return mod
+ else:
+ import sys
+ try:
+ version = getattr(sys.modules[mod_name], '__version__')
+ except AttributeError:
+ # xlrd uses a capitalized attribute name
+ version = getattr(sys.modules[mod_name], '__VERSION__')
+ if version:
+ from distutils.version import LooseVersion
+ if LooseVersion(version) >= LooseVersion(min_version):
+ return mod
+
+ return False
+
+
+def _skip_if_no_mpl():
+ mod = safe_import("matplotlib")
+ if mod:
+ mod.use("Agg", warn=False)
+ else:
+ return True
+
+
+def _skip_if_mpl_2_2():
+ mod = safe_import("matplotlib")
+
+ if mod:
+ v = mod.__version__
+ if LooseVersion(v) > LooseVersion('2.1.2'):
+ return True
+ else:
+ mod.use("Agg", warn=False)
+
+
+def _skip_if_has_locale():
+ lang, _ = locale.getlocale()
+ if lang is not None:
+ return True
+
+
+def _skip_if_not_us_locale():
+ lang, _ = locale.getlocale()
+ if lang != 'en_US':
+ return True
+
+
+def _skip_if_no_scipy():
+ return not (safe_import('scipy.stats') and
+ safe_import('scipy.sparse') and
+ safe_import('scipy.interpolate') and
+ safe_import('scipy.signal'))
+
+
+def _skip_if_no_lzma():
+ try:
+ import_lzma()
+ except ImportError:
+ return True
+
+
+def skip_if_no(package, min_version=None):
+ """
+ Generic function to help skip test functions when required packages are not
+ present on the testing system.
+
+ Intended for use as a decorator, this function will wrap the decorated
+ function with a pytest ``skip_if`` mark. During a pytest test suite
+ execution, that mark will attempt to import the specified ``package`` and
+ optionally ensure it meets the ``min_version``. If the import and version
+ check are unsuccessful, then the decorated function will be skipped.
+
+ Parameters
+ ----------
+ package: str
+ The name of the package required by the decorated function
+ min_version: str or None, default None
+ Optional minimum version of the package required by the decorated
+ function
+
+ Returns
+ -------
+ decorated_func: function
+ The decorated function wrapped within a pytest ``skip_if`` mark
+ """
+ def decorated_func(func):
+ msg = "Could not import '{}'".format(package)
+ if min_version:
+ msg += " satisfying a min_version of {}".format(min_version)
+ return pytest.mark.skipif(
+ not safe_import(package, min_version=min_version), reason=msg
+ )(func)
+ return decorated_func
+
+
+skip_if_no_mpl = pytest.mark.skipif(_skip_if_no_mpl(),
+ reason="Missing matplotlib dependency")
+skip_if_np_lt_115 = pytest.mark.skipif(_np_version_under1p15,
+ reason="NumPy 1.15 or greater required")
+skip_if_mpl = pytest.mark.skipif(not _skip_if_no_mpl(),
+ reason="matplotlib is present")
+xfail_if_mpl_2_2 = pytest.mark.xfail(_skip_if_mpl_2_2(),
+ reason="matplotlib 2.2",
+ strict=False)
+skip_if_32bit = pytest.mark.skipif(is_platform_32bit(),
+ reason="skipping for 32 bit")
+skip_if_windows = pytest.mark.skipif(is_platform_windows(),
+ reason="Running on Windows")
+skip_if_windows_python_3 = pytest.mark.skipif(is_platform_windows() and PY3,
+ reason=("not used on python3/"
+ "win32"))
+skip_if_has_locale = pytest.mark.skipif(_skip_if_has_locale(),
+ reason="Specific locale is set {lang}"
+ .format(lang=locale.getlocale()[0]))
+skip_if_not_us_locale = pytest.mark.skipif(_skip_if_not_us_locale(),
+ reason="Specific locale is set "
+ "{lang}".format(
+ lang=locale.getlocale()[0]))
+skip_if_no_scipy = pytest.mark.skipif(_skip_if_no_scipy(),
+ reason="Missing SciPy requirement")
+skip_if_no_lzma = pytest.mark.skipif(_skip_if_no_lzma(),
+ reason="need backports.lzma to run")
+skip_if_no_ne = pytest.mark.skipif(not _USE_NUMEXPR,
+ reason="numexpr enabled->{enabled}, "
+ "installed->{installed}".format(
+ enabled=_USE_NUMEXPR,
+ installed=_NUMEXPR_INSTALLED))
+
+
+def parametrize_fixture_doc(*args):
+ """
+ Intended for use as a decorator for parametrized fixture,
+ this function will wrap the decorated function with a pytest
+ ``parametrize_fixture_doc`` mark. That mark will format
+ initial fixture docstring by replacing placeholders {0}, {1} etc
+ with parameters passed as arguments.
+
+ Parameters:
+ ----------
+ args: iterable
+ Positional arguments for docstring.
+
+ Returns:
+ -------
+ documented_fixture: function
+ The decorated function wrapped within a pytest
+ ``parametrize_fixture_doc`` mark
+ """
+ def documented_fixture(fixture):
+ fixture.__doc__ = fixture.__doc__.format(*args)
+ return fixture
+ return documented_fixture
diff --git a/contrib/python/pandas/py2/pandas/util/_tester.py b/contrib/python/pandas/py2/pandas/util/_tester.py
new file mode 100644
index 00000000000..18e8d415459
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/util/_tester.py
@@ -0,0 +1,29 @@
+"""
+Entrypoint for testing from the top-level namespace
+"""
+import os
+import sys
+
+PKG = os.path.dirname(os.path.dirname(__file__))
+
+
+def test(extra_args=None):
+ try:
+ import pytest
+ except ImportError:
+ raise ImportError("Need pytest>=3.0 to run tests")
+ try:
+ import hypothesis # noqa
+ except ImportError:
+ raise ImportError("Need hypothesis>=3.58 to run tests")
+ cmd = ['--skip-slow', '--skip-network', '--skip-db']
+ if extra_args:
+ if not isinstance(extra_args, list):
+ extra_args = [extra_args]
+ cmd = extra_args
+ cmd += [PKG]
+ print("running: pytest {}".format(' '.join(cmd)))
+ sys.exit(pytest.main(cmd))
+
+
+__all__ = ['test']
diff --git a/contrib/python/pandas/py2/pandas/util/_validators.py b/contrib/python/pandas/py2/pandas/util/_validators.py
new file mode 100644
index 00000000000..1171478de2e
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/util/_validators.py
@@ -0,0 +1,358 @@
+"""
+Module that contains many useful utilities
+for validating data or function arguments
+"""
+import warnings
+
+from pandas.core.dtypes.common import is_bool
+
+
+def _check_arg_length(fname, args, max_fname_arg_count, compat_args):
+ """
+ Checks whether 'args' has length of at most 'compat_args'. Raises
+ a TypeError if that is not the case, similar to in Python when a
+ function is called with too many arguments.
+
+ """
+ if max_fname_arg_count < 0:
+ raise ValueError("'max_fname_arg_count' must be non-negative")
+
+ if len(args) > len(compat_args):
+ max_arg_count = len(compat_args) + max_fname_arg_count
+ actual_arg_count = len(args) + max_fname_arg_count
+ argument = 'argument' if max_arg_count == 1 else 'arguments'
+
+ raise TypeError(
+ "{fname}() takes at most {max_arg} {argument} "
+ "({given_arg} given)".format(
+ fname=fname, max_arg=max_arg_count,
+ argument=argument, given_arg=actual_arg_count))
+
+
+def _check_for_default_values(fname, arg_val_dict, compat_args):
+ """
+ Check that the keys in `arg_val_dict` are mapped to their
+ default values as specified in `compat_args`.
+
+ Note that this function is to be called only when it has been
+ checked that arg_val_dict.keys() is a subset of compat_args
+
+ """
+ for key in arg_val_dict:
+ # try checking equality directly with '=' operator,
+ # as comparison may have been overridden for the left
+ # hand object
+ try:
+ v1 = arg_val_dict[key]
+ v2 = compat_args[key]
+
+ # check for None-ness otherwise we could end up
+ # comparing a numpy array vs None
+ if (v1 is not None and v2 is None) or \
+ (v1 is None and v2 is not None):
+ match = False
+ else:
+ match = (v1 == v2)
+
+ if not is_bool(match):
+ raise ValueError("'match' is not a boolean")
+
+ # could not compare them directly, so try comparison
+ # using the 'is' operator
+ except ValueError:
+ match = (arg_val_dict[key] is compat_args[key])
+
+ if not match:
+ raise ValueError(("the '{arg}' parameter is not "
+ "supported in the pandas "
+ "implementation of {fname}()".
+ format(fname=fname, arg=key)))
+
+
+def validate_args(fname, args, max_fname_arg_count, compat_args):
+ """
+ Checks whether the length of the `*args` argument passed into a function
+ has at most `len(compat_args)` arguments and whether or not all of these
+ elements in `args` are set to their default values.
+
+ fname: str
+ The name of the function being passed the `*args` parameter
+
+ args: tuple
+ The `*args` parameter passed into a function
+
+ max_fname_arg_count: int
+ The maximum number of arguments that the function `fname`
+ can accept, excluding those in `args`. Used for displaying
+ appropriate error messages. Must be non-negative.
+
+ compat_args: OrderedDict
+ A ordered dictionary of keys and their associated default values.
+ In order to accommodate buggy behaviour in some versions of `numpy`,
+ where a signature displayed keyword arguments but then passed those
+ arguments **positionally** internally when calling downstream
+ implementations, an ordered dictionary ensures that the original
+ order of the keyword arguments is enforced. Note that if there is
+ only one key, a generic dict can be passed in as well.
+
+ Raises
+ ------
+ TypeError if `args` contains more values than there are `compat_args`
+ ValueError if `args` contains values that do not correspond to those
+ of the default values specified in `compat_args`
+
+ """
+ _check_arg_length(fname, args, max_fname_arg_count, compat_args)
+
+ # We do this so that we can provide a more informative
+ # error message about the parameters that we are not
+ # supporting in the pandas implementation of 'fname'
+ kwargs = dict(zip(compat_args, args))
+ _check_for_default_values(fname, kwargs, compat_args)
+
+
+def _check_for_invalid_keys(fname, kwargs, compat_args):
+ """
+ Checks whether 'kwargs' contains any keys that are not
+ in 'compat_args' and raises a TypeError if there is one.
+
+ """
+ # set(dict) --> set of the dictionary's keys
+ diff = set(kwargs) - set(compat_args)
+
+ if diff:
+ bad_arg = list(diff)[0]
+ raise TypeError(("{fname}() got an unexpected "
+ "keyword argument '{arg}'".
+ format(fname=fname, arg=bad_arg)))
+
+
+def validate_kwargs(fname, kwargs, compat_args):
+ """
+ Checks whether parameters passed to the **kwargs argument in a
+ function `fname` are valid parameters as specified in `*compat_args`
+ and whether or not they are set to their default values.
+
+ Parameters
+ ----------
+ fname: str
+ The name of the function being passed the `**kwargs` parameter
+
+ kwargs: dict
+ The `**kwargs` parameter passed into `fname`
+
+ compat_args: dict
+ A dictionary of keys that `kwargs` is allowed to have and their
+ associated default values
+
+ Raises
+ ------
+ TypeError if `kwargs` contains keys not in `compat_args`
+ ValueError if `kwargs` contains keys in `compat_args` that do not
+ map to the default values specified in `compat_args`
+
+ """
+ kwds = kwargs.copy()
+ _check_for_invalid_keys(fname, kwargs, compat_args)
+ _check_for_default_values(fname, kwds, compat_args)
+
+
+def validate_args_and_kwargs(fname, args, kwargs,
+ max_fname_arg_count,
+ compat_args):
+ """
+ Checks whether parameters passed to the *args and **kwargs argument in a
+ function `fname` are valid parameters as specified in `*compat_args`
+ and whether or not they are set to their default values.
+
+ Parameters
+ ----------
+ fname: str
+ The name of the function being passed the `**kwargs` parameter
+
+ args: tuple
+ The `*args` parameter passed into a function
+
+ kwargs: dict
+ The `**kwargs` parameter passed into `fname`
+
+ max_fname_arg_count: int
+ The minimum number of arguments that the function `fname`
+ requires, excluding those in `args`. Used for displaying
+ appropriate error messages. Must be non-negative.
+
+ compat_args: OrderedDict
+ A ordered dictionary of keys that `kwargs` is allowed to
+ have and their associated default values. Note that if there
+ is only one key, a generic dict can be passed in as well.
+
+ Raises
+ ------
+ TypeError if `args` contains more values than there are
+ `compat_args` OR `kwargs` contains keys not in `compat_args`
+ ValueError if `args` contains values not at the default value (`None`)
+ `kwargs` contains keys in `compat_args` that do not map to the default
+ value as specified in `compat_args`
+
+ See Also
+ --------
+ validate_args : Purely args validation.
+ validate_kwargs : Purely kwargs validation.
+
+ """
+ # Check that the total number of arguments passed in (i.e.
+ # args and kwargs) does not exceed the length of compat_args
+ _check_arg_length(fname, args + tuple(kwargs.values()),
+ max_fname_arg_count, compat_args)
+
+ # Check there is no overlap with the positional and keyword
+ # arguments, similar to what is done in actual Python functions
+ args_dict = dict(zip(compat_args, args))
+
+ for key in args_dict:
+ if key in kwargs:
+ raise TypeError("{fname}() got multiple values for keyword "
+ "argument '{arg}'".format(fname=fname, arg=key))
+
+ kwargs.update(args_dict)
+ validate_kwargs(fname, kwargs, compat_args)
+
+
+def validate_bool_kwarg(value, arg_name):
+ """ Ensures that argument passed in arg_name is of type bool. """
+ if not (is_bool(value) or value is None):
+ raise ValueError('For argument "{arg}" expected type bool, received '
+ 'type {typ}.'.format(arg=arg_name,
+ typ=type(value).__name__))
+ return value
+
+
+def validate_axis_style_args(data, args, kwargs, arg_name, method_name):
+ """Argument handler for mixed index, columns / axis functions
+
+ In an attempt to handle both `.method(index, columns)`, and
+ `.method(arg, axis=.)`, we have to do some bad things to argument
+ parsing. This translates all arguments to `{index=., columns=.}` style.
+
+ Parameters
+ ----------
+ data : DataFrame or Panel
+ arg : tuple
+ All positional arguments from the user
+ kwargs : dict
+ All keyword arguments from the user
+ arg_name, method_name : str
+ Used for better error messages
+
+ Returns
+ -------
+ kwargs : dict
+ A dictionary of keyword arguments. Doesn't modify ``kwargs``
+ inplace, so update them with the return value here.
+
+ Examples
+ --------
+ >>> df._validate_axis_style_args((str.upper,), {'columns': id},
+ ... 'mapper', 'rename')
+ {'columns': <function id>, 'index': <method 'upper' of 'str' objects>}
+
+ This emits a warning
+ >>> df._validate_axis_style_args((str.upper, id), {},
+ ... 'mapper', 'rename')
+ {'columns': <function id>, 'index': <method 'upper' of 'str' objects>}
+ """
+ # TODO(PY3): Change to keyword-only args and remove all this
+
+ out = {}
+ # Goal: fill 'out' with index/columns-style arguments
+ # like out = {'index': foo, 'columns': bar}
+
+ # Start by validating for consistency
+ if 'axis' in kwargs and any(x in kwargs for x in data._AXIS_NUMBERS):
+ msg = "Cannot specify both 'axis' and any of 'index' or 'columns'."
+ raise TypeError(msg)
+
+ # First fill with explicit values provided by the user...
+ if arg_name in kwargs:
+ if args:
+ msg = ("{} got multiple values for argument "
+ "'{}'".format(method_name, arg_name))
+ raise TypeError(msg)
+
+ axis = data._get_axis_name(kwargs.get('axis', 0))
+ out[axis] = kwargs[arg_name]
+
+ # More user-provided arguments, now from kwargs
+ for k, v in kwargs.items():
+ try:
+ ax = data._get_axis_name(k)
+ except ValueError:
+ pass
+ else:
+ out[ax] = v
+
+ # All user-provided kwargs have been handled now.
+ # Now we supplement with positional arguments, emitting warnings
+ # when there's ambiguity and raising when there's conflicts
+
+ if len(args) == 0:
+ pass # It's up to the function to decide if this is valid
+ elif len(args) == 1:
+ axis = data._get_axis_name(kwargs.get('axis', 0))
+ out[axis] = args[0]
+ elif len(args) == 2:
+ if 'axis' in kwargs:
+ # Unambiguously wrong
+ msg = ("Cannot specify both 'axis' and any of 'index' "
+ "or 'columns'")
+ raise TypeError(msg)
+
+ msg = ("Interpreting call\n\t'.{method_name}(a, b)' as "
+ "\n\t'.{method_name}(index=a, columns=b)'.\nUse named "
+ "arguments to remove any ambiguity. In the future, using "
+ "positional arguments for 'index' or 'columns' will raise "
+ " a 'TypeError'.")
+ warnings.warn(msg.format(method_name=method_name,), FutureWarning,
+ stacklevel=4)
+ out[data._AXIS_NAMES[0]] = args[0]
+ out[data._AXIS_NAMES[1]] = args[1]
+ else:
+ msg = "Cannot specify all of '{}', 'index', 'columns'."
+ raise TypeError(msg.format(arg_name))
+ return out
+
+
+def validate_fillna_kwargs(value, method, validate_scalar_dict_value=True):
+ """Validate the keyword arguments to 'fillna'.
+
+ This checks that exactly one of 'value' and 'method' is specified.
+ If 'method' is specified, this validates that it's a valid method.
+
+ Parameters
+ ----------
+ value, method : object
+ The 'value' and 'method' keyword arguments for 'fillna'.
+ validate_scalar_dict_value : bool, default True
+ Whether to validate that 'value' is a scalar or dict. Specifically,
+ validate that it is not a list or tuple.
+
+ Returns
+ -------
+ value, method : object
+ """
+ from pandas.core.missing import clean_fill_method
+
+ if value is None and method is None:
+ raise ValueError("Must specify a fill 'value' or 'method'.")
+ elif value is None and method is not None:
+ method = clean_fill_method(method)
+
+ elif value is not None and method is None:
+ if validate_scalar_dict_value and isinstance(value, (list, tuple)):
+ raise TypeError('"value" parameter must be a scalar or dict, but '
+ 'you passed a "{0}"'.format(type(value).__name__))
+
+ elif value is not None and method is not None:
+ raise ValueError("Cannot specify both 'value' and 'method'.")
+
+ return value, method
diff --git a/contrib/python/pandas/py2/pandas/util/move.c b/contrib/python/pandas/py2/pandas/util/move.c
new file mode 100644
index 00000000000..62860adb1c1
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/util/move.c
@@ -0,0 +1,268 @@
+#include <Python.h>
+
+#define COMPILING_IN_PY2 (PY_VERSION_HEX <= 0x03000000)
+
+#if !COMPILING_IN_PY2
+/* alias this because it is not aliased in Python 3 */
+#define PyString_CheckExact PyBytes_CheckExact
+#define PyString_AS_STRING PyBytes_AS_STRING
+#define PyString_GET_SIZE PyBytes_GET_SIZE
+
+/* in python 3, we cannot intern bytes objects so this is always false */
+#define PyString_CHECK_INTERNED(cs) 0
+#endif /* !COMPILING_IN_PY2 */
+
+#ifndef Py_TPFLAGS_HAVE_GETCHARBUFFER
+#define Py_TPFLAGS_HAVE_GETCHARBUFFER 0
+#endif
+
+#ifndef Py_TPFLAGS_HAVE_NEWBUFFER
+#define Py_TPFLAGS_HAVE_NEWBUFFER 0
+#endif
+
+static PyObject *badmove; /* bad move exception class */
+
+typedef struct {
+ PyObject_HEAD
+ /* the bytes that own the buffer we are mutating */
+ PyObject *invalid_bytes;
+} stolenbufobject;
+
+static PyTypeObject stolenbuf_type; /* forward declare type */
+
+static void
+stolenbuf_dealloc(stolenbufobject *self)
+{
+ Py_DECREF(self->invalid_bytes);
+ PyObject_Del(self);
+}
+
+static int
+stolenbuf_getbuffer(stolenbufobject *self, Py_buffer *view, int flags)
+{
+ return PyBuffer_FillInfo(view,
+ (PyObject*) self,
+ (void*) PyString_AS_STRING(self->invalid_bytes),
+ PyString_GET_SIZE(self->invalid_bytes),
+ 0, /* not readonly */
+ flags);
+}
+
+#if COMPILING_IN_PY2
+
+static Py_ssize_t
+stolenbuf_getreadwritebuf(stolenbufobject *self, Py_ssize_t segment, void **out)
+{
+ if (segment != 0) {
+ PyErr_SetString(PyExc_SystemError,
+ "accessing non-existent string segment");
+ return -1;
+ }
+ *out = PyString_AS_STRING(self->invalid_bytes);
+ return PyString_GET_SIZE(self->invalid_bytes);
+}
+
+static Py_ssize_t
+stolenbuf_getsegcount(stolenbufobject *self, Py_ssize_t *len)
+{
+ if (len) {
+ *len = PyString_GET_SIZE(self->invalid_bytes);
+ }
+ return 1;
+}
+
+static PyBufferProcs stolenbuf_as_buffer = {
+ (readbufferproc) stolenbuf_getreadwritebuf,
+ (writebufferproc) stolenbuf_getreadwritebuf,
+ (segcountproc) stolenbuf_getsegcount,
+ (charbufferproc) stolenbuf_getreadwritebuf,
+ (getbufferproc) stolenbuf_getbuffer,
+};
+
+#else /* Python 3 */
+
+static PyBufferProcs stolenbuf_as_buffer = {
+ (getbufferproc) stolenbuf_getbuffer,
+ NULL,
+};
+
+#endif /* COMPILING_IN_PY2 */
+
+PyDoc_STRVAR(stolenbuf_doc,
+ "A buffer that is wrapping a stolen bytes object's buffer.");
+
+static PyTypeObject stolenbuf_type = {
+ PyVarObject_HEAD_INIT(NULL, 0)
+ "pandas.util._move.stolenbuf", /* tp_name */
+ sizeof(stolenbufobject), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ (destructor) stolenbuf_dealloc, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_reserved */
+ 0, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ 0, /* tp_getattro */
+ 0, /* tp_setattro */
+ &stolenbuf_as_buffer, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT |
+ Py_TPFLAGS_HAVE_NEWBUFFER |
+ Py_TPFLAGS_HAVE_GETCHARBUFFER, /* tp_flags */
+ stolenbuf_doc, /* tp_doc */
+};
+
+PyDoc_STRVAR(
+ move_into_mutable_buffer_doc,
+ "Moves a bytes object that is about to be destroyed into a mutable buffer\n"
+ "without copying the data.\n"
+ "\n"
+ "Parameters\n"
+ "----------\n"
+ "bytes_rvalue : bytes with 1 refcount.\n"
+ " The bytes object that you want to move into a mutable buffer. This\n"
+ " cannot be a named object. It must only have a single reference.\n"
+ "\n"
+ "Returns\n"
+ "-------\n"
+ "buf : stolenbuf\n"
+ " An object that supports the buffer protocol which can give a mutable\n"
+ " view of the data that was previously owned by ``bytes_rvalue``.\n"
+ "\n"
+ "Raises\n"
+ "------\n"
+ "BadMove\n"
+ " Raised when a move is attempted on an object with more than one\n"
+ " reference.\n"
+ "\n"
+ "Notes\n"
+ "-----\n"
+ "If you want to use this function you are probably wrong.\n"
+ "\n"
+ "Warning: Do not call this function through *unpacking. This can\n"
+ "potentially trick the reference checks which may allow you to get a\n"
+ "mutable reference to a shared string!\n"
+ "\n");
+
+/* This is implemented as a standalone function instead of the ``tp_new`` of
+ ``stolenbuf`` because we need to create a function using the METH_O flag
+ to support Python 3.6. In python 3.6, PyCFunction calls from python code now
+ count the reference owned by the argument tuple. This would cause the object
+ to have 2 references if used with a direct call like: ``stolenbuf(a)``;
+ however, if called through *unpacking like ``stolenbuf(*(a,))`` it would
+ only have the one reference (the tuple). */
+static PyObject*
+move_into_mutable_buffer(PyObject *self, PyObject *bytes_rvalue)
+{
+ stolenbufobject *ret;
+
+ if (!PyString_CheckExact(bytes_rvalue)) {
+ PyErr_SetString(PyExc_TypeError,
+ "stolenbuf can only steal from bytes objects");
+ return NULL;
+ }
+
+ if (Py_REFCNT(bytes_rvalue) != 1 || PyString_CHECK_INTERNED(bytes_rvalue)) {
+ /* there is a reference other than the caller's stack or the string is
+ interned */
+ PyErr_SetObject(badmove, bytes_rvalue);
+ return NULL;
+ }
+
+ if (!(ret = PyObject_New(stolenbufobject, &stolenbuf_type))) {
+ return NULL;
+ }
+
+ /* store the original bytes object in a field that is not
+ exposed to python */
+ Py_INCREF(bytes_rvalue);
+ ret->invalid_bytes = bytes_rvalue;
+ return (PyObject*) ret;
+}
+
+static PyMethodDef methods[] = {
+ {"move_into_mutable_buffer",
+ (PyCFunction) move_into_mutable_buffer,
+ METH_O,
+ move_into_mutable_buffer_doc},
+ {NULL},
+};
+
+#define MODULE_NAME "pandas.util._move"
+
+#if !COMPILING_IN_PY2
+static PyModuleDef move_module = {
+ PyModuleDef_HEAD_INIT,
+ MODULE_NAME,
+ NULL,
+ -1,
+ methods,
+};
+#endif /* !COMPILING_IN_PY2 */
+
+PyDoc_STRVAR(
+ badmove_doc,
+ "Exception used to indicate that a move was attempted on a value with\n"
+ "more than a single reference.\n"
+ "\n"
+ "Parameters\n"
+ "----------\n"
+ "data : any\n"
+ " The data which was passed to ``move_into_mutable_buffer``.\n"
+ "\n"
+ "See Also\n"
+ "--------\n"
+ "pandas.util._move.move_into_mutable_buffer\n");
+
+PyMODINIT_FUNC
+#if !COMPILING_IN_PY2
+#define ERROR_RETURN NULL
+PyInit__move(void)
+#else
+#define ERROR_RETURN
+init_move(void)
+#endif /* !COMPILING_IN_PY2 */
+{
+ PyObject *m;
+
+ if (!(badmove = PyErr_NewExceptionWithDoc("pandas.util._move.BadMove",
+ badmove_doc,
+ NULL,
+ NULL))) {
+ return ERROR_RETURN;
+ }
+
+ if (PyType_Ready(&stolenbuf_type)) {
+ return ERROR_RETURN;
+ }
+
+#if !COMPILING_IN_PY2
+ if (!(m = PyModule_Create(&move_module)))
+#else
+ if (!(m = Py_InitModule(MODULE_NAME, methods)))
+#endif /* !COMPILING_IN_PY2 */
+ {
+ return ERROR_RETURN;
+ }
+
+ if (PyModule_AddObject(m,
+ "stolenbuf",
+ (PyObject*) &stolenbuf_type)) {
+ Py_DECREF(m);
+ return ERROR_RETURN;
+ }
+
+ if (PyModule_AddObject(m, "BadMove", badmove)) {
+ Py_DECREF(m);
+ return ERROR_RETURN;
+ }
+
+#if !COMPILING_IN_PY2
+ return m;
+#endif /* !COMPILING_IN_PY2 */
+}
diff --git a/contrib/python/pandas/py2/pandas/util/testing.py b/contrib/python/pandas/py2/pandas/util/testing.py
new file mode 100644
index 00000000000..f441dd20f39
--- /dev/null
+++ b/contrib/python/pandas/py2/pandas/util/testing.py
@@ -0,0 +1,3067 @@
+from __future__ import division
+
+from contextlib import contextmanager
+from datetime import datetime
+from functools import wraps
+import locale
+import os
+import re
+from shutil import rmtree
+import string
+import subprocess
+import sys
+import tempfile
+import traceback
+import warnings
+
+import numpy as np
+from numpy.random import rand, randn
+
+from pandas._libs import testing as _testing
+import pandas.compat as compat
+from pandas.compat import (
+ PY2, PY3, Counter, callable, filter, httplib, lmap, lrange, lzip, map,
+ raise_with_traceback, range, string_types, u, unichr, zip)
+
+from pandas.core.dtypes.common import (
+ is_bool, is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
+ is_datetimelike_v_numeric, is_datetimelike_v_object,
+ is_extension_array_dtype, is_interval_dtype, is_list_like, is_number,
+ is_period_dtype, is_sequence, is_timedelta64_dtype, needs_i8_conversion)
+from pandas.core.dtypes.missing import array_equivalent
+
+import pandas as pd
+from pandas import (
+ Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Index,
+ IntervalIndex, MultiIndex, Panel, RangeIndex, Series, bdate_range)
+from pandas.core.algorithms import take_1d
+from pandas.core.arrays import (
+ DatetimeArray, ExtensionArray, IntervalArray, PeriodArray, TimedeltaArray,
+ period_array)
+import pandas.core.common as com
+
+from pandas.io.common import urlopen
+from pandas.io.formats.printing import pprint_thing
+
+N = 30
+K = 4
+_RAISE_NETWORK_ERROR_DEFAULT = False
+
+# set testing_mode
+_testing_mode_warnings = (DeprecationWarning, compat.ResourceWarning)
+
+
+def set_testing_mode():
+ # set the testing mode filters
+ testing_mode = os.environ.get('PANDAS_TESTING_MODE', 'None')
+ if 'deprecate' in testing_mode:
+ warnings.simplefilter('always', _testing_mode_warnings)
+
+
+def reset_testing_mode():
+ # reset the testing mode filters
+ testing_mode = os.environ.get('PANDAS_TESTING_MODE', 'None')
+ if 'deprecate' in testing_mode:
+ warnings.simplefilter('ignore', _testing_mode_warnings)
+
+
+set_testing_mode()
+
+
+def reset_display_options():
+ """
+ Reset the display options for printing and representing objects.
+ """
+
+ pd.reset_option('^display.', silent=True)
+
+
+def round_trip_pickle(obj, path=None):
+ """
+ Pickle an object and then read it again.
+
+ Parameters
+ ----------
+ obj : pandas object
+ The object to pickle and then re-read.
+ path : str, default None
+ The path where the pickled object is written and then read.
+
+ Returns
+ -------
+ round_trip_pickled_object : pandas object
+ The original object that was pickled and then re-read.
+ """
+
+ if path is None:
+ path = u('__{random_bytes}__.pickle'.format(random_bytes=rands(10)))
+ with ensure_clean(path) as path:
+ pd.to_pickle(obj, path)
+ return pd.read_pickle(path)
+
+
+def round_trip_pathlib(writer, reader, path=None):
+ """
+ Write an object to file specified by a pathlib.Path and read it back
+
+ Parameters
+ ----------
+ writer : callable bound to pandas object
+ IO writing function (e.g. DataFrame.to_csv )
+ reader : callable
+ IO reading function (e.g. pd.read_csv )
+ path : str, default None
+ The path where the object is written and then read.
+
+ Returns
+ -------
+ round_trip_object : pandas object
+ The original object that was serialized and then re-read.
+ """
+
+ import pytest
+ Path = pytest.importorskip('pathlib').Path
+ if path is None:
+ path = '___pathlib___'
+ with ensure_clean(path) as path:
+ writer(Path(path))
+ obj = reader(Path(path))
+ return obj
+
+
+def round_trip_localpath(writer, reader, path=None):
+ """
+ Write an object to file specified by a py.path LocalPath and read it back
+
+ Parameters
+ ----------
+ writer : callable bound to pandas object
+ IO writing function (e.g. DataFrame.to_csv )
+ reader : callable
+ IO reading function (e.g. pd.read_csv )
+ path : str, default None
+ The path where the object is written and then read.
+
+ Returns
+ -------
+ round_trip_object : pandas object
+ The original object that was serialized and then re-read.
+ """
+ import pytest
+ LocalPath = pytest.importorskip('py.path').local
+ if path is None:
+ path = '___localpath___'
+ with ensure_clean(path) as path:
+ writer(LocalPath(path))
+ obj = reader(LocalPath(path))
+ return obj
+
+
+@contextmanager
+def decompress_file(path, compression):
+ """
+ Open a compressed file and return a file object
+
+ Parameters
+ ----------
+ path : str
+ The path where the file is read from
+
+ compression : {'gzip', 'bz2', 'zip', 'xz', None}
+ Name of the decompression to use
+
+ Returns
+ -------
+ f : file object
+ """
+
+ if compression is None:
+ f = open(path, 'rb')
+ elif compression == 'gzip':
+ import gzip
+ f = gzip.open(path, 'rb')
+ elif compression == 'bz2':
+ import bz2
+ f = bz2.BZ2File(path, 'rb')
+ elif compression == 'xz':
+ lzma = compat.import_lzma()
+ f = lzma.LZMAFile(path, 'rb')
+ elif compression == 'zip':
+ import zipfile
+ zip_file = zipfile.ZipFile(path)
+ zip_names = zip_file.namelist()
+ if len(zip_names) == 1:
+ f = zip_file.open(zip_names.pop())
+ else:
+ raise ValueError('ZIP file {} error. Only one file per ZIP.'
+ .format(path))
+ else:
+ msg = 'Unrecognized compression type: {}'.format(compression)
+ raise ValueError(msg)
+
+ try:
+ yield f
+ finally:
+ f.close()
+ if compression == "zip":
+ zip_file.close()
+
+
+def write_to_compressed(compression, path, data, dest="test"):
+ """
+ Write data to a compressed file.
+
+ Parameters
+ ----------
+ compression : {'gzip', 'bz2', 'zip', 'xz'}
+ The compression type to use.
+ path : str
+ The file path to write the data.
+ data : str
+ The data to write.
+ dest : str, default "test"
+ The destination file (for ZIP only)
+
+ Raises
+ ------
+ ValueError : An invalid compression value was passed in.
+ """
+
+ if compression == "zip":
+ import zipfile
+ compress_method = zipfile.ZipFile
+ elif compression == "gzip":
+ import gzip
+ compress_method = gzip.GzipFile
+ elif compression == "bz2":
+ import bz2
+ compress_method = bz2.BZ2File
+ elif compression == "xz":
+ lzma = compat.import_lzma()
+ compress_method = lzma.LZMAFile
+ else:
+ msg = "Unrecognized compression type: {}".format(compression)
+ raise ValueError(msg)
+
+ if compression == "zip":
+ mode = "w"
+ args = (dest, data)
+ method = "writestr"
+ else:
+ mode = "wb"
+ args = (data,)
+ method = "write"
+
+ with compress_method(path, mode=mode) as f:
+ getattr(f, method)(*args)
+
+
+def assert_almost_equal(left, right, check_dtype="equiv",
+ check_less_precise=False, **kwargs):
+ """
+ Check that the left and right objects are approximately equal.
+
+ By approximately equal, we refer to objects that are numbers or that
+ contain numbers which may be equivalent to specific levels of precision.
+
+ Parameters
+ ----------
+ left : object
+ right : object
+ check_dtype : bool / string {'equiv'}, default 'equiv'
+ Check dtype if both a and b are the same type. If 'equiv' is passed in,
+ then `RangeIndex` and `Int64Index` are also considered equivalent
+ when doing type checking.
+ check_less_precise : bool or int, default False
+ Specify comparison precision. 5 digits (False) or 3 digits (True)
+ after decimal points are compared. If int, then specify the number
+ of digits to compare.
+
+ When comparing two numbers, if the first number has magnitude less
+ than 1e-5, we compare the two numbers directly and check whether
+ they are equivalent within the specified precision. Otherwise, we
+ compare the **ratio** of the second number to the first number and
+ check whether it is equivalent to 1 within the specified precision.
+ """
+
+ if isinstance(left, pd.Index):
+ return assert_index_equal(left, right,
+ check_exact=False,
+ exact=check_dtype,
+ check_less_precise=check_less_precise,
+ **kwargs)
+
+ elif isinstance(left, pd.Series):
+ return assert_series_equal(left, right,
+ check_exact=False,
+ check_dtype=check_dtype,
+ check_less_precise=check_less_precise,
+ **kwargs)
+
+ elif isinstance(left, pd.DataFrame):
+ return assert_frame_equal(left, right,
+ check_exact=False,
+ check_dtype=check_dtype,
+ check_less_precise=check_less_precise,
+ **kwargs)
+
+ else:
+ # Other sequences.
+ if check_dtype:
+ if is_number(left) and is_number(right):
+ # Do not compare numeric classes, like np.float64 and float.
+ pass
+ elif is_bool(left) and is_bool(right):
+ # Do not compare bool classes, like np.bool_ and bool.
+ pass
+ else:
+ if (isinstance(left, np.ndarray) or
+ isinstance(right, np.ndarray)):
+ obj = "numpy array"
+ else:
+ obj = "Input"
+ assert_class_equal(left, right, obj=obj)
+ return _testing.assert_almost_equal(
+ left, right,
+ check_dtype=check_dtype,
+ check_less_precise=check_less_precise,
+ **kwargs)
+
+
+def _check_isinstance(left, right, cls):
+ """
+ Helper method for our assert_* methods that ensures that
+ the two objects being compared have the right type before
+ proceeding with the comparison.
+
+ Parameters
+ ----------
+ left : The first object being compared.
+ right : The second object being compared.
+ cls : The class type to check against.
+
+ Raises
+ ------
+ AssertionError : Either `left` or `right` is not an instance of `cls`.
+ """
+
+ err_msg = "{name} Expected type {exp_type}, found {act_type} instead"
+ cls_name = cls.__name__
+
+ if not isinstance(left, cls):
+ raise AssertionError(err_msg.format(name=cls_name, exp_type=cls,
+ act_type=type(left)))
+ if not isinstance(right, cls):
+ raise AssertionError(err_msg.format(name=cls_name, exp_type=cls,
+ act_type=type(right)))
+
+
+def assert_dict_equal(left, right, compare_keys=True):
+
+ _check_isinstance(left, right, dict)
+ return _testing.assert_dict_equal(left, right, compare_keys=compare_keys)
+
+
+def randbool(size=(), p=0.5):
+ return rand(*size) <= p
+
+
+RANDS_CHARS = np.array(list(string.ascii_letters + string.digits),
+ dtype=(np.str_, 1))
+RANDU_CHARS = np.array(list(u("").join(map(unichr, lrange(1488, 1488 + 26))) +
+ string.digits), dtype=(np.unicode_, 1))
+
+
+def rands_array(nchars, size, dtype='O'):
+ """Generate an array of byte strings."""
+ retval = (np.random.choice(RANDS_CHARS, size=nchars * np.prod(size))
+ .view((np.str_, nchars)).reshape(size))
+ if dtype is None:
+ return retval
+ else:
+ return retval.astype(dtype)
+
+
+def randu_array(nchars, size, dtype='O'):
+ """Generate an array of unicode strings."""
+ retval = (np.random.choice(RANDU_CHARS, size=nchars * np.prod(size))
+ .view((np.unicode_, nchars)).reshape(size))
+ if dtype is None:
+ return retval
+ else:
+ return retval.astype(dtype)
+
+
+def rands(nchars):
+ """
+ Generate one random byte string.
+
+ See `rands_array` if you want to create an array of random strings.
+
+ """
+ return ''.join(np.random.choice(RANDS_CHARS, nchars))
+
+
+def randu(nchars):
+ """
+ Generate one random unicode string.
+
+ See `randu_array` if you want to create an array of random unicode strings.
+
+ """
+ return ''.join(np.random.choice(RANDU_CHARS, nchars))
+
+
+def close(fignum=None):
+ from matplotlib.pyplot import get_fignums, close as _close
+
+ if fignum is None:
+ for fignum in get_fignums():
+ _close(fignum)
+ else:
+ _close(fignum)
+
+
+# -----------------------------------------------------------------------------
+# locale utilities
+
+
+def check_output(*popenargs, **kwargs):
+ # shamelessly taken from Python 2.7 source
+ r"""Run command with arguments and return its output as a byte string.
+
+ If the exit code was non-zero it raises a CalledProcessError. The
+ CalledProcessError object will have the return code in the returncode
+ attribute and output in the output attribute.
+
+ The arguments are the same as for the Popen constructor. Example:
+
+ >>> check_output(["ls", "-l", "/dev/null"])
+ 'crw-rw-rw- 1 root root 1, 3 Oct 18 2007 /dev/null\n'
+
+ The stdout argument is not allowed as it is used internally.
+ To capture standard error in the result, use stderr=STDOUT.
+
+ >>> check_output(["/bin/sh", "-c",
+ ... "ls -l non_existent_file ; exit 0"],
+ ... stderr=STDOUT)
+ 'ls: non_existent_file: No such file or directory\n'
+ """
+ if 'stdout' in kwargs:
+ raise ValueError('stdout argument not allowed, it will be overridden.')
+ process = subprocess.Popen(stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+ *popenargs, **kwargs)
+ output, unused_err = process.communicate()
+ retcode = process.poll()
+ if retcode:
+ cmd = kwargs.get("args")
+ if cmd is None:
+ cmd = popenargs[0]
+ raise subprocess.CalledProcessError(retcode, cmd, output=output)
+ return output
+
+
+def _default_locale_getter():
+ try:
+ raw_locales = check_output(['locale -a'], shell=True)
+ except subprocess.CalledProcessError as e:
+ raise type(e)("{exception}, the 'locale -a' command cannot be found "
+ "on your system".format(exception=e))
+ return raw_locales
+
+
+def get_locales(prefix=None, normalize=True,
+ locale_getter=_default_locale_getter):
+ """Get all the locales that are available on the system.
+
+ Parameters
+ ----------
+ prefix : str
+ If not ``None`` then return only those locales with the prefix
+ provided. For example to get all English language locales (those that
+ start with ``"en"``), pass ``prefix="en"``.
+ normalize : bool
+ Call ``locale.normalize`` on the resulting list of available locales.
+ If ``True``, only locales that can be set without throwing an
+ ``Exception`` are returned.
+ locale_getter : callable
+ The function to use to retrieve the current locales. This should return
+ a string with each locale separated by a newline character.
+
+ Returns
+ -------
+ locales : list of strings
+ A list of locale strings that can be set with ``locale.setlocale()``.
+ For example::
+
+ locale.setlocale(locale.LC_ALL, locale_string)
+
+ On error will return None (no locale available, e.g. Windows)
+
+ """
+ try:
+ raw_locales = locale_getter()
+ except Exception:
+ return None
+
+ try:
+ # raw_locales is "\n" separated list of locales
+ # it may contain non-decodable parts, so split
+ # extract what we can and then rejoin.
+ raw_locales = raw_locales.split(b'\n')
+ out_locales = []
+ for x in raw_locales:
+ if PY3:
+ out_locales.append(str(
+ x, encoding=pd.options.display.encoding))
+ else:
+ out_locales.append(str(x))
+
+ except TypeError:
+ pass
+
+ if prefix is None:
+ return _valid_locales(out_locales, normalize)
+
+ pattern = re.compile('{prefix}.*'.format(prefix=prefix))
+ found = pattern.findall('\n'.join(out_locales))
+ return _valid_locales(found, normalize)
+
+
+@contextmanager
+def set_locale(new_locale, lc_var=locale.LC_ALL):
+ """Context manager for temporarily setting a locale.
+
+ Parameters
+ ----------
+ new_locale : str or tuple
+ A string of the form <language_country>.<encoding>. For example to set
+ the current locale to US English with a UTF8 encoding, you would pass
+ "en_US.UTF-8".
+ lc_var : int, default `locale.LC_ALL`
+ The category of the locale being set.
+
+ Notes
+ -----
+ This is useful when you want to run a particular block of code under a
+ particular locale, without globally setting the locale. This probably isn't
+ thread-safe.
+ """
+ current_locale = locale.getlocale()
+
+ try:
+ locale.setlocale(lc_var, new_locale)
+ normalized_locale = locale.getlocale()
+ if com._all_not_none(*normalized_locale):
+ yield '.'.join(normalized_locale)
+ else:
+ yield new_locale
+ finally:
+ locale.setlocale(lc_var, current_locale)
+
+
+def can_set_locale(lc, lc_var=locale.LC_ALL):
+ """
+ Check to see if we can set a locale, and subsequently get the locale,
+ without raising an Exception.
+
+ Parameters
+ ----------
+ lc : str
+ The locale to attempt to set.
+ lc_var : int, default `locale.LC_ALL`
+ The category of the locale being set.
+
+ Returns
+ -------
+ is_valid : bool
+ Whether the passed locale can be set
+ """
+
+ try:
+ with set_locale(lc, lc_var=lc_var):
+ pass
+ except (ValueError,
+ locale.Error): # horrible name for a Exception subclass
+ return False
+ else:
+ return True
+
+
+def _valid_locales(locales, normalize):
+ """Return a list of normalized locales that do not throw an ``Exception``
+ when set.
+
+ Parameters
+ ----------
+ locales : str
+ A string where each locale is separated by a newline.
+ normalize : bool
+ Whether to call ``locale.normalize`` on each locale.
+
+ Returns
+ -------
+ valid_locales : list
+ A list of valid locales.
+ """
+ if normalize:
+ normalizer = lambda x: locale.normalize(x.strip())
+ else:
+ normalizer = lambda x: x.strip()
+
+ return list(filter(can_set_locale, map(normalizer, locales)))
+
+# -----------------------------------------------------------------------------
+# Stdout / stderr decorators
+
+
+@contextmanager
+def set_defaultencoding(encoding):
+ """
+ Set default encoding (as given by sys.getdefaultencoding()) to the given
+ encoding; restore on exit.
+
+ Parameters
+ ----------
+ encoding : str
+ """
+ if not PY2:
+ raise ValueError("set_defaultencoding context is only available "
+ "in Python 2.")
+ orig = sys.getdefaultencoding()
+ reload(sys) # noqa:F821
+ sys.setdefaultencoding(encoding)
+ try:
+ yield
+ finally:
+ sys.setdefaultencoding(orig)
+
+
+# -----------------------------------------------------------------------------
+# Console debugging tools
+
+
+def debug(f, *args, **kwargs):
+ from pdb import Pdb as OldPdb
+ try:
+ from IPython.core.debugger import Pdb
+ kw = dict(color_scheme='Linux')
+ except ImportError:
+ Pdb = OldPdb
+ kw = {}
+ pdb = Pdb(**kw)
+ return pdb.runcall(f, *args, **kwargs)
+
+
+def pudebug(f, *args, **kwargs):
+ import pudb
+ return pudb.runcall(f, *args, **kwargs)
+
+
+def set_trace():
+ from IPython.core.debugger import Pdb
+ try:
+ Pdb(color_scheme='Linux').set_trace(sys._getframe().f_back)
+ except Exception:
+ from pdb import Pdb as OldPdb
+ OldPdb().set_trace(sys._getframe().f_back)
+
+# -----------------------------------------------------------------------------
+# contextmanager to ensure the file cleanup
+
+
+@contextmanager
+def ensure_clean(filename=None, return_filelike=False):
+ """Gets a temporary path and agrees to remove on close.
+
+ Parameters
+ ----------
+ filename : str (optional)
+ if None, creates a temporary file which is then removed when out of
+ scope. if passed, creates temporary file with filename as ending.
+ return_filelike : bool (default False)
+ if True, returns a file-like which is *always* cleaned. Necessary for
+ savefig and other functions which want to append extensions.
+ """
+ filename = filename or ''
+ fd = None
+
+ if return_filelike:
+ f = tempfile.TemporaryFile(suffix=filename)
+ try:
+ yield f
+ finally:
+ f.close()
+ else:
+ # don't generate tempfile if using a path with directory specified
+ if len(os.path.dirname(filename)):
+ raise ValueError("Can't pass a qualified name to ensure_clean()")
+
+ try:
+ fd, filename = tempfile.mkstemp(suffix=filename)
+ except UnicodeEncodeError:
+ import pytest
+ pytest.skip('no unicode file names on this system')
+
+ try:
+ yield filename
+ finally:
+ try:
+ os.close(fd)
+ except Exception:
+ print("Couldn't close file descriptor: {fdesc} (file: {fname})"
+ .format(fdesc=fd, fname=filename))
+ try:
+ if os.path.exists(filename):
+ os.remove(filename)
+ except Exception as e:
+ print("Exception on removing file: {error}".format(error=e))
+
+
+@contextmanager
+def ensure_clean_dir():
+ """
+ Get a temporary directory path and agrees to remove on close.
+
+ Yields
+ ------
+ Temporary directory path
+ """
+ directory_name = tempfile.mkdtemp(suffix='')
+ try:
+ yield directory_name
+ finally:
+ try:
+ rmtree(directory_name)
+ except Exception:
+ pass
+
+
+@contextmanager
+def ensure_safe_environment_variables():
+ """
+ Get a context manager to safely set environment variables
+
+ All changes will be undone on close, hence environment variables set
+ within this contextmanager will neither persist nor change global state.
+ """
+ saved_environ = dict(os.environ)
+ try:
+ yield
+ finally:
+ os.environ.clear()
+ os.environ.update(saved_environ)
+
+
+# -----------------------------------------------------------------------------
+# Comparators
+
+
+def equalContents(arr1, arr2):
+ """Checks if the set of unique elements of arr1 and arr2 are equivalent.
+ """
+ return frozenset(arr1) == frozenset(arr2)
+
+
+def assert_index_equal(left, right, exact='equiv', check_names=True,
+ check_less_precise=False, check_exact=True,
+ check_categorical=True, obj='Index'):
+ """Check that left and right Index are equal.
+
+ Parameters
+ ----------
+ left : Index
+ right : Index
+ exact : bool / string {'equiv'}, default 'equiv'
+ Whether to check the Index class, dtype and inferred_type
+ are identical. If 'equiv', then RangeIndex can be substituted for
+ Int64Index as well.
+ check_names : bool, default True
+ Whether to check the names attribute.
+ check_less_precise : bool or int, default False
+ Specify comparison precision. Only used when check_exact is False.
+ 5 digits (False) or 3 digits (True) after decimal points are compared.
+ If int, then specify the digits to compare
+ check_exact : bool, default True
+ Whether to compare number exactly.
+ check_categorical : bool, default True
+ Whether to compare internal Categorical exactly.
+ obj : str, default 'Index'
+ Specify object name being compared, internally used to show appropriate
+ assertion message
+ """
+ __tracebackhide__ = True
+
+ def _check_types(l, r, obj='Index'):
+ if exact:
+ assert_class_equal(l, r, exact=exact, obj=obj)
+
+ # Skip exact dtype checking when `check_categorical` is False
+ if check_categorical:
+ assert_attr_equal('dtype', l, r, obj=obj)
+
+ # allow string-like to have different inferred_types
+ if l.inferred_type in ('string', 'unicode'):
+ assert r.inferred_type in ('string', 'unicode')
+ else:
+ assert_attr_equal('inferred_type', l, r, obj=obj)
+
+ def _get_ilevel_values(index, level):
+ # accept level number only
+ unique = index.levels[level]
+ labels = index.codes[level]
+ filled = take_1d(unique.values, labels, fill_value=unique._na_value)
+ values = unique._shallow_copy(filled, name=index.names[level])
+ return values
+
+ # instance validation
+ _check_isinstance(left, right, Index)
+
+ # class / dtype comparison
+ _check_types(left, right, obj=obj)
+
+ # level comparison
+ if left.nlevels != right.nlevels:
+ msg1 = '{obj} levels are different'.format(obj=obj)
+ msg2 = '{nlevels}, {left}'.format(nlevels=left.nlevels, left=left)
+ msg3 = '{nlevels}, {right}'.format(nlevels=right.nlevels, right=right)
+ raise_assert_detail(obj, msg1, msg2, msg3)
+
+ # length comparison
+ if len(left) != len(right):
+ msg1 = '{obj} length are different'.format(obj=obj)
+ msg2 = '{length}, {left}'.format(length=len(left), left=left)
+ msg3 = '{length}, {right}'.format(length=len(right), right=right)
+ raise_assert_detail(obj, msg1, msg2, msg3)
+
+ # MultiIndex special comparison for little-friendly error messages
+ if left.nlevels > 1:
+ for level in range(left.nlevels):
+ # cannot use get_level_values here because it can change dtype
+ llevel = _get_ilevel_values(left, level)
+ rlevel = _get_ilevel_values(right, level)
+
+ lobj = 'MultiIndex level [{level}]'.format(level=level)
+ assert_index_equal(llevel, rlevel,
+ exact=exact, check_names=check_names,
+ check_less_precise=check_less_precise,
+ check_exact=check_exact, obj=lobj)
+ # get_level_values may change dtype
+ _check_types(left.levels[level], right.levels[level], obj=obj)
+
+ # skip exact index checking when `check_categorical` is False
+ if check_exact and check_categorical:
+ if not left.equals(right):
+ diff = np.sum((left.values != right.values)
+ .astype(int)) * 100.0 / len(left)
+ msg = '{obj} values are different ({pct} %)'.format(
+ obj=obj, pct=np.round(diff, 5))
+ raise_assert_detail(obj, msg, left, right)
+ else:
+ _testing.assert_almost_equal(left.values, right.values,
+ check_less_precise=check_less_precise,
+ check_dtype=exact,
+ obj=obj, lobj=left, robj=right)
+
+ # metadata comparison
+ if check_names:
+ assert_attr_equal('names', left, right, obj=obj)
+ if isinstance(left, pd.PeriodIndex) or isinstance(right, pd.PeriodIndex):
+ assert_attr_equal('freq', left, right, obj=obj)
+ if (isinstance(left, pd.IntervalIndex) or
+ isinstance(right, pd.IntervalIndex)):
+ assert_interval_array_equal(left.values, right.values)
+
+ if check_categorical:
+ if is_categorical_dtype(left) or is_categorical_dtype(right):
+ assert_categorical_equal(left.values, right.values,
+ obj='{obj} category'.format(obj=obj))
+
+
+def assert_class_equal(left, right, exact=True, obj='Input'):
+ """checks classes are equal."""
+ __tracebackhide__ = True
+
+ def repr_class(x):
+ if isinstance(x, Index):
+ # return Index as it is to include values in the error message
+ return x
+
+ try:
+ return x.__class__.__name__
+ except AttributeError:
+ return repr(type(x))
+
+ if exact == 'equiv':
+ if type(left) != type(right):
+ # allow equivalence of Int64Index/RangeIndex
+ types = {type(left).__name__, type(right).__name__}
+ if len(types - {'Int64Index', 'RangeIndex'}):
+ msg = '{obj} classes are not equivalent'.format(obj=obj)
+ raise_assert_detail(obj, msg, repr_class(left),
+ repr_class(right))
+ elif exact:
+ if type(left) != type(right):
+ msg = '{obj} classes are different'.format(obj=obj)
+ raise_assert_detail(obj, msg, repr_class(left),
+ repr_class(right))
+
+
+def assert_attr_equal(attr, left, right, obj='Attributes'):
+ """checks attributes are equal. Both objects must have attribute.
+
+ Parameters
+ ----------
+ attr : str
+ Attribute name being compared.
+ left : object
+ right : object
+ obj : str, default 'Attributes'
+ Specify object name being compared, internally used to show appropriate
+ assertion message
+ """
+ __tracebackhide__ = True
+
+ left_attr = getattr(left, attr)
+ right_attr = getattr(right, attr)
+
+ if left_attr is right_attr:
+ return True
+ elif (is_number(left_attr) and np.isnan(left_attr) and
+ is_number(right_attr) and np.isnan(right_attr)):
+ # np.nan
+ return True
+
+ try:
+ result = left_attr == right_attr
+ except TypeError:
+ # datetimetz on rhs may raise TypeError
+ result = False
+ if not isinstance(result, bool):
+ result = result.all()
+
+ if result:
+ return True
+ else:
+ msg = 'Attribute "{attr}" are different'.format(attr=attr)
+ raise_assert_detail(obj, msg, left_attr, right_attr)
+
+
+def assert_is_valid_plot_return_object(objs):
+ import matplotlib.pyplot as plt
+ if isinstance(objs, (pd.Series, np.ndarray)):
+ for el in objs.ravel():
+ msg = ("one of 'objs' is not a matplotlib Axes instance, type "
+ "encountered {name!r}").format(name=el.__class__.__name__)
+ assert isinstance(el, (plt.Axes, dict)), msg
+ else:
+ assert isinstance(objs, (plt.Artist, tuple, dict)), (
+ 'objs is neither an ndarray of Artist instances nor a '
+ 'single Artist instance, tuple, or dict, "objs" is a {name!r}'
+ .format(name=objs.__class__.__name__))
+
+
+def isiterable(obj):
+ return hasattr(obj, '__iter__')
+
+
+def is_sorted(seq):
+ if isinstance(seq, (Index, Series)):
+ seq = seq.values
+ # sorting does not change precisions
+ return assert_numpy_array_equal(seq, np.sort(np.array(seq)))
+
+
+def assert_categorical_equal(left, right, check_dtype=True,
+ check_category_order=True, obj='Categorical'):
+ """Test that Categoricals are equivalent.
+
+ Parameters
+ ----------
+ left : Categorical
+ right : Categorical
+ check_dtype : bool, default True
+ Check that integer dtype of the codes are the same
+ check_category_order : bool, default True
+ Whether the order of the categories should be compared, which
+ implies identical integer codes. If False, only the resulting
+ values are compared. The ordered attribute is
+ checked regardless.
+ obj : str, default 'Categorical'
+ Specify object name being compared, internally used to show appropriate
+ assertion message
+ """
+ _check_isinstance(left, right, Categorical)
+
+ if check_category_order:
+ assert_index_equal(left.categories, right.categories,
+ obj='{obj}.categories'.format(obj=obj))
+ assert_numpy_array_equal(left.codes, right.codes,
+ check_dtype=check_dtype,
+ obj='{obj}.codes'.format(obj=obj))
+ else:
+ assert_index_equal(left.categories.sort_values(),
+ right.categories.sort_values(),
+ obj='{obj}.categories'.format(obj=obj))
+ assert_index_equal(left.categories.take(left.codes),
+ right.categories.take(right.codes),
+ obj='{obj}.values'.format(obj=obj))
+
+ assert_attr_equal('ordered', left, right, obj=obj)
+
+
+def assert_interval_array_equal(left, right, exact='equiv',
+ obj='IntervalArray'):
+ """Test that two IntervalArrays are equivalent.
+
+ Parameters
+ ----------
+ left, right : IntervalArray
+ The IntervalArrays to compare.
+ exact : bool / string {'equiv'}, default 'equiv'
+ Whether to check the Index class, dtype and inferred_type
+ are identical. If 'equiv', then RangeIndex can be substituted for
+ Int64Index as well.
+ obj : str, default 'IntervalArray'
+ Specify object name being compared, internally used to show appropriate
+ assertion message
+ """
+ _check_isinstance(left, right, IntervalArray)
+
+ assert_index_equal(left.left, right.left, exact=exact,
+ obj='{obj}.left'.format(obj=obj))
+ assert_index_equal(left.right, right.right, exact=exact,
+ obj='{obj}.left'.format(obj=obj))
+ assert_attr_equal('closed', left, right, obj=obj)
+
+
+def assert_period_array_equal(left, right, obj='PeriodArray'):
+ _check_isinstance(left, right, PeriodArray)
+
+ assert_numpy_array_equal(left._data, right._data,
+ obj='{obj}.values'.format(obj=obj))
+ assert_attr_equal('freq', left, right, obj=obj)
+
+
+def assert_datetime_array_equal(left, right, obj='DatetimeArray'):
+ __tracebackhide__ = True
+ _check_isinstance(left, right, DatetimeArray)
+
+ assert_numpy_array_equal(left._data, right._data,
+ obj='{obj}._data'.format(obj=obj))
+ assert_attr_equal('freq', left, right, obj=obj)
+ assert_attr_equal('tz', left, right, obj=obj)
+
+
+def assert_timedelta_array_equal(left, right, obj='TimedeltaArray'):
+ __tracebackhide__ = True
+ _check_isinstance(left, right, TimedeltaArray)
+ assert_numpy_array_equal(left._data, right._data,
+ obj='{obj}._data'.format(obj=obj))
+ assert_attr_equal('freq', left, right, obj=obj)
+
+
+def raise_assert_detail(obj, message, left, right, diff=None):
+ __tracebackhide__ = True
+
+ if isinstance(left, np.ndarray):
+ left = pprint_thing(left)
+ elif is_categorical_dtype(left):
+ left = repr(left)
+
+ if PY2 and isinstance(left, string_types):
+ # left needs to be printable in native text type in python2
+ left = left.encode('utf-8')
+
+ if isinstance(right, np.ndarray):
+ right = pprint_thing(right)
+ elif is_categorical_dtype(right):
+ right = repr(right)
+
+ if PY2 and isinstance(right, string_types):
+ # right needs to be printable in native text type in python2
+ right = right.encode('utf-8')
+
+ msg = """{obj} are different
+
+{message}
+[left]: {left}
+[right]: {right}""".format(obj=obj, message=message, left=left, right=right)
+
+ if diff is not None:
+ msg += "\n[diff]: {diff}".format(diff=diff)
+
+ raise AssertionError(msg)
+
+
+def assert_numpy_array_equal(left, right, strict_nan=False,
+ check_dtype=True, err_msg=None,
+ check_same=None, obj='numpy array'):
+ """ Checks that 'np.ndarray' is equivalent
+
+ Parameters
+ ----------
+ left : np.ndarray or iterable
+ right : np.ndarray or iterable
+ strict_nan : bool, default False
+ If True, consider NaN and None to be different.
+ check_dtype: bool, default True
+ check dtype if both a and b are np.ndarray
+ err_msg : str, default None
+ If provided, used as assertion message
+ check_same : None|'copy'|'same', default None
+ Ensure left and right refer/do not refer to the same memory area
+ obj : str, default 'numpy array'
+ Specify object name being compared, internally used to show appropriate
+ assertion message
+ """
+ __tracebackhide__ = True
+
+ # instance validation
+ # Show a detailed error message when classes are different
+ assert_class_equal(left, right, obj=obj)
+ # both classes must be an np.ndarray
+ _check_isinstance(left, right, np.ndarray)
+
+ def _get_base(obj):
+ return obj.base if getattr(obj, 'base', None) is not None else obj
+
+ left_base = _get_base(left)
+ right_base = _get_base(right)
+
+ if check_same == 'same':
+ if left_base is not right_base:
+ msg = "{left!r} is not {right!r}".format(
+ left=left_base, right=right_base)
+ raise AssertionError(msg)
+ elif check_same == 'copy':
+ if left_base is right_base:
+ msg = "{left!r} is {right!r}".format(
+ left=left_base, right=right_base)
+ raise AssertionError(msg)
+
+ def _raise(left, right, err_msg):
+ if err_msg is None:
+ if left.shape != right.shape:
+ raise_assert_detail(obj, '{obj} shapes are different'
+ .format(obj=obj), left.shape, right.shape)
+
+ diff = 0
+ for l, r in zip(left, right):
+ # count up differences
+ if not array_equivalent(l, r, strict_nan=strict_nan):
+ diff += 1
+
+ diff = diff * 100.0 / left.size
+ msg = '{obj} values are different ({pct} %)'.format(
+ obj=obj, pct=np.round(diff, 5))
+ raise_assert_detail(obj, msg, left, right)
+
+ raise AssertionError(err_msg)
+
+ # compare shape and values
+ if not array_equivalent(left, right, strict_nan=strict_nan):
+ _raise(left, right, err_msg)
+
+ if check_dtype:
+ if isinstance(left, np.ndarray) and isinstance(right, np.ndarray):
+ assert_attr_equal('dtype', left, right, obj=obj)
+
+ return True
+
+
+def assert_extension_array_equal(left, right, check_dtype=True,
+ check_less_precise=False,
+ check_exact=False):
+ """Check that left and right ExtensionArrays are equal.
+
+ Parameters
+ ----------
+ left, right : ExtensionArray
+ The two arrays to compare
+ check_dtype : bool, default True
+ Whether to check if the ExtensionArray dtypes are identical.
+ check_less_precise : bool or int, default False
+ Specify comparison precision. Only used when check_exact is False.
+ 5 digits (False) or 3 digits (True) after decimal points are compared.
+ If int, then specify the digits to compare.
+ check_exact : bool, default False
+ Whether to compare number exactly.
+
+ Notes
+ -----
+ Missing values are checked separately from valid values.
+ A mask of missing values is computed for each and checked to match.
+ The remaining all-valid values are cast to object dtype and checked.
+ """
+ assert isinstance(left, ExtensionArray), 'left is not an ExtensionArray'
+ assert isinstance(right, ExtensionArray), 'right is not an ExtensionArray'
+ if check_dtype:
+ assert_attr_equal('dtype', left, right, obj='ExtensionArray')
+
+ if hasattr(left, "asi8") and type(right) == type(left):
+ # Avoid slow object-dtype comparisons
+ assert_numpy_array_equal(left.asi8, right.asi8)
+ return
+
+ left_na = np.asarray(left.isna())
+ right_na = np.asarray(right.isna())
+ assert_numpy_array_equal(left_na, right_na, obj='ExtensionArray NA mask')
+
+ left_valid = np.asarray(left[~left_na].astype(object))
+ right_valid = np.asarray(right[~right_na].astype(object))
+ if check_exact:
+ assert_numpy_array_equal(left_valid, right_valid, obj='ExtensionArray')
+ else:
+ _testing.assert_almost_equal(left_valid, right_valid,
+ check_dtype=check_dtype,
+ check_less_precise=check_less_precise,
+ obj='ExtensionArray')
+
+
+# This could be refactored to use the NDFrame.equals method
+def assert_series_equal(left, right, check_dtype=True,
+ check_index_type='equiv',
+ check_series_type=True,
+ check_less_precise=False,
+ check_names=True,
+ check_exact=False,
+ check_datetimelike_compat=False,
+ check_categorical=True,
+ obj='Series'):
+ """Check that left and right Series are equal.
+
+ Parameters
+ ----------
+ left : Series
+ right : Series
+ check_dtype : bool, default True
+ Whether to check the Series dtype is identical.
+ check_index_type : bool / string {'equiv'}, default 'equiv'
+ Whether to check the Index class, dtype and inferred_type
+ are identical.
+ check_series_type : bool, default True
+ Whether to check the Series class is identical.
+ check_less_precise : bool or int, default False
+ Specify comparison precision. Only used when check_exact is False.
+ 5 digits (False) or 3 digits (True) after decimal points are compared.
+ If int, then specify the digits to compare.
+ check_names : bool, default True
+ Whether to check the Series and Index names attribute.
+ check_exact : bool, default False
+ Whether to compare number exactly.
+ check_datetimelike_compat : bool, default False
+ Compare datetime-like which is comparable ignoring dtype.
+ check_categorical : bool, default True
+ Whether to compare internal Categorical exactly.
+ obj : str, default 'Series'
+ Specify object name being compared, internally used to show appropriate
+ assertion message.
+ """
+ __tracebackhide__ = True
+
+ # instance validation
+ _check_isinstance(left, right, Series)
+
+ if check_series_type:
+ # ToDo: There are some tests using rhs is sparse
+ # lhs is dense. Should use assert_class_equal in future
+ assert isinstance(left, type(right))
+ # assert_class_equal(left, right, obj=obj)
+
+ # length comparison
+ if len(left) != len(right):
+ msg1 = '{len}, {left}'.format(len=len(left), left=left.index)
+ msg2 = '{len}, {right}'.format(len=len(right), right=right.index)
+ raise_assert_detail(obj, 'Series length are different', msg1, msg2)
+
+ # index comparison
+ assert_index_equal(left.index, right.index, exact=check_index_type,
+ check_names=check_names,
+ check_less_precise=check_less_precise,
+ check_exact=check_exact,
+ check_categorical=check_categorical,
+ obj='{obj}.index'.format(obj=obj))
+
+ if check_dtype:
+ # We want to skip exact dtype checking when `check_categorical`
+ # is False. We'll still raise if only one is a `Categorical`,
+ # regardless of `check_categorical`
+ if (is_categorical_dtype(left) and is_categorical_dtype(right) and
+ not check_categorical):
+ pass
+ else:
+ assert_attr_equal('dtype', left, right)
+
+ if check_exact:
+ assert_numpy_array_equal(left.get_values(), right.get_values(),
+ check_dtype=check_dtype,
+ obj='{obj}'.format(obj=obj),)
+ elif check_datetimelike_compat:
+ # we want to check only if we have compat dtypes
+ # e.g. integer and M|m are NOT compat, but we can simply check
+ # the values in that case
+ if (is_datetimelike_v_numeric(left, right) or
+ is_datetimelike_v_object(left, right) or
+ needs_i8_conversion(left) or
+ needs_i8_conversion(right)):
+
+ # datetimelike may have different objects (e.g. datetime.datetime
+ # vs Timestamp) but will compare equal
+ if not Index(left.values).equals(Index(right.values)):
+ msg = ('[datetimelike_compat=True] {left} is not equal to '
+ '{right}.').format(left=left.values, right=right.values)
+ raise AssertionError(msg)
+ else:
+ assert_numpy_array_equal(left.get_values(), right.get_values(),
+ check_dtype=check_dtype)
+ elif is_interval_dtype(left) or is_interval_dtype(right):
+ assert_interval_array_equal(left.array, right.array)
+
+ elif (is_extension_array_dtype(left.dtype) and
+ is_datetime64tz_dtype(left.dtype)):
+ # .values is an ndarray, but ._values is the ExtensionArray.
+ # TODO: Use .array
+ assert is_extension_array_dtype(right.dtype)
+ return assert_extension_array_equal(left._values, right._values)
+
+ elif (is_extension_array_dtype(left) and not is_categorical_dtype(left) and
+ is_extension_array_dtype(right) and not is_categorical_dtype(right)):
+ return assert_extension_array_equal(left.array, right.array)
+
+ else:
+ _testing.assert_almost_equal(left.get_values(), right.get_values(),
+ check_less_precise=check_less_precise,
+ check_dtype=check_dtype,
+ obj='{obj}'.format(obj=obj))
+
+ # metadata comparison
+ if check_names:
+ assert_attr_equal('name', left, right, obj=obj)
+
+ if check_categorical:
+ if is_categorical_dtype(left) or is_categorical_dtype(right):
+ assert_categorical_equal(left.values, right.values,
+ obj='{obj} category'.format(obj=obj))
+
+
+# This could be refactored to use the NDFrame.equals method
+def assert_frame_equal(left, right, check_dtype=True,
+ check_index_type='equiv',
+ check_column_type='equiv',
+ check_frame_type=True,
+ check_less_precise=False,
+ check_names=True,
+ by_blocks=False,
+ check_exact=False,
+ check_datetimelike_compat=False,
+ check_categorical=True,
+ check_like=False,
+ obj='DataFrame'):
+ """
+ Check that left and right DataFrame are equal.
+
+ This function is intended to compare two DataFrames and output any
+ differences. Is is mostly intended for use in unit tests.
+ Additional parameters allow varying the strictness of the
+ equality checks performed.
+
+ Parameters
+ ----------
+ left : DataFrame
+ First DataFrame to compare.
+ right : DataFrame
+ Second DataFrame to compare.
+ check_dtype : bool, default True
+ Whether to check the DataFrame dtype is identical.
+ check_index_type : bool / string {'equiv'}, default 'equiv'
+ Whether to check the Index class, dtype and inferred_type
+ are identical.
+ check_column_type : bool / string {'equiv'}, default 'equiv'
+ Whether to check the columns class, dtype and inferred_type
+ are identical. Is passed as the ``exact`` argument of
+ :func:`assert_index_equal`.
+ check_frame_type : bool, default True
+ Whether to check the DataFrame class is identical.
+ check_less_precise : bool or int, default False
+ Specify comparison precision. Only used when check_exact is False.
+ 5 digits (False) or 3 digits (True) after decimal points are compared.
+ If int, then specify the digits to compare.
+ check_names : bool, default True
+ Whether to check that the `names` attribute for both the `index`
+ and `column` attributes of the DataFrame is identical, i.e.
+
+ * left.index.names == right.index.names
+ * left.columns.names == right.columns.names
+ by_blocks : bool, default False
+ Specify how to compare internal data. If False, compare by columns.
+ If True, compare by blocks.
+ check_exact : bool, default False
+ Whether to compare number exactly.
+ check_datetimelike_compat : bool, default False
+ Compare datetime-like which is comparable ignoring dtype.
+ check_categorical : bool, default True
+ Whether to compare internal Categorical exactly.
+ check_like : bool, default False
+ If True, ignore the order of index & columns.
+ Note: index labels must match their respective rows
+ (same as in columns) - same labels must be with the same data.
+ obj : str, default 'DataFrame'
+ Specify object name being compared, internally used to show appropriate
+ assertion message.
+
+ See Also
+ --------
+ assert_series_equal : Equivalent method for asserting Series equality.
+ DataFrame.equals : Check DataFrame equality.
+
+ Examples
+ --------
+ This example shows comparing two DataFrames that are equal
+ but with columns of differing dtypes.
+
+ >>> from pandas.util.testing import assert_frame_equal
+ >>> df1 = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
+ >>> df2 = pd.DataFrame({'a': [1, 2], 'b': [3.0, 4.0]})
+
+ df1 equals itself.
+ >>> assert_frame_equal(df1, df1)
+
+ df1 differs from df2 as column 'b' is of a different type.
+ >>> assert_frame_equal(df1, df2)
+ Traceback (most recent call last):
+ AssertionError: Attributes are different
+
+ Attribute "dtype" are different
+ [left]: int64
+ [right]: float64
+
+ Ignore differing dtypes in columns with check_dtype.
+ >>> assert_frame_equal(df1, df2, check_dtype=False)
+ """
+ __tracebackhide__ = True
+
+ # instance validation
+ _check_isinstance(left, right, DataFrame)
+
+ if check_frame_type:
+ # ToDo: There are some tests using rhs is SparseDataFrame
+ # lhs is DataFrame. Should use assert_class_equal in future
+ assert isinstance(left, type(right))
+ # assert_class_equal(left, right, obj=obj)
+
+ # shape comparison
+ if left.shape != right.shape:
+ raise_assert_detail(obj,
+ 'DataFrame shape mismatch',
+ '{shape!r}'.format(shape=left.shape),
+ '{shape!r}'.format(shape=right.shape))
+
+ if check_like:
+ left, right = left.reindex_like(right), right
+
+ # index comparison
+ assert_index_equal(left.index, right.index, exact=check_index_type,
+ check_names=check_names,
+ check_less_precise=check_less_precise,
+ check_exact=check_exact,
+ check_categorical=check_categorical,
+ obj='{obj}.index'.format(obj=obj))
+
+ # column comparison
+ assert_index_equal(left.columns, right.columns, exact=check_column_type,
+ check_names=check_names,
+ check_less_precise=check_less_precise,
+ check_exact=check_exact,
+ check_categorical=check_categorical,
+ obj='{obj}.columns'.format(obj=obj))
+
+ # compare by blocks
+ if by_blocks:
+ rblocks = right._to_dict_of_blocks()
+ lblocks = left._to_dict_of_blocks()
+ for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))):
+ assert dtype in lblocks
+ assert dtype in rblocks
+ assert_frame_equal(lblocks[dtype], rblocks[dtype],
+ check_dtype=check_dtype, obj='DataFrame.blocks')
+
+ # compare by columns
+ else:
+ for i, col in enumerate(left.columns):
+ assert col in right
+ lcol = left.iloc[:, i]
+ rcol = right.iloc[:, i]
+ assert_series_equal(
+ lcol, rcol, check_dtype=check_dtype,
+ check_index_type=check_index_type,
+ check_less_precise=check_less_precise,
+ check_exact=check_exact, check_names=check_names,
+ check_datetimelike_compat=check_datetimelike_compat,
+ check_categorical=check_categorical,
+ obj='DataFrame.iloc[:, {idx}]'.format(idx=i))
+
+
+def assert_panel_equal(left, right,
+ check_dtype=True,
+ check_panel_type=False,
+ check_less_precise=False,
+ check_names=False,
+ by_blocks=False,
+ obj='Panel'):
+ """Check that left and right Panels are equal.
+
+ Parameters
+ ----------
+ left : Panel (or nd)
+ right : Panel (or nd)
+ check_dtype : bool, default True
+ Whether to check the Panel dtype is identical.
+ check_panel_type : bool, default False
+ Whether to check the Panel class is identical.
+ check_less_precise : bool or int, default False
+ Specify comparison precision. Only used when check_exact is False.
+ 5 digits (False) or 3 digits (True) after decimal points are compared.
+ If int, then specify the digits to compare
+ check_names : bool, default True
+ Whether to check the Index names attribute.
+ by_blocks : bool, default False
+ Specify how to compare internal data. If False, compare by columns.
+ If True, compare by blocks.
+ obj : str, default 'Panel'
+ Specify the object name being compared, internally used to show
+ the appropriate assertion message.
+ """
+
+ if check_panel_type:
+ assert_class_equal(left, right, obj=obj)
+
+ for axis in left._AXIS_ORDERS:
+ left_ind = getattr(left, axis)
+ right_ind = getattr(right, axis)
+ assert_index_equal(left_ind, right_ind, check_names=check_names)
+
+ if by_blocks:
+ rblocks = right._to_dict_of_blocks()
+ lblocks = left._to_dict_of_blocks()
+ for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))):
+ assert dtype in lblocks
+ assert dtype in rblocks
+ array_equivalent(lblocks[dtype].values, rblocks[dtype].values)
+ else:
+
+ # can potentially be slow
+ for i, item in enumerate(left._get_axis(0)):
+ msg = "non-matching item (right) '{item}'".format(item=item)
+ assert item in right, msg
+ litem = left.iloc[i]
+ ritem = right.iloc[i]
+ assert_frame_equal(litem, ritem,
+ check_less_precise=check_less_precise,
+ check_names=check_names)
+
+ for i, item in enumerate(right._get_axis(0)):
+ msg = "non-matching item (left) '{item}'".format(item=item)
+ assert item in left, msg
+
+
+def assert_equal(left, right, **kwargs):
+ """
+ Wrapper for tm.assert_*_equal to dispatch to the appropriate test function.
+
+ Parameters
+ ----------
+ left : Index, Series, DataFrame, ExtensionArray, or np.ndarray
+ right : Index, Series, DataFrame, ExtensionArray, or np.ndarray
+ **kwargs
+ """
+ __tracebackhide__ = True
+
+ if isinstance(left, pd.Index):
+ assert_index_equal(left, right, **kwargs)
+ elif isinstance(left, pd.Series):
+ assert_series_equal(left, right, **kwargs)
+ elif isinstance(left, pd.DataFrame):
+ assert_frame_equal(left, right, **kwargs)
+ elif isinstance(left, IntervalArray):
+ assert_interval_array_equal(left, right, **kwargs)
+ elif isinstance(left, PeriodArray):
+ assert_period_array_equal(left, right, **kwargs)
+ elif isinstance(left, DatetimeArray):
+ assert_datetime_array_equal(left, right, **kwargs)
+ elif isinstance(left, TimedeltaArray):
+ assert_timedelta_array_equal(left, right, **kwargs)
+ elif isinstance(left, ExtensionArray):
+ assert_extension_array_equal(left, right, **kwargs)
+ elif isinstance(left, np.ndarray):
+ assert_numpy_array_equal(left, right, **kwargs)
+ else:
+ raise NotImplementedError(type(left))
+
+
+def box_expected(expected, box_cls, transpose=True):
+ """
+ Helper function to wrap the expected output of a test in a given box_class.
+
+ Parameters
+ ----------
+ expected : np.ndarray, Index, Series
+ box_cls : {Index, Series, DataFrame}
+
+ Returns
+ -------
+ subclass of box_cls
+ """
+ if box_cls is pd.Index:
+ expected = pd.Index(expected)
+ elif box_cls is pd.Series:
+ expected = pd.Series(expected)
+ elif box_cls is pd.DataFrame:
+ expected = pd.Series(expected).to_frame()
+ if transpose:
+ # for vector operations, we we need a DataFrame to be a single-row,
+ # not a single-column, in order to operate against non-DataFrame
+ # vectors of the same length.
+ expected = expected.T
+ elif box_cls is PeriodArray:
+ # the PeriodArray constructor is not as flexible as period_array
+ expected = period_array(expected)
+ elif box_cls is DatetimeArray:
+ expected = DatetimeArray(expected)
+ elif box_cls is TimedeltaArray:
+ expected = TimedeltaArray(expected)
+ elif box_cls is np.ndarray:
+ expected = np.array(expected)
+ elif box_cls is to_array:
+ expected = to_array(expected)
+ else:
+ raise NotImplementedError(box_cls)
+ return expected
+
+
+def to_array(obj):
+ # temporary implementation until we get pd.array in place
+ if is_period_dtype(obj):
+ return period_array(obj)
+ elif is_datetime64_dtype(obj) or is_datetime64tz_dtype(obj):
+ return DatetimeArray._from_sequence(obj)
+ elif is_timedelta64_dtype(obj):
+ return TimedeltaArray._from_sequence(obj)
+ else:
+ return np.array(obj)
+
+
+# -----------------------------------------------------------------------------
+# Sparse
+
+
+def assert_sp_array_equal(left, right, check_dtype=True, check_kind=True,
+ check_fill_value=True,
+ consolidate_block_indices=False):
+ """Check that the left and right SparseArray are equal.
+
+ Parameters
+ ----------
+ left : SparseArray
+ right : SparseArray
+ check_dtype : bool, default True
+ Whether to check the data dtype is identical.
+ check_kind : bool, default True
+ Whether to just the kind of the sparse index for each column.
+ check_fill_value : bool, default True
+ Whether to check that left.fill_value matches right.fill_value
+ consolidate_block_indices : bool, default False
+ Whether to consolidate contiguous blocks for sparse arrays with
+ a BlockIndex. Some operations, e.g. concat, will end up with
+ block indices that could be consolidated. Setting this to true will
+ create a new BlockIndex for that array, with consolidated
+ block indices.
+ """
+
+ _check_isinstance(left, right, pd.SparseArray)
+
+ assert_numpy_array_equal(left.sp_values, right.sp_values,
+ check_dtype=check_dtype)
+
+ # SparseIndex comparison
+ assert isinstance(left.sp_index, pd._libs.sparse.SparseIndex)
+ assert isinstance(right.sp_index, pd._libs.sparse.SparseIndex)
+
+ if not check_kind:
+ left_index = left.sp_index.to_block_index()
+ right_index = right.sp_index.to_block_index()
+ else:
+ left_index = left.sp_index
+ right_index = right.sp_index
+
+ if consolidate_block_indices and left.kind == 'block':
+ # we'll probably remove this hack...
+ left_index = left_index.to_int_index().to_block_index()
+ right_index = right_index.to_int_index().to_block_index()
+
+ if not left_index.equals(right_index):
+ raise_assert_detail('SparseArray.index', 'index are not equal',
+ left_index, right_index)
+ else:
+ # Just ensure a
+ pass
+
+ if check_fill_value:
+ assert_attr_equal('fill_value', left, right)
+ if check_dtype:
+ assert_attr_equal('dtype', left, right)
+ assert_numpy_array_equal(left.values, right.values,
+ check_dtype=check_dtype)
+
+
+def assert_sp_series_equal(left, right, check_dtype=True, exact_indices=True,
+ check_series_type=True, check_names=True,
+ check_kind=True,
+ check_fill_value=True,
+ consolidate_block_indices=False,
+ obj='SparseSeries'):
+ """Check that the left and right SparseSeries are equal.
+
+ Parameters
+ ----------
+ left : SparseSeries
+ right : SparseSeries
+ check_dtype : bool, default True
+ Whether to check the Series dtype is identical.
+ exact_indices : bool, default True
+ check_series_type : bool, default True
+ Whether to check the SparseSeries class is identical.
+ check_names : bool, default True
+ Whether to check the SparseSeries name attribute.
+ check_kind : bool, default True
+ Whether to just the kind of the sparse index for each column.
+ check_fill_value : bool, default True
+ Whether to check that left.fill_value matches right.fill_value
+ consolidate_block_indices : bool, default False
+ Whether to consolidate contiguous blocks for sparse arrays with
+ a BlockIndex. Some operations, e.g. concat, will end up with
+ block indices that could be consolidated. Setting this to true will
+ create a new BlockIndex for that array, with consolidated
+ block indices.
+ obj : str, default 'SparseSeries'
+ Specify the object name being compared, internally used to show
+ the appropriate assertion message.
+ """
+ _check_isinstance(left, right, pd.SparseSeries)
+
+ if check_series_type:
+ assert_class_equal(left, right, obj=obj)
+
+ assert_index_equal(left.index, right.index,
+ obj='{obj}.index'.format(obj=obj))
+
+ assert_sp_array_equal(left.values, right.values,
+ check_kind=check_kind,
+ check_fill_value=check_fill_value,
+ consolidate_block_indices=consolidate_block_indices)
+
+ if check_names:
+ assert_attr_equal('name', left, right)
+ if check_dtype:
+ assert_attr_equal('dtype', left, right)
+
+ assert_numpy_array_equal(np.asarray(left.values),
+ np.asarray(right.values))
+
+
+def assert_sp_frame_equal(left, right, check_dtype=True, exact_indices=True,
+ check_frame_type=True, check_kind=True,
+ check_fill_value=True,
+ consolidate_block_indices=False,
+ obj='SparseDataFrame'):
+ """Check that the left and right SparseDataFrame are equal.
+
+ Parameters
+ ----------
+ left : SparseDataFrame
+ right : SparseDataFrame
+ check_dtype : bool, default True
+ Whether to check the Series dtype is identical.
+ exact_indices : bool, default True
+ SparseSeries SparseIndex objects must be exactly the same,
+ otherwise just compare dense representations.
+ check_frame_type : bool, default True
+ Whether to check the SparseDataFrame class is identical.
+ check_kind : bool, default True
+ Whether to just the kind of the sparse index for each column.
+ check_fill_value : bool, default True
+ Whether to check that left.fill_value matches right.fill_value
+ consolidate_block_indices : bool, default False
+ Whether to consolidate contiguous blocks for sparse arrays with
+ a BlockIndex. Some operations, e.g. concat, will end up with
+ block indices that could be consolidated. Setting this to true will
+ create a new BlockIndex for that array, with consolidated
+ block indices.
+ obj : str, default 'SparseDataFrame'
+ Specify the object name being compared, internally used to show
+ the appropriate assertion message.
+ """
+ _check_isinstance(left, right, pd.SparseDataFrame)
+
+ if check_frame_type:
+ assert_class_equal(left, right, obj=obj)
+
+ assert_index_equal(left.index, right.index,
+ obj='{obj}.index'.format(obj=obj))
+ assert_index_equal(left.columns, right.columns,
+ obj='{obj}.columns'.format(obj=obj))
+
+ if check_fill_value:
+ assert_attr_equal('default_fill_value', left, right, obj=obj)
+
+ for col, series in compat.iteritems(left):
+ assert (col in right)
+ # trade-off?
+
+ if exact_indices:
+ assert_sp_series_equal(
+ series, right[col],
+ check_dtype=check_dtype,
+ check_kind=check_kind,
+ check_fill_value=check_fill_value,
+ consolidate_block_indices=consolidate_block_indices
+ )
+ else:
+ assert_series_equal(series.to_dense(), right[col].to_dense(),
+ check_dtype=check_dtype)
+
+ # do I care?
+ # assert(left.default_kind == right.default_kind)
+
+ for col in right:
+ assert (col in left)
+
+# -----------------------------------------------------------------------------
+# Others
+
+
+def assert_contains_all(iterable, dic):
+ for k in iterable:
+ assert k in dic, "Did not contain item: '{key!r}'".format(key=k)
+
+
+def assert_copy(iter1, iter2, **eql_kwargs):
+ """
+ iter1, iter2: iterables that produce elements
+ comparable with assert_almost_equal
+
+ Checks that the elements are equal, but not
+ the same object. (Does not check that items
+ in sequences are also not the same object)
+ """
+ for elem1, elem2 in zip(iter1, iter2):
+ assert_almost_equal(elem1, elem2, **eql_kwargs)
+ msg = ("Expected object {obj1!r} and object {obj2!r} to be "
+ "different objects, but they were the same object."
+ ).format(obj1=type(elem1), obj2=type(elem2))
+ assert elem1 is not elem2, msg
+
+
+def getCols(k):
+ return string.ascii_uppercase[:k]
+
+
+# make index
+def makeStringIndex(k=10, name=None):
+ return Index(rands_array(nchars=10, size=k), name=name)
+
+
+def makeUnicodeIndex(k=10, name=None):
+ return Index(randu_array(nchars=10, size=k), name=name)
+
+
+def makeCategoricalIndex(k=10, n=3, name=None, **kwargs):
+ """ make a length k index or n categories """
+ x = rands_array(nchars=4, size=n)
+ return CategoricalIndex(np.random.choice(x, k), name=name, **kwargs)
+
+
+def makeIntervalIndex(k=10, name=None, **kwargs):
+ """ make a length k IntervalIndex """
+ x = np.linspace(0, 100, num=(k + 1))
+ return IntervalIndex.from_breaks(x, name=name, **kwargs)
+
+
+def makeBoolIndex(k=10, name=None):
+ if k == 1:
+ return Index([True], name=name)
+ elif k == 2:
+ return Index([False, True], name=name)
+ return Index([False, True] + [False] * (k - 2), name=name)
+
+
+def makeIntIndex(k=10, name=None):
+ return Index(lrange(k), name=name)
+
+
+def makeUIntIndex(k=10, name=None):
+ return Index([2**63 + i for i in lrange(k)], name=name)
+
+
+def makeRangeIndex(k=10, name=None, **kwargs):
+ return RangeIndex(0, k, 1, name=name, **kwargs)
+
+
+def makeFloatIndex(k=10, name=None):
+ values = sorted(np.random.random_sample(k)) - np.random.random_sample(1)
+ return Index(values * (10 ** np.random.randint(0, 9)), name=name)
+
+
+def makeDateIndex(k=10, freq='B', name=None, **kwargs):
+ dt = datetime(2000, 1, 1)
+ dr = bdate_range(dt, periods=k, freq=freq, name=name)
+ return DatetimeIndex(dr, name=name, **kwargs)
+
+
+def makeTimedeltaIndex(k=10, freq='D', name=None, **kwargs):
+ return pd.timedelta_range(start='1 day', periods=k, freq=freq,
+ name=name, **kwargs)
+
+
+def makePeriodIndex(k=10, name=None, **kwargs):
+ dt = datetime(2000, 1, 1)
+ dr = pd.period_range(start=dt, periods=k, freq='B', name=name, **kwargs)
+ return dr
+
+
+def makeMultiIndex(k=10, names=None, **kwargs):
+ return MultiIndex.from_product(
+ (('foo', 'bar'), (1, 2)), names=names, **kwargs)
+
+
+def all_index_generator(k=10):
+ """Generator which can be iterated over to get instances of all the various
+ index classes.
+
+ Parameters
+ ----------
+ k: length of each of the index instances
+ """
+ all_make_index_funcs = [makeIntIndex, makeFloatIndex, makeStringIndex,
+ makeUnicodeIndex, makeDateIndex, makePeriodIndex,
+ makeTimedeltaIndex, makeBoolIndex, makeRangeIndex,
+ makeIntervalIndex,
+ makeCategoricalIndex]
+ for make_index_func in all_make_index_funcs:
+ yield make_index_func(k=k)
+
+
+def index_subclass_makers_generator():
+ make_index_funcs = [
+ makeDateIndex, makePeriodIndex,
+ makeTimedeltaIndex, makeRangeIndex,
+ makeIntervalIndex, makeCategoricalIndex,
+ makeMultiIndex
+ ]
+ for make_index_func in make_index_funcs:
+ yield make_index_func
+
+
+def all_timeseries_index_generator(k=10):
+ """Generator which can be iterated over to get instances of all the classes
+ which represent time-seires.
+
+ Parameters
+ ----------
+ k: length of each of the index instances
+ """
+ make_index_funcs = [makeDateIndex, makePeriodIndex, makeTimedeltaIndex]
+ for make_index_func in make_index_funcs:
+ yield make_index_func(k=k)
+
+
+# make series
+def makeFloatSeries(name=None):
+ index = makeStringIndex(N)
+ return Series(randn(N), index=index, name=name)
+
+
+def makeStringSeries(name=None):
+ index = makeStringIndex(N)
+ return Series(randn(N), index=index, name=name)
+
+
+def makeObjectSeries(name=None):
+ dateIndex = makeDateIndex(N)
+ dateIndex = Index(dateIndex, dtype=object)
+ index = makeStringIndex(N)
+ return Series(dateIndex, index=index, name=name)
+
+
+def getSeriesData():
+ index = makeStringIndex(N)
+ return {c: Series(randn(N), index=index) for c in getCols(K)}
+
+
+def makeTimeSeries(nper=None, freq='B', name=None):
+ if nper is None:
+ nper = N
+ return Series(randn(nper), index=makeDateIndex(nper, freq=freq), name=name)
+
+
+def makePeriodSeries(nper=None, name=None):
+ if nper is None:
+ nper = N
+ return Series(randn(nper), index=makePeriodIndex(nper), name=name)
+
+
+def getTimeSeriesData(nper=None, freq='B'):
+ return {c: makeTimeSeries(nper, freq) for c in getCols(K)}
+
+
+def getPeriodData(nper=None):
+ return {c: makePeriodSeries(nper) for c in getCols(K)}
+
+
+# make frame
+def makeTimeDataFrame(nper=None, freq='B'):
+ data = getTimeSeriesData(nper, freq)
+ return DataFrame(data)
+
+
+def makeDataFrame():
+ data = getSeriesData()
+ return DataFrame(data)
+
+
+def getMixedTypeDict():
+ index = Index(['a', 'b', 'c', 'd', 'e'])
+
+ data = {
+ 'A': [0., 1., 2., 3., 4.],
+ 'B': [0., 1., 0., 1., 0.],
+ 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
+ 'D': bdate_range('1/1/2009', periods=5)
+ }
+
+ return index, data
+
+
+def makeMixedDataFrame():
+ return DataFrame(getMixedTypeDict()[1])
+
+
+def makePeriodFrame(nper=None):
+ data = getPeriodData(nper)
+ return DataFrame(data)
+
+
+def makePanel(nper=None):
+ with warnings.catch_warnings(record=True):
+ warnings.filterwarnings("ignore", "\\nPanel", FutureWarning)
+ cols = ['Item' + c for c in string.ascii_uppercase[:K - 1]]
+ data = {c: makeTimeDataFrame(nper) for c in cols}
+ return Panel.fromDict(data)
+
+
+def makePeriodPanel(nper=None):
+ with warnings.catch_warnings(record=True):
+ warnings.filterwarnings("ignore", "\\nPanel", FutureWarning)
+ cols = ['Item' + c for c in string.ascii_uppercase[:K - 1]]
+ data = {c: makePeriodFrame(nper) for c in cols}
+ return Panel.fromDict(data)
+
+
+def makeCustomIndex(nentries, nlevels, prefix='#', names=False, ndupe_l=None,
+ idx_type=None):
+ """Create an index/multindex with given dimensions, levels, names, etc'
+
+ nentries - number of entries in index
+ nlevels - number of levels (> 1 produces multindex)
+ prefix - a string prefix for labels
+ names - (Optional), bool or list of strings. if True will use default
+ names, if false will use no names, if a list is given, the name of
+ each level in the index will be taken from the list.
+ ndupe_l - (Optional), list of ints, the number of rows for which the
+ label will repeated at the corresponding level, you can specify just
+ the first few, the rest will use the default ndupe_l of 1.
+ len(ndupe_l) <= nlevels.
+ idx_type - "i"/"f"/"s"/"u"/"dt"/"p"/"td".
+ If idx_type is not None, `idx_nlevels` must be 1.
+ "i"/"f" creates an integer/float index,
+ "s"/"u" creates a string/unicode index
+ "dt" create a datetime index.
+ "td" create a datetime index.
+
+ if unspecified, string labels will be generated.
+ """
+
+ if ndupe_l is None:
+ ndupe_l = [1] * nlevels
+ assert (is_sequence(ndupe_l) and len(ndupe_l) <= nlevels)
+ assert (names is None or names is False or
+ names is True or len(names) is nlevels)
+ assert idx_type is None or (idx_type in ('i', 'f', 's', 'u',
+ 'dt', 'p', 'td')
+ and nlevels == 1)
+
+ if names is True:
+ # build default names
+ names = [prefix + str(i) for i in range(nlevels)]
+ if names is False:
+ # pass None to index constructor for no name
+ names = None
+
+ # make singelton case uniform
+ if isinstance(names, compat.string_types) and nlevels == 1:
+ names = [names]
+
+ # specific 1D index type requested?
+ idx_func = dict(i=makeIntIndex, f=makeFloatIndex,
+ s=makeStringIndex, u=makeUnicodeIndex,
+ dt=makeDateIndex, td=makeTimedeltaIndex,
+ p=makePeriodIndex).get(idx_type)
+ if idx_func:
+ idx = idx_func(nentries)
+ # but we need to fill in the name
+ if names:
+ idx.name = names[0]
+ return idx
+ elif idx_type is not None:
+ raise ValueError('"{idx_type}" is not a legal value for `idx_type`, '
+ 'use "i"/"f"/"s"/"u"/"dt/"p"/"td".'
+ .format(idx_type=idx_type))
+
+ if len(ndupe_l) < nlevels:
+ ndupe_l.extend([1] * (nlevels - len(ndupe_l)))
+ assert len(ndupe_l) == nlevels
+
+ assert all(x > 0 for x in ndupe_l)
+
+ tuples = []
+ for i in range(nlevels):
+ def keyfunc(x):
+ import re
+ numeric_tuple = re.sub(r"[^\d_]_?", "", x).split("_")
+ return lmap(int, numeric_tuple)
+
+ # build a list of lists to create the index from
+ div_factor = nentries // ndupe_l[i] + 1
+ cnt = Counter()
+ for j in range(div_factor):
+ label = '{prefix}_l{i}_g{j}'.format(prefix=prefix, i=i, j=j)
+ cnt[label] = ndupe_l[i]
+ # cute Counter trick
+ result = list(sorted(cnt.elements(), key=keyfunc))[:nentries]
+ tuples.append(result)
+
+ tuples = lzip(*tuples)
+
+ # convert tuples to index
+ if nentries == 1:
+ # we have a single level of tuples, i.e. a regular Index
+ index = Index(tuples[0], name=names[0])
+ elif nlevels == 1:
+ name = None if names is None else names[0]
+ index = Index((x[0] for x in tuples), name=name)
+ else:
+ index = MultiIndex.from_tuples(tuples, names=names)
+ return index
+
+
+def makeCustomDataframe(nrows, ncols, c_idx_names=True, r_idx_names=True,
+ c_idx_nlevels=1, r_idx_nlevels=1, data_gen_f=None,
+ c_ndupe_l=None, r_ndupe_l=None, dtype=None,
+ c_idx_type=None, r_idx_type=None):
+ """
+ nrows, ncols - number of data rows/cols
+ c_idx_names, idx_names - False/True/list of strings, yields No names ,
+ default names or uses the provided names for the levels of the
+ corresponding index. You can provide a single string when
+ c_idx_nlevels ==1.
+ c_idx_nlevels - number of levels in columns index. > 1 will yield MultiIndex
+ r_idx_nlevels - number of levels in rows index. > 1 will yield MultiIndex
+ data_gen_f - a function f(row,col) which return the data value
+ at that position, the default generator used yields values of the form
+ "RxCy" based on position.
+ c_ndupe_l, r_ndupe_l - list of integers, determines the number
+ of duplicates for each label at a given level of the corresponding
+ index. The default `None` value produces a multiplicity of 1 across
+ all levels, i.e. a unique index. Will accept a partial list of length
+ N < idx_nlevels, for just the first N levels. If ndupe doesn't divide
+ nrows/ncol, the last label might have lower multiplicity.
+ dtype - passed to the DataFrame constructor as is, in case you wish to
+ have more control in conjuncion with a custom `data_gen_f`
+ r_idx_type, c_idx_type - "i"/"f"/"s"/"u"/"dt"/"td".
+ If idx_type is not None, `idx_nlevels` must be 1.
+ "i"/"f" creates an integer/float index,
+ "s"/"u" creates a string/unicode index
+ "dt" create a datetime index.
+ "td" create a timedelta index.
+
+ if unspecified, string labels will be generated.
+
+ Examples:
+
+ # 5 row, 3 columns, default names on both, single index on both axis
+ >> makeCustomDataframe(5,3)
+
+ # make the data a random int between 1 and 100
+ >> mkdf(5,3,data_gen_f=lambda r,c:randint(1,100))
+
+ # 2-level multiindex on rows with each label duplicated
+ # twice on first level, default names on both axis, single
+ # index on both axis
+ >> a=makeCustomDataframe(5,3,r_idx_nlevels=2,r_ndupe_l=[2])
+
+ # DatetimeIndex on row, index with unicode labels on columns
+ # no names on either axis
+ >> a=makeCustomDataframe(5,3,c_idx_names=False,r_idx_names=False,
+ r_idx_type="dt",c_idx_type="u")
+
+ # 4-level multindex on rows with names provided, 2-level multindex
+ # on columns with default labels and default names.
+ >> a=makeCustomDataframe(5,3,r_idx_nlevels=4,
+ r_idx_names=["FEE","FI","FO","FAM"],
+ c_idx_nlevels=2)
+
+ >> a=mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4)
+ """
+
+ assert c_idx_nlevels > 0
+ assert r_idx_nlevels > 0
+ assert r_idx_type is None or (r_idx_type in ('i', 'f', 's',
+ 'u', 'dt', 'p', 'td')
+ and r_idx_nlevels == 1)
+ assert c_idx_type is None or (c_idx_type in ('i', 'f', 's',
+ 'u', 'dt', 'p', 'td')
+ and c_idx_nlevels == 1)
+
+ columns = makeCustomIndex(ncols, nlevels=c_idx_nlevels, prefix='C',
+ names=c_idx_names, ndupe_l=c_ndupe_l,
+ idx_type=c_idx_type)
+ index = makeCustomIndex(nrows, nlevels=r_idx_nlevels, prefix='R',
+ names=r_idx_names, ndupe_l=r_ndupe_l,
+ idx_type=r_idx_type)
+
+ # by default, generate data based on location
+ if data_gen_f is None:
+ data_gen_f = lambda r, c: "R{rows}C{cols}".format(rows=r, cols=c)
+
+ data = [[data_gen_f(r, c) for c in range(ncols)] for r in range(nrows)]
+
+ return DataFrame(data, index, columns, dtype=dtype)
+
+
+def _create_missing_idx(nrows, ncols, density, random_state=None):
+ if random_state is None:
+ random_state = np.random
+ else:
+ random_state = np.random.RandomState(random_state)
+
+ # below is cribbed from scipy.sparse
+ size = int(np.round((1 - density) * nrows * ncols))
+ # generate a few more to ensure unique values
+ min_rows = 5
+ fac = 1.02
+ extra_size = min(size + min_rows, fac * size)
+
+ def _gen_unique_rand(rng, _extra_size):
+ ind = rng.rand(int(_extra_size))
+ return np.unique(np.floor(ind * nrows * ncols))[:size]
+
+ ind = _gen_unique_rand(random_state, extra_size)
+ while ind.size < size:
+ extra_size *= 1.05
+ ind = _gen_unique_rand(random_state, extra_size)
+
+ j = np.floor(ind * 1. / nrows).astype(int)
+ i = (ind - j * nrows).astype(int)
+ return i.tolist(), j.tolist()
+
+
+def makeMissingCustomDataframe(nrows, ncols, density=.9, random_state=None,
+ c_idx_names=True, r_idx_names=True,
+ c_idx_nlevels=1, r_idx_nlevels=1,
+ data_gen_f=None,
+ c_ndupe_l=None, r_ndupe_l=None, dtype=None,
+ c_idx_type=None, r_idx_type=None):
+ """
+ Parameters
+ ----------
+ Density : float, optional
+ Float in (0, 1) that gives the percentage of non-missing numbers in
+ the DataFrame.
+ random_state : {np.random.RandomState, int}, optional
+ Random number generator or random seed.
+
+ See makeCustomDataframe for descriptions of the rest of the parameters.
+ """
+ df = makeCustomDataframe(nrows, ncols, c_idx_names=c_idx_names,
+ r_idx_names=r_idx_names,
+ c_idx_nlevels=c_idx_nlevels,
+ r_idx_nlevels=r_idx_nlevels,
+ data_gen_f=data_gen_f,
+ c_ndupe_l=c_ndupe_l, r_ndupe_l=r_ndupe_l,
+ dtype=dtype, c_idx_type=c_idx_type,
+ r_idx_type=r_idx_type)
+
+ i, j = _create_missing_idx(nrows, ncols, density, random_state)
+ df.values[i, j] = np.nan
+ return df
+
+
+def makeMissingDataframe(density=.9, random_state=None):
+ df = makeDataFrame()
+ i, j = _create_missing_idx(*df.shape, density=density,
+ random_state=random_state)
+ df.values[i, j] = np.nan
+ return df
+
+
+def add_nans(panel):
+ I, J, N = panel.shape
+ for i, item in enumerate(panel.items):
+ dm = panel[item]
+ for j, col in enumerate(dm.columns):
+ dm[col][:i + j] = np.NaN
+ return panel
+
+
+class TestSubDict(dict):
+
+ def __init__(self, *args, **kwargs):
+ dict.__init__(self, *args, **kwargs)
+
+
+def optional_args(decorator):
+ """allows a decorator to take optional positional and keyword arguments.
+ Assumes that taking a single, callable, positional argument means that
+ it is decorating a function, i.e. something like this::
+
+ @my_decorator
+ def function(): pass
+
+ Calls decorator with decorator(f, *args, **kwargs)"""
+
+ @wraps(decorator)
+ def wrapper(*args, **kwargs):
+ def dec(f):
+ return decorator(f, *args, **kwargs)
+
+ is_decorating = not kwargs and len(args) == 1 and callable(args[0])
+ if is_decorating:
+ f = args[0]
+ args = []
+ return dec(f)
+ else:
+ return dec
+
+ return wrapper
+
+
+# skip tests on exceptions with this message
+_network_error_messages = (
+ # 'urlopen error timed out',
+ # 'timeout: timed out',
+ # 'socket.timeout: timed out',
+ 'timed out',
+ 'Server Hangup',
+ 'HTTP Error 503: Service Unavailable',
+ '502: Proxy Error',
+ 'HTTP Error 502: internal error',
+ 'HTTP Error 502',
+ 'HTTP Error 503',
+ 'HTTP Error 403',
+ 'HTTP Error 400',
+ 'Temporary failure in name resolution',
+ 'Name or service not known',
+ 'Connection refused',
+ 'certificate verify',
+)
+
+# or this e.errno/e.reason.errno
+_network_errno_vals = (
+ 101, # Network is unreachable
+ 111, # Connection refused
+ 110, # Connection timed out
+ 104, # Connection reset Error
+ 54, # Connection reset by peer
+ 60, # urllib.error.URLError: [Errno 60] Connection timed out
+)
+
+# Both of the above shouldn't mask real issues such as 404's
+# or refused connections (changed DNS).
+# But some tests (test_data yahoo) contact incredibly flakey
+# servers.
+
+# and conditionally raise on these exception types
+_network_error_classes = (IOError, httplib.HTTPException)
+
+if PY3:
+ _network_error_classes += (TimeoutError,) # noqa
+
+
+def can_connect(url, error_classes=_network_error_classes):
+ """Try to connect to the given url. True if succeeds, False if IOError
+ raised
+
+ Parameters
+ ----------
+ url : basestring
+ The URL to try to connect to
+
+ Returns
+ -------
+ connectable : bool
+ Return True if no IOError (unable to connect) or URLError (bad url) was
+ raised
+ """
+ try:
+ with urlopen(url):
+ pass
+ except error_classes:
+ return False
+ else:
+ return True
+
+
+@optional_args
+def network(t, url="http://www.google.com",
+ raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT,
+ check_before_test=False,
+ error_classes=_network_error_classes,
+ skip_errnos=_network_errno_vals,
+ _skip_on_messages=_network_error_messages,
+ ):
+ """
+ Label a test as requiring network connection and, if an error is
+ encountered, only raise if it does not find a network connection.
+
+ In comparison to ``network``, this assumes an added contract to your test:
+ you must assert that, under normal conditions, your test will ONLY fail if
+ it does not have network connectivity.
+
+ You can call this in 3 ways: as a standard decorator, with keyword
+ arguments, or with a positional argument that is the url to check.
+
+ Parameters
+ ----------
+ t : callable
+ The test requiring network connectivity.
+ url : path
+ The url to test via ``pandas.io.common.urlopen`` to check
+ for connectivity. Defaults to 'http://www.google.com'.
+ raise_on_error : bool
+ If True, never catches errors.
+ check_before_test : bool
+ If True, checks connectivity before running the test case.
+ error_classes : tuple or Exception
+ error classes to ignore. If not in ``error_classes``, raises the error.
+ defaults to IOError. Be careful about changing the error classes here.
+ skip_errnos : iterable of int
+ Any exception that has .errno or .reason.erno set to one
+ of these values will be skipped with an appropriate
+ message.
+ _skip_on_messages: iterable of string
+ any exception e for which one of the strings is
+ a substring of str(e) will be skipped with an appropriate
+ message. Intended to suppress errors where an errno isn't available.
+
+ Notes
+ -----
+ * ``raise_on_error`` supercedes ``check_before_test``
+
+ Returns
+ -------
+ t : callable
+ The decorated test ``t``, with checks for connectivity errors.
+
+ Example
+ -------
+
+ Tests decorated with @network will fail if it's possible to make a network
+ connection to another URL (defaults to google.com)::
+
+ >>> from pandas.util.testing import network
+ >>> from pandas.io.common import urlopen
+ >>> @network
+ ... def test_network():
+ ... with urlopen("rabbit://bonanza.com"):
+ ... pass
+ Traceback
+ ...
+ URLError: <urlopen error unknown url type: rabit>
+
+ You can specify alternative URLs::
+
+ >>> @network("http://www.yahoo.com")
+ ... def test_something_with_yahoo():
+ ... raise IOError("Failure Message")
+ >>> test_something_with_yahoo()
+ Traceback (most recent call last):
+ ...
+ IOError: Failure Message
+
+ If you set check_before_test, it will check the url first and not run the
+ test on failure::
+
+ >>> @network("failing://url.blaher", check_before_test=True)
+ ... def test_something():
+ ... print("I ran!")
+ ... raise ValueError("Failure")
+ >>> test_something()
+ Traceback (most recent call last):
+ ...
+
+ Errors not related to networking will always be raised.
+ """
+ from pytest import skip
+ t.network = True
+
+ @compat.wraps(t)
+ def wrapper(*args, **kwargs):
+ if check_before_test and not raise_on_error:
+ if not can_connect(url, error_classes):
+ skip()
+ try:
+ return t(*args, **kwargs)
+ except Exception as e:
+ errno = getattr(e, 'errno', None)
+ if not errno and hasattr(errno, "reason"):
+ errno = getattr(e.reason, 'errno', None)
+
+ if errno in skip_errnos:
+ skip("Skipping test due to known errno"
+ " and error {error}".format(error=e))
+
+ try:
+ e_str = traceback.format_exc(e)
+ except Exception:
+ e_str = str(e)
+
+ if any(m.lower() in e_str.lower() for m in _skip_on_messages):
+ skip("Skipping test because exception "
+ "message is known and error {error}".format(error=e))
+
+ if not isinstance(e, error_classes):
+ raise
+
+ if raise_on_error or can_connect(url, error_classes):
+ raise
+ else:
+ skip("Skipping test due to lack of connectivity"
+ " and error {error}".format(error=e))
+
+ return wrapper
+
+
+with_connectivity_check = network
+
+
+def assert_raises_regex(_exception, _regexp, _callable=None,
+ *args, **kwargs):
+ r"""
+ Check that the specified Exception is raised and that the error message
+ matches a given regular expression pattern. This may be a regular
+ expression object or a string containing a regular expression suitable
+ for use by `re.search()`. This is a port of the `assertRaisesRegexp`
+ function from unittest in Python 2.7.
+
+ .. deprecated:: 0.24.0
+ Use `pytest.raises` instead.
+
+ Examples
+ --------
+ >>> assert_raises_regex(ValueError, 'invalid literal for.*XYZ', int, 'XYZ')
+ >>> import re
+ >>> assert_raises_regex(ValueError, re.compile('literal'), int, 'XYZ')
+
+ If an exception of a different type is raised, it bubbles up.
+
+ >>> assert_raises_regex(TypeError, 'literal', int, 'XYZ')
+ Traceback (most recent call last):
+ ...
+ ValueError: invalid literal for int() with base 10: 'XYZ'
+ >>> dct = dict()
+ >>> assert_raises_regex(KeyError, 'pear', dct.__getitem__, 'apple')
+ Traceback (most recent call last):
+ ...
+ AssertionError: "pear" does not match "'apple'"
+
+ You can also use this in a with statement.
+
+ >>> with assert_raises_regex(TypeError, r'unsupported operand type\(s\)'):
+ ... 1 + {}
+ >>> with assert_raises_regex(TypeError, 'banana'):
+ ... 'apple'[0] = 'b'
+ Traceback (most recent call last):
+ ...
+ AssertionError: "banana" does not match "'str' object does not support \
+item assignment"
+ """
+ warnings.warn(("assert_raises_regex has been deprecated and will "
+ "be removed in the next release. Please use "
+ "`pytest.raises` instead."), FutureWarning, stacklevel=2)
+
+ manager = _AssertRaisesContextmanager(exception=_exception, regexp=_regexp)
+ if _callable is not None:
+ with manager:
+ _callable(*args, **kwargs)
+ else:
+ return manager
+
+
+class _AssertRaisesContextmanager(object):
+ """
+ Context manager behind `assert_raises_regex`.
+ """
+
+ def __init__(self, exception, regexp=None):
+ """
+ Initialize an _AssertRaisesContextManager instance.
+
+ Parameters
+ ----------
+ exception : class
+ The expected Exception class.
+ regexp : str, default None
+ The regex to compare against the Exception message.
+ """
+
+ self.exception = exception
+
+ if regexp is not None and not hasattr(regexp, "search"):
+ regexp = re.compile(regexp, re.DOTALL)
+
+ self.regexp = regexp
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, trace_back):
+ expected = self.exception
+
+ if not exc_type:
+ exp_name = getattr(expected, "__name__", str(expected))
+ raise AssertionError("{name} not raised.".format(name=exp_name))
+
+ return self.exception_matches(exc_type, exc_value, trace_back)
+
+ def exception_matches(self, exc_type, exc_value, trace_back):
+ """
+ Check that the Exception raised matches the expected Exception
+ and expected error message regular expression.
+
+ Parameters
+ ----------
+ exc_type : class
+ The type of Exception raised.
+ exc_value : Exception
+ The instance of `exc_type` raised.
+ trace_back : stack trace object
+ The traceback object associated with `exc_value`.
+
+ Returns
+ -------
+ is_matched : bool
+ Whether or not the Exception raised matches the expected
+ Exception class and expected error message regular expression.
+
+ Raises
+ ------
+ AssertionError : The error message provided does not match
+ the expected error message regular expression.
+ """
+
+ if issubclass(exc_type, self.exception):
+ if self.regexp is not None:
+ val = str(exc_value)
+
+ if not self.regexp.search(val):
+ msg = '"{pat}" does not match "{val}"'.format(
+ pat=self.regexp.pattern, val=val)
+ e = AssertionError(msg)
+ raise_with_traceback(e, trace_back)
+
+ return True
+ else:
+ # Failed, so allow Exception to bubble up.
+ return False
+
+
+@contextmanager
+def assert_produces_warning(expected_warning=Warning, filter_level="always",
+ clear=None, check_stacklevel=True):
+ """
+ Context manager for running code expected to either raise a specific
+ warning, or not raise any warnings. Verifies that the code raises the
+ expected warning, and that it does not raise any other unexpected
+ warnings. It is basically a wrapper around ``warnings.catch_warnings``.
+
+ Parameters
+ ----------
+ expected_warning : {Warning, False, None}, default Warning
+ The type of Exception raised. ``exception.Warning`` is the base
+ class for all warnings. To check that no warning is returned,
+ specify ``False`` or ``None``.
+ filter_level : str, default "always"
+ Specifies whether warnings are ignored, displayed, or turned
+ into errors.
+ Valid values are:
+
+ * "error" - turns matching warnings into exceptions
+ * "ignore" - discard the warning
+ * "always" - always emit a warning
+ * "default" - print the warning the first time it is generated
+ from each location
+ * "module" - print the warning the first time it is generated
+ from each module
+ * "once" - print the warning the first time it is generated
+
+ clear : str, default None
+ If not ``None`` then remove any previously raised warnings from
+ the ``__warningsregistry__`` to ensure that no warning messages are
+ suppressed by this context manager. If ``None`` is specified,
+ the ``__warningsregistry__`` keeps track of which warnings have been
+ shown, and does not show them again.
+ check_stacklevel : bool, default True
+ If True, displays the line that called the function containing
+ the warning to show were the function is called. Otherwise, the
+ line that implements the function is displayed.
+
+ Examples
+ --------
+ >>> import warnings
+ >>> with assert_produces_warning():
+ ... warnings.warn(UserWarning())
+ ...
+ >>> with assert_produces_warning(False):
+ ... warnings.warn(RuntimeWarning())
+ ...
+ Traceback (most recent call last):
+ ...
+ AssertionError: Caused unexpected warning(s): ['RuntimeWarning'].
+ >>> with assert_produces_warning(UserWarning):
+ ... warnings.warn(RuntimeWarning())
+ Traceback (most recent call last):
+ ...
+ AssertionError: Did not see expected warning of class 'UserWarning'.
+
+ ..warn:: This is *not* thread-safe.
+ """
+ __tracebackhide__ = True
+
+ with warnings.catch_warnings(record=True) as w:
+
+ if clear is not None:
+ # make sure that we are clearing these warnings
+ # if they have happened before
+ # to guarantee that we will catch them
+ if not is_list_like(clear):
+ clear = [clear]
+ for m in clear:
+ try:
+ m.__warningregistry__.clear()
+ except Exception:
+ pass
+
+ saw_warning = False
+ warnings.simplefilter(filter_level)
+ yield w
+ extra_warnings = []
+
+ for actual_warning in w:
+ if (expected_warning and issubclass(actual_warning.category,
+ expected_warning)):
+ saw_warning = True
+
+ if check_stacklevel and issubclass(actual_warning.category,
+ (FutureWarning,
+ DeprecationWarning)):
+ from inspect import getframeinfo, stack
+ caller = getframeinfo(stack()[2][0])
+ msg = ("Warning not set with correct stacklevel. "
+ "File where warning is raised: {actual} != "
+ "{caller}. Warning message: {message}"
+ ).format(actual=actual_warning.filename,
+ caller=caller.filename,
+ message=actual_warning.message)
+ assert actual_warning.filename == caller.filename, msg
+ else:
+ extra_warnings.append((actual_warning.category.__name__,
+ actual_warning.message,
+ actual_warning.filename,
+ actual_warning.lineno))
+ if expected_warning:
+ msg = "Did not see expected warning of class {name!r}.".format(
+ name=expected_warning.__name__)
+ assert saw_warning, msg
+ assert not extra_warnings, ("Caused unexpected warning(s): {extra!r}."
+ ).format(extra=extra_warnings)
+
+
+class RNGContext(object):
+ """
+ Context manager to set the numpy random number generator speed. Returns
+ to the original value upon exiting the context manager.
+
+ Parameters
+ ----------
+ seed : int
+ Seed for numpy.random.seed
+
+ Examples
+ --------
+
+ with RNGContext(42):
+ np.random.randn()
+ """
+
+ def __init__(self, seed):
+ self.seed = seed
+
+ def __enter__(self):
+
+ self.start_state = np.random.get_state()
+ np.random.seed(self.seed)
+
+ def __exit__(self, exc_type, exc_value, traceback):
+
+ np.random.set_state(self.start_state)
+
+
+@contextmanager
+def with_csv_dialect(name, **kwargs):
+ """
+ Context manager to temporarily register a CSV dialect for parsing CSV.
+
+ Parameters
+ ----------
+ name : str
+ The name of the dialect.
+ kwargs : mapping
+ The parameters for the dialect.
+
+ Raises
+ ------
+ ValueError : the name of the dialect conflicts with a builtin one.
+
+ See Also
+ --------
+ csv : Python's CSV library.
+ """
+ import csv
+ _BUILTIN_DIALECTS = {"excel", "excel-tab", "unix"}
+
+ if name in _BUILTIN_DIALECTS:
+ raise ValueError("Cannot override builtin dialect.")
+
+ csv.register_dialect(name, **kwargs)
+ yield
+ csv.unregister_dialect(name)
+
+
+@contextmanager
+def use_numexpr(use, min_elements=None):
+ from pandas.core.computation import expressions as expr
+ if min_elements is None:
+ min_elements = expr._MIN_ELEMENTS
+
+ olduse = expr._USE_NUMEXPR
+ oldmin = expr._MIN_ELEMENTS
+ expr.set_use_numexpr(use)
+ expr._MIN_ELEMENTS = min_elements
+ yield
+ expr._MIN_ELEMENTS = oldmin
+ expr.set_use_numexpr(olduse)
+
+
+def test_parallel(num_threads=2, kwargs_list=None):
+ """Decorator to run the same function multiple times in parallel.
+
+ Parameters
+ ----------
+ num_threads : int, optional
+ The number of times the function is run in parallel.
+ kwargs_list : list of dicts, optional
+ The list of kwargs to update original
+ function kwargs on different threads.
+ Notes
+ -----
+ This decorator does not pass the return value of the decorated function.
+
+ Original from scikit-image:
+
+ https://github.com/scikit-image/scikit-image/pull/1519
+
+ """
+
+ assert num_threads > 0
+ has_kwargs_list = kwargs_list is not None
+ if has_kwargs_list:
+ assert len(kwargs_list) == num_threads
+ import threading
+
+ def wrapper(func):
+ @wraps(func)
+ def inner(*args, **kwargs):
+ if has_kwargs_list:
+ update_kwargs = lambda i: dict(kwargs, **kwargs_list[i])
+ else:
+ update_kwargs = lambda i: kwargs
+ threads = []
+ for i in range(num_threads):
+ updated_kwargs = update_kwargs(i)
+ thread = threading.Thread(target=func, args=args,
+ kwargs=updated_kwargs)
+ threads.append(thread)
+ for thread in threads:
+ thread.start()
+ for thread in threads:
+ thread.join()
+ return inner
+ return wrapper
+
+
+class SubclassedSeries(Series):
+ _metadata = ['testattr', 'name']
+
+ @property
+ def _constructor(self):
+ return SubclassedSeries
+
+ @property
+ def _constructor_expanddim(self):
+ return SubclassedDataFrame
+
+
+class SubclassedDataFrame(DataFrame):
+ _metadata = ['testattr']
+
+ @property
+ def _constructor(self):
+ return SubclassedDataFrame
+
+ @property
+ def _constructor_sliced(self):
+ return SubclassedSeries
+
+
+class SubclassedSparseSeries(pd.SparseSeries):
+ _metadata = ['testattr']
+
+ @property
+ def _constructor(self):
+ return SubclassedSparseSeries
+
+ @property
+ def _constructor_expanddim(self):
+ return SubclassedSparseDataFrame
+
+
+class SubclassedSparseDataFrame(pd.SparseDataFrame):
+ _metadata = ['testattr']
+
+ @property
+ def _constructor(self):
+ return SubclassedSparseDataFrame
+
+ @property
+ def _constructor_sliced(self):
+ return SubclassedSparseSeries
+
+
+class SubclassedCategorical(Categorical):
+
+ @property
+ def _constructor(self):
+ return SubclassedCategorical
+
+
+@contextmanager
+def set_timezone(tz):
+ """Context manager for temporarily setting a timezone.
+
+ Parameters
+ ----------
+ tz : str
+ A string representing a valid timezone.
+
+ Examples
+ --------
+
+ >>> from datetime import datetime
+ >>> from dateutil.tz import tzlocal
+ >>> tzlocal().tzname(datetime.now())
+ 'IST'
+
+ >>> with set_timezone('US/Eastern'):
+ ... tzlocal().tzname(datetime.now())
+ ...
+ 'EDT'
+ """
+
+ import os
+ import time
+
+ def setTZ(tz):
+ if tz is None:
+ try:
+ del os.environ['TZ']
+ except KeyError:
+ pass
+ else:
+ os.environ['TZ'] = tz
+ time.tzset()
+
+ orig_tz = os.environ.get('TZ')
+ setTZ(tz)
+ try:
+ yield
+ finally:
+ setTZ(orig_tz)
+
+
+def _make_skipna_wrapper(alternative, skipna_alternative=None):
+ """Create a function for calling on an array.
+
+ Parameters
+ ----------
+ alternative : function
+ The function to be called on the array with no NaNs.
+ Only used when 'skipna_alternative' is None.
+ skipna_alternative : function
+ The function to be called on the original array
+
+ Returns
+ -------
+ skipna_wrapper : function
+ """
+ if skipna_alternative:
+ def skipna_wrapper(x):
+ return skipna_alternative(x.values)
+ else:
+ def skipna_wrapper(x):
+ nona = x.dropna()
+ if len(nona) == 0:
+ return np.nan
+ return alternative(nona)
+
+ return skipna_wrapper
+
+
+def convert_rows_list_to_csv_str(rows_list):
+ """
+ Convert list of CSV rows to single CSV-formatted string for current OS.
+
+ This method is used for creating expected value of to_csv() method.
+
+ Parameters
+ ----------
+ rows_list : list
+ The list of string. Each element represents the row of csv.
+
+ Returns
+ -------
+ expected : string
+ Expected output of to_csv() in current OS
+ """
+ sep = os.linesep
+ expected = sep.join(rows_list) + sep
+ return expected
diff --git a/contrib/python/pandas/py2/symbols.cmake b/contrib/python/pandas/py2/symbols.cmake
new file mode 100644
index 00000000000..d4e9cf71a4f
--- /dev/null
+++ b/contrib/python/pandas/py2/symbols.cmake
@@ -0,0 +1,175 @@
+CFLAGS(
+ -DBuffer_AppendDoubleUnchecked=_pandas_Buffer_AppendDoubleUnchecked
+ -DBuffer_AppendIntUnchecked=_pandas_Buffer_AppendIntUnchecked
+ -DBuffer_AppendLongUnchecked=_pandas_Buffer_AppendLongUnchecked
+ -DBuffer_EscapeStringUnvalidated=_pandas_Buffer_EscapeStringUnvalidated
+ -DBuffer_EscapeStringValidated=_pandas_Buffer_EscapeStringValidated
+ -DBuffer_Realloc=_pandas_Buffer_Realloc
+ -DDataFrame_iterBegin=_pandas_DataFrame_iterBegin
+ -DDataFrame_iterEnd=_pandas_DataFrame_iterEnd
+ -DDataFrame_iterGetName=_pandas_DataFrame_iterGetName
+ -DDataFrame_iterGetValue=_pandas_DataFrame_iterGetValue
+ -DDataFrame_iterNext=_pandas_DataFrame_iterNext
+ -DDict_iterBegin=_pandas_Dict_iterBegin
+ -DDict_iterEnd=_pandas_Dict_iterEnd
+ -DDict_iterGetName=_pandas_Dict_iterGetName
+ -DDict_iterGetValue=_pandas_Dict_iterGetValue
+ -DDict_iterNext=_pandas_Dict_iterNext
+ -DDir_iterBegin=_pandas_Dir_iterBegin
+ -DDir_iterEnd=_pandas_Dir_iterEnd
+ -DDir_iterGetName=_pandas_Dir_iterGetName
+ -DDir_iterGetValue=_pandas_Dir_iterGetValue
+ -DDir_iterNext=_pandas_Dir_iterNext
+ -DIndex_iterBegin=_pandas_Index_iterBegin
+ -DIndex_iterEnd=_pandas_Index_iterEnd
+ -DIndex_iterGetName=_pandas_Index_iterGetName
+ -DIndex_iterGetValue=_pandas_Index_iterGetValue
+ -DIndex_iterNext=_pandas_Index_iterNext
+ -DIter_iterBegin=_pandas_Iter_iterBegin
+ -DIter_iterEnd=_pandas_Iter_iterEnd
+ -DIter_iterGetName=_pandas_Iter_iterGetName
+ -DIter_iterGetValue=_pandas_Iter_iterGetValue
+ -DIter_iterNext=_pandas_Iter_iterNext
+ -DJSONFileToObj=_pandas_JSONFileToObj
+ -DJSONToObj=_pandas_JSONToObj
+ -DJSON_DecodeObject=_pandas_JSON_DecodeObject
+ -DJSON_EncodeObject=_pandas_JSON_EncodeObject
+ -DList_iterBegin=_pandas_List_iterBegin
+ -DList_iterEnd=_pandas_List_iterEnd
+ -DList_iterGetName=_pandas_List_iterGetName
+ -DList_iterGetValue=_pandas_List_iterGetValue
+ -DList_iterNext=_pandas_List_iterNext
+ -DNpyArrPassThru_iterBegin=_pandas_NpyArrPassThru_iterBegin
+ -DNpyArrPassThru_iterEnd=_pandas_NpyArrPassThru_iterEnd
+ -DNpyArr_encodeLabels=_pandas_NpyArr_encodeLabels
+ -DNpyArr_freeLabels=_pandas_NpyArr_freeLabels
+ -DNpyArr_iterBegin=_pandas_NpyArr_iterBegin
+ -DNpyArr_iterEnd=_pandas_NpyArr_iterEnd
+ -DNpyArr_iterGetName=_pandas_NpyArr_iterGetName
+ -DNpyArr_iterGetValue=_pandas_NpyArr_iterGetValue
+ -DNpyArr_iterNext=_pandas_NpyArr_iterNext
+ -DNpyArr_iterNextItem=_pandas_NpyArr_iterNextItem
+ -DNpyArr_iterNextNone=_pandas_NpyArr_iterNextNone
+ -DNpy_releaseContext=_pandas_Npy_releaseContext
+ -DNpy_returnLabelled=_pandas_Npy_returnLabelled
+ -DObject_arrayAddItem=_pandas_Object_arrayAddItem
+ -DObject_beginTypeContext=_pandas_Object_beginTypeContext
+ -DObject_endArray=_pandas_Object_endArray
+ -DObject_endObject=_pandas_Object_endObject
+ -DObject_endTypeContext=_pandas_Object_endTypeContext
+ -DObject_getDoubleValue=_pandas_Object_getDoubleValue
+ -DObject_getIntValue=_pandas_Object_getIntValue
+ -DObject_getLongValue=_pandas_Object_getLongValue
+ -DObject_getStringValue=_pandas_Object_getStringValue
+ -DObject_invokeDefaultHandler=_pandas_Object_invokeDefaultHandler
+ -DObject_iterBegin=_pandas_Object_iterBegin
+ -DObject_iterEnd=_pandas_Object_iterEnd
+ -DObject_iterGetName=_pandas_Object_iterGetName
+ -DObject_iterGetValue=_pandas_Object_iterGetValue
+ -DObject_iterNext=_pandas_Object_iterNext
+ -DObject_newArray=_pandas_Object_newArray
+ -DObject_newDouble=_pandas_Object_newDouble
+ -DObject_newFalse=_pandas_Object_newFalse
+ -DObject_newInteger=_pandas_Object_newInteger
+ -DObject_newLong=_pandas_Object_newLong
+ -DObject_newNull=_pandas_Object_newNull
+ -DObject_newObject=_pandas_Object_newObject
+ -DObject_newString=_pandas_Object_newString
+ -DObject_newTrue=_pandas_Object_newTrue
+ -DObject_npyArrayAddItem=_pandas_Object_npyArrayAddItem
+ -DObject_npyArrayListAddItem=_pandas_Object_npyArrayListAddItem
+ -DObject_npyEndArray=_pandas_Object_npyEndArray
+ -DObject_npyEndArrayList=_pandas_Object_npyEndArrayList
+ -DObject_npyEndObject=_pandas_Object_npyEndObject
+ -DObject_npyNewArray=_pandas_Object_npyNewArray
+ -DObject_npyNewArrayList=_pandas_Object_npyNewArrayList
+ -DObject_npyNewObject=_pandas_Object_npyNewObject
+ -DObject_npyObjectAddKey=_pandas_Object_npyObjectAddKey
+ -DObject_objectAddKey=_pandas_Object_objectAddKey
+ -DPdBlockPassThru_iterBegin=_pandas_PdBlockPassThru_iterBegin
+ -DPdBlockPassThru_iterEnd=_pandas_PdBlockPassThru_iterEnd
+ -DPdBlock_iterBegin=_pandas_PdBlock_iterBegin
+ -DPdBlock_iterEnd=_pandas_PdBlock_iterEnd
+ -DPdBlock_iterGetName=_pandas_PdBlock_iterGetName
+ -DPdBlock_iterGetName_Transpose=_pandas_PdBlock_iterGetName_Transpose
+ -DPdBlock_iterNext=_pandas_PdBlock_iterNext
+ -DPdBlock_iterNextItem=_pandas_PdBlock_iterNextItem
+ -DSeries_iterBegin=_pandas_Series_iterBegin
+ -DSeries_iterEnd=_pandas_Series_iterEnd
+ -DSeries_iterGetName=_pandas_Series_iterGetName
+ -DSeries_iterGetValue=_pandas_Series_iterGetValue
+ -DSeries_iterNext=_pandas_Series_iterNext
+ -DSkipWhitespace=_pandas_SkipWhitespace
+ -DTuple_iterBegin=_pandas_Tuple_iterBegin
+ -DTuple_iterEnd=_pandas_Tuple_iterEnd
+ -DTuple_iterGetName=_pandas_Tuple_iterGetName
+ -DTuple_iterGetValue=_pandas_Tuple_iterGetValue
+ -DTuple_iterNext=_pandas_Tuple_iterNext
+ -DUJSON_NUMPY=_pandas_UJSON_NUMPY
+ -D_NS_MAX_DTS=_pandas__NS_MAX_DTS
+ -D_NS_MIN_DTS=_pandas__NS_MIN_DTS
+ -D_tokenize_helper=_pandas__tokenize_helper
+ -Dadd_minutes_to_datetimestruct=_pandas_add_minutes_to_datetimestruct
+ -Dadd_seconds_to_datetimestruct=_pandas_add_seconds_to_datetimestruct
+ -Dbuffer_file_bytes=_pandas_buffer_file_bytes
+ -Dbuffer_mmap_bytes=_pandas_buffer_mmap_bytes
+ -Dbuffer_rd_bytes=_pandas_buffer_rd_bytes
+ -Dcmp_npy_datetimestruct=_pandas_cmp_npy_datetimestruct
+ -Dcoliter_new=_pandas_coliter_new
+ -Dcoliter_setup=_pandas_coliter_setup
+ -Dconvert_pydatetime_to_datetimestruct=_pandas_convert_pydatetime_to_datetimestruct
+ -DcreateDouble=_pandas_createDouble
+ -Ddays_per_month_table=_pandas_days_per_month_table
+ -DdecodePreciseFloat=_pandas_decodePreciseFloat
+ -Ddecode_any=_pandas_decode_any
+ -Ddecode_array=_pandas_decode_array
+ -Ddecode_false=_pandas_decode_false
+ -Ddecode_null=_pandas_decode_null
+ -Ddecode_numeric=_pandas_decode_numeric
+ -Ddecode_object=_pandas_decode_object
+ -Ddecode_string=_pandas_decode_string
+ -Ddecode_true=_pandas_decode_true
+ -Ddel_file_source=_pandas_del_file_source
+ -Ddel_mmap=_pandas_del_mmap
+ -Ddel_rd_source=_pandas_del_rd_source
+ -Dencode=_pandas_encode
+ -Dfloatify=_pandas_floatify
+ -Dget_datetime_iso_8601_strlen=_pandas_get_datetime_iso_8601_strlen
+ -Dget_datetimestruct_days=_pandas_get_datetimestruct_days
+ -Dget_nat=_pandas_get_nat
+ -Dget_parser_memory_footprint=_pandas_get_parser_memory_footprint
+ -DinitObjToJSON=_pandas_initObjToJSON
+ -Dis_leapyear=_pandas_is_leapyear
+ -Dmake_iso_8601_datetime=_pandas_make_iso_8601_datetime
+ -Dnew_file_source=_pandas_new_file_source
+ -Dnew_mmap=_pandas_new_mmap
+ -Dnew_rd_source=_pandas_new_rd_source
+ -Dnpy_datetimestruct_to_datetime=_pandas_npy_datetimestruct_to_datetime
+ -DobjToJSON=_pandas_objToJSON
+ -DobjToJSONFile=_pandas_objToJSONFile
+ -Dparse_iso_8601_datetime=_pandas_parse_iso_8601_datetime
+ -Dparser_add_skiprow=_pandas_parser_add_skiprow
+ -Dparser_cleanup=_pandas_parser_cleanup
+ -Dparser_clear_data_buffers=_pandas_parser_clear_data_buffers
+ -Dparser_consume_rows=_pandas_parser_consume_rows
+ -Dparser_del=_pandas_parser_del
+ -Dparser_free=_pandas_parser_free
+ -Dparser_init=_pandas_parser_init
+ -Dparser_new=_pandas_parser_new
+ -Dparser_set_default_options=_pandas_parser_set_default_options
+ -Dparser_set_skipfirstnrows=_pandas_parser_set_skipfirstnrows
+ -Dparser_trim_buffers=_pandas_parser_trim_buffers
+ -Dprecise_xstrtod=_pandas_precise_xstrtod
+ -Dround_trip=_pandas_round_trip
+ -Dskip_this_line=_pandas_skip_this_line
+ -Dstr_to_int64=_pandas_str_to_int64
+ -Dstr_to_uint64=_pandas_str_to_uint64
+ -Dto_boolean=_pandas_to_boolean
+ -Dto_double=_pandas_to_double
+ -Dtokenize_all_rows=_pandas_tokenize_all_rows
+ -Dtokenize_bytes=_pandas_tokenize_bytes
+ -Dtokenize_nrows=_pandas_tokenize_nrows
+ -Duint64_conflict=_pandas_uint64_conflict
+ -Duint_state_init=_pandas_uint_state_init
+ -Dxstrtod=_pandas_xstrtod
+)
diff --git a/contrib/python/pandas/py2/ya.make b/contrib/python/pandas/py2/ya.make
new file mode 100644
index 00000000000..8f6c5a8c918
--- /dev/null
+++ b/contrib/python/pandas/py2/ya.make
@@ -0,0 +1,299 @@
+PY2_LIBRARY()
+
+LICENSE(BSD-3-Clause)
+
+VERSION(0.24.2)
+
+PEERDIR(
+ contrib/python/Jinja2
+ contrib/python/python-dateutil
+ contrib/python/numpy
+ contrib/python/pytz
+)
+
+ADDINCL(
+ FOR cython contrib/python/pandas/py2
+ contrib/python/pandas/py2/pandas/_libs
+ contrib/python/pandas/py2/pandas/_libs/src
+ contrib/python/pandas/py2/pandas/_libs/src/klib
+ contrib/python/pandas/py2/pandas/_libs/src/msgpack
+ contrib/python/pandas/py2/pandas/_libs/src/ujson/lib
+ contrib/python/pandas/py2/pandas/_libs/tslibs
+)
+
+NO_CHECK_IMPORTS(
+ pandas.io.*
+ pandas.plotting.*
+ pandas.tseries.*
+ pandas.util.*
+)
+
+NO_COMPILER_WARNINGS()
+
+NO_LINT()
+
+INCLUDE(symbols.cmake)
+
+PY_REGISTER(pandas._libs.json)
+PY_REGISTER(pandas.util._move)
+
+SRCS(
+ pandas/_libs/src/parser/io.c
+ pandas/_libs/src/parser/tokenizer.c
+ pandas/_libs/src/ujson/lib/ultrajsondec.c
+ pandas/_libs/src/ujson/lib/ultrajsonenc.c
+ pandas/_libs/src/ujson/python/JSONtoObj.c
+ pandas/_libs/src/ujson/python/objToJSON.c
+ pandas/_libs/src/ujson/python/ujson.c
+ pandas/_libs/tslibs/src/datetime/np_datetime.c
+ pandas/_libs/tslibs/src/datetime/np_datetime_strings.c
+ pandas/util/move.c
+)
+
+PY_SRCS(
+ TOP_LEVEL
+
+ CYTHON_C
+ pandas/_libs/algos.pyx
+ pandas/_libs/groupby.pyx
+ pandas/_libs/hashing.pyx
+ pandas/_libs/hashtable.pyx
+ pandas/_libs/index.pyx
+ pandas/_libs/indexing.pyx
+ pandas/_libs/internals.pyx
+ pandas/_libs/interval.pyx
+ pandas/_libs/join.pyx
+ pandas/_libs/lib.pyx
+ pandas/_libs/missing.pyx
+ pandas/_libs/ops.pyx
+ pandas/_libs/parsers.pyx
+ pandas/_libs/properties.pyx
+ pandas/_libs/reduction.pyx
+ pandas/_libs/reshape.pyx
+ pandas/_libs/skiplist.pyx
+ pandas/_libs/sparse.pyx
+ pandas/_libs/testing.pyx
+ pandas/_libs/tslib.pyx
+ pandas/_libs/tslibs/ccalendar.pyx
+ pandas/_libs/tslibs/conversion.pyx
+ pandas/_libs/tslibs/fields.pyx
+ pandas/_libs/tslibs/frequencies.pyx
+ pandas/_libs/tslibs/nattype.pyx
+ pandas/_libs/tslibs/np_datetime.pyx
+ pandas/_libs/tslibs/offsets.pyx
+ pandas/_libs/tslibs/parsing.pyx
+ pandas/_libs/tslibs/period.pyx
+ pandas/_libs/tslibs/resolution.pyx
+ pandas/_libs/tslibs/strptime.pyx
+ pandas/_libs/tslibs/timedeltas.pyx
+ pandas/_libs/tslibs/timestamps.pyx
+ pandas/_libs/tslibs/timezones.pyx
+ pandas/_libs/writers.pyx
+ pandas/io/sas/sas.pyx
+
+ CYTHON_CPP
+ pandas/_libs/window.pyx
+ pandas/io/msgpack/_packer.pyx
+ pandas/io/msgpack/_unpacker.pyx
+
+ pandas/__init__.py
+ pandas/_libs/__init__.py
+ pandas/_libs/tslibs/__init__.py
+ pandas/_version.py
+ pandas/api/__init__.py
+ pandas/api/extensions/__init__.py
+ pandas/api/types/__init__.py
+ pandas/arrays/__init__.py
+ pandas/compat/__init__.py
+ pandas/compat/chainmap.py
+ pandas/compat/chainmap_impl.py
+ pandas/compat/numpy/__init__.py
+ pandas/compat/numpy/function.py
+ pandas/compat/pickle_compat.py
+ pandas/core/__init__.py
+ pandas/core/accessor.py
+ pandas/core/algorithms.py
+ pandas/core/api.py
+ pandas/core/apply.py
+ pandas/core/arrays/__init__.py
+ pandas/core/arrays/_ranges.py
+ pandas/core/arrays/array_.py
+ pandas/core/arrays/base.py
+ pandas/core/arrays/categorical.py
+ pandas/core/arrays/datetimelike.py
+ pandas/core/arrays/datetimes.py
+ pandas/core/arrays/integer.py
+ pandas/core/arrays/interval.py
+ pandas/core/arrays/numpy_.py
+ pandas/core/arrays/period.py
+ pandas/core/arrays/sparse.py
+ pandas/core/arrays/timedeltas.py
+ pandas/core/base.py
+ pandas/core/categorical.py
+ pandas/core/common.py
+ pandas/core/computation/__init__.py
+ pandas/core/computation/align.py
+ pandas/core/computation/api.py
+ pandas/core/computation/check.py
+ pandas/core/computation/common.py
+ pandas/core/computation/engines.py
+ pandas/core/computation/eval.py
+ pandas/core/computation/expr.py
+ pandas/core/computation/expressions.py
+ pandas/core/computation/ops.py
+ pandas/core/computation/pytables.py
+ pandas/core/computation/scope.py
+ pandas/core/config.py
+ pandas/core/config_init.py
+ pandas/core/dtypes/__init__.py
+ pandas/core/dtypes/api.py
+ pandas/core/dtypes/base.py
+ pandas/core/dtypes/cast.py
+ pandas/core/dtypes/common.py
+ pandas/core/dtypes/concat.py
+ pandas/core/dtypes/dtypes.py
+ pandas/core/dtypes/generic.py
+ pandas/core/dtypes/inference.py
+ pandas/core/dtypes/missing.py
+ pandas/core/frame.py
+ pandas/core/generic.py
+ pandas/core/groupby/__init__.py
+ pandas/core/groupby/base.py
+ pandas/core/groupby/categorical.py
+ pandas/core/groupby/generic.py
+ pandas/core/groupby/groupby.py
+ pandas/core/groupby/grouper.py
+ pandas/core/groupby/ops.py
+ pandas/core/index.py
+ pandas/core/indexes/__init__.py
+ pandas/core/indexes/accessors.py
+ pandas/core/indexes/api.py
+ pandas/core/indexes/base.py
+ pandas/core/indexes/category.py
+ pandas/core/indexes/datetimelike.py
+ pandas/core/indexes/datetimes.py
+ pandas/core/indexes/frozen.py
+ pandas/core/indexes/interval.py
+ pandas/core/indexes/multi.py
+ pandas/core/indexes/numeric.py
+ pandas/core/indexes/period.py
+ pandas/core/indexes/range.py
+ pandas/core/indexes/timedeltas.py
+ pandas/core/indexing.py
+ pandas/core/internals/__init__.py
+ pandas/core/internals/arrays.py
+ pandas/core/internals/blocks.py
+ pandas/core/internals/concat.py
+ pandas/core/internals/construction.py
+ pandas/core/internals/managers.py
+ pandas/core/missing.py
+ pandas/core/nanops.py
+ pandas/core/ops.py
+ pandas/core/panel.py
+ pandas/core/resample.py
+ pandas/core/reshape/__init__.py
+ pandas/core/reshape/api.py
+ pandas/core/reshape/concat.py
+ pandas/core/reshape/melt.py
+ pandas/core/reshape/merge.py
+ pandas/core/reshape/pivot.py
+ pandas/core/reshape/reshape.py
+ pandas/core/reshape/tile.py
+ pandas/core/reshape/util.py
+ pandas/core/series.py
+ pandas/core/sorting.py
+ pandas/core/sparse/__init__.py
+ pandas/core/sparse/api.py
+ pandas/core/sparse/frame.py
+ pandas/core/sparse/scipy_sparse.py
+ pandas/core/sparse/series.py
+ pandas/core/strings.py
+ pandas/core/tools/__init__.py
+ pandas/core/tools/datetimes.py
+ pandas/core/tools/numeric.py
+ pandas/core/tools/timedeltas.py
+ pandas/core/util/__init__.py
+ pandas/core/util/hashing.py
+ pandas/core/window.py
+ pandas/errors/__init__.py
+ pandas/io/__init__.py
+ pandas/io/api.py
+ pandas/io/clipboard/__init__.py
+ pandas/io/clipboard/clipboards.py
+ pandas/io/clipboard/exceptions.py
+ pandas/io/clipboard/windows.py
+ pandas/io/clipboards.py
+ pandas/io/common.py
+ pandas/io/date_converters.py
+ pandas/io/excel.py
+ pandas/io/feather_format.py
+ pandas/io/formats/__init__.py
+ pandas/io/formats/console.py
+ pandas/io/formats/css.py
+ pandas/io/formats/csvs.py
+ pandas/io/formats/excel.py
+ pandas/io/formats/format.py
+ pandas/io/formats/html.py
+ pandas/io/formats/latex.py
+ pandas/io/formats/printing.py
+ pandas/io/formats/style.py
+ pandas/io/formats/terminal.py
+ pandas/io/gbq.py
+ pandas/io/gcs.py
+ pandas/io/html.py
+ pandas/io/json/__init__.py
+ pandas/io/json/json.py
+ pandas/io/json/normalize.py
+ pandas/io/json/table_schema.py
+ pandas/io/msgpack/__init__.py
+ pandas/io/msgpack/_version.py
+ pandas/io/msgpack/exceptions.py
+ pandas/io/packers.py
+ pandas/io/parquet.py
+ pandas/io/parsers.py
+ pandas/io/pickle.py
+ pandas/io/pytables.py
+ pandas/io/s3.py
+ pandas/io/sas/__init__.py
+ pandas/io/sas/sas7bdat.py
+ pandas/io/sas/sas_constants.py
+ pandas/io/sas/sas_xport.py
+ pandas/io/sas/sasreader.py
+ pandas/io/sql.py
+ pandas/io/stata.py
+ pandas/plotting/__init__.py
+ pandas/plotting/_compat.py
+ pandas/plotting/_converter.py
+ pandas/plotting/_core.py
+ pandas/plotting/_misc.py
+ pandas/plotting/_style.py
+ pandas/plotting/_timeseries.py
+ pandas/plotting/_tools.py
+ pandas/testing.py
+ pandas/tseries/__init__.py
+ pandas/tseries/api.py
+ pandas/tseries/converter.py
+ pandas/tseries/frequencies.py
+ pandas/tseries/holiday.py
+ pandas/tseries/offsets.py
+ pandas/tseries/plotting.py
+ pandas/util/__init__.py
+ pandas/util/_decorators.py
+ pandas/util/_depr_module.py
+ pandas/util/_doctools.py
+ pandas/util/_exceptions.py
+ pandas/util/_print_versions.py
+ pandas/util/_test_decorators.py
+ pandas/util/_tester.py
+ pandas/util/_validators.py
+ pandas/util/testing.py
+)
+
+RESOURCE_FILES(
+ PREFIX contrib/python/pandas/py2/
+ .dist-info/METADATA
+ .dist-info/top_level.txt
+ pandas/io/formats/templates/html.tpl
+)
+
+END()